target/arm/tcg/vec_helper64.c - qemu - Git at Google

 /*
  * ARM AdvSIMD / SVE Vector Operations
  *
  * Copyright (c) 2026 Linaro
  *
  * SPDX-License-Identifier: GPL-2.0-or-later
  */

 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "helper.h"
 #include "helper-a64.h"
 #include "helper-sme.h"
 #include "helper-sve.h"
 #include "tcg/tcg-gvec-desc.h"
 #include "fpu/softfloat.h"
 #include "qemu/int128.h"
 #include "crypto/clmul.h"
 #include "vec_internal.h"

 DO_3OP(gvec_fdiv_h, float16_div, float16)
 DO_3OP(gvec_fdiv_s, float32_div, float32)
 DO_3OP(gvec_fdiv_d, float64_div, float64)

 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)

 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)

 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)

 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)

 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)

 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)

 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)

 DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16)
 DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16)
 DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16)
 DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16)
 DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16)
 DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16)

 #define nop(N, M, S) (M)

 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)

 #undef nop

 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
 {
     int shift = simd_data(desc) * 8;
     intptr_t i, opr_sz = simd_oprsz(desc);
     uint64_t *d = vd, *n = vn, *m = vm;

     for (i = 0; i < opr_sz / 8; ++i) {
         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
     }
 }

 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
 {
     intptr_t sel = H4(simd_data(desc));
     intptr_t i, opr_sz = simd_oprsz(desc);
     uint32_t *n = vn, *m = vm;
     uint64_t *d = vd;

     for (i = 0; i < opr_sz / 8; ++i) {
         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
     }
 }

 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, /**/)

 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, /**/)

 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
 {
     const uint8_t *indices = vm;
     size_t oprsz = simd_oprsz(desc);
     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
     union {
         uint8_t b[16];
         uint64_t d[2];
     } result;

     /*
      * We must construct the final result in a temp, lest the output
      * overlaps the input table.  For TBL, begin with zero; for TBX,
      * begin with the original register contents.  Note that we always
      * copy 16 bytes here to avoid an extra branch; clearing the high
      * bits of the register for oprsz == 8 is handled below.
      */
     if (is_tbx) {
         memcpy(&result, vd, 16);
     } else {
         memset(&result, 0, 16);
     }

     for (size_t i = 0; i < oprsz; ++i) {
         uint32_t index = indices[H1(i)];

         if (index < table_len) {
             /*
              * Convert index (a byte offset into the virtual table
              * which is a series of 128-bit vectors concatenated)
              * into the correct register element, bearing in mind
              * that the table can wrap around from V31 to V0.
              */
             const uint8_t *table = (const uint8_t *)
                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
             result.b[H1(i)] = table[H1(index % 16)];
         }
     }

     memcpy(vd, &result, 16);
     clear_tail(vd, oprsz, simd_maxsz(desc));
 }
	/*
	* ARM AdvSIMD / SVE Vector Operations
	*
	* Copyright (c) 2026 Linaro
	*
	* SPDX-License-Identifier: GPL-2.0-or-later
	*/

	#include "qemu/osdep.h"
	#include "cpu.h"
	#include "helper.h"
	#include "helper-a64.h"
	#include "helper-sme.h"
	#include "helper-sve.h"
	#include "tcg/tcg-gvec-desc.h"
	#include "fpu/softfloat.h"
	#include "qemu/int128.h"
	#include "crypto/clmul.h"
	#include "vec_internal.h"

	DO_3OP(gvec_fdiv_h, float16_div, float16)
	DO_3OP(gvec_fdiv_s, float32_div, float32)
	DO_3OP(gvec_fdiv_d, float64_div, float64)

	DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
	DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
	DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)

	DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
	DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
	DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)

	DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
	DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
	DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)

	DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
	DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
	DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)

	DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
	DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
	DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)

	DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
	DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
	DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)

	DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
	DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
	DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)

	DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16)
	DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16)
	DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16)
	DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16)
	DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16)
	DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16)

	#define nop(N, M, S) (M)

	DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
	DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
	DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)

	#undef nop

	void HELPER(sve2_pmull_h)(void vd, void vn, void *vm, uint32_t desc)
	{
	int shift = simd_data(desc) * 8;
	intptr_t i, opr_sz = simd_oprsz(desc);
	uint64_t d = vd, n = vn, *m = vm;

	for (i = 0; i < opr_sz / 8; ++i) {
	d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
	}
	}

	void HELPER(sve2_pmull_d)(void vd, void vn, void *vm, uint32_t desc)
	{
	intptr_t sel = H4(simd_data(desc));
	intptr_t i, opr_sz = simd_oprsz(desc);
	uint32_t n = vn, m = vm;
	uint64_t *d = vd;

	for (i = 0; i < opr_sz / 8; ++i) {
	d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
	}
	}

	DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
	DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
	DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, /**/)

	DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
	DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
	DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, /**/)

	void HELPER(simd_tblx)(void vd, void vm, CPUARMState *env, uint32_t desc)
	{
	const uint8_t *indices = vm;
	size_t oprsz = simd_oprsz(desc);
	uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
	bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
	uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
	union {
	uint8_t b[16];
	uint64_t d[2];
	} result;

	/*
	* We must construct the final result in a temp, lest the output
	* overlaps the input table. For TBL, begin with zero; for TBX,
	* begin with the original register contents. Note that we always
	* copy 16 bytes here to avoid an extra branch; clearing the high
	* bits of the register for oprsz == 8 is handled below.
	*/
	if (is_tbx) {
	memcpy(&result, vd, 16);
	} else {
	memset(&result, 0, 16);
	}

	for (size_t i = 0; i < oprsz; ++i) {
	uint32_t index = indices[H1(i)];

	if (index < table_len) {
	/*
	* Convert index (a byte offset into the virtual table
	* which is a series of 128-bit vectors concatenated)
	* into the correct register element, bearing in mind
	* that the table can wrap around from V31 to V0.
	*/
	const uint8_t table = (const uint8_t )
	aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
	result.b[H1(i)] = table[H1(index % 16)];
	}
	}

	memcpy(vd, &result, 16);
	clear_tail(vd, oprsz, simd_maxsz(desc));
	}