blob: 249a257177e1c396cedf052e0c89db6e6bd4046f [file] [log] [blame]
/*
* ARM AdvSIMD / SVE Vector Operations
*
* Copyright (c) 2026 Linaro
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
#include "qemu/osdep.h"
#include "cpu.h"
#include "helper.h"
#include "helper-a64.h"
#include "helper-sme.h"
#include "helper-sve.h"
#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
#include "qemu/int128.h"
#include "crypto/clmul.h"
#include "vec_internal.h"
DO_3OP(gvec_fdiv_h, float16_div, float16)
DO_3OP(gvec_fdiv_s, float32_div, float32)
DO_3OP(gvec_fdiv_d, float64_div, float64)
DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16)
DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16)
DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16)
DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16)
DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16)
DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16)
#define nop(N, M, S) (M)
DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
#undef nop
void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
{
int shift = simd_data(desc) * 8;
intptr_t i, opr_sz = simd_oprsz(desc);
uint64_t *d = vd, *n = vn, *m = vm;
for (i = 0; i < opr_sz / 8; ++i) {
d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
}
}
void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
{
intptr_t sel = H4(simd_data(desc));
intptr_t i, opr_sz = simd_oprsz(desc);
uint32_t *n = vn, *m = vm;
uint64_t *d = vd;
for (i = 0; i < opr_sz / 8; ++i) {
d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
}
}
DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, /**/)
DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, /**/)
void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
{
const uint8_t *indices = vm;
size_t oprsz = simd_oprsz(desc);
uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
union {
uint8_t b[16];
uint64_t d[2];
} result;
/*
* We must construct the final result in a temp, lest the output
* overlaps the input table. For TBL, begin with zero; for TBX,
* begin with the original register contents. Note that we always
* copy 16 bytes here to avoid an extra branch; clearing the high
* bits of the register for oprsz == 8 is handled below.
*/
if (is_tbx) {
memcpy(&result, vd, 16);
} else {
memset(&result, 0, 16);
}
for (size_t i = 0; i < oprsz; ++i) {
uint32_t index = indices[H1(i)];
if (index < table_len) {
/*
* Convert index (a byte offset into the virtual table
* which is a series of 128-bit vectors concatenated)
* into the correct register element, bearing in mind
* that the table can wrap around from V31 to V0.
*/
const uint8_t *table = (const uint8_t *)
aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
result.b[H1(i)] = table[H1(index % 16)];
}
}
memcpy(vd, &result, 16);
clear_tail(vd, oprsz, simd_maxsz(desc));
}