blob: 22ddb9688177cc1f170ad25c23895033db0dc85d [file] [log] [blame]
Richard Hendersond9061ec2018-03-02 10:45:41 +00001/*
2 * ARM AdvSIMD / SVE Vector Operations
3 *
4 * Copyright (c) 2018 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
Chetan Pant50f57e02020-10-23 12:29:13 +00009 * version 2.1 of the License, or (at your option) any later version.
Richard Hendersond9061ec2018-03-02 10:45:41 +000010 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
Richard Hendersond9061ec2018-03-02 10:45:41 +000022#include "exec/helper-proto.h"
23#include "tcg/tcg-gvec-desc.h"
Richard Henderson1695cd62018-03-02 10:45:43 +000024#include "fpu/softfloat.h"
Richard Hendersonab3ddf32021-05-24 18:03:01 -070025#include "qemu/int128.h"
Richard Henderson8e3da4c2023-07-10 16:07:57 +010026#include "crypto/clmul.h"
Richard Hendersona04b68e2020-05-14 14:28:26 -070027#include "vec_internal.h"
Richard Hendersond9061ec2018-03-02 10:45:41 +000028
Peter Maydell77f96142021-06-14 16:09:23 +010029/*
30 * Data for expanding active predicate bits to bytes, for byte elements.
31 *
32 * for (i = 0; i < 256; ++i) {
33 * unsigned long m = 0;
34 * for (j = 0; j < 8; j++) {
35 * if ((i >> j) & 1) {
36 * m |= 0xfful << (j << 3);
37 * }
38 * }
39 * printf("0x%016lx,\n", m);
40 * }
41 */
42const uint64_t expand_pred_b_data[256] = {
43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128 0xffffffffffffffff,
129};
130
Richard Hendersona613cf22022-06-08 19:38:58 +0100131/*
132 * Similarly for half-word elements.
133 * for (i = 0; i < 256; ++i) {
134 * unsigned long m = 0;
135 * if (i & 0xaa) {
136 * continue;
137 * }
138 * for (j = 0; j < 8; j += 2) {
139 * if ((i >> j) & 1) {
140 * m |= 0xfffful << (j << 3);
141 * }
142 * }
143 * printf("[0x%x] = 0x%016lx,\n", i, m);
144 * }
145 */
146const uint64_t expand_pred_h_data[0x55 + 1] = {
147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154 [0x55] = 0xffffffffffffffff,
155};
156
Richard Hendersonab3ddf32021-05-24 18:03:01 -0700157/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
Richard Hendersond782d3c2021-05-24 18:03:03 -0700158int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159 bool neg, bool round)
Richard Hendersonab3ddf32021-05-24 18:03:01 -0700160{
161 /*
162 * Simplify:
163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165 */
166 int32_t ret = (int32_t)src1 * src2;
167 if (neg) {
168 ret = -ret;
169 }
170 ret += ((int32_t)src3 << 7) + (round << 6);
171 ret >>= 7;
172
173 if (ret != (int8_t)ret) {
174 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175 }
176 return ret;
177}
178
179void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180 void *va, uint32_t desc)
181{
182 intptr_t i, opr_sz = simd_oprsz(desc);
183 int8_t *d = vd, *n = vn, *m = vm, *a = va;
184
185 for (i = 0; i < opr_sz; ++i) {
186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187 }
188}
189
190void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191 void *va, uint32_t desc)
192{
193 intptr_t i, opr_sz = simd_oprsz(desc);
194 int8_t *d = vd, *n = vn, *m = vm, *a = va;
195
196 for (i = 0; i < opr_sz; ++i) {
197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198 }
199}
200
Richard Henderson169d7c52021-05-24 18:03:24 -0700201void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202{
203 intptr_t i, opr_sz = simd_oprsz(desc);
204 int8_t *d = vd, *n = vn, *m = vm;
205
206 for (i = 0; i < opr_sz; ++i) {
207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208 }
209}
210
211void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212{
213 intptr_t i, opr_sz = simd_oprsz(desc);
214 int8_t *d = vd, *n = vn, *m = vm;
215
216 for (i = 0; i < opr_sz; ++i) {
217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218 }
219}
220
Richard Hendersond9061ec2018-03-02 10:45:41 +0000221/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
Richard Hendersond782d3c2021-05-24 18:03:03 -0700222int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223 bool neg, bool round, uint32_t *sat)
Richard Hendersond9061ec2018-03-02 10:45:41 +0000224{
Richard Hendersonab3ddf32021-05-24 18:03:01 -0700225 /* Simplify similarly to do_sqrdmlah_b above. */
Richard Hendersond9061ec2018-03-02 10:45:41 +0000226 int32_t ret = (int32_t)src1 * src2;
Richard Hendersond2179882020-08-28 10:02:49 +0100227 if (neg) {
228 ret = -ret;
229 }
230 ret += ((int32_t)src3 << 15) + (round << 14);
Richard Hendersond9061ec2018-03-02 10:45:41 +0000231 ret >>= 15;
Richard Hendersond2179882020-08-28 10:02:49 +0100232
Richard Hendersond9061ec2018-03-02 10:45:41 +0000233 if (ret != (int16_t)ret) {
Richard Hendersone286bf42020-05-13 09:32:42 -0700234 *sat = 1;
Richard Hendersond2179882020-08-28 10:02:49 +0100235 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
Richard Hendersond9061ec2018-03-02 10:45:41 +0000236 }
237 return ret;
238}
239
240uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241 uint32_t src2, uint32_t src3)
242{
Richard Hendersone286bf42020-05-13 09:32:42 -0700243 uint32_t *sat = &env->vfp.qc[0];
Richard Hendersond2179882020-08-28 10:02:49 +0100244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246 false, true, sat);
Richard Hendersond9061ec2018-03-02 10:45:41 +0000247 return deposit32(e1, 16, 16, e2);
248}
249
Richard Hendersone7186d82018-03-02 10:45:42 +0000250void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
Richard Hendersone286bf42020-05-13 09:32:42 -0700251 void *vq, uint32_t desc)
Richard Hendersone7186d82018-03-02 10:45:42 +0000252{
253 uintptr_t opr_sz = simd_oprsz(desc);
254 int16_t *d = vd;
255 int16_t *n = vn;
256 int16_t *m = vm;
Richard Hendersone7186d82018-03-02 10:45:42 +0000257 uintptr_t i;
258
259 for (i = 0; i < opr_sz / 2; ++i) {
Richard Hendersond2179882020-08-28 10:02:49 +0100260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
Richard Hendersone7186d82018-03-02 10:45:42 +0000261 }
262 clear_tail(d, opr_sz, simd_maxsz(desc));
263}
264
Richard Hendersond9061ec2018-03-02 10:45:41 +0000265uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266 uint32_t src2, uint32_t src3)
267{
Richard Hendersone286bf42020-05-13 09:32:42 -0700268 uint32_t *sat = &env->vfp.qc[0];
Richard Hendersond2179882020-08-28 10:02:49 +0100269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271 true, true, sat);
Richard Hendersond9061ec2018-03-02 10:45:41 +0000272 return deposit32(e1, 16, 16, e2);
273}
274
Richard Hendersone7186d82018-03-02 10:45:42 +0000275void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
Richard Hendersone286bf42020-05-13 09:32:42 -0700276 void *vq, uint32_t desc)
Richard Hendersone7186d82018-03-02 10:45:42 +0000277{
278 uintptr_t opr_sz = simd_oprsz(desc);
279 int16_t *d = vd;
280 int16_t *n = vn;
281 int16_t *m = vm;
Richard Hendersone7186d82018-03-02 10:45:42 +0000282 uintptr_t i;
283
284 for (i = 0; i < opr_sz / 2; ++i) {
Richard Hendersond2179882020-08-28 10:02:49 +0100285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
Richard Hendersone7186d82018-03-02 10:45:42 +0000286 }
287 clear_tail(d, opr_sz, simd_maxsz(desc));
288}
289
Richard Hendersoned788492020-08-28 10:02:50 +0100290void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291 void *vq, uint32_t desc)
292{
293 intptr_t i, opr_sz = simd_oprsz(desc);
294 int16_t *d = vd, *n = vn, *m = vm;
295
296 for (i = 0; i < opr_sz / 2; ++i) {
297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298 }
299 clear_tail(d, opr_sz, simd_maxsz(desc));
300}
301
302void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303 void *vq, uint32_t desc)
304{
305 intptr_t i, opr_sz = simd_oprsz(desc);
306 int16_t *d = vd, *n = vn, *m = vm;
307
308 for (i = 0; i < opr_sz / 2; ++i) {
309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310 }
311 clear_tail(d, opr_sz, simd_maxsz(desc));
312}
313
Richard Hendersonf80701c2024-05-28 13:30:42 -0700314void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315 void *vq, uint32_t desc)
316{
317 intptr_t i, j, opr_sz = simd_oprsz(desc);
318 int idx = simd_data(desc);
319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700320 intptr_t elements = opr_sz / 2;
321 intptr_t eltspersegment = MIN(16 / 2, elements);
Richard Hendersonf80701c2024-05-28 13:30:42 -0700322
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700323 for (i = 0; i < elements; i += 16 / 2) {
Richard Hendersonf80701c2024-05-28 13:30:42 -0700324 int16_t mm = m[i];
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700325 for (j = 0; j < eltspersegment; ++j) {
Richard Hendersonf80701c2024-05-28 13:30:42 -0700326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327 }
328 }
329 clear_tail(d, opr_sz, simd_maxsz(desc));
330}
331
332void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333 void *vq, uint32_t desc)
334{
335 intptr_t i, j, opr_sz = simd_oprsz(desc);
336 int idx = simd_data(desc);
337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700338 intptr_t elements = opr_sz / 2;
339 intptr_t eltspersegment = MIN(16 / 2, elements);
Richard Hendersonf80701c2024-05-28 13:30:42 -0700340
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700341 for (i = 0; i < elements; i += 16 / 2) {
Richard Hendersonf80701c2024-05-28 13:30:42 -0700342 int16_t mm = m[i];
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700343 for (j = 0; j < eltspersegment; ++j) {
Richard Hendersonf80701c2024-05-28 13:30:42 -0700344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345 }
346 }
347 clear_tail(d, opr_sz, simd_maxsz(desc));
348}
349
Richard Hendersonf698e452024-06-25 11:35:27 -0700350void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351 void *vq, uint32_t desc)
352{
353 intptr_t i, j, opr_sz = simd_oprsz(desc);
354 int idx = simd_data(desc);
355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356 intptr_t elements = opr_sz / 2;
357 intptr_t eltspersegment = MIN(16 / 2, elements);
358
359 for (i = 0; i < elements; i += 16 / 2) {
360 int16_t mm = m[i];
361 for (j = 0; j < eltspersegment; ++j) {
362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363 }
364 }
365 clear_tail(d, opr_sz, simd_maxsz(desc));
366}
367
368void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369 void *vq, uint32_t desc)
370{
371 intptr_t i, j, opr_sz = simd_oprsz(desc);
372 int idx = simd_data(desc);
373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374 intptr_t elements = opr_sz / 2;
375 intptr_t eltspersegment = MIN(16 / 2, elements);
376
377 for (i = 0; i < elements; i += 16 / 2) {
378 int16_t mm = m[i];
379 for (j = 0; j < eltspersegment; ++j) {
380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381 }
382 }
383 clear_tail(d, opr_sz, simd_maxsz(desc));
384}
385
Richard Hendersonab3ddf32021-05-24 18:03:01 -0700386void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387 void *va, uint32_t desc)
388{
389 intptr_t i, opr_sz = simd_oprsz(desc);
390 int16_t *d = vd, *n = vn, *m = vm, *a = va;
391 uint32_t discard;
392
393 for (i = 0; i < opr_sz / 2; ++i) {
394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395 }
396}
397
398void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399 void *va, uint32_t desc)
400{
401 intptr_t i, opr_sz = simd_oprsz(desc);
402 int16_t *d = vd, *n = vn, *m = vm, *a = va;
403 uint32_t discard;
404
405 for (i = 0; i < opr_sz / 2; ++i) {
406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407 }
408}
409
Richard Henderson169d7c52021-05-24 18:03:24 -0700410void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411{
412 intptr_t i, opr_sz = simd_oprsz(desc);
413 int16_t *d = vd, *n = vn, *m = vm;
414 uint32_t discard;
415
416 for (i = 0; i < opr_sz / 2; ++i) {
417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418 }
419}
420
421void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422{
423 intptr_t i, opr_sz = simd_oprsz(desc);
424 int16_t *d = vd, *n = vn, *m = vm;
425 uint32_t discard;
426
427 for (i = 0; i < opr_sz / 2; ++i) {
428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429 }
430}
431
Richard Henderson1aee2d72021-05-24 18:03:25 -0700432void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433{
434 intptr_t i, j, opr_sz = simd_oprsz(desc);
435 int idx = simd_data(desc);
436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437 uint32_t discard;
438
439 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440 int16_t mm = m[i];
441 for (j = 0; j < 16 / 2; ++j) {
442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443 }
444 }
445}
446
447void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448{
449 intptr_t i, j, opr_sz = simd_oprsz(desc);
450 int idx = simd_data(desc);
451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452 uint32_t discard;
453
454 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455 int16_t mm = m[i];
456 for (j = 0; j < 16 / 2; ++j) {
457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458 }
459 }
460}
461
Richard Hendersond9061ec2018-03-02 10:45:41 +0000462/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
Richard Hendersond782d3c2021-05-24 18:03:03 -0700463int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464 bool neg, bool round, uint32_t *sat)
Richard Hendersond9061ec2018-03-02 10:45:41 +0000465{
Richard Hendersonab3ddf32021-05-24 18:03:01 -0700466 /* Simplify similarly to do_sqrdmlah_b above. */
Richard Hendersond9061ec2018-03-02 10:45:41 +0000467 int64_t ret = (int64_t)src1 * src2;
Richard Hendersond2179882020-08-28 10:02:49 +0100468 if (neg) {
469 ret = -ret;
470 }
471 ret += ((int64_t)src3 << 31) + (round << 30);
Richard Hendersond9061ec2018-03-02 10:45:41 +0000472 ret >>= 31;
Richard Hendersond2179882020-08-28 10:02:49 +0100473
Richard Hendersond9061ec2018-03-02 10:45:41 +0000474 if (ret != (int32_t)ret) {
Richard Hendersone286bf42020-05-13 09:32:42 -0700475 *sat = 1;
Richard Hendersond9061ec2018-03-02 10:45:41 +0000476 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477 }
478 return ret;
479}
480
Richard Hendersone286bf42020-05-13 09:32:42 -0700481uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482 int32_t src2, int32_t src3)
483{
484 uint32_t *sat = &env->vfp.qc[0];
Richard Hendersond2179882020-08-28 10:02:49 +0100485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
Richard Hendersone286bf42020-05-13 09:32:42 -0700486}
487
Richard Hendersone7186d82018-03-02 10:45:42 +0000488void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
Richard Hendersone286bf42020-05-13 09:32:42 -0700489 void *vq, uint32_t desc)
Richard Hendersone7186d82018-03-02 10:45:42 +0000490{
491 uintptr_t opr_sz = simd_oprsz(desc);
492 int32_t *d = vd;
493 int32_t *n = vn;
494 int32_t *m = vm;
Richard Hendersone7186d82018-03-02 10:45:42 +0000495 uintptr_t i;
496
497 for (i = 0; i < opr_sz / 4; ++i) {
Richard Hendersond2179882020-08-28 10:02:49 +0100498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
Richard Hendersone7186d82018-03-02 10:45:42 +0000499 }
500 clear_tail(d, opr_sz, simd_maxsz(desc));
501}
502
Richard Hendersone286bf42020-05-13 09:32:42 -0700503uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504 int32_t src2, int32_t src3)
505{
506 uint32_t *sat = &env->vfp.qc[0];
Richard Hendersond2179882020-08-28 10:02:49 +0100507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
Richard Hendersone286bf42020-05-13 09:32:42 -0700508}
509
Richard Hendersone7186d82018-03-02 10:45:42 +0000510void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
Richard Hendersone286bf42020-05-13 09:32:42 -0700511 void *vq, uint32_t desc)
Richard Hendersone7186d82018-03-02 10:45:42 +0000512{
513 uintptr_t opr_sz = simd_oprsz(desc);
514 int32_t *d = vd;
515 int32_t *n = vn;
516 int32_t *m = vm;
Richard Hendersone7186d82018-03-02 10:45:42 +0000517 uintptr_t i;
518
519 for (i = 0; i < opr_sz / 4; ++i) {
Richard Hendersond2179882020-08-28 10:02:49 +0100520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
Richard Hendersone7186d82018-03-02 10:45:42 +0000521 }
522 clear_tail(d, opr_sz, simd_maxsz(desc));
523}
Richard Henderson1695cd62018-03-02 10:45:43 +0000524
Richard Hendersoned788492020-08-28 10:02:50 +0100525void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526 void *vq, uint32_t desc)
527{
528 intptr_t i, opr_sz = simd_oprsz(desc);
529 int32_t *d = vd, *n = vn, *m = vm;
530
531 for (i = 0; i < opr_sz / 4; ++i) {
532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533 }
534 clear_tail(d, opr_sz, simd_maxsz(desc));
535}
536
537void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538 void *vq, uint32_t desc)
539{
540 intptr_t i, opr_sz = simd_oprsz(desc);
541 int32_t *d = vd, *n = vn, *m = vm;
542
543 for (i = 0; i < opr_sz / 4; ++i) {
544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545 }
546 clear_tail(d, opr_sz, simd_maxsz(desc));
547}
548
Richard Hendersonf80701c2024-05-28 13:30:42 -0700549void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550 void *vq, uint32_t desc)
551{
552 intptr_t i, j, opr_sz = simd_oprsz(desc);
553 int idx = simd_data(desc);
554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700555 intptr_t elements = opr_sz / 4;
556 intptr_t eltspersegment = MIN(16 / 4, elements);
Richard Hendersonf80701c2024-05-28 13:30:42 -0700557
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700558 for (i = 0; i < elements; i += 16 / 4) {
Richard Hendersonf80701c2024-05-28 13:30:42 -0700559 int32_t mm = m[i];
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700560 for (j = 0; j < eltspersegment; ++j) {
Richard Hendersonf80701c2024-05-28 13:30:42 -0700561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562 }
563 }
564 clear_tail(d, opr_sz, simd_maxsz(desc));
565}
566
567void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568 void *vq, uint32_t desc)
569{
570 intptr_t i, j, opr_sz = simd_oprsz(desc);
571 int idx = simd_data(desc);
572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700573 intptr_t elements = opr_sz / 4;
574 intptr_t eltspersegment = MIN(16 / 4, elements);
Richard Hendersonf80701c2024-05-28 13:30:42 -0700575
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700576 for (i = 0; i < elements; i += 16 / 4) {
Richard Hendersonf80701c2024-05-28 13:30:42 -0700577 int32_t mm = m[i];
Richard Hendersona5b72cc2024-06-25 11:35:25 -0700578 for (j = 0; j < eltspersegment; ++j) {
Richard Hendersonf80701c2024-05-28 13:30:42 -0700579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580 }
581 }
582 clear_tail(d, opr_sz, simd_maxsz(desc));
583}
584
Richard Hendersonf698e452024-06-25 11:35:27 -0700585void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586 void *vq, uint32_t desc)
587{
588 intptr_t i, j, opr_sz = simd_oprsz(desc);
589 int idx = simd_data(desc);
590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591 intptr_t elements = opr_sz / 4;
592 intptr_t eltspersegment = MIN(16 / 4, elements);
593
594 for (i = 0; i < elements; i += 16 / 4) {
595 int32_t mm = m[i];
596 for (j = 0; j < eltspersegment; ++j) {
597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598 }
599 }
600 clear_tail(d, opr_sz, simd_maxsz(desc));
601}
602
603void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604 void *vq, uint32_t desc)
605{
606 intptr_t i, j, opr_sz = simd_oprsz(desc);
607 int idx = simd_data(desc);
608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609 intptr_t elements = opr_sz / 4;
610 intptr_t eltspersegment = MIN(16 / 4, elements);
611
612 for (i = 0; i < elements; i += 16 / 4) {
613 int32_t mm = m[i];
614 for (j = 0; j < eltspersegment; ++j) {
615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616 }
617 }
618 clear_tail(d, opr_sz, simd_maxsz(desc));
619}
620
Richard Hendersonab3ddf32021-05-24 18:03:01 -0700621void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622 void *va, uint32_t desc)
623{
624 intptr_t i, opr_sz = simd_oprsz(desc);
625 int32_t *d = vd, *n = vn, *m = vm, *a = va;
626 uint32_t discard;
627
628 for (i = 0; i < opr_sz / 4; ++i) {
629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630 }
631}
632
633void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634 void *va, uint32_t desc)
635{
636 intptr_t i, opr_sz = simd_oprsz(desc);
637 int32_t *d = vd, *n = vn, *m = vm, *a = va;
638 uint32_t discard;
639
640 for (i = 0; i < opr_sz / 4; ++i) {
641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642 }
643}
644
Richard Henderson169d7c52021-05-24 18:03:24 -0700645void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646{
647 intptr_t i, opr_sz = simd_oprsz(desc);
648 int32_t *d = vd, *n = vn, *m = vm;
649 uint32_t discard;
650
651 for (i = 0; i < opr_sz / 4; ++i) {
652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653 }
654}
655
656void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657{
658 intptr_t i, opr_sz = simd_oprsz(desc);
659 int32_t *d = vd, *n = vn, *m = vm;
660 uint32_t discard;
661
662 for (i = 0; i < opr_sz / 4; ++i) {
663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664 }
665}
666
Richard Henderson1aee2d72021-05-24 18:03:25 -0700667void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668{
669 intptr_t i, j, opr_sz = simd_oprsz(desc);
670 int idx = simd_data(desc);
671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672 uint32_t discard;
673
674 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675 int32_t mm = m[i];
676 for (j = 0; j < 16 / 4; ++j) {
677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678 }
679 }
680}
681
682void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683{
684 intptr_t i, j, opr_sz = simd_oprsz(desc);
685 int idx = simd_data(desc);
686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687 uint32_t discard;
688
689 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690 int32_t mm = m[i];
691 for (j = 0; j < 16 / 4; ++j) {
692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693 }
694 }
695}
696
Richard Hendersonab3ddf32021-05-24 18:03:01 -0700697/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698static int64_t do_sat128_d(Int128 r)
699{
700 int64_t ls = int128_getlo(r);
701 int64_t hs = int128_gethi(r);
702
703 if (unlikely(hs != (ls >> 63))) {
704 return hs < 0 ? INT64_MIN : INT64_MAX;
705 }
706 return ls;
707}
708
Richard Hendersond782d3c2021-05-24 18:03:03 -0700709int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
Richard Hendersonab3ddf32021-05-24 18:03:01 -0700710{
711 uint64_t l, h;
712 Int128 r, t;
713
714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715 muls64(&l, &h, m, n);
716 r = int128_make128(l, h);
717 if (neg) {
718 r = int128_neg(r);
719 }
720 if (a) {
721 t = int128_exts64(a);
722 t = int128_lshift(t, 63);
723 r = int128_add(r, t);
724 }
725 if (round) {
726 t = int128_exts64(1ll << 62);
727 r = int128_add(r, t);
728 }
729 r = int128_rshift(r, 63);
730
731 return do_sat128_d(r);
732}
733
734void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735 void *va, uint32_t desc)
736{
737 intptr_t i, opr_sz = simd_oprsz(desc);
738 int64_t *d = vd, *n = vn, *m = vm, *a = va;
739
740 for (i = 0; i < opr_sz / 8; ++i) {
741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742 }
743}
744
745void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746 void *va, uint32_t desc)
747{
748 intptr_t i, opr_sz = simd_oprsz(desc);
749 int64_t *d = vd, *n = vn, *m = vm, *a = va;
750
751 for (i = 0; i < opr_sz / 8; ++i) {
752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753 }
754}
755
Richard Henderson169d7c52021-05-24 18:03:24 -0700756void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757{
758 intptr_t i, opr_sz = simd_oprsz(desc);
759 int64_t *d = vd, *n = vn, *m = vm;
760
761 for (i = 0; i < opr_sz / 8; ++i) {
762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763 }
764}
765
766void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767{
768 intptr_t i, opr_sz = simd_oprsz(desc);
769 int64_t *d = vd, *n = vn, *m = vm;
770
771 for (i = 0; i < opr_sz / 8; ++i) {
772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773 }
774}
775
Richard Henderson1aee2d72021-05-24 18:03:25 -0700776void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777{
778 intptr_t i, j, opr_sz = simd_oprsz(desc);
779 int idx = simd_data(desc);
780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781
782 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783 int64_t mm = m[i];
784 for (j = 0; j < 16 / 8; ++j) {
785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786 }
787 }
788}
789
790void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791{
792 intptr_t i, j, opr_sz = simd_oprsz(desc);
793 int idx = simd_data(desc);
794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795
796 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797 int64_t mm = m[i];
798 for (j = 0; j < 16 / 8; ++j) {
799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800 }
801 }
802}
803
Richard Hendersond730eca2018-06-29 15:11:13 +0100804/* Integer 8 and 16-bit dot-product.
805 *
806 * Note that for the loops herein, host endianness does not matter
Richard Henderson5c57e3b2021-05-24 18:03:30 -0700807 * with respect to the ordering of data within the quad-width lanes.
Richard Hendersond730eca2018-06-29 15:11:13 +0100808 * All elements are treated equally, no matter where they are.
809 */
810
Richard Henderson5c57e3b2021-05-24 18:03:30 -0700811#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
813{ \
814 intptr_t i, opr_sz = simd_oprsz(desc); \
815 TYPED *d = vd, *a = va; \
816 TYPEN *n = vn; \
817 TYPEM *m = vm; \
818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
819 d[i] = (a[i] + \
820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
824 } \
825 clear_tail(d, opr_sz, simd_maxsz(desc)); \
Richard Hendersond730eca2018-06-29 15:11:13 +0100826}
827
Richard Henderson5c57e3b2021-05-24 18:03:30 -0700828DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
Richard Henderson6a98cb22021-05-24 18:03:33 -0700830DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
Richard Henderson5c57e3b2021-05-24 18:03:30 -0700831DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
Richard Hendersond730eca2018-06-29 15:11:13 +0100833
Richard Henderson7020ffd2021-05-24 18:03:31 -0700834#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
836{ \
837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
839 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
840 intptr_t index = simd_data(desc); \
841 TYPED *d = vd, *a = va; \
842 TYPEN *n = vn; \
843 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
844 do { \
845 TYPED m0 = m_indexed[i * 4 + 0]; \
846 TYPED m1 = m_indexed[i * 4 + 1]; \
847 TYPED m2 = m_indexed[i * 4 + 2]; \
848 TYPED m3 = m_indexed[i * 4 + 3]; \
849 do { \
850 d[i] = (a[i] + \
851 n[i * 4 + 0] * m0 + \
852 n[i * 4 + 1] * m1 + \
853 n[i * 4 + 2] * m2 + \
854 n[i * 4 + 3] * m3); \
855 } while (++i < segend); \
856 segend = i + 4; \
857 } while (i < opr_sz_n); \
858 clear_tail(d, opr_sz, simd_maxsz(desc)); \
Richard Henderson16fcfdc2018-06-29 15:11:15 +0100859}
860
Richard Henderson7020ffd2021-05-24 18:03:31 -0700861DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
862DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
Richard Henderson28670392021-05-24 18:03:32 -0700863DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
864DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
Peter Maydell6e802db2021-06-14 16:09:11 +0100865DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
866DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
Richard Henderson16fcfdc2018-06-29 15:11:15 +0100867
Richard Henderson1695cd62018-03-02 10:45:43 +0000868void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
869 void *vfpst, uint32_t desc)
870{
871 uintptr_t opr_sz = simd_oprsz(desc);
872 float16 *d = vd;
873 float16 *n = vn;
874 float16 *m = vm;
875 float_status *fpst = vfpst;
876 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
877 uint32_t neg_imag = neg_real ^ 1;
878 uintptr_t i;
879
880 /* Shift boolean to the sign bit so we can xor to negate. */
881 neg_real <<= 15;
882 neg_imag <<= 15;
883
884 for (i = 0; i < opr_sz / 2; i += 2) {
885 float16 e0 = n[H2(i)];
886 float16 e1 = m[H2(i + 1)] ^ neg_imag;
887 float16 e2 = n[H2(i + 1)];
888 float16 e3 = m[H2(i)] ^ neg_real;
889
890 d[H2(i)] = float16_add(e0, e1, fpst);
891 d[H2(i + 1)] = float16_add(e2, e3, fpst);
892 }
893 clear_tail(d, opr_sz, simd_maxsz(desc));
894}
895
896void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
897 void *vfpst, uint32_t desc)
898{
899 uintptr_t opr_sz = simd_oprsz(desc);
900 float32 *d = vd;
901 float32 *n = vn;
902 float32 *m = vm;
903 float_status *fpst = vfpst;
904 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
905 uint32_t neg_imag = neg_real ^ 1;
906 uintptr_t i;
907
908 /* Shift boolean to the sign bit so we can xor to negate. */
909 neg_real <<= 31;
910 neg_imag <<= 31;
911
912 for (i = 0; i < opr_sz / 4; i += 2) {
913 float32 e0 = n[H4(i)];
914 float32 e1 = m[H4(i + 1)] ^ neg_imag;
915 float32 e2 = n[H4(i + 1)];
916 float32 e3 = m[H4(i)] ^ neg_real;
917
918 d[H4(i)] = float32_add(e0, e1, fpst);
919 d[H4(i + 1)] = float32_add(e2, e3, fpst);
920 }
921 clear_tail(d, opr_sz, simd_maxsz(desc));
922}
923
924void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
925 void *vfpst, uint32_t desc)
926{
927 uintptr_t opr_sz = simd_oprsz(desc);
928 float64 *d = vd;
929 float64 *n = vn;
930 float64 *m = vm;
931 float_status *fpst = vfpst;
932 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
933 uint64_t neg_imag = neg_real ^ 1;
934 uintptr_t i;
935
936 /* Shift boolean to the sign bit so we can xor to negate. */
937 neg_real <<= 63;
938 neg_imag <<= 63;
939
940 for (i = 0; i < opr_sz / 8; i += 2) {
941 float64 e0 = n[i];
942 float64 e1 = m[i + 1] ^ neg_imag;
943 float64 e2 = n[i + 1];
944 float64 e3 = m[i] ^ neg_real;
945
946 d[i] = float64_add(e0, e1, fpst);
947 d[i + 1] = float64_add(e2, e3, fpst);
948 }
949 clear_tail(d, opr_sz, simd_maxsz(desc));
950}
Richard Hendersond17b7cd2018-03-02 10:45:44 +0000951
Richard Henderson636ddeb2021-05-24 18:03:16 -0700952void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
Richard Hendersond17b7cd2018-03-02 10:45:44 +0000953 void *vfpst, uint32_t desc)
954{
955 uintptr_t opr_sz = simd_oprsz(desc);
Richard Henderson636ddeb2021-05-24 18:03:16 -0700956 float16 *d = vd, *n = vn, *m = vm, *a = va;
Richard Hendersond17b7cd2018-03-02 10:45:44 +0000957 float_status *fpst = vfpst;
958 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
959 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
960 uint32_t neg_real = flip ^ neg_imag;
961 uintptr_t i;
962
963 /* Shift boolean to the sign bit so we can xor to negate. */
964 neg_real <<= 15;
965 neg_imag <<= 15;
966
967 for (i = 0; i < opr_sz / 2; i += 2) {
968 float16 e2 = n[H2(i + flip)];
969 float16 e1 = m[H2(i + flip)] ^ neg_real;
970 float16 e4 = e2;
971 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
972
Richard Henderson636ddeb2021-05-24 18:03:16 -0700973 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
974 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
Richard Hendersond17b7cd2018-03-02 10:45:44 +0000975 }
976 clear_tail(d, opr_sz, simd_maxsz(desc));
977}
978
Richard Henderson636ddeb2021-05-24 18:03:16 -0700979void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
Richard Hendersond17b7cd2018-03-02 10:45:44 +0000980 void *vfpst, uint32_t desc)
981{
982 uintptr_t opr_sz = simd_oprsz(desc);
Richard Henderson636ddeb2021-05-24 18:03:16 -0700983 float16 *d = vd, *n = vn, *m = vm, *a = va;
Richard Hendersond17b7cd2018-03-02 10:45:44 +0000984 float_status *fpst = vfpst;
985 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
986 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
Richard Henderson2cc99912018-06-29 15:11:12 +0100987 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
Richard Hendersond17b7cd2018-03-02 10:45:44 +0000988 uint32_t neg_real = flip ^ neg_imag;
Richard Henderson18fc2402018-06-29 15:11:12 +0100989 intptr_t elements = opr_sz / sizeof(float16);
Richard Henderson76bccf32024-06-25 11:35:24 -0700990 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
Richard Henderson18fc2402018-06-29 15:11:12 +0100991 intptr_t i, j;
Richard Hendersond17b7cd2018-03-02 10:45:44 +0000992
993 /* Shift boolean to the sign bit so we can xor to negate. */
994 neg_real <<= 15;
995 neg_imag <<= 15;
Richard Hendersond17b7cd2018-03-02 10:45:44 +0000996
Richard Henderson18fc2402018-06-29 15:11:12 +0100997 for (i = 0; i < elements; i += eltspersegment) {
998 float16 mr = m[H2(i + 2 * index + 0)];
999 float16 mi = m[H2(i + 2 * index + 1)];
1000 float16 e1 = neg_real ^ (flip ? mi : mr);
1001 float16 e3 = neg_imag ^ (flip ? mr : mi);
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001002
Richard Henderson18fc2402018-06-29 15:11:12 +01001003 for (j = i; j < i + eltspersegment; j += 2) {
1004 float16 e2 = n[H2(j + flip)];
1005 float16 e4 = e2;
1006
Richard Henderson636ddeb2021-05-24 18:03:16 -07001007 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1008 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
Richard Henderson18fc2402018-06-29 15:11:12 +01001009 }
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001010 }
1011 clear_tail(d, opr_sz, simd_maxsz(desc));
1012}
1013
Richard Henderson636ddeb2021-05-24 18:03:16 -07001014void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001015 void *vfpst, uint32_t desc)
1016{
1017 uintptr_t opr_sz = simd_oprsz(desc);
Richard Henderson636ddeb2021-05-24 18:03:16 -07001018 float32 *d = vd, *n = vn, *m = vm, *a = va;
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001019 float_status *fpst = vfpst;
1020 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1021 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1022 uint32_t neg_real = flip ^ neg_imag;
1023 uintptr_t i;
1024
1025 /* Shift boolean to the sign bit so we can xor to negate. */
1026 neg_real <<= 31;
1027 neg_imag <<= 31;
1028
1029 for (i = 0; i < opr_sz / 4; i += 2) {
1030 float32 e2 = n[H4(i + flip)];
1031 float32 e1 = m[H4(i + flip)] ^ neg_real;
1032 float32 e4 = e2;
1033 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1034
Richard Henderson636ddeb2021-05-24 18:03:16 -07001035 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1036 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001037 }
1038 clear_tail(d, opr_sz, simd_maxsz(desc));
1039}
1040
Richard Henderson636ddeb2021-05-24 18:03:16 -07001041void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001042 void *vfpst, uint32_t desc)
1043{
1044 uintptr_t opr_sz = simd_oprsz(desc);
Richard Henderson636ddeb2021-05-24 18:03:16 -07001045 float32 *d = vd, *n = vn, *m = vm, *a = va;
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001046 float_status *fpst = vfpst;
1047 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1048 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
Richard Henderson2cc99912018-06-29 15:11:12 +01001049 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001050 uint32_t neg_real = flip ^ neg_imag;
Richard Henderson18fc2402018-06-29 15:11:12 +01001051 intptr_t elements = opr_sz / sizeof(float32);
Richard Henderson76bccf32024-06-25 11:35:24 -07001052 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
Richard Henderson18fc2402018-06-29 15:11:12 +01001053 intptr_t i, j;
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001054
1055 /* Shift boolean to the sign bit so we can xor to negate. */
1056 neg_real <<= 31;
1057 neg_imag <<= 31;
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001058
Richard Henderson18fc2402018-06-29 15:11:12 +01001059 for (i = 0; i < elements; i += eltspersegment) {
1060 float32 mr = m[H4(i + 2 * index + 0)];
1061 float32 mi = m[H4(i + 2 * index + 1)];
1062 float32 e1 = neg_real ^ (flip ? mi : mr);
1063 float32 e3 = neg_imag ^ (flip ? mr : mi);
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001064
Richard Henderson18fc2402018-06-29 15:11:12 +01001065 for (j = i; j < i + eltspersegment; j += 2) {
1066 float32 e2 = n[H4(j + flip)];
1067 float32 e4 = e2;
1068
Richard Henderson636ddeb2021-05-24 18:03:16 -07001069 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1070 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
Richard Henderson18fc2402018-06-29 15:11:12 +01001071 }
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001072 }
1073 clear_tail(d, opr_sz, simd_maxsz(desc));
1074}
1075
Richard Henderson636ddeb2021-05-24 18:03:16 -07001076void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001077 void *vfpst, uint32_t desc)
1078{
1079 uintptr_t opr_sz = simd_oprsz(desc);
Richard Henderson636ddeb2021-05-24 18:03:16 -07001080 float64 *d = vd, *n = vn, *m = vm, *a = va;
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001081 float_status *fpst = vfpst;
1082 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1083 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1084 uint64_t neg_real = flip ^ neg_imag;
1085 uintptr_t i;
1086
1087 /* Shift boolean to the sign bit so we can xor to negate. */
1088 neg_real <<= 63;
1089 neg_imag <<= 63;
1090
1091 for (i = 0; i < opr_sz / 8; i += 2) {
1092 float64 e2 = n[i + flip];
1093 float64 e1 = m[i + flip] ^ neg_real;
1094 float64 e4 = e2;
1095 float64 e3 = m[i + 1 - flip] ^ neg_imag;
1096
Richard Henderson636ddeb2021-05-24 18:03:16 -07001097 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1098 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
Richard Hendersond17b7cd2018-03-02 10:45:44 +00001099 }
1100 clear_tail(d, opr_sz, simd_maxsz(desc));
1101}
Richard Henderson29b80462018-06-15 14:57:15 +01001102
Peter Maydellad505db2020-08-28 19:33:35 +01001103/*
1104 * Floating point comparisons producing an integer result (all 1s or all 0s).
1105 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1106 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1107 */
1108static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1109{
1110 return -float16_eq_quiet(op1, op2, stat);
1111}
1112
1113static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1114{
1115 return -float32_eq_quiet(op1, op2, stat);
1116}
1117
Richard Henderson4fe068f2024-05-24 16:20:39 -07001118static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1119{
1120 return -float64_eq_quiet(op1, op2, stat);
1121}
1122
Peter Maydellad505db2020-08-28 19:33:35 +01001123static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1124{
1125 return -float16_le(op2, op1, stat);
1126}
1127
1128static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1129{
1130 return -float32_le(op2, op1, stat);
1131}
1132
Richard Henderson4fe068f2024-05-24 16:20:39 -07001133static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1134{
1135 return -float64_le(op2, op1, stat);
1136}
1137
Peter Maydellad505db2020-08-28 19:33:35 +01001138static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1139{
1140 return -float16_lt(op2, op1, stat);
1141}
1142
1143static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1144{
1145 return -float32_lt(op2, op1, stat);
1146}
1147
Richard Henderson4fe068f2024-05-24 16:20:39 -07001148static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1149{
1150 return -float64_lt(op2, op1, stat);
1151}
1152
Peter Maydellbb2741d2020-08-28 19:33:36 +01001153static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1154{
1155 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1156}
1157
1158static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1159{
1160 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1161}
1162
Richard Henderson4fe068f2024-05-24 16:20:39 -07001163static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1164{
1165 return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1166}
1167
Peter Maydellbb2741d2020-08-28 19:33:36 +01001168static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1169{
1170 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1171}
1172
1173static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1174{
1175 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1176}
1177
Richard Henderson4fe068f2024-05-24 16:20:39 -07001178static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1179{
1180 return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1181}
1182
Peter Maydell7782a9a2020-08-28 19:33:45 +01001183static int16_t vfp_tosszh(float16 x, void *fpstp)
1184{
1185 float_status *fpst = fpstp;
1186 if (float16_is_any_nan(x)) {
1187 float_raise(float_flag_invalid, fpst);
1188 return 0;
1189 }
1190 return float16_to_int16_round_to_zero(x, fpst);
1191}
1192
1193static uint16_t vfp_touszh(float16 x, void *fpstp)
1194{
1195 float_status *fpst = fpstp;
1196 if (float16_is_any_nan(x)) {
1197 float_raise(float_flag_invalid, fpst);
1198 return 0;
1199 }
1200 return float16_to_uint16_round_to_zero(x, fpst);
1201}
1202
Richard Henderson3887c032018-06-29 15:11:09 +01001203#define DO_2OP(NAME, FUNC, TYPE) \
1204void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
1205{ \
1206 intptr_t i, oprsz = simd_oprsz(desc); \
1207 TYPE *d = vd, *n = vn; \
1208 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1209 d[i] = FUNC(n[i], stat); \
1210 } \
Richard Hendersond8efe782019-02-15 09:56:41 +00001211 clear_tail(d, oprsz, simd_maxsz(desc)); \
Richard Henderson3887c032018-06-29 15:11:09 +01001212}
1213
1214DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1215DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1216DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1217
1218DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1219DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1220DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1221
Peter Maydell23afcdd2020-08-28 19:33:50 +01001222DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1223DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1224
Peter Maydell7782a9a2020-08-28 19:33:45 +01001225DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1226DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1227DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1228DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1229DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1230DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1231DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1232DO_2OP(gvec_touszh, vfp_touszh, float16)
1233
Peter Maydell635187a2020-08-28 19:33:41 +01001234#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1235 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1236 { \
1237 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1238 }
1239
1240#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1241 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1242 { \
1243 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1244 }
1245
1246#define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1247 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1248 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1249 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1250 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1251
1252DO_2OP_CMP0(cgt, cgt, FWD)
1253DO_2OP_CMP0(cge, cge, FWD)
1254DO_2OP_CMP0(ceq, ceq, FWD)
1255DO_2OP_CMP0(clt, cgt, REV)
1256DO_2OP_CMP0(cle, cge, REV)
1257
Richard Henderson3887c032018-06-29 15:11:09 +01001258#undef DO_2OP
Peter Maydell635187a2020-08-28 19:33:41 +01001259#undef DO_2OP_CMP0
Richard Henderson3887c032018-06-29 15:11:09 +01001260
Richard Henderson29b80462018-06-15 14:57:15 +01001261/* Floating-point trigonometric starting value.
1262 * See the ARM ARM pseudocode function FPTrigSMul.
1263 */
1264static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1265{
1266 float16 result = float16_mul(op1, op1, stat);
1267 if (!float16_is_any_nan(result)) {
1268 result = float16_set_sign(result, op2 & 1);
1269 }
1270 return result;
1271}
1272
1273static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1274{
1275 float32 result = float32_mul(op1, op1, stat);
1276 if (!float32_is_any_nan(result)) {
1277 result = float32_set_sign(result, op2 & 1);
1278 }
1279 return result;
1280}
1281
1282static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1283{
1284 float64 result = float64_mul(op1, op1, stat);
1285 if (!float64_is_any_nan(result)) {
1286 result = float64_set_sign(result, op2 & 1);
1287 }
1288 return result;
1289}
1290
Peter Maydelle4a6d4a2020-08-28 19:33:32 +01001291static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1292{
1293 return float16_abs(float16_sub(op1, op2, stat));
1294}
1295
Peter Maydella26a3522020-05-12 17:38:58 +01001296static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1297{
1298 return float32_abs(float32_sub(op1, op2, stat));
1299}
1300
Richard Henderson43454732024-05-24 16:20:40 -07001301static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1302{
1303 return float64_abs(float64_sub(op1, op2, stat));
1304}
1305
Peter Maydellac8c62c2020-08-28 19:33:42 +01001306/*
1307 * Reciprocal step. These are the AArch32 version which uses a
1308 * non-fused multiply-and-subtract.
1309 */
1310static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1311{
1312 op1 = float16_squash_input_denormal(op1, stat);
1313 op2 = float16_squash_input_denormal(op2, stat);
1314
1315 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1316 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1317 return float16_two;
1318 }
1319 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1320}
1321
1322static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1323{
1324 op1 = float32_squash_input_denormal(op1, stat);
1325 op2 = float32_squash_input_denormal(op2, stat);
1326
1327 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1328 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1329 return float32_two;
1330 }
1331 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1332}
1333
Peter Maydell40fde722020-08-28 19:33:43 +01001334/* Reciprocal square-root step. AArch32 non-fused semantics. */
1335static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1336{
1337 op1 = float16_squash_input_denormal(op1, stat);
1338 op2 = float16_squash_input_denormal(op2, stat);
1339
1340 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1341 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1342 return float16_one_point_five;
1343 }
1344 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1345 return float16_div(op1, float16_two, stat);
1346}
1347
1348static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1349{
1350 op1 = float32_squash_input_denormal(op1, stat);
1351 op2 = float32_squash_input_denormal(op2, stat);
1352
1353 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1354 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1355 return float32_one_point_five;
1356 }
1357 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1358 return float32_div(op1, float32_two, stat);
1359}
1360
Richard Henderson29b80462018-06-15 14:57:15 +01001361#define DO_3OP(NAME, FUNC, TYPE) \
1362void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1363{ \
1364 intptr_t i, oprsz = simd_oprsz(desc); \
1365 TYPE *d = vd, *n = vn, *m = vm; \
1366 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1367 d[i] = FUNC(n[i], m[i], stat); \
1368 } \
Richard Hendersond8efe782019-02-15 09:56:41 +00001369 clear_tail(d, oprsz, simd_maxsz(desc)); \
Richard Henderson29b80462018-06-15 14:57:15 +01001370}
1371
1372DO_3OP(gvec_fadd_h, float16_add, float16)
1373DO_3OP(gvec_fadd_s, float32_add, float32)
1374DO_3OP(gvec_fadd_d, float64_add, float64)
1375
1376DO_3OP(gvec_fsub_h, float16_sub, float16)
1377DO_3OP(gvec_fsub_s, float32_sub, float32)
1378DO_3OP(gvec_fsub_d, float64_sub, float64)
1379
1380DO_3OP(gvec_fmul_h, float16_mul, float16)
1381DO_3OP(gvec_fmul_s, float32_mul, float32)
1382DO_3OP(gvec_fmul_d, float64_mul, float64)
1383
1384DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1385DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1386DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1387
Peter Maydelle4a6d4a2020-08-28 19:33:32 +01001388DO_3OP(gvec_fabd_h, float16_abd, float16)
Peter Maydella26a3522020-05-12 17:38:58 +01001389DO_3OP(gvec_fabd_s, float32_abd, float32)
Richard Henderson43454732024-05-24 16:20:40 -07001390DO_3OP(gvec_fabd_d, float64_abd, float64)
Peter Maydella26a3522020-05-12 17:38:58 +01001391
Peter Maydellad505db2020-08-28 19:33:35 +01001392DO_3OP(gvec_fceq_h, float16_ceq, float16)
1393DO_3OP(gvec_fceq_s, float32_ceq, float32)
Richard Henderson4fe068f2024-05-24 16:20:39 -07001394DO_3OP(gvec_fceq_d, float64_ceq, float64)
Peter Maydellad505db2020-08-28 19:33:35 +01001395
1396DO_3OP(gvec_fcge_h, float16_cge, float16)
1397DO_3OP(gvec_fcge_s, float32_cge, float32)
Richard Henderson4fe068f2024-05-24 16:20:39 -07001398DO_3OP(gvec_fcge_d, float64_cge, float64)
Peter Maydellad505db2020-08-28 19:33:35 +01001399
1400DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1401DO_3OP(gvec_fcgt_s, float32_cgt, float32)
Richard Henderson4fe068f2024-05-24 16:20:39 -07001402DO_3OP(gvec_fcgt_d, float64_cgt, float64)
Peter Maydellad505db2020-08-28 19:33:35 +01001403
Peter Maydellbb2741d2020-08-28 19:33:36 +01001404DO_3OP(gvec_facge_h, float16_acge, float16)
1405DO_3OP(gvec_facge_s, float32_acge, float32)
Richard Henderson4fe068f2024-05-24 16:20:39 -07001406DO_3OP(gvec_facge_d, float64_acge, float64)
Peter Maydellbb2741d2020-08-28 19:33:36 +01001407
1408DO_3OP(gvec_facgt_h, float16_acgt, float16)
1409DO_3OP(gvec_facgt_s, float32_acgt, float32)
Richard Henderson4fe068f2024-05-24 16:20:39 -07001410DO_3OP(gvec_facgt_d, float64_acgt, float64)
Peter Maydellbb2741d2020-08-28 19:33:36 +01001411
Peter Maydelle43268c2020-08-28 19:33:37 +01001412DO_3OP(gvec_fmax_h, float16_max, float16)
1413DO_3OP(gvec_fmax_s, float32_max, float32)
Richard Hendersona1e250f2024-05-24 16:20:34 -07001414DO_3OP(gvec_fmax_d, float64_max, float64)
Peter Maydelle43268c2020-08-28 19:33:37 +01001415
1416DO_3OP(gvec_fmin_h, float16_min, float16)
1417DO_3OP(gvec_fmin_s, float32_min, float32)
Richard Hendersona1e250f2024-05-24 16:20:34 -07001418DO_3OP(gvec_fmin_d, float64_min, float64)
Peter Maydelle43268c2020-08-28 19:33:37 +01001419
Peter Maydelle22705b2020-08-28 19:33:38 +01001420DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1421DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
Richard Hendersona1e250f2024-05-24 16:20:34 -07001422DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
Peter Maydelle22705b2020-08-28 19:33:38 +01001423
1424DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1425DO_3OP(gvec_fminnum_s, float32_minnum, float32)
Richard Hendersona1e250f2024-05-24 16:20:34 -07001426DO_3OP(gvec_fminnum_d, float64_minnum, float64)
Peter Maydelle22705b2020-08-28 19:33:38 +01001427
Peter Maydellac8c62c2020-08-28 19:33:42 +01001428DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1429DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1430
Peter Maydell40fde722020-08-28 19:33:43 +01001431DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1432DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1433
Richard Henderson29b80462018-06-15 14:57:15 +01001434#ifdef TARGET_AARCH64
Richard Hendersone0300a92024-05-24 16:20:33 -07001435DO_3OP(gvec_fdiv_h, float16_div, float16)
1436DO_3OP(gvec_fdiv_s, float32_div, float32)
1437DO_3OP(gvec_fdiv_d, float64_div, float64)
1438
Richard Hendersoncb1c77f2024-05-24 16:20:32 -07001439DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1440DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1441DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
Richard Henderson29b80462018-06-15 14:57:15 +01001442
1443DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1444DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1445DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1446
1447DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1448DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1449DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1450
1451#endif
1452#undef DO_3OP
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001453
Peter Maydelle5adc702020-08-28 19:33:39 +01001454/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1455static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1456 float_status *stat)
1457{
1458 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1459}
1460
1461static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1462 float_status *stat)
1463{
1464 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1465}
1466
1467static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1468 float_status *stat)
1469{
1470 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1471}
1472
1473static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1474 float_status *stat)
1475{
1476 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1477}
1478
Peter Maydellcf722d72020-08-28 19:33:40 +01001479/* Fused versions; these have the semantics Neon VFMA/VFMS want */
1480static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1481 float_status *stat)
1482{
1483 return float16_muladd(op1, op2, dest, 0, stat);
1484}
1485
1486static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1487 float_status *stat)
1488{
1489 return float32_muladd(op1, op2, dest, 0, stat);
1490}
1491
Richard Henderson2d558ef2024-05-24 16:20:38 -07001492static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1493 float_status *stat)
1494{
1495 return float64_muladd(op1, op2, dest, 0, stat);
1496}
1497
Peter Maydellcf722d72020-08-28 19:33:40 +01001498static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1499 float_status *stat)
1500{
1501 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1502}
1503
1504static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1505 float_status *stat)
1506{
1507 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1508}
1509
Richard Henderson2d558ef2024-05-24 16:20:38 -07001510static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1511 float_status *stat)
1512{
1513 return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1514}
1515
Peter Maydellcf722d72020-08-28 19:33:40 +01001516#define DO_MULADD(NAME, FUNC, TYPE) \
Peter Maydelle5adc702020-08-28 19:33:39 +01001517void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1518{ \
1519 intptr_t i, oprsz = simd_oprsz(desc); \
1520 TYPE *d = vd, *n = vn, *m = vm; \
1521 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1522 d[i] = FUNC(d[i], n[i], m[i], stat); \
1523 } \
1524 clear_tail(d, oprsz, simd_maxsz(desc)); \
1525}
1526
1527DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1528DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1529
1530DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1531DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1532
Peter Maydellcf722d72020-08-28 19:33:40 +01001533DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1534DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
Richard Henderson2d558ef2024-05-24 16:20:38 -07001535DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
Peter Maydellcf722d72020-08-28 19:33:40 +01001536
1537DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1538DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
Richard Henderson2d558ef2024-05-24 16:20:38 -07001539DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
Peter Maydellcf722d72020-08-28 19:33:40 +01001540
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001541/* For the indexed ops, SVE applies the index per 128-bit vector segment.
1542 * For AdvSIMD, there is of course only one such vector segment.
1543 */
1544
1545#define DO_MUL_IDX(NAME, TYPE, H) \
Richard Henderson2e5a2652020-08-28 10:02:50 +01001546void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1547{ \
Peter Maydelld7ce81e2020-08-28 19:33:51 +01001548 intptr_t i, j, oprsz = simd_oprsz(desc); \
1549 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
Richard Henderson2e5a2652020-08-28 10:02:50 +01001550 intptr_t idx = simd_data(desc); \
1551 TYPE *d = vd, *n = vn, *m = vm; \
1552 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1553 TYPE mm = m[H(i + idx)]; \
1554 for (j = 0; j < segment; j++) { \
1555 d[i + j] = n[i + j] * mm; \
1556 } \
1557 } \
1558 clear_tail(d, oprsz, simd_maxsz(desc)); \
1559}
1560
1561DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1562DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
Peter Maydell6e802db2021-06-14 16:09:11 +01001563DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
Richard Henderson2e5a2652020-08-28 10:02:50 +01001564
1565#undef DO_MUL_IDX
1566
Richard Henderson36074402020-08-28 10:02:50 +01001567#define DO_MLA_IDX(NAME, TYPE, OP, H) \
1568void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1569{ \
Peter Maydelld7ce81e2020-08-28 19:33:51 +01001570 intptr_t i, j, oprsz = simd_oprsz(desc); \
1571 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
Richard Henderson36074402020-08-28 10:02:50 +01001572 intptr_t idx = simd_data(desc); \
1573 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1574 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1575 TYPE mm = m[H(i + idx)]; \
1576 for (j = 0; j < segment; j++) { \
1577 d[i + j] = a[i + j] OP n[i + j] * mm; \
1578 } \
1579 } \
1580 clear_tail(d, oprsz, simd_maxsz(desc)); \
1581}
1582
1583DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1584DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
Peter Maydell6e802db2021-06-14 16:09:11 +01001585DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
Richard Henderson36074402020-08-28 10:02:50 +01001586
1587DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1588DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
Peter Maydell6e802db2021-06-14 16:09:11 +01001589DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
Richard Henderson36074402020-08-28 10:02:50 +01001590
1591#undef DO_MLA_IDX
1592
Richard Hendersoncb1c77f2024-05-24 16:20:32 -07001593#define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001594void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1595{ \
Peter Maydelld7ce81e2020-08-28 19:33:51 +01001596 intptr_t i, j, oprsz = simd_oprsz(desc); \
1597 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001598 intptr_t idx = simd_data(desc); \
1599 TYPE *d = vd, *n = vn, *m = vm; \
1600 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1601 TYPE mm = m[H(i + idx)]; \
1602 for (j = 0; j < segment; j++) { \
Richard Hendersoncb1c77f2024-05-24 16:20:32 -07001603 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001604 } \
1605 } \
Richard Henderson525d9b62020-05-13 09:32:43 -07001606 clear_tail(d, oprsz, simd_maxsz(desc)); \
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001607}
1608
Richard Hendersoncb1c77f2024-05-24 16:20:32 -07001609#define nop(N, M, S) (M)
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001610
Richard Hendersoncb1c77f2024-05-24 16:20:32 -07001611DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1612DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1613DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1614
1615#ifdef TARGET_AARCH64
1616
1617DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1618DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1619DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1620
1621#endif
1622
1623#undef nop
Peter Maydellc50d8d12020-08-28 19:33:52 +01001624
1625/*
1626 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1627 * the fused ops below they assume accumulate both from and into Vd.
1628 */
Richard Hendersoncb1c77f2024-05-24 16:20:32 -07001629DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1630DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1631DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1632DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
Peter Maydellc50d8d12020-08-28 19:33:52 +01001633
Richard Henderson2e5a2652020-08-28 10:02:50 +01001634#undef DO_FMUL_IDX
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001635
1636#define DO_FMLA_IDX(NAME, TYPE, H) \
1637void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1638 void *stat, uint32_t desc) \
1639{ \
Peter Maydelld7ce81e2020-08-28 19:33:51 +01001640 intptr_t i, j, oprsz = simd_oprsz(desc); \
1641 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001642 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1643 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1644 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1645 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1646 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1647 TYPE mm = m[H(i + idx)]; \
1648 for (j = 0; j < segment; j++) { \
1649 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1650 mm, a[i + j], 0, stat); \
1651 } \
1652 } \
Richard Henderson525d9b62020-05-13 09:32:43 -07001653 clear_tail(d, oprsz, simd_maxsz(desc)); \
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001654}
1655
1656DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1657DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
Peter Maydell6e802db2021-06-14 16:09:11 +01001658DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
Richard Hendersonca40a6e2018-06-29 15:11:08 +01001659
1660#undef DO_FMLA_IDX
Richard Henderson89e68b52019-02-15 09:56:41 +00001661
1662#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1663void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1664{ \
1665 intptr_t i, oprsz = simd_oprsz(desc); \
1666 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1667 bool q = false; \
1668 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1669 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1670 if (dd < MIN) { \
1671 dd = MIN; \
1672 q = true; \
1673 } else if (dd > MAX) { \
1674 dd = MAX; \
1675 q = true; \
1676 } \
1677 d[i] = dd; \
1678 } \
1679 if (q) { \
1680 uint32_t *qc = vq; \
1681 qc[0] = 1; \
1682 } \
1683 clear_tail(d, oprsz, simd_maxsz(desc)); \
1684}
1685
1686DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1687DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1688DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1689
1690DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1691DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1692DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1693
1694DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1695DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1696DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1697
1698DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1699DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1700DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1701
Richard Henderson8f6343a2024-05-28 13:30:15 -07001702DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1703DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1704DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1705
1706DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1707DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1708DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1709
Richard Henderson89e68b52019-02-15 09:56:41 +00001710#undef DO_SAT
1711
1712void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1713 void *vm, uint32_t desc)
1714{
1715 intptr_t i, oprsz = simd_oprsz(desc);
1716 uint64_t *d = vd, *n = vn, *m = vm;
1717 bool q = false;
1718
1719 for (i = 0; i < oprsz / 8; i++) {
1720 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1721 if (dd < nn) {
1722 dd = UINT64_MAX;
1723 q = true;
1724 }
1725 d[i] = dd;
1726 }
1727 if (q) {
1728 uint32_t *qc = vq;
1729 qc[0] = 1;
1730 }
1731 clear_tail(d, oprsz, simd_maxsz(desc));
1732}
1733
1734void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1735 void *vm, uint32_t desc)
1736{
1737 intptr_t i, oprsz = simd_oprsz(desc);
1738 uint64_t *d = vd, *n = vn, *m = vm;
1739 bool q = false;
1740
1741 for (i = 0; i < oprsz / 8; i++) {
1742 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1743 if (nn < mm) {
1744 dd = 0;
1745 q = true;
1746 }
1747 d[i] = dd;
1748 }
1749 if (q) {
1750 uint32_t *qc = vq;
1751 qc[0] = 1;
1752 }
1753 clear_tail(d, oprsz, simd_maxsz(desc));
1754}
1755
1756void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1757 void *vm, uint32_t desc)
1758{
1759 intptr_t i, oprsz = simd_oprsz(desc);
1760 int64_t *d = vd, *n = vn, *m = vm;
1761 bool q = false;
1762
1763 for (i = 0; i < oprsz / 8; i++) {
1764 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1765 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1766 dd = (nn >> 63) ^ ~INT64_MIN;
1767 q = true;
1768 }
1769 d[i] = dd;
1770 }
1771 if (q) {
1772 uint32_t *qc = vq;
1773 qc[0] = 1;
1774 }
1775 clear_tail(d, oprsz, simd_maxsz(desc));
1776}
1777
1778void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1779 void *vm, uint32_t desc)
1780{
1781 intptr_t i, oprsz = simd_oprsz(desc);
1782 int64_t *d = vd, *n = vn, *m = vm;
1783 bool q = false;
1784
1785 for (i = 0; i < oprsz / 8; i++) {
1786 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1787 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1788 dd = (nn >> 63) ^ ~INT64_MIN;
1789 q = true;
1790 }
1791 d[i] = dd;
1792 }
1793 if (q) {
1794 uint32_t *qc = vq;
1795 qc[0] = 1;
1796 }
1797 clear_tail(d, oprsz, simd_maxsz(desc));
1798}
Richard Hendersona4e943a2019-02-28 10:55:16 +00001799
Richard Henderson8f6343a2024-05-28 13:30:15 -07001800void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1801 void *vm, uint32_t desc)
1802{
1803 intptr_t i, oprsz = simd_oprsz(desc);
1804 uint64_t *d = vd, *n = vn, *m = vm;
1805 bool q = false;
1806
1807 for (i = 0; i < oprsz / 8; i++) {
1808 uint64_t nn = n[i];
1809 int64_t mm = m[i];
1810 uint64_t dd = nn + mm;
1811
1812 if (mm < 0) {
1813 if (nn < (uint64_t)-mm) {
1814 dd = 0;
1815 q = true;
1816 }
1817 } else {
1818 if (dd < nn) {
1819 dd = UINT64_MAX;
1820 q = true;
1821 }
1822 }
1823 d[i] = dd;
1824 }
1825 if (q) {
1826 uint32_t *qc = vq;
1827 qc[0] = 1;
1828 }
1829 clear_tail(d, oprsz, simd_maxsz(desc));
1830}
1831
1832void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1833 void *vm, uint32_t desc)
1834{
1835 intptr_t i, oprsz = simd_oprsz(desc);
1836 uint64_t *d = vd, *n = vn, *m = vm;
1837 bool q = false;
1838
1839 for (i = 0; i < oprsz / 8; i++) {
1840 int64_t nn = n[i];
1841 uint64_t mm = m[i];
1842 int64_t dd = nn + mm;
1843
1844 if (mm > (uint64_t)(INT64_MAX - nn)) {
1845 dd = INT64_MAX;
1846 q = true;
1847 }
1848 d[i] = dd;
1849 }
1850 if (q) {
1851 uint32_t *qc = vq;
1852 qc[0] = 1;
1853 }
1854 clear_tail(d, oprsz, simd_maxsz(desc));
1855}
Richard Henderson631e5652020-05-13 09:32:30 -07001856
1857#define DO_SRA(NAME, TYPE) \
1858void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1859{ \
1860 intptr_t i, oprsz = simd_oprsz(desc); \
1861 int shift = simd_data(desc); \
1862 TYPE *d = vd, *n = vn; \
1863 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1864 d[i] += n[i] >> shift; \
1865 } \
1866 clear_tail(d, oprsz, simd_maxsz(desc)); \
1867}
1868
1869DO_SRA(gvec_ssra_b, int8_t)
1870DO_SRA(gvec_ssra_h, int16_t)
1871DO_SRA(gvec_ssra_s, int32_t)
1872DO_SRA(gvec_ssra_d, int64_t)
1873
1874DO_SRA(gvec_usra_b, uint8_t)
1875DO_SRA(gvec_usra_h, uint16_t)
1876DO_SRA(gvec_usra_s, uint32_t)
1877DO_SRA(gvec_usra_d, uint64_t)
1878
1879#undef DO_SRA
1880
Richard Henderson6ccd48d2020-05-13 09:32:31 -07001881#define DO_RSHR(NAME, TYPE) \
1882void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1883{ \
1884 intptr_t i, oprsz = simd_oprsz(desc); \
1885 int shift = simd_data(desc); \
1886 TYPE *d = vd, *n = vn; \
1887 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1888 TYPE tmp = n[i] >> (shift - 1); \
1889 d[i] = (tmp >> 1) + (tmp & 1); \
1890 } \
1891 clear_tail(d, oprsz, simd_maxsz(desc)); \
1892}
1893
1894DO_RSHR(gvec_srshr_b, int8_t)
1895DO_RSHR(gvec_srshr_h, int16_t)
1896DO_RSHR(gvec_srshr_s, int32_t)
1897DO_RSHR(gvec_srshr_d, int64_t)
1898
1899DO_RSHR(gvec_urshr_b, uint8_t)
1900DO_RSHR(gvec_urshr_h, uint16_t)
1901DO_RSHR(gvec_urshr_s, uint32_t)
1902DO_RSHR(gvec_urshr_d, uint64_t)
1903
1904#undef DO_RSHR
1905
1906#define DO_RSRA(NAME, TYPE) \
1907void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1908{ \
1909 intptr_t i, oprsz = simd_oprsz(desc); \
1910 int shift = simd_data(desc); \
1911 TYPE *d = vd, *n = vn; \
1912 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1913 TYPE tmp = n[i] >> (shift - 1); \
1914 d[i] += (tmp >> 1) + (tmp & 1); \
1915 } \
1916 clear_tail(d, oprsz, simd_maxsz(desc)); \
1917}
1918
1919DO_RSRA(gvec_srsra_b, int8_t)
1920DO_RSRA(gvec_srsra_h, int16_t)
1921DO_RSRA(gvec_srsra_s, int32_t)
1922DO_RSRA(gvec_srsra_d, int64_t)
1923
1924DO_RSRA(gvec_ursra_b, uint8_t)
1925DO_RSRA(gvec_ursra_h, uint16_t)
1926DO_RSRA(gvec_ursra_s, uint32_t)
1927DO_RSRA(gvec_ursra_d, uint64_t)
1928
1929#undef DO_RSRA
1930
Richard Henderson893ab052020-05-13 09:32:32 -07001931#define DO_SRI(NAME, TYPE) \
1932void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1933{ \
1934 intptr_t i, oprsz = simd_oprsz(desc); \
1935 int shift = simd_data(desc); \
1936 TYPE *d = vd, *n = vn; \
1937 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1938 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1939 } \
1940 clear_tail(d, oprsz, simd_maxsz(desc)); \
1941}
1942
1943DO_SRI(gvec_sri_b, uint8_t)
1944DO_SRI(gvec_sri_h, uint16_t)
1945DO_SRI(gvec_sri_s, uint32_t)
1946DO_SRI(gvec_sri_d, uint64_t)
1947
1948#undef DO_SRI
1949
1950#define DO_SLI(NAME, TYPE) \
1951void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1952{ \
1953 intptr_t i, oprsz = simd_oprsz(desc); \
1954 int shift = simd_data(desc); \
1955 TYPE *d = vd, *n = vn; \
1956 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1957 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1958 } \
1959 clear_tail(d, oprsz, simd_maxsz(desc)); \
1960}
1961
1962DO_SLI(gvec_sli_b, uint8_t)
1963DO_SLI(gvec_sli_h, uint16_t)
1964DO_SLI(gvec_sli_s, uint32_t)
1965DO_SLI(gvec_sli_d, uint64_t)
1966
1967#undef DO_SLI
1968
Richard Hendersona4e943a2019-02-28 10:55:16 +00001969/*
1970 * Convert float16 to float32, raising no exceptions and
1971 * preserving exceptional values, including SNaN.
1972 * This is effectively an unpack+repack operation.
1973 */
1974static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1975{
1976 const int f16_bias = 15;
1977 const int f32_bias = 127;
1978 uint32_t sign = extract32(f16, 15, 1);
1979 uint32_t exp = extract32(f16, 10, 5);
1980 uint32_t frac = extract32(f16, 0, 10);
1981
1982 if (exp == 0x1f) {
1983 /* Inf or NaN */
1984 exp = 0xff;
1985 } else if (exp == 0) {
1986 /* Zero or denormal. */
1987 if (frac != 0) {
1988 if (fz16) {
1989 frac = 0;
1990 } else {
1991 /*
1992 * Denormal; these are all normal float32.
1993 * Shift the fraction so that the msb is at bit 11,
1994 * then remove bit 11 as the implicit bit of the
1995 * normalized float32. Note that we still go through
1996 * the shift for normal numbers below, to put the
1997 * float32 fraction at the right place.
1998 */
1999 int shift = clz32(frac) - 21;
2000 frac = (frac << shift) & 0x3ff;
2001 exp = f32_bias - f16_bias - shift + 1;
2002 }
2003 }
2004 } else {
2005 /* Normal number; adjust the bias. */
2006 exp += f32_bias - f16_bias;
2007 }
2008 sign <<= 31;
2009 exp <<= 23;
2010 frac <<= 23 - 10;
2011
2012 return sign | exp | frac;
2013}
2014
2015static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2016{
2017 /*
2018 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2019 * Load the 2nd qword iff is_q & is_2.
2020 * Shift to the 2nd dword iff !is_q & is_2.
2021 * For !is_q & !is_2, the upper bits of the result are garbage.
2022 */
2023 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2024}
2025
2026/*
2027 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2028 * as there is not yet SVE versions that might use blocking.
2029 */
2030
2031static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2032 uint32_t desc, bool fz16)
2033{
2034 intptr_t i, oprsz = simd_oprsz(desc);
2035 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2036 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2037 int is_q = oprsz == 16;
2038 uint64_t n_4, m_4;
2039
2040 /* Pre-load all of the f16 data, avoiding overlap issues. */
2041 n_4 = load4_f16(vn, is_q, is_2);
2042 m_4 = load4_f16(vm, is_q, is_2);
2043
2044 /* Negate all inputs for FMLSL at once. */
2045 if (is_s) {
2046 n_4 ^= 0x8000800080008000ull;
2047 }
2048
2049 for (i = 0; i < oprsz / 4; i++) {
2050 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2051 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2052 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2053 }
2054 clear_tail(d, oprsz, simd_maxsz(desc));
2055}
2056
2057void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2058 void *venv, uint32_t desc)
2059{
2060 CPUARMState *env = venv;
2061 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2062 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2063}
2064
2065void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2066 void *venv, uint32_t desc)
2067{
2068 CPUARMState *env = venv;
2069 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
2070 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2071}
2072
Stephen Long50d102b2021-05-24 18:03:48 -07002073void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2074 void *venv, uint32_t desc)
2075{
2076 intptr_t i, oprsz = simd_oprsz(desc);
2077 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2078 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2079 CPUARMState *env = venv;
2080 float_status *status = &env->vfp.fp_status;
2081 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2082
2083 for (i = 0; i < oprsz; i += sizeof(float32)) {
2084 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2085 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2086 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2087 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2088 float32 aa = *(float32 *)(va + H1_4(i));
2089
2090 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2091 }
2092}
2093
Richard Hendersona4e943a2019-02-28 10:55:16 +00002094static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2095 uint32_t desc, bool fz16)
2096{
2097 intptr_t i, oprsz = simd_oprsz(desc);
2098 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2099 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2100 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2101 int is_q = oprsz == 16;
2102 uint64_t n_4;
2103 float32 m_1;
2104
2105 /* Pre-load all of the f16 data, avoiding overlap issues. */
2106 n_4 = load4_f16(vn, is_q, is_2);
2107
2108 /* Negate all inputs for FMLSL at once. */
2109 if (is_s) {
2110 n_4 ^= 0x8000800080008000ull;
2111 }
2112
2113 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2114
2115 for (i = 0; i < oprsz / 4; i++) {
2116 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2117 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2118 }
2119 clear_tail(d, oprsz, simd_maxsz(desc));
2120}
2121
2122void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2123 void *venv, uint32_t desc)
2124{
2125 CPUARMState *env = venv;
2126 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2127 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2128}
2129
2130void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2131 void *venv, uint32_t desc)
2132{
2133 CPUARMState *env = venv;
2134 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2135 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2136}
Richard Henderson87b74e82020-02-16 13:42:29 -08002137
Stephen Long50d102b2021-05-24 18:03:48 -07002138void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2139 void *venv, uint32_t desc)
2140{
2141 intptr_t i, j, oprsz = simd_oprsz(desc);
2142 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2143 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2144 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2145 CPUARMState *env = venv;
2146 float_status *status = &env->vfp.fp_status;
2147 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2148
2149 for (i = 0; i < oprsz; i += 16) {
2150 float16 mm_16 = *(float16 *)(vm + i + idx);
2151 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2152
2153 for (j = 0; j < 16; j += sizeof(float32)) {
2154 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2155 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2156 float32 aa = *(float32 *)(va + H1_4(i + j));
2157
2158 *(float32 *)(vd + H1_4(i + j)) =
2159 float32_muladd(nn, mm, aa, 0, status);
2160 }
2161 }
2162}
2163
Richard Henderson87b74e82020-02-16 13:42:29 -08002164void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2165{
2166 intptr_t i, opr_sz = simd_oprsz(desc);
2167 int8_t *d = vd, *n = vn, *m = vm;
2168
2169 for (i = 0; i < opr_sz; ++i) {
2170 int8_t mm = m[i];
2171 int8_t nn = n[i];
2172 int8_t res = 0;
2173 if (mm >= 0) {
2174 if (mm < 8) {
2175 res = nn << mm;
2176 }
2177 } else {
2178 res = nn >> (mm > -8 ? -mm : 7);
2179 }
2180 d[i] = res;
2181 }
2182 clear_tail(d, opr_sz, simd_maxsz(desc));
2183}
2184
2185void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2186{
2187 intptr_t i, opr_sz = simd_oprsz(desc);
2188 int16_t *d = vd, *n = vn, *m = vm;
2189
2190 for (i = 0; i < opr_sz / 2; ++i) {
2191 int8_t mm = m[i]; /* only 8 bits of shift are significant */
2192 int16_t nn = n[i];
2193 int16_t res = 0;
2194 if (mm >= 0) {
2195 if (mm < 16) {
2196 res = nn << mm;
2197 }
2198 } else {
2199 res = nn >> (mm > -16 ? -mm : 15);
2200 }
2201 d[i] = res;
2202 }
2203 clear_tail(d, opr_sz, simd_maxsz(desc));
2204}
2205
2206void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2207{
2208 intptr_t i, opr_sz = simd_oprsz(desc);
2209 uint8_t *d = vd, *n = vn, *m = vm;
2210
2211 for (i = 0; i < opr_sz; ++i) {
2212 int8_t mm = m[i];
2213 uint8_t nn = n[i];
2214 uint8_t res = 0;
2215 if (mm >= 0) {
2216 if (mm < 8) {
2217 res = nn << mm;
2218 }
2219 } else {
2220 if (mm > -8) {
2221 res = nn >> -mm;
2222 }
2223 }
2224 d[i] = res;
2225 }
2226 clear_tail(d, opr_sz, simd_maxsz(desc));
2227}
2228
2229void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2230{
2231 intptr_t i, opr_sz = simd_oprsz(desc);
2232 uint16_t *d = vd, *n = vn, *m = vm;
2233
2234 for (i = 0; i < opr_sz / 2; ++i) {
2235 int8_t mm = m[i]; /* only 8 bits of shift are significant */
2236 uint16_t nn = n[i];
2237 uint16_t res = 0;
2238 if (mm >= 0) {
2239 if (mm < 16) {
2240 res = nn << mm;
2241 }
2242 } else {
2243 if (mm > -16) {
2244 res = nn >> -mm;
2245 }
2246 }
2247 d[i] = res;
2248 }
2249 clear_tail(d, opr_sz, simd_maxsz(desc));
2250}
Richard Hendersona21bb782020-02-16 13:42:30 -08002251
2252/*
2253 * 8x8->8 polynomial multiply.
2254 *
2255 * Polynomial multiplication is like integer multiplication except the
2256 * partial products are XORed, not added.
2257 *
2258 * TODO: expose this as a generic vector operation, as it is a common
2259 * crypto building block.
2260 */
2261void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2262{
Richard Henderson8e3da4c2023-07-10 16:07:57 +01002263 intptr_t i, opr_sz = simd_oprsz(desc);
Richard Hendersona21bb782020-02-16 13:42:30 -08002264 uint64_t *d = vd, *n = vn, *m = vm;
2265
2266 for (i = 0; i < opr_sz / 8; ++i) {
Richard Henderson8e3da4c2023-07-10 16:07:57 +01002267 d[i] = clmul_8x8_low(n[i], m[i]);
Richard Hendersona21bb782020-02-16 13:42:30 -08002268 }
2269 clear_tail(d, opr_sz, simd_maxsz(desc));
2270}
Richard Hendersonb9ed5102020-02-16 13:42:31 -08002271
2272/*
2273 * 64x64->128 polynomial multiply.
2274 * Because of the lanes are not accessed in strict columns,
2275 * this probably cannot be turned into a generic helper.
2276 */
2277void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2278{
Richard Hendersona50cfdf2023-07-11 10:13:45 +01002279 intptr_t i, opr_sz = simd_oprsz(desc);
Richard Hendersonb9ed5102020-02-16 13:42:31 -08002280 intptr_t hi = simd_data(desc);
2281 uint64_t *d = vd, *n = vn, *m = vm;
2282
2283 for (i = 0; i < opr_sz / 8; i += 2) {
Richard Hendersona50cfdf2023-07-11 10:13:45 +01002284 Int128 r = clmul_64(n[i + hi], m[i + hi]);
2285 d[i] = int128_getlo(r);
2286 d[i + 1] = int128_gethi(r);
Richard Hendersonb9ed5102020-02-16 13:42:31 -08002287 }
2288 clear_tail(d, opr_sz, simd_maxsz(desc));
2289}
Richard Hendersone7e96fc2020-02-16 13:42:32 -08002290
Richard Hendersone7e96fc2020-02-16 13:42:32 -08002291void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2292{
2293 int hi = simd_data(desc);
2294 uint64_t *d = vd, *n = vn, *m = vm;
2295 uint64_t nn = n[hi], mm = m[hi];
2296
Richard Henderson8e3da4c2023-07-10 16:07:57 +01002297 d[0] = clmul_8x4_packed(nn, mm);
Richard Hendersone7e96fc2020-02-16 13:42:32 -08002298 nn >>= 32;
2299 mm >>= 32;
Richard Henderson8e3da4c2023-07-10 16:07:57 +01002300 d[1] = clmul_8x4_packed(nn, mm);
Richard Hendersone7e96fc2020-02-16 13:42:32 -08002301
2302 clear_tail(d, 16, simd_maxsz(desc));
2303}
2304
2305#ifdef TARGET_AARCH64
2306void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2307{
2308 int shift = simd_data(desc) * 8;
2309 intptr_t i, opr_sz = simd_oprsz(desc);
2310 uint64_t *d = vd, *n = vn, *m = vm;
2311
2312 for (i = 0; i < opr_sz / 8; ++i) {
Richard Henderson8e3da4c2023-07-10 16:07:57 +01002313 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
Richard Hendersone7e96fc2020-02-16 13:42:32 -08002314 }
2315}
Richard Hendersone3a56132021-05-24 18:02:40 -07002316
Richard Hendersone3a56132021-05-24 18:02:40 -07002317void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2318{
2319 intptr_t sel = H4(simd_data(desc));
2320 intptr_t i, opr_sz = simd_oprsz(desc);
2321 uint32_t *n = vn, *m = vm;
2322 uint64_t *d = vd;
2323
2324 for (i = 0; i < opr_sz / 8; ++i) {
Richard Hendersonbae25f62023-07-11 09:56:41 +01002325 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
Richard Hendersone3a56132021-05-24 18:02:40 -07002326 }
2327}
Richard Hendersone7e96fc2020-02-16 13:42:32 -08002328#endif
Richard Henderson6b375d32020-04-18 09:28:08 -07002329
2330#define DO_CMP0(NAME, TYPE, OP) \
2331void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2332{ \
2333 intptr_t i, opr_sz = simd_oprsz(desc); \
2334 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2335 TYPE nn = *(TYPE *)(vn + i); \
2336 *(TYPE *)(vd + i) = -(nn OP 0); \
2337 } \
2338 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2339}
2340
2341DO_CMP0(gvec_ceq0_b, int8_t, ==)
2342DO_CMP0(gvec_clt0_b, int8_t, <)
2343DO_CMP0(gvec_cle0_b, int8_t, <=)
2344DO_CMP0(gvec_cgt0_b, int8_t, >)
2345DO_CMP0(gvec_cge0_b, int8_t, >=)
2346
2347DO_CMP0(gvec_ceq0_h, int16_t, ==)
2348DO_CMP0(gvec_clt0_h, int16_t, <)
2349DO_CMP0(gvec_cle0_h, int16_t, <=)
2350DO_CMP0(gvec_cgt0_h, int16_t, >)
2351DO_CMP0(gvec_cge0_h, int16_t, >=)
2352
2353#undef DO_CMP0
Richard Henderson50c160d2020-05-13 09:32:44 -07002354
2355#define DO_ABD(NAME, TYPE) \
2356void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2357{ \
2358 intptr_t i, opr_sz = simd_oprsz(desc); \
2359 TYPE *d = vd, *n = vn, *m = vm; \
2360 \
2361 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2362 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2363 } \
2364 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2365}
2366
2367DO_ABD(gvec_sabd_b, int8_t)
2368DO_ABD(gvec_sabd_h, int16_t)
2369DO_ABD(gvec_sabd_s, int32_t)
2370DO_ABD(gvec_sabd_d, int64_t)
2371
2372DO_ABD(gvec_uabd_b, uint8_t)
2373DO_ABD(gvec_uabd_h, uint16_t)
2374DO_ABD(gvec_uabd_s, uint32_t)
2375DO_ABD(gvec_uabd_d, uint64_t)
2376
2377#undef DO_ABD
Richard Hendersoncfdb2c02020-05-13 09:32:45 -07002378
2379#define DO_ABA(NAME, TYPE) \
2380void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2381{ \
2382 intptr_t i, opr_sz = simd_oprsz(desc); \
2383 TYPE *d = vd, *n = vn, *m = vm; \
2384 \
2385 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2386 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2387 } \
2388 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2389}
2390
2391DO_ABA(gvec_saba_b, int8_t)
2392DO_ABA(gvec_saba_h, int16_t)
2393DO_ABA(gvec_saba_s, int32_t)
2394DO_ABA(gvec_saba_d, int64_t)
2395
2396DO_ABA(gvec_uaba_b, uint8_t)
2397DO_ABA(gvec_uaba_h, uint16_t)
2398DO_ABA(gvec_uaba_s, uint32_t)
2399DO_ABA(gvec_uaba_d, uint64_t)
2400
2401#undef DO_ABA
Peter Maydell1dc587e2020-08-28 19:33:44 +01002402
Richard Henderson57801ca2024-05-24 16:20:42 -07002403#define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2404void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2405{ \
2406 ARMVectorReg scratch; \
2407 intptr_t oprsz = simd_oprsz(desc); \
2408 intptr_t half = oprsz / sizeof(TYPE) / 2; \
2409 TYPE *d = vd, *n = vn, *m = vm; \
2410 if (unlikely(d == m)) { \
2411 m = memcpy(&scratch, m, oprsz); \
2412 } \
2413 for (intptr_t i = 0; i < half; ++i) { \
2414 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \
2415 } \
2416 for (intptr_t i = 0; i < half; ++i) { \
2417 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \
2418 } \
2419 clear_tail(d, oprsz, simd_maxsz(desc)); \
2420}
2421
2422DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2423DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2424DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2425
Richard Hendersona13f9fb2024-05-24 16:20:43 -07002426DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2427DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2428DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2429
2430DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2431DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2432DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2433
2434DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2435DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2436DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2437
2438DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2439DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2440DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2441
Richard Hendersona7e4eec2024-05-24 16:20:45 -07002442#undef DO_3OP_PAIR
2443
2444#define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2445void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2446{ \
2447 ARMVectorReg scratch; \
2448 intptr_t oprsz = simd_oprsz(desc); \
2449 intptr_t half = oprsz / sizeof(TYPE) / 2; \
2450 TYPE *d = vd, *n = vn, *m = vm; \
2451 if (unlikely(d == m)) { \
2452 m = memcpy(&scratch, m, oprsz); \
2453 } \
2454 for (intptr_t i = 0; i < half; ++i) { \
2455 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \
2456 } \
2457 for (intptr_t i = 0; i < half; ++i) { \
2458 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \
2459 } \
2460 clear_tail(d, oprsz, simd_maxsz(desc)); \
2461}
2462
2463#define ADD(A, B) (A + B)
2464DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2465DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2466DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2467DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2468#undef ADD
2469
Richard Henderson28b54512024-05-24 16:20:47 -07002470DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2471DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2472DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2473
2474DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2475DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2476DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2477
2478DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2479DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2480DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2481
2482DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2483DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2484DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2485
Richard Hendersona7e4eec2024-05-24 16:20:45 -07002486#undef DO_3OP_PAIR
2487
Peter Maydell7b959c52020-08-28 19:33:46 +01002488#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2489 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2490 { \
2491 intptr_t i, oprsz = simd_oprsz(desc); \
2492 int shift = simd_data(desc); \
2493 TYPE *d = vd, *n = vn; \
2494 float_status *fpst = stat; \
2495 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2496 d[i] = FUNC(n[i], shift, fpst); \
2497 } \
2498 clear_tail(d, oprsz, simd_maxsz(desc)); \
2499 }
2500
2501DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2502DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2503DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2504DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
Peter Maydell24018cf2020-08-28 19:33:47 +01002505DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2506DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2507DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2508DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
Peter Maydell7b959c52020-08-28 19:33:46 +01002509
2510#undef DO_VCVT_FIXED
Peter Maydellca88a6e2020-08-28 19:33:48 +01002511
2512#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2513 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2514 { \
2515 float_status *fpst = stat; \
2516 intptr_t i, oprsz = simd_oprsz(desc); \
2517 uint32_t rmode = simd_data(desc); \
2518 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2519 TYPE *d = vd, *n = vn; \
2520 set_float_rounding_mode(rmode, fpst); \
2521 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2522 d[i] = FUNC(n[i], 0, fpst); \
2523 } \
2524 set_float_rounding_mode(prev_rmode, fpst); \
2525 clear_tail(d, oprsz, simd_maxsz(desc)); \
2526 }
2527
2528DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2529DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2530DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2531DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2532
2533#undef DO_VCVT_RMODE
Peter Maydell18725912020-08-28 19:33:49 +01002534
2535#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2536 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2537 { \
2538 float_status *fpst = stat; \
2539 intptr_t i, oprsz = simd_oprsz(desc); \
2540 uint32_t rmode = simd_data(desc); \
2541 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2542 TYPE *d = vd, *n = vn; \
2543 set_float_rounding_mode(rmode, fpst); \
2544 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2545 d[i] = FUNC(n[i], fpst); \
2546 } \
2547 set_float_rounding_mode(prev_rmode, fpst); \
2548 clear_tail(d, oprsz, simd_maxsz(desc)); \
2549 }
2550
2551DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2552DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2553
2554#undef DO_VRINT_RMODE
Richard Henderson519183d2021-02-24 15:05:32 -08002555
2556#ifdef TARGET_AARCH64
2557void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2558{
2559 const uint8_t *indices = vm;
2560 CPUARMState *env = venv;
2561 size_t oprsz = simd_oprsz(desc);
2562 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2563 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2564 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2565 union {
2566 uint8_t b[16];
2567 uint64_t d[2];
2568 } result;
2569
2570 /*
2571 * We must construct the final result in a temp, lest the output
2572 * overlaps the input table. For TBL, begin with zero; for TBX,
2573 * begin with the original register contents. Note that we always
2574 * copy 16 bytes here to avoid an extra branch; clearing the high
2575 * bits of the register for oprsz == 8 is handled below.
2576 */
2577 if (is_tbx) {
2578 memcpy(&result, vd, 16);
2579 } else {
2580 memset(&result, 0, 16);
2581 }
2582
2583 for (size_t i = 0; i < oprsz; ++i) {
2584 uint32_t index = indices[H1(i)];
2585
2586 if (index < table_len) {
2587 /*
2588 * Convert index (a byte offset into the virtual table
2589 * which is a series of 128-bit vectors concatenated)
2590 * into the correct register element, bearing in mind
2591 * that the table can wrap around from V31 to V0.
2592 */
2593 const uint8_t *table = (const uint8_t *)
2594 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2595 result.b[H1(i)] = table[H1(index % 16)];
2596 }
2597 }
2598
2599 memcpy(vd, &result, 16);
2600 clear_tail(vd, oprsz, simd_maxsz(desc));
2601}
2602#endif
Richard Henderson5dad1ba2021-05-24 18:02:28 -07002603
2604/*
2605 * NxN -> N highpart multiply
2606 *
2607 * TODO: expose this as a generic vector operation.
2608 */
2609
2610void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2611{
2612 intptr_t i, opr_sz = simd_oprsz(desc);
2613 int8_t *d = vd, *n = vn, *m = vm;
2614
2615 for (i = 0; i < opr_sz; ++i) {
2616 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2617 }
2618 clear_tail(d, opr_sz, simd_maxsz(desc));
2619}
2620
2621void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2622{
2623 intptr_t i, opr_sz = simd_oprsz(desc);
2624 int16_t *d = vd, *n = vn, *m = vm;
2625
2626 for (i = 0; i < opr_sz / 2; ++i) {
2627 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2628 }
2629 clear_tail(d, opr_sz, simd_maxsz(desc));
2630}
2631
2632void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2633{
2634 intptr_t i, opr_sz = simd_oprsz(desc);
2635 int32_t *d = vd, *n = vn, *m = vm;
2636
2637 for (i = 0; i < opr_sz / 4; ++i) {
2638 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2639 }
2640 clear_tail(d, opr_sz, simd_maxsz(desc));
2641}
2642
2643void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2644{
2645 intptr_t i, opr_sz = simd_oprsz(desc);
2646 uint64_t *d = vd, *n = vn, *m = vm;
2647 uint64_t discard;
2648
2649 for (i = 0; i < opr_sz / 8; ++i) {
2650 muls64(&discard, &d[i], n[i], m[i]);
2651 }
2652 clear_tail(d, opr_sz, simd_maxsz(desc));
2653}
2654
2655void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2656{
2657 intptr_t i, opr_sz = simd_oprsz(desc);
2658 uint8_t *d = vd, *n = vn, *m = vm;
2659
2660 for (i = 0; i < opr_sz; ++i) {
2661 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2662 }
2663 clear_tail(d, opr_sz, simd_maxsz(desc));
2664}
2665
2666void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2667{
2668 intptr_t i, opr_sz = simd_oprsz(desc);
2669 uint16_t *d = vd, *n = vn, *m = vm;
2670
2671 for (i = 0; i < opr_sz / 2; ++i) {
2672 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2673 }
2674 clear_tail(d, opr_sz, simd_maxsz(desc));
2675}
2676
2677void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2678{
2679 intptr_t i, opr_sz = simd_oprsz(desc);
2680 uint32_t *d = vd, *n = vn, *m = vm;
2681
2682 for (i = 0; i < opr_sz / 4; ++i) {
2683 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2684 }
2685 clear_tail(d, opr_sz, simd_maxsz(desc));
2686}
2687
2688void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2689{
2690 intptr_t i, opr_sz = simd_oprsz(desc);
2691 uint64_t *d = vd, *n = vn, *m = vm;
2692 uint64_t discard;
2693
2694 for (i = 0; i < opr_sz / 8; ++i) {
2695 mulu64(&discard, &d[i], n[i], m[i]);
2696 }
2697 clear_tail(d, opr_sz, simd_maxsz(desc));
2698}
Richard Hendersone6eba6e2021-05-24 18:03:09 -07002699
2700void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2701{
2702 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2703 int shr = simd_data(desc);
2704 uint64_t *d = vd, *n = vn, *m = vm;
2705
2706 for (i = 0; i < opr_sz; ++i) {
2707 d[i] = ror64(n[i] ^ m[i], shr);
2708 }
2709 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2710}
Richard Henderson2323c5f2021-05-24 18:03:56 -07002711
2712/*
2713 * Integer matrix-multiply accumulate
2714 */
2715
2716static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2717{
2718 int8_t *n = vn, *m = vm;
2719
2720 for (intptr_t k = 0; k < 8; ++k) {
2721 sum += n[H1(k)] * m[H1(k)];
2722 }
2723 return sum;
2724}
2725
2726static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2727{
2728 uint8_t *n = vn, *m = vm;
2729
2730 for (intptr_t k = 0; k < 8; ++k) {
2731 sum += n[H1(k)] * m[H1(k)];
2732 }
2733 return sum;
2734}
2735
2736static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2737{
2738 uint8_t *n = vn;
2739 int8_t *m = vm;
2740
2741 for (intptr_t k = 0; k < 8; ++k) {
2742 sum += n[H1(k)] * m[H1(k)];
2743 }
2744 return sum;
2745}
2746
2747static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2748 uint32_t (*inner_loop)(uint32_t, void *, void *))
2749{
2750 intptr_t seg, opr_sz = simd_oprsz(desc);
2751
2752 for (seg = 0; seg < opr_sz; seg += 16) {
2753 uint32_t *d = vd + seg;
2754 uint32_t *a = va + seg;
2755 uint32_t sum0, sum1, sum2, sum3;
2756
2757 /*
2758 * Process the entire segment at once, writing back the
2759 * results only after we've consumed all of the inputs.
2760 *
Richard Henderson81266a12021-05-25 15:58:13 -07002761 * Key to indices by column:
Richard Henderson2323c5f2021-05-24 18:03:56 -07002762 * i j i j
2763 */
2764 sum0 = a[H4(0 + 0)];
2765 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2766 sum1 = a[H4(0 + 1)];
2767 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2768 sum2 = a[H4(2 + 0)];
2769 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2770 sum3 = a[H4(2 + 1)];
2771 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2772
2773 d[H4(0)] = sum0;
2774 d[H4(1)] = sum1;
2775 d[H4(2)] = sum2;
2776 d[H4(3)] = sum3;
2777 }
2778 clear_tail(vd, opr_sz, simd_maxsz(desc));
2779}
2780
2781#define DO_MMLA_B(NAME, INNER) \
2782 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2783 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2784
2785DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2786DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2787DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
Richard Hendersoncb8657f2021-05-25 15:58:11 -07002788
2789/*
2790 * BFloat16 Dot Product
2791 */
2792
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002793bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
Richard Hendersoncb8657f2021-05-25 15:58:11 -07002794{
Peter Maydell0e185012024-09-03 17:22:16 +01002795 /*
2796 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2797 * For EBF = 0, we ignore the FPCR bits which determine rounding
2798 * mode and denormal-flushing, and we do unfused multiplies and
2799 * additions with intermediate rounding of all products and sums.
2800 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2801 * and we perform a fused two-way sum-of-products without intermediate
2802 * rounding of the products.
2803 * In either case, we don't set fp exception flags.
2804 *
2805 * EBF is AArch64 only, so even if it's set in the FPCR it has
2806 * no effect on AArch32 instructions.
2807 */
2808 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002809 *statusp = (float_status){
Richard Hendersoncb8657f2021-05-25 15:58:11 -07002810 .tininess_before_rounding = float_tininess_before_rounding,
2811 .float_rounding_mode = float_round_to_odd_inf,
2812 .flush_to_zero = true,
2813 .flush_inputs_to_zero = true,
2814 .default_nan_mode = true,
2815 };
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002816
Peter Maydell0e185012024-09-03 17:22:16 +01002817 if (ebf) {
2818 float_status *fpst = &env->vfp.fp_status;
2819 set_flush_to_zero(get_flush_to_zero(fpst), statusp);
2820 set_flush_inputs_to_zero(get_flush_inputs_to_zero(fpst), statusp);
2821 set_float_rounding_mode(get_float_rounding_mode(fpst), statusp);
2822
2823 /* EBF=1 needs to do a step with round-to-odd semantics */
2824 *oddstatusp = *statusp;
2825 set_float_rounding_mode(float_round_to_odd, oddstatusp);
2826 }
2827
2828 return ebf;
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002829}
2830
2831float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2832{
Richard Hendersoncb8657f2021-05-25 15:58:11 -07002833 float32 t1, t2;
2834
2835 /*
2836 * Extract each BFloat16 from the element pair, and shift
2837 * them such that they become float32.
2838 */
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002839 t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2840 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2841 t1 = float32_add(t1, t2, fpst);
2842 t1 = float32_add(sum, t1, fpst);
Richard Hendersoncb8657f2021-05-25 15:58:11 -07002843
2844 return t1;
2845}
2846
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002847float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2848 float_status *fpst, float_status *fpst_odd)
2849{
Peter Maydell0e185012024-09-03 17:22:16 +01002850 /*
2851 * Compare f16_dotadd() in sme_helper.c, but here we have
2852 * bfloat16 inputs. In particular that means that we do not
2853 * want the FPCR.FZ16 flush semantics, so we use the normal
2854 * float_status for the input handling here.
2855 */
2856 float64 e1r = float32_to_float64(e1 << 16, fpst);
2857 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2858 float64 e2r = float32_to_float64(e2 << 16, fpst);
2859 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2860 float64 t64;
2861 float32 t32;
2862
2863 /*
2864 * The ARM pseudocode function FPDot performs both multiplies
2865 * and the add with a single rounding operation. Emulate this
2866 * by performing the first multiply in round-to-odd, then doing
2867 * the second multiply as fused multiply-add, and rounding to
2868 * float32 all in one step.
2869 */
2870 t64 = float64_mul(e1r, e2r, fpst_odd);
2871 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2872
2873 /* This conversion is exact, because we've already rounded. */
2874 t32 = float64_to_float32(t64, fpst);
2875
2876 /* The final accumulation step is not fused. */
2877 return float32_add(sum, t32, fpst);
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002878}
2879
Peter Maydell75a67842024-09-03 17:22:14 +01002880void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2881 CPUARMState *env, uint32_t desc)
Richard Hendersoncb8657f2021-05-25 15:58:11 -07002882{
2883 intptr_t i, opr_sz = simd_oprsz(desc);
2884 float32 *d = vd, *a = va;
2885 uint32_t *n = vn, *m = vm;
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002886 float_status fpst, fpst_odd;
Richard Hendersoncb8657f2021-05-25 15:58:11 -07002887
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002888 if (is_ebf(env, &fpst, &fpst_odd)) {
2889 for (i = 0; i < opr_sz / 4; ++i) {
2890 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2891 }
2892 } else {
2893 for (i = 0; i < opr_sz / 4; ++i) {
2894 d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2895 }
Richard Hendersoncb8657f2021-05-25 15:58:11 -07002896 }
2897 clear_tail(d, opr_sz, simd_maxsz(desc));
2898}
Richard Henderson83914472021-05-25 15:58:12 -07002899
2900void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
Peter Maydellc8d644b2024-09-03 17:22:15 +01002901 void *va, CPUARMState *env, uint32_t desc)
Richard Henderson83914472021-05-25 15:58:12 -07002902{
2903 intptr_t i, j, opr_sz = simd_oprsz(desc);
2904 intptr_t index = simd_data(desc);
2905 intptr_t elements = opr_sz / 4;
2906 intptr_t eltspersegment = MIN(16 / 4, elements);
2907 float32 *d = vd, *a = va;
2908 uint32_t *n = vn, *m = vm;
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002909 float_status fpst, fpst_odd;
Richard Henderson83914472021-05-25 15:58:12 -07002910
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002911 if (is_ebf(env, &fpst, &fpst_odd)) {
2912 for (i = 0; i < elements; i += eltspersegment) {
2913 uint32_t m_idx = m[i + H4(index)];
Richard Henderson83914472021-05-25 15:58:12 -07002914
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002915 for (j = i; j < i + eltspersegment; j++) {
2916 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2917 }
2918 }
2919 } else {
2920 for (i = 0; i < elements; i += eltspersegment) {
2921 uint32_t m_idx = m[i + H4(index)];
2922
2923 for (j = i; j < i + eltspersegment; j++) {
2924 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2925 }
Richard Henderson83914472021-05-25 15:58:12 -07002926 }
2927 }
2928 clear_tail(d, opr_sz, simd_maxsz(desc));
2929}
Richard Henderson81266a12021-05-25 15:58:13 -07002930
Peter Maydell2da2d7d2024-09-03 17:22:15 +01002931void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2932 CPUARMState *env, uint32_t desc)
Richard Henderson81266a12021-05-25 15:58:13 -07002933{
2934 intptr_t s, opr_sz = simd_oprsz(desc);
2935 float32 *d = vd, *a = va;
2936 uint32_t *n = vn, *m = vm;
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002937 float_status fpst, fpst_odd;
Richard Henderson81266a12021-05-25 15:58:13 -07002938
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002939 if (is_ebf(env, &fpst, &fpst_odd)) {
2940 for (s = 0; s < opr_sz / 4; s += 4) {
2941 float32 sum00, sum01, sum10, sum11;
Richard Henderson81266a12021-05-25 15:58:13 -07002942
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002943 /*
2944 * Process the entire segment at once, writing back the
2945 * results only after we've consumed all of the inputs.
2946 *
2947 * Key to indices by column:
2948 * i j i k j k
2949 */
2950 sum00 = a[s + H4(0 + 0)];
2951 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2952 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
Richard Henderson81266a12021-05-25 15:58:13 -07002953
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002954 sum01 = a[s + H4(0 + 1)];
2955 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2956 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
Richard Henderson81266a12021-05-25 15:58:13 -07002957
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002958 sum10 = a[s + H4(2 + 0)];
2959 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2960 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
Richard Henderson81266a12021-05-25 15:58:13 -07002961
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002962 sum11 = a[s + H4(2 + 1)];
2963 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2964 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
Richard Henderson81266a12021-05-25 15:58:13 -07002965
Peter Maydell09b0d9e2024-09-03 17:22:15 +01002966 d[s + H4(0 + 0)] = sum00;
2967 d[s + H4(0 + 1)] = sum01;
2968 d[s + H4(2 + 0)] = sum10;
2969 d[s + H4(2 + 1)] = sum11;
2970 }
2971 } else {
2972 for (s = 0; s < opr_sz / 4; s += 4) {
2973 float32 sum00, sum01, sum10, sum11;
2974
2975 /*
2976 * Process the entire segment at once, writing back the
2977 * results only after we've consumed all of the inputs.
2978 *
2979 * Key to indices by column:
2980 * i j i k j k
2981 */
2982 sum00 = a[s + H4(0 + 0)];
2983 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2984 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2985
2986 sum01 = a[s + H4(0 + 1)];
2987 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2988 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2989
2990 sum10 = a[s + H4(2 + 0)];
2991 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
2992 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
2993
2994 sum11 = a[s + H4(2 + 1)];
2995 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
2996 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
2997
2998 d[s + H4(0 + 0)] = sum00;
2999 d[s + H4(0 + 1)] = sum01;
3000 d[s + H4(2 + 0)] = sum10;
3001 d[s + H4(2 + 1)] = sum11;
3002 }
Richard Henderson81266a12021-05-25 15:58:13 -07003003 }
3004 clear_tail(d, opr_sz, simd_maxsz(desc));
3005}
Richard Henderson56938872021-05-25 15:58:14 -07003006
3007void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3008 void *stat, uint32_t desc)
3009{
3010 intptr_t i, opr_sz = simd_oprsz(desc);
3011 intptr_t sel = simd_data(desc);
3012 float32 *d = vd, *a = va;
3013 bfloat16 *n = vn, *m = vm;
3014
3015 for (i = 0; i < opr_sz / 4; ++i) {
3016 float32 nn = n[H2(i * 2 + sel)] << 16;
3017 float32 mm = m[H2(i * 2 + sel)] << 16;
3018 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3019 }
3020 clear_tail(d, opr_sz, simd_maxsz(desc));
3021}
Richard Henderson458d0ab2021-05-25 15:58:15 -07003022
3023void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3024 void *va, void *stat, uint32_t desc)
3025{
3026 intptr_t i, j, opr_sz = simd_oprsz(desc);
3027 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3028 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3029 intptr_t elements = opr_sz / 4;
3030 intptr_t eltspersegment = MIN(16 / 4, elements);
3031 float32 *d = vd, *a = va;
3032 bfloat16 *n = vn, *m = vm;
3033
3034 for (i = 0; i < elements; i += eltspersegment) {
3035 float32 m_idx = m[H2(2 * i + index)] << 16;
3036
3037 for (j = i; j < i + eltspersegment; j++) {
3038 float32 n_j = n[H2(2 * j + sel)] << 16;
3039 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3040 }
3041 }
3042 clear_tail(d, opr_sz, simd_maxsz(desc));
3043}
Richard Henderson6b5a3bd2022-07-08 20:45:25 +05303044
3045#define DO_CLAMP(NAME, TYPE) \
3046void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
3047{ \
3048 intptr_t i, opr_sz = simd_oprsz(desc); \
3049 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
3050 TYPE aa = *(TYPE *)(a + i); \
3051 TYPE nn = *(TYPE *)(n + i); \
3052 TYPE mm = *(TYPE *)(m + i); \
3053 TYPE dd = MIN(MAX(aa, nn), mm); \
3054 *(TYPE *)(d + i) = dd; \
3055 } \
3056 clear_tail(d, opr_sz, simd_maxsz(desc)); \
3057}
3058
3059DO_CLAMP(gvec_sclamp_b, int8_t)
3060DO_CLAMP(gvec_sclamp_h, int16_t)
3061DO_CLAMP(gvec_sclamp_s, int32_t)
3062DO_CLAMP(gvec_sclamp_d, int64_t)
3063
3064DO_CLAMP(gvec_uclamp_b, uint8_t)
3065DO_CLAMP(gvec_uclamp_h, uint16_t)
3066DO_CLAMP(gvec_uclamp_s, uint32_t)
3067DO_CLAMP(gvec_uclamp_d, uint64_t)