Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-migration-20210726a...
[qemu/armbru.git] / target / arm / vec_helper.c
blob034f6b84f78ad12b077fb52b7e891d3986fe2600
1 /*
2 * ARM AdvSIMD / SVE Vector Operations
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "vec_internal.h"
29 * Data for expanding active predicate bits to bytes, for byte elements.
31 * for (i = 0; i < 256; ++i) {
32 * unsigned long m = 0;
33 * for (j = 0; j < 8; j++) {
34 * if ((i >> j) & 1) {
35 * m |= 0xfful << (j << 3);
36 * }
37 * }
38 * printf("0x%016lx,\n", m);
39 * }
41 const uint64_t expand_pred_b_data[256] = {
42 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
43 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
44 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
45 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
46 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
47 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
48 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
49 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
50 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
51 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
52 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
53 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
54 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
55 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
56 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
57 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
58 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
59 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
60 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
61 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
62 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
63 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
64 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
65 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
66 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
67 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
68 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
69 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
70 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
71 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
72 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
73 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
74 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
75 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
76 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
77 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
78 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
79 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
80 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
81 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
82 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
83 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
84 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
85 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
86 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
87 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
88 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
89 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
90 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
91 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
92 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
93 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
94 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
95 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
96 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
97 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
98 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
99 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
100 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
101 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
102 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
103 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
104 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
105 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
106 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
107 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
108 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
109 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
110 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
111 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
112 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
113 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
114 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
115 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
116 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
117 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
118 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
119 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
120 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
121 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
122 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
123 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
124 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
125 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
126 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
127 0xffffffffffffffff,
130 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
131 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
132 bool neg, bool round)
135 * Simplify:
136 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
137 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
139 int32_t ret = (int32_t)src1 * src2;
140 if (neg) {
141 ret = -ret;
143 ret += ((int32_t)src3 << 7) + (round << 6);
144 ret >>= 7;
146 if (ret != (int8_t)ret) {
147 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
149 return ret;
152 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
153 void *va, uint32_t desc)
155 intptr_t i, opr_sz = simd_oprsz(desc);
156 int8_t *d = vd, *n = vn, *m = vm, *a = va;
158 for (i = 0; i < opr_sz; ++i) {
159 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
163 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
164 void *va, uint32_t desc)
166 intptr_t i, opr_sz = simd_oprsz(desc);
167 int8_t *d = vd, *n = vn, *m = vm, *a = va;
169 for (i = 0; i < opr_sz; ++i) {
170 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
174 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
176 intptr_t i, opr_sz = simd_oprsz(desc);
177 int8_t *d = vd, *n = vn, *m = vm;
179 for (i = 0; i < opr_sz; ++i) {
180 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
184 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
186 intptr_t i, opr_sz = simd_oprsz(desc);
187 int8_t *d = vd, *n = vn, *m = vm;
189 for (i = 0; i < opr_sz; ++i) {
190 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
194 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
195 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
196 bool neg, bool round, uint32_t *sat)
198 /* Simplify similarly to do_sqrdmlah_b above. */
199 int32_t ret = (int32_t)src1 * src2;
200 if (neg) {
201 ret = -ret;
203 ret += ((int32_t)src3 << 15) + (round << 14);
204 ret >>= 15;
206 if (ret != (int16_t)ret) {
207 *sat = 1;
208 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
210 return ret;
213 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
214 uint32_t src2, uint32_t src3)
216 uint32_t *sat = &env->vfp.qc[0];
217 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
218 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
219 false, true, sat);
220 return deposit32(e1, 16, 16, e2);
223 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
224 void *vq, uint32_t desc)
226 uintptr_t opr_sz = simd_oprsz(desc);
227 int16_t *d = vd;
228 int16_t *n = vn;
229 int16_t *m = vm;
230 uintptr_t i;
232 for (i = 0; i < opr_sz / 2; ++i) {
233 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
235 clear_tail(d, opr_sz, simd_maxsz(desc));
238 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
239 uint32_t src2, uint32_t src3)
241 uint32_t *sat = &env->vfp.qc[0];
242 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
243 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
244 true, true, sat);
245 return deposit32(e1, 16, 16, e2);
248 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
249 void *vq, uint32_t desc)
251 uintptr_t opr_sz = simd_oprsz(desc);
252 int16_t *d = vd;
253 int16_t *n = vn;
254 int16_t *m = vm;
255 uintptr_t i;
257 for (i = 0; i < opr_sz / 2; ++i) {
258 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
260 clear_tail(d, opr_sz, simd_maxsz(desc));
263 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
264 void *vq, uint32_t desc)
266 intptr_t i, opr_sz = simd_oprsz(desc);
267 int16_t *d = vd, *n = vn, *m = vm;
269 for (i = 0; i < opr_sz / 2; ++i) {
270 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
272 clear_tail(d, opr_sz, simd_maxsz(desc));
275 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
276 void *vq, uint32_t desc)
278 intptr_t i, opr_sz = simd_oprsz(desc);
279 int16_t *d = vd, *n = vn, *m = vm;
281 for (i = 0; i < opr_sz / 2; ++i) {
282 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
284 clear_tail(d, opr_sz, simd_maxsz(desc));
287 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
288 void *va, uint32_t desc)
290 intptr_t i, opr_sz = simd_oprsz(desc);
291 int16_t *d = vd, *n = vn, *m = vm, *a = va;
292 uint32_t discard;
294 for (i = 0; i < opr_sz / 2; ++i) {
295 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
299 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
300 void *va, uint32_t desc)
302 intptr_t i, opr_sz = simd_oprsz(desc);
303 int16_t *d = vd, *n = vn, *m = vm, *a = va;
304 uint32_t discard;
306 for (i = 0; i < opr_sz / 2; ++i) {
307 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
311 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
313 intptr_t i, opr_sz = simd_oprsz(desc);
314 int16_t *d = vd, *n = vn, *m = vm;
315 uint32_t discard;
317 for (i = 0; i < opr_sz / 2; ++i) {
318 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
322 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
324 intptr_t i, opr_sz = simd_oprsz(desc);
325 int16_t *d = vd, *n = vn, *m = vm;
326 uint32_t discard;
328 for (i = 0; i < opr_sz / 2; ++i) {
329 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
333 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
335 intptr_t i, j, opr_sz = simd_oprsz(desc);
336 int idx = simd_data(desc);
337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338 uint32_t discard;
340 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
341 int16_t mm = m[i];
342 for (j = 0; j < 16 / 2; ++j) {
343 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
348 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
350 intptr_t i, j, opr_sz = simd_oprsz(desc);
351 int idx = simd_data(desc);
352 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
353 uint32_t discard;
355 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
356 int16_t mm = m[i];
357 for (j = 0; j < 16 / 2; ++j) {
358 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
363 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
364 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
365 bool neg, bool round, uint32_t *sat)
367 /* Simplify similarly to do_sqrdmlah_b above. */
368 int64_t ret = (int64_t)src1 * src2;
369 if (neg) {
370 ret = -ret;
372 ret += ((int64_t)src3 << 31) + (round << 30);
373 ret >>= 31;
375 if (ret != (int32_t)ret) {
376 *sat = 1;
377 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
379 return ret;
382 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
383 int32_t src2, int32_t src3)
385 uint32_t *sat = &env->vfp.qc[0];
386 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
389 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
390 void *vq, uint32_t desc)
392 uintptr_t opr_sz = simd_oprsz(desc);
393 int32_t *d = vd;
394 int32_t *n = vn;
395 int32_t *m = vm;
396 uintptr_t i;
398 for (i = 0; i < opr_sz / 4; ++i) {
399 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
401 clear_tail(d, opr_sz, simd_maxsz(desc));
404 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
405 int32_t src2, int32_t src3)
407 uint32_t *sat = &env->vfp.qc[0];
408 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
411 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
412 void *vq, uint32_t desc)
414 uintptr_t opr_sz = simd_oprsz(desc);
415 int32_t *d = vd;
416 int32_t *n = vn;
417 int32_t *m = vm;
418 uintptr_t i;
420 for (i = 0; i < opr_sz / 4; ++i) {
421 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
423 clear_tail(d, opr_sz, simd_maxsz(desc));
426 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
427 void *vq, uint32_t desc)
429 intptr_t i, opr_sz = simd_oprsz(desc);
430 int32_t *d = vd, *n = vn, *m = vm;
432 for (i = 0; i < opr_sz / 4; ++i) {
433 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
435 clear_tail(d, opr_sz, simd_maxsz(desc));
438 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
439 void *vq, uint32_t desc)
441 intptr_t i, opr_sz = simd_oprsz(desc);
442 int32_t *d = vd, *n = vn, *m = vm;
444 for (i = 0; i < opr_sz / 4; ++i) {
445 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
447 clear_tail(d, opr_sz, simd_maxsz(desc));
450 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
451 void *va, uint32_t desc)
453 intptr_t i, opr_sz = simd_oprsz(desc);
454 int32_t *d = vd, *n = vn, *m = vm, *a = va;
455 uint32_t discard;
457 for (i = 0; i < opr_sz / 4; ++i) {
458 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
462 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
463 void *va, uint32_t desc)
465 intptr_t i, opr_sz = simd_oprsz(desc);
466 int32_t *d = vd, *n = vn, *m = vm, *a = va;
467 uint32_t discard;
469 for (i = 0; i < opr_sz / 4; ++i) {
470 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
474 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
476 intptr_t i, opr_sz = simd_oprsz(desc);
477 int32_t *d = vd, *n = vn, *m = vm;
478 uint32_t discard;
480 for (i = 0; i < opr_sz / 4; ++i) {
481 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
485 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
487 intptr_t i, opr_sz = simd_oprsz(desc);
488 int32_t *d = vd, *n = vn, *m = vm;
489 uint32_t discard;
491 for (i = 0; i < opr_sz / 4; ++i) {
492 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
496 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
498 intptr_t i, j, opr_sz = simd_oprsz(desc);
499 int idx = simd_data(desc);
500 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
501 uint32_t discard;
503 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
504 int32_t mm = m[i];
505 for (j = 0; j < 16 / 4; ++j) {
506 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
511 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
513 intptr_t i, j, opr_sz = simd_oprsz(desc);
514 int idx = simd_data(desc);
515 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
516 uint32_t discard;
518 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
519 int32_t mm = m[i];
520 for (j = 0; j < 16 / 4; ++j) {
521 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
526 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
527 static int64_t do_sat128_d(Int128 r)
529 int64_t ls = int128_getlo(r);
530 int64_t hs = int128_gethi(r);
532 if (unlikely(hs != (ls >> 63))) {
533 return hs < 0 ? INT64_MIN : INT64_MAX;
535 return ls;
538 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
540 uint64_t l, h;
541 Int128 r, t;
543 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
544 muls64(&l, &h, m, n);
545 r = int128_make128(l, h);
546 if (neg) {
547 r = int128_neg(r);
549 if (a) {
550 t = int128_exts64(a);
551 t = int128_lshift(t, 63);
552 r = int128_add(r, t);
554 if (round) {
555 t = int128_exts64(1ll << 62);
556 r = int128_add(r, t);
558 r = int128_rshift(r, 63);
560 return do_sat128_d(r);
563 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
564 void *va, uint32_t desc)
566 intptr_t i, opr_sz = simd_oprsz(desc);
567 int64_t *d = vd, *n = vn, *m = vm, *a = va;
569 for (i = 0; i < opr_sz / 8; ++i) {
570 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
574 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
575 void *va, uint32_t desc)
577 intptr_t i, opr_sz = simd_oprsz(desc);
578 int64_t *d = vd, *n = vn, *m = vm, *a = va;
580 for (i = 0; i < opr_sz / 8; ++i) {
581 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
585 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
587 intptr_t i, opr_sz = simd_oprsz(desc);
588 int64_t *d = vd, *n = vn, *m = vm;
590 for (i = 0; i < opr_sz / 8; ++i) {
591 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
595 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
597 intptr_t i, opr_sz = simd_oprsz(desc);
598 int64_t *d = vd, *n = vn, *m = vm;
600 for (i = 0; i < opr_sz / 8; ++i) {
601 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
605 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
607 intptr_t i, j, opr_sz = simd_oprsz(desc);
608 int idx = simd_data(desc);
609 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
611 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
612 int64_t mm = m[i];
613 for (j = 0; j < 16 / 8; ++j) {
614 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
619 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
621 intptr_t i, j, opr_sz = simd_oprsz(desc);
622 int idx = simd_data(desc);
623 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
625 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
626 int64_t mm = m[i];
627 for (j = 0; j < 16 / 8; ++j) {
628 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
633 /* Integer 8 and 16-bit dot-product.
635 * Note that for the loops herein, host endianness does not matter
636 * with respect to the ordering of data within the quad-width lanes.
637 * All elements are treated equally, no matter where they are.
640 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
641 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
643 intptr_t i, opr_sz = simd_oprsz(desc); \
644 TYPED *d = vd, *a = va; \
645 TYPEN *n = vn; \
646 TYPEM *m = vm; \
647 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
648 d[i] = (a[i] + \
649 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
650 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
651 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
652 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
654 clear_tail(d, opr_sz, simd_maxsz(desc)); \
657 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
658 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
659 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
660 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
661 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
663 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
664 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
666 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
667 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
668 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
669 intptr_t index = simd_data(desc); \
670 TYPED *d = vd, *a = va; \
671 TYPEN *n = vn; \
672 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
673 do { \
674 TYPED m0 = m_indexed[i * 4 + 0]; \
675 TYPED m1 = m_indexed[i * 4 + 1]; \
676 TYPED m2 = m_indexed[i * 4 + 2]; \
677 TYPED m3 = m_indexed[i * 4 + 3]; \
678 do { \
679 d[i] = (a[i] + \
680 n[i * 4 + 0] * m0 + \
681 n[i * 4 + 1] * m1 + \
682 n[i * 4 + 2] * m2 + \
683 n[i * 4 + 3] * m3); \
684 } while (++i < segend); \
685 segend = i + 4; \
686 } while (i < opr_sz_n); \
687 clear_tail(d, opr_sz, simd_maxsz(desc)); \
690 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
691 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
692 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
693 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
694 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
695 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
697 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
698 void *vfpst, uint32_t desc)
700 uintptr_t opr_sz = simd_oprsz(desc);
701 float16 *d = vd;
702 float16 *n = vn;
703 float16 *m = vm;
704 float_status *fpst = vfpst;
705 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
706 uint32_t neg_imag = neg_real ^ 1;
707 uintptr_t i;
709 /* Shift boolean to the sign bit so we can xor to negate. */
710 neg_real <<= 15;
711 neg_imag <<= 15;
713 for (i = 0; i < opr_sz / 2; i += 2) {
714 float16 e0 = n[H2(i)];
715 float16 e1 = m[H2(i + 1)] ^ neg_imag;
716 float16 e2 = n[H2(i + 1)];
717 float16 e3 = m[H2(i)] ^ neg_real;
719 d[H2(i)] = float16_add(e0, e1, fpst);
720 d[H2(i + 1)] = float16_add(e2, e3, fpst);
722 clear_tail(d, opr_sz, simd_maxsz(desc));
725 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
726 void *vfpst, uint32_t desc)
728 uintptr_t opr_sz = simd_oprsz(desc);
729 float32 *d = vd;
730 float32 *n = vn;
731 float32 *m = vm;
732 float_status *fpst = vfpst;
733 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
734 uint32_t neg_imag = neg_real ^ 1;
735 uintptr_t i;
737 /* Shift boolean to the sign bit so we can xor to negate. */
738 neg_real <<= 31;
739 neg_imag <<= 31;
741 for (i = 0; i < opr_sz / 4; i += 2) {
742 float32 e0 = n[H4(i)];
743 float32 e1 = m[H4(i + 1)] ^ neg_imag;
744 float32 e2 = n[H4(i + 1)];
745 float32 e3 = m[H4(i)] ^ neg_real;
747 d[H4(i)] = float32_add(e0, e1, fpst);
748 d[H4(i + 1)] = float32_add(e2, e3, fpst);
750 clear_tail(d, opr_sz, simd_maxsz(desc));
753 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
754 void *vfpst, uint32_t desc)
756 uintptr_t opr_sz = simd_oprsz(desc);
757 float64 *d = vd;
758 float64 *n = vn;
759 float64 *m = vm;
760 float_status *fpst = vfpst;
761 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
762 uint64_t neg_imag = neg_real ^ 1;
763 uintptr_t i;
765 /* Shift boolean to the sign bit so we can xor to negate. */
766 neg_real <<= 63;
767 neg_imag <<= 63;
769 for (i = 0; i < opr_sz / 8; i += 2) {
770 float64 e0 = n[i];
771 float64 e1 = m[i + 1] ^ neg_imag;
772 float64 e2 = n[i + 1];
773 float64 e3 = m[i] ^ neg_real;
775 d[i] = float64_add(e0, e1, fpst);
776 d[i + 1] = float64_add(e2, e3, fpst);
778 clear_tail(d, opr_sz, simd_maxsz(desc));
781 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
782 void *vfpst, uint32_t desc)
784 uintptr_t opr_sz = simd_oprsz(desc);
785 float16 *d = vd, *n = vn, *m = vm, *a = va;
786 float_status *fpst = vfpst;
787 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
788 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
789 uint32_t neg_real = flip ^ neg_imag;
790 uintptr_t i;
792 /* Shift boolean to the sign bit so we can xor to negate. */
793 neg_real <<= 15;
794 neg_imag <<= 15;
796 for (i = 0; i < opr_sz / 2; i += 2) {
797 float16 e2 = n[H2(i + flip)];
798 float16 e1 = m[H2(i + flip)] ^ neg_real;
799 float16 e4 = e2;
800 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
802 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
803 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
805 clear_tail(d, opr_sz, simd_maxsz(desc));
808 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
809 void *vfpst, uint32_t desc)
811 uintptr_t opr_sz = simd_oprsz(desc);
812 float16 *d = vd, *n = vn, *m = vm, *a = va;
813 float_status *fpst = vfpst;
814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
817 uint32_t neg_real = flip ^ neg_imag;
818 intptr_t elements = opr_sz / sizeof(float16);
819 intptr_t eltspersegment = 16 / sizeof(float16);
820 intptr_t i, j;
822 /* Shift boolean to the sign bit so we can xor to negate. */
823 neg_real <<= 15;
824 neg_imag <<= 15;
826 for (i = 0; i < elements; i += eltspersegment) {
827 float16 mr = m[H2(i + 2 * index + 0)];
828 float16 mi = m[H2(i + 2 * index + 1)];
829 float16 e1 = neg_real ^ (flip ? mi : mr);
830 float16 e3 = neg_imag ^ (flip ? mr : mi);
832 for (j = i; j < i + eltspersegment; j += 2) {
833 float16 e2 = n[H2(j + flip)];
834 float16 e4 = e2;
836 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
837 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
840 clear_tail(d, opr_sz, simd_maxsz(desc));
843 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
844 void *vfpst, uint32_t desc)
846 uintptr_t opr_sz = simd_oprsz(desc);
847 float32 *d = vd, *n = vn, *m = vm, *a = va;
848 float_status *fpst = vfpst;
849 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
850 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
851 uint32_t neg_real = flip ^ neg_imag;
852 uintptr_t i;
854 /* Shift boolean to the sign bit so we can xor to negate. */
855 neg_real <<= 31;
856 neg_imag <<= 31;
858 for (i = 0; i < opr_sz / 4; i += 2) {
859 float32 e2 = n[H4(i + flip)];
860 float32 e1 = m[H4(i + flip)] ^ neg_real;
861 float32 e4 = e2;
862 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
864 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
865 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
867 clear_tail(d, opr_sz, simd_maxsz(desc));
870 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
871 void *vfpst, uint32_t desc)
873 uintptr_t opr_sz = simd_oprsz(desc);
874 float32 *d = vd, *n = vn, *m = vm, *a = va;
875 float_status *fpst = vfpst;
876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
879 uint32_t neg_real = flip ^ neg_imag;
880 intptr_t elements = opr_sz / sizeof(float32);
881 intptr_t eltspersegment = 16 / sizeof(float32);
882 intptr_t i, j;
884 /* Shift boolean to the sign bit so we can xor to negate. */
885 neg_real <<= 31;
886 neg_imag <<= 31;
888 for (i = 0; i < elements; i += eltspersegment) {
889 float32 mr = m[H4(i + 2 * index + 0)];
890 float32 mi = m[H4(i + 2 * index + 1)];
891 float32 e1 = neg_real ^ (flip ? mi : mr);
892 float32 e3 = neg_imag ^ (flip ? mr : mi);
894 for (j = i; j < i + eltspersegment; j += 2) {
895 float32 e2 = n[H4(j + flip)];
896 float32 e4 = e2;
898 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
899 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
902 clear_tail(d, opr_sz, simd_maxsz(desc));
905 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
906 void *vfpst, uint32_t desc)
908 uintptr_t opr_sz = simd_oprsz(desc);
909 float64 *d = vd, *n = vn, *m = vm, *a = va;
910 float_status *fpst = vfpst;
911 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
912 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
913 uint64_t neg_real = flip ^ neg_imag;
914 uintptr_t i;
916 /* Shift boolean to the sign bit so we can xor to negate. */
917 neg_real <<= 63;
918 neg_imag <<= 63;
920 for (i = 0; i < opr_sz / 8; i += 2) {
921 float64 e2 = n[i + flip];
922 float64 e1 = m[i + flip] ^ neg_real;
923 float64 e4 = e2;
924 float64 e3 = m[i + 1 - flip] ^ neg_imag;
926 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
927 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
929 clear_tail(d, opr_sz, simd_maxsz(desc));
933 * Floating point comparisons producing an integer result (all 1s or all 0s).
934 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
935 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
937 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
939 return -float16_eq_quiet(op1, op2, stat);
942 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
944 return -float32_eq_quiet(op1, op2, stat);
947 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
949 return -float16_le(op2, op1, stat);
952 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
954 return -float32_le(op2, op1, stat);
957 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
959 return -float16_lt(op2, op1, stat);
962 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
964 return -float32_lt(op2, op1, stat);
967 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
969 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
972 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
974 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
977 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
979 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
982 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
984 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
987 static int16_t vfp_tosszh(float16 x, void *fpstp)
989 float_status *fpst = fpstp;
990 if (float16_is_any_nan(x)) {
991 float_raise(float_flag_invalid, fpst);
992 return 0;
994 return float16_to_int16_round_to_zero(x, fpst);
997 static uint16_t vfp_touszh(float16 x, void *fpstp)
999 float_status *fpst = fpstp;
1000 if (float16_is_any_nan(x)) {
1001 float_raise(float_flag_invalid, fpst);
1002 return 0;
1004 return float16_to_uint16_round_to_zero(x, fpst);
1007 #define DO_2OP(NAME, FUNC, TYPE) \
1008 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
1010 intptr_t i, oprsz = simd_oprsz(desc); \
1011 TYPE *d = vd, *n = vn; \
1012 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1013 d[i] = FUNC(n[i], stat); \
1015 clear_tail(d, oprsz, simd_maxsz(desc)); \
1018 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1019 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1020 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1022 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1023 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1024 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1026 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1027 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1029 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1030 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1031 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1032 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1033 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1034 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1035 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1036 DO_2OP(gvec_touszh, vfp_touszh, float16)
1038 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1039 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1041 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1044 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1045 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1047 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1050 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1051 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1052 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1053 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1054 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1056 DO_2OP_CMP0(cgt, cgt, FWD)
1057 DO_2OP_CMP0(cge, cge, FWD)
1058 DO_2OP_CMP0(ceq, ceq, FWD)
1059 DO_2OP_CMP0(clt, cgt, REV)
1060 DO_2OP_CMP0(cle, cge, REV)
1062 #undef DO_2OP
1063 #undef DO_2OP_CMP0
1065 /* Floating-point trigonometric starting value.
1066 * See the ARM ARM pseudocode function FPTrigSMul.
1068 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1070 float16 result = float16_mul(op1, op1, stat);
1071 if (!float16_is_any_nan(result)) {
1072 result = float16_set_sign(result, op2 & 1);
1074 return result;
1077 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1079 float32 result = float32_mul(op1, op1, stat);
1080 if (!float32_is_any_nan(result)) {
1081 result = float32_set_sign(result, op2 & 1);
1083 return result;
1086 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1088 float64 result = float64_mul(op1, op1, stat);
1089 if (!float64_is_any_nan(result)) {
1090 result = float64_set_sign(result, op2 & 1);
1092 return result;
1095 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1097 return float16_abs(float16_sub(op1, op2, stat));
1100 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1102 return float32_abs(float32_sub(op1, op2, stat));
1106 * Reciprocal step. These are the AArch32 version which uses a
1107 * non-fused multiply-and-subtract.
1109 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1111 op1 = float16_squash_input_denormal(op1, stat);
1112 op2 = float16_squash_input_denormal(op2, stat);
1114 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1115 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1116 return float16_two;
1118 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1121 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1123 op1 = float32_squash_input_denormal(op1, stat);
1124 op2 = float32_squash_input_denormal(op2, stat);
1126 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1127 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1128 return float32_two;
1130 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1133 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1134 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1136 op1 = float16_squash_input_denormal(op1, stat);
1137 op2 = float16_squash_input_denormal(op2, stat);
1139 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1140 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1141 return float16_one_point_five;
1143 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1144 return float16_div(op1, float16_two, stat);
1147 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1149 op1 = float32_squash_input_denormal(op1, stat);
1150 op2 = float32_squash_input_denormal(op2, stat);
1152 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1153 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1154 return float32_one_point_five;
1156 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1157 return float32_div(op1, float32_two, stat);
1160 #define DO_3OP(NAME, FUNC, TYPE) \
1161 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1163 intptr_t i, oprsz = simd_oprsz(desc); \
1164 TYPE *d = vd, *n = vn, *m = vm; \
1165 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1166 d[i] = FUNC(n[i], m[i], stat); \
1168 clear_tail(d, oprsz, simd_maxsz(desc)); \
1171 DO_3OP(gvec_fadd_h, float16_add, float16)
1172 DO_3OP(gvec_fadd_s, float32_add, float32)
1173 DO_3OP(gvec_fadd_d, float64_add, float64)
1175 DO_3OP(gvec_fsub_h, float16_sub, float16)
1176 DO_3OP(gvec_fsub_s, float32_sub, float32)
1177 DO_3OP(gvec_fsub_d, float64_sub, float64)
1179 DO_3OP(gvec_fmul_h, float16_mul, float16)
1180 DO_3OP(gvec_fmul_s, float32_mul, float32)
1181 DO_3OP(gvec_fmul_d, float64_mul, float64)
1183 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1184 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1185 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1187 DO_3OP(gvec_fabd_h, float16_abd, float16)
1188 DO_3OP(gvec_fabd_s, float32_abd, float32)
1190 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1191 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1193 DO_3OP(gvec_fcge_h, float16_cge, float16)
1194 DO_3OP(gvec_fcge_s, float32_cge, float32)
1196 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1197 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1199 DO_3OP(gvec_facge_h, float16_acge, float16)
1200 DO_3OP(gvec_facge_s, float32_acge, float32)
1202 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1203 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1205 DO_3OP(gvec_fmax_h, float16_max, float16)
1206 DO_3OP(gvec_fmax_s, float32_max, float32)
1208 DO_3OP(gvec_fmin_h, float16_min, float16)
1209 DO_3OP(gvec_fmin_s, float32_min, float32)
1211 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1212 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1214 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1215 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1217 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1218 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1220 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1221 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1223 #ifdef TARGET_AARCH64
1225 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1226 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1227 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1229 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1230 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1231 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1233 #endif
1234 #undef DO_3OP
1236 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1237 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1238 float_status *stat)
1240 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1243 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1244 float_status *stat)
1246 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1249 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1250 float_status *stat)
1252 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1255 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1256 float_status *stat)
1258 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1261 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1262 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1263 float_status *stat)
1265 return float16_muladd(op1, op2, dest, 0, stat);
1268 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1269 float_status *stat)
1271 return float32_muladd(op1, op2, dest, 0, stat);
1274 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1275 float_status *stat)
1277 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1280 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1281 float_status *stat)
1283 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1286 #define DO_MULADD(NAME, FUNC, TYPE) \
1287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1289 intptr_t i, oprsz = simd_oprsz(desc); \
1290 TYPE *d = vd, *n = vn, *m = vm; \
1291 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1292 d[i] = FUNC(d[i], n[i], m[i], stat); \
1294 clear_tail(d, oprsz, simd_maxsz(desc)); \
1297 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1298 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1300 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1301 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1303 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1304 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1306 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1307 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1309 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1310 * For AdvSIMD, there is of course only one such vector segment.
1313 #define DO_MUL_IDX(NAME, TYPE, H) \
1314 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1316 intptr_t i, j, oprsz = simd_oprsz(desc); \
1317 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1318 intptr_t idx = simd_data(desc); \
1319 TYPE *d = vd, *n = vn, *m = vm; \
1320 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1321 TYPE mm = m[H(i + idx)]; \
1322 for (j = 0; j < segment; j++) { \
1323 d[i + j] = n[i + j] * mm; \
1326 clear_tail(d, oprsz, simd_maxsz(desc)); \
1329 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1330 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1331 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1333 #undef DO_MUL_IDX
1335 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1336 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1338 intptr_t i, j, oprsz = simd_oprsz(desc); \
1339 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1340 intptr_t idx = simd_data(desc); \
1341 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1342 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1343 TYPE mm = m[H(i + idx)]; \
1344 for (j = 0; j < segment; j++) { \
1345 d[i + j] = a[i + j] OP n[i + j] * mm; \
1348 clear_tail(d, oprsz, simd_maxsz(desc)); \
1351 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1352 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1353 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1355 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1356 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1357 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1359 #undef DO_MLA_IDX
1361 #define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
1362 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1364 intptr_t i, j, oprsz = simd_oprsz(desc); \
1365 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1366 intptr_t idx = simd_data(desc); \
1367 TYPE *d = vd, *n = vn, *m = vm; \
1368 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1369 TYPE mm = m[H(i + idx)]; \
1370 for (j = 0; j < segment; j++) { \
1371 d[i + j] = TYPE##_##ADD(d[i + j], \
1372 TYPE##_mul(n[i + j], mm, stat), stat); \
1375 clear_tail(d, oprsz, simd_maxsz(desc)); \
1378 #define float16_nop(N, M, S) (M)
1379 #define float32_nop(N, M, S) (M)
1380 #define float64_nop(N, M, S) (M)
1382 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1383 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1384 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1387 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1388 * the fused ops below they assume accumulate both from and into Vd.
1390 DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1391 DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1392 DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1393 DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1395 #undef float16_nop
1396 #undef float32_nop
1397 #undef float64_nop
1398 #undef DO_FMUL_IDX
1400 #define DO_FMLA_IDX(NAME, TYPE, H) \
1401 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1402 void *stat, uint32_t desc) \
1404 intptr_t i, j, oprsz = simd_oprsz(desc); \
1405 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1406 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1407 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1408 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1409 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1410 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1411 TYPE mm = m[H(i + idx)]; \
1412 for (j = 0; j < segment; j++) { \
1413 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1414 mm, a[i + j], 0, stat); \
1417 clear_tail(d, oprsz, simd_maxsz(desc)); \
1420 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1421 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1422 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1424 #undef DO_FMLA_IDX
1426 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1427 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1429 intptr_t i, oprsz = simd_oprsz(desc); \
1430 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1431 bool q = false; \
1432 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1433 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1434 if (dd < MIN) { \
1435 dd = MIN; \
1436 q = true; \
1437 } else if (dd > MAX) { \
1438 dd = MAX; \
1439 q = true; \
1441 d[i] = dd; \
1443 if (q) { \
1444 uint32_t *qc = vq; \
1445 qc[0] = 1; \
1447 clear_tail(d, oprsz, simd_maxsz(desc)); \
1450 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1451 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1452 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1454 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1455 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1456 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1458 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1459 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1460 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1462 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1463 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1464 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1466 #undef DO_SAT
1468 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1469 void *vm, uint32_t desc)
1471 intptr_t i, oprsz = simd_oprsz(desc);
1472 uint64_t *d = vd, *n = vn, *m = vm;
1473 bool q = false;
1475 for (i = 0; i < oprsz / 8; i++) {
1476 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1477 if (dd < nn) {
1478 dd = UINT64_MAX;
1479 q = true;
1481 d[i] = dd;
1483 if (q) {
1484 uint32_t *qc = vq;
1485 qc[0] = 1;
1487 clear_tail(d, oprsz, simd_maxsz(desc));
1490 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1491 void *vm, uint32_t desc)
1493 intptr_t i, oprsz = simd_oprsz(desc);
1494 uint64_t *d = vd, *n = vn, *m = vm;
1495 bool q = false;
1497 for (i = 0; i < oprsz / 8; i++) {
1498 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1499 if (nn < mm) {
1500 dd = 0;
1501 q = true;
1503 d[i] = dd;
1505 if (q) {
1506 uint32_t *qc = vq;
1507 qc[0] = 1;
1509 clear_tail(d, oprsz, simd_maxsz(desc));
1512 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1513 void *vm, uint32_t desc)
1515 intptr_t i, oprsz = simd_oprsz(desc);
1516 int64_t *d = vd, *n = vn, *m = vm;
1517 bool q = false;
1519 for (i = 0; i < oprsz / 8; i++) {
1520 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1521 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1522 dd = (nn >> 63) ^ ~INT64_MIN;
1523 q = true;
1525 d[i] = dd;
1527 if (q) {
1528 uint32_t *qc = vq;
1529 qc[0] = 1;
1531 clear_tail(d, oprsz, simd_maxsz(desc));
1534 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1535 void *vm, uint32_t desc)
1537 intptr_t i, oprsz = simd_oprsz(desc);
1538 int64_t *d = vd, *n = vn, *m = vm;
1539 bool q = false;
1541 for (i = 0; i < oprsz / 8; i++) {
1542 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1543 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1544 dd = (nn >> 63) ^ ~INT64_MIN;
1545 q = true;
1547 d[i] = dd;
1549 if (q) {
1550 uint32_t *qc = vq;
1551 qc[0] = 1;
1553 clear_tail(d, oprsz, simd_maxsz(desc));
1557 #define DO_SRA(NAME, TYPE) \
1558 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1560 intptr_t i, oprsz = simd_oprsz(desc); \
1561 int shift = simd_data(desc); \
1562 TYPE *d = vd, *n = vn; \
1563 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1564 d[i] += n[i] >> shift; \
1566 clear_tail(d, oprsz, simd_maxsz(desc)); \
1569 DO_SRA(gvec_ssra_b, int8_t)
1570 DO_SRA(gvec_ssra_h, int16_t)
1571 DO_SRA(gvec_ssra_s, int32_t)
1572 DO_SRA(gvec_ssra_d, int64_t)
1574 DO_SRA(gvec_usra_b, uint8_t)
1575 DO_SRA(gvec_usra_h, uint16_t)
1576 DO_SRA(gvec_usra_s, uint32_t)
1577 DO_SRA(gvec_usra_d, uint64_t)
1579 #undef DO_SRA
1581 #define DO_RSHR(NAME, TYPE) \
1582 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1584 intptr_t i, oprsz = simd_oprsz(desc); \
1585 int shift = simd_data(desc); \
1586 TYPE *d = vd, *n = vn; \
1587 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1588 TYPE tmp = n[i] >> (shift - 1); \
1589 d[i] = (tmp >> 1) + (tmp & 1); \
1591 clear_tail(d, oprsz, simd_maxsz(desc)); \
1594 DO_RSHR(gvec_srshr_b, int8_t)
1595 DO_RSHR(gvec_srshr_h, int16_t)
1596 DO_RSHR(gvec_srshr_s, int32_t)
1597 DO_RSHR(gvec_srshr_d, int64_t)
1599 DO_RSHR(gvec_urshr_b, uint8_t)
1600 DO_RSHR(gvec_urshr_h, uint16_t)
1601 DO_RSHR(gvec_urshr_s, uint32_t)
1602 DO_RSHR(gvec_urshr_d, uint64_t)
1604 #undef DO_RSHR
1606 #define DO_RSRA(NAME, TYPE) \
1607 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1609 intptr_t i, oprsz = simd_oprsz(desc); \
1610 int shift = simd_data(desc); \
1611 TYPE *d = vd, *n = vn; \
1612 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1613 TYPE tmp = n[i] >> (shift - 1); \
1614 d[i] += (tmp >> 1) + (tmp & 1); \
1616 clear_tail(d, oprsz, simd_maxsz(desc)); \
1619 DO_RSRA(gvec_srsra_b, int8_t)
1620 DO_RSRA(gvec_srsra_h, int16_t)
1621 DO_RSRA(gvec_srsra_s, int32_t)
1622 DO_RSRA(gvec_srsra_d, int64_t)
1624 DO_RSRA(gvec_ursra_b, uint8_t)
1625 DO_RSRA(gvec_ursra_h, uint16_t)
1626 DO_RSRA(gvec_ursra_s, uint32_t)
1627 DO_RSRA(gvec_ursra_d, uint64_t)
1629 #undef DO_RSRA
1631 #define DO_SRI(NAME, TYPE) \
1632 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1634 intptr_t i, oprsz = simd_oprsz(desc); \
1635 int shift = simd_data(desc); \
1636 TYPE *d = vd, *n = vn; \
1637 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1638 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1640 clear_tail(d, oprsz, simd_maxsz(desc)); \
1643 DO_SRI(gvec_sri_b, uint8_t)
1644 DO_SRI(gvec_sri_h, uint16_t)
1645 DO_SRI(gvec_sri_s, uint32_t)
1646 DO_SRI(gvec_sri_d, uint64_t)
1648 #undef DO_SRI
1650 #define DO_SLI(NAME, TYPE) \
1651 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1653 intptr_t i, oprsz = simd_oprsz(desc); \
1654 int shift = simd_data(desc); \
1655 TYPE *d = vd, *n = vn; \
1656 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1657 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1659 clear_tail(d, oprsz, simd_maxsz(desc)); \
1662 DO_SLI(gvec_sli_b, uint8_t)
1663 DO_SLI(gvec_sli_h, uint16_t)
1664 DO_SLI(gvec_sli_s, uint32_t)
1665 DO_SLI(gvec_sli_d, uint64_t)
1667 #undef DO_SLI
1670 * Convert float16 to float32, raising no exceptions and
1671 * preserving exceptional values, including SNaN.
1672 * This is effectively an unpack+repack operation.
1674 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1676 const int f16_bias = 15;
1677 const int f32_bias = 127;
1678 uint32_t sign = extract32(f16, 15, 1);
1679 uint32_t exp = extract32(f16, 10, 5);
1680 uint32_t frac = extract32(f16, 0, 10);
1682 if (exp == 0x1f) {
1683 /* Inf or NaN */
1684 exp = 0xff;
1685 } else if (exp == 0) {
1686 /* Zero or denormal. */
1687 if (frac != 0) {
1688 if (fz16) {
1689 frac = 0;
1690 } else {
1692 * Denormal; these are all normal float32.
1693 * Shift the fraction so that the msb is at bit 11,
1694 * then remove bit 11 as the implicit bit of the
1695 * normalized float32. Note that we still go through
1696 * the shift for normal numbers below, to put the
1697 * float32 fraction at the right place.
1699 int shift = clz32(frac) - 21;
1700 frac = (frac << shift) & 0x3ff;
1701 exp = f32_bias - f16_bias - shift + 1;
1704 } else {
1705 /* Normal number; adjust the bias. */
1706 exp += f32_bias - f16_bias;
1708 sign <<= 31;
1709 exp <<= 23;
1710 frac <<= 23 - 10;
1712 return sign | exp | frac;
1715 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1718 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1719 * Load the 2nd qword iff is_q & is_2.
1720 * Shift to the 2nd dword iff !is_q & is_2.
1721 * For !is_q & !is_2, the upper bits of the result are garbage.
1723 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1727 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1728 * as there is not yet SVE versions that might use blocking.
1731 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1732 uint32_t desc, bool fz16)
1734 intptr_t i, oprsz = simd_oprsz(desc);
1735 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1736 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1737 int is_q = oprsz == 16;
1738 uint64_t n_4, m_4;
1740 /* Pre-load all of the f16 data, avoiding overlap issues. */
1741 n_4 = load4_f16(vn, is_q, is_2);
1742 m_4 = load4_f16(vm, is_q, is_2);
1744 /* Negate all inputs for FMLSL at once. */
1745 if (is_s) {
1746 n_4 ^= 0x8000800080008000ull;
1749 for (i = 0; i < oprsz / 4; i++) {
1750 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1751 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1752 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1754 clear_tail(d, oprsz, simd_maxsz(desc));
1757 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1758 void *venv, uint32_t desc)
1760 CPUARMState *env = venv;
1761 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1762 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1765 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1766 void *venv, uint32_t desc)
1768 CPUARMState *env = venv;
1769 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1770 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1773 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1774 void *venv, uint32_t desc)
1776 intptr_t i, oprsz = simd_oprsz(desc);
1777 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1778 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1779 CPUARMState *env = venv;
1780 float_status *status = &env->vfp.fp_status;
1781 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1783 for (i = 0; i < oprsz; i += sizeof(float32)) {
1784 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1785 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1786 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1787 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1788 float32 aa = *(float32 *)(va + H1_4(i));
1790 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1794 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1795 uint32_t desc, bool fz16)
1797 intptr_t i, oprsz = simd_oprsz(desc);
1798 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1799 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1800 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1801 int is_q = oprsz == 16;
1802 uint64_t n_4;
1803 float32 m_1;
1805 /* Pre-load all of the f16 data, avoiding overlap issues. */
1806 n_4 = load4_f16(vn, is_q, is_2);
1808 /* Negate all inputs for FMLSL at once. */
1809 if (is_s) {
1810 n_4 ^= 0x8000800080008000ull;
1813 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1815 for (i = 0; i < oprsz / 4; i++) {
1816 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1817 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1819 clear_tail(d, oprsz, simd_maxsz(desc));
1822 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1823 void *venv, uint32_t desc)
1825 CPUARMState *env = venv;
1826 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1827 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1830 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1831 void *venv, uint32_t desc)
1833 CPUARMState *env = venv;
1834 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1835 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1838 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1839 void *venv, uint32_t desc)
1841 intptr_t i, j, oprsz = simd_oprsz(desc);
1842 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1843 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1844 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1845 CPUARMState *env = venv;
1846 float_status *status = &env->vfp.fp_status;
1847 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1849 for (i = 0; i < oprsz; i += 16) {
1850 float16 mm_16 = *(float16 *)(vm + i + idx);
1851 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1853 for (j = 0; j < 16; j += sizeof(float32)) {
1854 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1855 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1856 float32 aa = *(float32 *)(va + H1_4(i + j));
1858 *(float32 *)(vd + H1_4(i + j)) =
1859 float32_muladd(nn, mm, aa, 0, status);
1864 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1866 intptr_t i, opr_sz = simd_oprsz(desc);
1867 int8_t *d = vd, *n = vn, *m = vm;
1869 for (i = 0; i < opr_sz; ++i) {
1870 int8_t mm = m[i];
1871 int8_t nn = n[i];
1872 int8_t res = 0;
1873 if (mm >= 0) {
1874 if (mm < 8) {
1875 res = nn << mm;
1877 } else {
1878 res = nn >> (mm > -8 ? -mm : 7);
1880 d[i] = res;
1882 clear_tail(d, opr_sz, simd_maxsz(desc));
1885 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1887 intptr_t i, opr_sz = simd_oprsz(desc);
1888 int16_t *d = vd, *n = vn, *m = vm;
1890 for (i = 0; i < opr_sz / 2; ++i) {
1891 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1892 int16_t nn = n[i];
1893 int16_t res = 0;
1894 if (mm >= 0) {
1895 if (mm < 16) {
1896 res = nn << mm;
1898 } else {
1899 res = nn >> (mm > -16 ? -mm : 15);
1901 d[i] = res;
1903 clear_tail(d, opr_sz, simd_maxsz(desc));
1906 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1908 intptr_t i, opr_sz = simd_oprsz(desc);
1909 uint8_t *d = vd, *n = vn, *m = vm;
1911 for (i = 0; i < opr_sz; ++i) {
1912 int8_t mm = m[i];
1913 uint8_t nn = n[i];
1914 uint8_t res = 0;
1915 if (mm >= 0) {
1916 if (mm < 8) {
1917 res = nn << mm;
1919 } else {
1920 if (mm > -8) {
1921 res = nn >> -mm;
1924 d[i] = res;
1926 clear_tail(d, opr_sz, simd_maxsz(desc));
1929 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1931 intptr_t i, opr_sz = simd_oprsz(desc);
1932 uint16_t *d = vd, *n = vn, *m = vm;
1934 for (i = 0; i < opr_sz / 2; ++i) {
1935 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1936 uint16_t nn = n[i];
1937 uint16_t res = 0;
1938 if (mm >= 0) {
1939 if (mm < 16) {
1940 res = nn << mm;
1942 } else {
1943 if (mm > -16) {
1944 res = nn >> -mm;
1947 d[i] = res;
1949 clear_tail(d, opr_sz, simd_maxsz(desc));
1953 * 8x8->8 polynomial multiply.
1955 * Polynomial multiplication is like integer multiplication except the
1956 * partial products are XORed, not added.
1958 * TODO: expose this as a generic vector operation, as it is a common
1959 * crypto building block.
1961 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1963 intptr_t i, j, opr_sz = simd_oprsz(desc);
1964 uint64_t *d = vd, *n = vn, *m = vm;
1966 for (i = 0; i < opr_sz / 8; ++i) {
1967 uint64_t nn = n[i];
1968 uint64_t mm = m[i];
1969 uint64_t rr = 0;
1971 for (j = 0; j < 8; ++j) {
1972 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1973 rr ^= mm & mask;
1974 mm = (mm << 1) & 0xfefefefefefefefeull;
1975 nn >>= 1;
1977 d[i] = rr;
1979 clear_tail(d, opr_sz, simd_maxsz(desc));
1983 * 64x64->128 polynomial multiply.
1984 * Because of the lanes are not accessed in strict columns,
1985 * this probably cannot be turned into a generic helper.
1987 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
1989 intptr_t i, j, opr_sz = simd_oprsz(desc);
1990 intptr_t hi = simd_data(desc);
1991 uint64_t *d = vd, *n = vn, *m = vm;
1993 for (i = 0; i < opr_sz / 8; i += 2) {
1994 uint64_t nn = n[i + hi];
1995 uint64_t mm = m[i + hi];
1996 uint64_t rhi = 0;
1997 uint64_t rlo = 0;
1999 /* Bit 0 can only influence the low 64-bit result. */
2000 if (nn & 1) {
2001 rlo = mm;
2004 for (j = 1; j < 64; ++j) {
2005 uint64_t mask = -((nn >> j) & 1);
2006 rlo ^= (mm << j) & mask;
2007 rhi ^= (mm >> (64 - j)) & mask;
2009 d[i] = rlo;
2010 d[i + 1] = rhi;
2012 clear_tail(d, opr_sz, simd_maxsz(desc));
2016 * 8x8->16 polynomial multiply.
2018 * The byte inputs are expanded to (or extracted from) half-words.
2019 * Note that neon and sve2 get the inputs from different positions.
2020 * This allows 4 bytes to be processed in parallel with uint64_t.
2023 static uint64_t expand_byte_to_half(uint64_t x)
2025 return (x & 0x000000ff)
2026 | ((x & 0x0000ff00) << 8)
2027 | ((x & 0x00ff0000) << 16)
2028 | ((x & 0xff000000) << 24);
2031 static uint64_t pmull_h(uint64_t op1, uint64_t op2)
2033 uint64_t result = 0;
2034 int i;
2036 for (i = 0; i < 8; ++i) {
2037 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
2038 result ^= op2 & mask;
2039 op1 >>= 1;
2040 op2 <<= 1;
2042 return result;
2045 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2047 int hi = simd_data(desc);
2048 uint64_t *d = vd, *n = vn, *m = vm;
2049 uint64_t nn = n[hi], mm = m[hi];
2051 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2052 nn >>= 32;
2053 mm >>= 32;
2054 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2056 clear_tail(d, 16, simd_maxsz(desc));
2059 #ifdef TARGET_AARCH64
2060 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2062 int shift = simd_data(desc) * 8;
2063 intptr_t i, opr_sz = simd_oprsz(desc);
2064 uint64_t *d = vd, *n = vn, *m = vm;
2066 for (i = 0; i < opr_sz / 8; ++i) {
2067 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
2068 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
2070 d[i] = pmull_h(nn, mm);
2074 static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2076 uint64_t result = 0;
2077 int i;
2079 for (i = 0; i < 32; ++i) {
2080 uint64_t mask = -((op1 >> i) & 1);
2081 result ^= (op2 << i) & mask;
2083 return result;
2086 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2088 intptr_t sel = H4(simd_data(desc));
2089 intptr_t i, opr_sz = simd_oprsz(desc);
2090 uint32_t *n = vn, *m = vm;
2091 uint64_t *d = vd;
2093 for (i = 0; i < opr_sz / 8; ++i) {
2094 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2097 #endif
2099 #define DO_CMP0(NAME, TYPE, OP) \
2100 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2102 intptr_t i, opr_sz = simd_oprsz(desc); \
2103 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2104 TYPE nn = *(TYPE *)(vn + i); \
2105 *(TYPE *)(vd + i) = -(nn OP 0); \
2107 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2110 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2111 DO_CMP0(gvec_clt0_b, int8_t, <)
2112 DO_CMP0(gvec_cle0_b, int8_t, <=)
2113 DO_CMP0(gvec_cgt0_b, int8_t, >)
2114 DO_CMP0(gvec_cge0_b, int8_t, >=)
2116 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2117 DO_CMP0(gvec_clt0_h, int16_t, <)
2118 DO_CMP0(gvec_cle0_h, int16_t, <=)
2119 DO_CMP0(gvec_cgt0_h, int16_t, >)
2120 DO_CMP0(gvec_cge0_h, int16_t, >=)
2122 #undef DO_CMP0
2124 #define DO_ABD(NAME, TYPE) \
2125 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2127 intptr_t i, opr_sz = simd_oprsz(desc); \
2128 TYPE *d = vd, *n = vn, *m = vm; \
2130 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2131 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2133 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2136 DO_ABD(gvec_sabd_b, int8_t)
2137 DO_ABD(gvec_sabd_h, int16_t)
2138 DO_ABD(gvec_sabd_s, int32_t)
2139 DO_ABD(gvec_sabd_d, int64_t)
2141 DO_ABD(gvec_uabd_b, uint8_t)
2142 DO_ABD(gvec_uabd_h, uint16_t)
2143 DO_ABD(gvec_uabd_s, uint32_t)
2144 DO_ABD(gvec_uabd_d, uint64_t)
2146 #undef DO_ABD
2148 #define DO_ABA(NAME, TYPE) \
2149 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2151 intptr_t i, opr_sz = simd_oprsz(desc); \
2152 TYPE *d = vd, *n = vn, *m = vm; \
2154 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2155 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2157 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2160 DO_ABA(gvec_saba_b, int8_t)
2161 DO_ABA(gvec_saba_h, int16_t)
2162 DO_ABA(gvec_saba_s, int32_t)
2163 DO_ABA(gvec_saba_d, int64_t)
2165 DO_ABA(gvec_uaba_b, uint8_t)
2166 DO_ABA(gvec_uaba_h, uint16_t)
2167 DO_ABA(gvec_uaba_s, uint32_t)
2168 DO_ABA(gvec_uaba_d, uint64_t)
2170 #undef DO_ABA
2172 #define DO_NEON_PAIRWISE(NAME, OP) \
2173 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
2174 void *stat, uint32_t oprsz) \
2176 float_status *fpst = stat; \
2177 float32 *d = vd; \
2178 float32 *n = vn; \
2179 float32 *m = vm; \
2180 float32 r0, r1; \
2182 /* Read all inputs before writing outputs in case vm == vd */ \
2183 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
2184 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
2186 d[H4(0)] = r0; \
2187 d[H4(1)] = r1; \
2190 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
2191 void *stat, uint32_t oprsz) \
2193 float_status *fpst = stat; \
2194 float16 *d = vd; \
2195 float16 *n = vn; \
2196 float16 *m = vm; \
2197 float16 r0, r1, r2, r3; \
2199 /* Read all inputs before writing outputs in case vm == vd */ \
2200 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
2201 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
2202 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
2203 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
2205 d[H2(0)] = r0; \
2206 d[H2(1)] = r1; \
2207 d[H2(2)] = r2; \
2208 d[H2(3)] = r3; \
2211 DO_NEON_PAIRWISE(neon_padd, add)
2212 DO_NEON_PAIRWISE(neon_pmax, max)
2213 DO_NEON_PAIRWISE(neon_pmin, min)
2215 #undef DO_NEON_PAIRWISE
2217 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2218 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2220 intptr_t i, oprsz = simd_oprsz(desc); \
2221 int shift = simd_data(desc); \
2222 TYPE *d = vd, *n = vn; \
2223 float_status *fpst = stat; \
2224 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2225 d[i] = FUNC(n[i], shift, fpst); \
2227 clear_tail(d, oprsz, simd_maxsz(desc)); \
2230 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2231 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2232 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2233 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2234 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2235 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2236 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2237 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2239 #undef DO_VCVT_FIXED
2241 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2242 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2244 float_status *fpst = stat; \
2245 intptr_t i, oprsz = simd_oprsz(desc); \
2246 uint32_t rmode = simd_data(desc); \
2247 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2248 TYPE *d = vd, *n = vn; \
2249 set_float_rounding_mode(rmode, fpst); \
2250 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2251 d[i] = FUNC(n[i], 0, fpst); \
2253 set_float_rounding_mode(prev_rmode, fpst); \
2254 clear_tail(d, oprsz, simd_maxsz(desc)); \
2257 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2258 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2259 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2260 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2262 #undef DO_VCVT_RMODE
2264 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2265 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2267 float_status *fpst = stat; \
2268 intptr_t i, oprsz = simd_oprsz(desc); \
2269 uint32_t rmode = simd_data(desc); \
2270 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2271 TYPE *d = vd, *n = vn; \
2272 set_float_rounding_mode(rmode, fpst); \
2273 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2274 d[i] = FUNC(n[i], fpst); \
2276 set_float_rounding_mode(prev_rmode, fpst); \
2277 clear_tail(d, oprsz, simd_maxsz(desc)); \
2280 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2281 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2283 #undef DO_VRINT_RMODE
2285 #ifdef TARGET_AARCH64
2286 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2288 const uint8_t *indices = vm;
2289 CPUARMState *env = venv;
2290 size_t oprsz = simd_oprsz(desc);
2291 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2292 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2293 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2294 union {
2295 uint8_t b[16];
2296 uint64_t d[2];
2297 } result;
2300 * We must construct the final result in a temp, lest the output
2301 * overlaps the input table. For TBL, begin with zero; for TBX,
2302 * begin with the original register contents. Note that we always
2303 * copy 16 bytes here to avoid an extra branch; clearing the high
2304 * bits of the register for oprsz == 8 is handled below.
2306 if (is_tbx) {
2307 memcpy(&result, vd, 16);
2308 } else {
2309 memset(&result, 0, 16);
2312 for (size_t i = 0; i < oprsz; ++i) {
2313 uint32_t index = indices[H1(i)];
2315 if (index < table_len) {
2317 * Convert index (a byte offset into the virtual table
2318 * which is a series of 128-bit vectors concatenated)
2319 * into the correct register element, bearing in mind
2320 * that the table can wrap around from V31 to V0.
2322 const uint8_t *table = (const uint8_t *)
2323 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2324 result.b[H1(i)] = table[H1(index % 16)];
2328 memcpy(vd, &result, 16);
2329 clear_tail(vd, oprsz, simd_maxsz(desc));
2331 #endif
2334 * NxN -> N highpart multiply
2336 * TODO: expose this as a generic vector operation.
2339 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2341 intptr_t i, opr_sz = simd_oprsz(desc);
2342 int8_t *d = vd, *n = vn, *m = vm;
2344 for (i = 0; i < opr_sz; ++i) {
2345 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2347 clear_tail(d, opr_sz, simd_maxsz(desc));
2350 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2352 intptr_t i, opr_sz = simd_oprsz(desc);
2353 int16_t *d = vd, *n = vn, *m = vm;
2355 for (i = 0; i < opr_sz / 2; ++i) {
2356 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2358 clear_tail(d, opr_sz, simd_maxsz(desc));
2361 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2363 intptr_t i, opr_sz = simd_oprsz(desc);
2364 int32_t *d = vd, *n = vn, *m = vm;
2366 for (i = 0; i < opr_sz / 4; ++i) {
2367 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2369 clear_tail(d, opr_sz, simd_maxsz(desc));
2372 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2374 intptr_t i, opr_sz = simd_oprsz(desc);
2375 uint64_t *d = vd, *n = vn, *m = vm;
2376 uint64_t discard;
2378 for (i = 0; i < opr_sz / 8; ++i) {
2379 muls64(&discard, &d[i], n[i], m[i]);
2381 clear_tail(d, opr_sz, simd_maxsz(desc));
2384 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2386 intptr_t i, opr_sz = simd_oprsz(desc);
2387 uint8_t *d = vd, *n = vn, *m = vm;
2389 for (i = 0; i < opr_sz; ++i) {
2390 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2392 clear_tail(d, opr_sz, simd_maxsz(desc));
2395 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2397 intptr_t i, opr_sz = simd_oprsz(desc);
2398 uint16_t *d = vd, *n = vn, *m = vm;
2400 for (i = 0; i < opr_sz / 2; ++i) {
2401 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2403 clear_tail(d, opr_sz, simd_maxsz(desc));
2406 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2408 intptr_t i, opr_sz = simd_oprsz(desc);
2409 uint32_t *d = vd, *n = vn, *m = vm;
2411 for (i = 0; i < opr_sz / 4; ++i) {
2412 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2414 clear_tail(d, opr_sz, simd_maxsz(desc));
2417 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2419 intptr_t i, opr_sz = simd_oprsz(desc);
2420 uint64_t *d = vd, *n = vn, *m = vm;
2421 uint64_t discard;
2423 for (i = 0; i < opr_sz / 8; ++i) {
2424 mulu64(&discard, &d[i], n[i], m[i]);
2426 clear_tail(d, opr_sz, simd_maxsz(desc));
2429 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2431 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2432 int shr = simd_data(desc);
2433 uint64_t *d = vd, *n = vn, *m = vm;
2435 for (i = 0; i < opr_sz; ++i) {
2436 d[i] = ror64(n[i] ^ m[i], shr);
2438 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2442 * Integer matrix-multiply accumulate
2445 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2447 int8_t *n = vn, *m = vm;
2449 for (intptr_t k = 0; k < 8; ++k) {
2450 sum += n[H1(k)] * m[H1(k)];
2452 return sum;
2455 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2457 uint8_t *n = vn, *m = vm;
2459 for (intptr_t k = 0; k < 8; ++k) {
2460 sum += n[H1(k)] * m[H1(k)];
2462 return sum;
2465 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2467 uint8_t *n = vn;
2468 int8_t *m = vm;
2470 for (intptr_t k = 0; k < 8; ++k) {
2471 sum += n[H1(k)] * m[H1(k)];
2473 return sum;
2476 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2477 uint32_t (*inner_loop)(uint32_t, void *, void *))
2479 intptr_t seg, opr_sz = simd_oprsz(desc);
2481 for (seg = 0; seg < opr_sz; seg += 16) {
2482 uint32_t *d = vd + seg;
2483 uint32_t *a = va + seg;
2484 uint32_t sum0, sum1, sum2, sum3;
2487 * Process the entire segment at once, writing back the
2488 * results only after we've consumed all of the inputs.
2490 * Key to indices by column:
2491 * i j i j
2493 sum0 = a[H4(0 + 0)];
2494 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2495 sum1 = a[H4(0 + 1)];
2496 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2497 sum2 = a[H4(2 + 0)];
2498 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2499 sum3 = a[H4(2 + 1)];
2500 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2502 d[H4(0)] = sum0;
2503 d[H4(1)] = sum1;
2504 d[H4(2)] = sum2;
2505 d[H4(3)] = sum3;
2507 clear_tail(vd, opr_sz, simd_maxsz(desc));
2510 #define DO_MMLA_B(NAME, INNER) \
2511 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2512 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2514 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2515 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2516 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2519 * BFloat16 Dot Product
2522 static float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2524 /* FPCR is ignored for BFDOT and BFMMLA. */
2525 float_status bf_status = {
2526 .tininess_before_rounding = float_tininess_before_rounding,
2527 .float_rounding_mode = float_round_to_odd_inf,
2528 .flush_to_zero = true,
2529 .flush_inputs_to_zero = true,
2530 .default_nan_mode = true,
2532 float32 t1, t2;
2535 * Extract each BFloat16 from the element pair, and shift
2536 * them such that they become float32.
2538 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2539 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2540 t1 = float32_add(t1, t2, &bf_status);
2541 t1 = float32_add(sum, t1, &bf_status);
2543 return t1;
2546 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2548 intptr_t i, opr_sz = simd_oprsz(desc);
2549 float32 *d = vd, *a = va;
2550 uint32_t *n = vn, *m = vm;
2552 for (i = 0; i < opr_sz / 4; ++i) {
2553 d[i] = bfdotadd(a[i], n[i], m[i]);
2555 clear_tail(d, opr_sz, simd_maxsz(desc));
2558 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2559 void *va, uint32_t desc)
2561 intptr_t i, j, opr_sz = simd_oprsz(desc);
2562 intptr_t index = simd_data(desc);
2563 intptr_t elements = opr_sz / 4;
2564 intptr_t eltspersegment = MIN(16 / 4, elements);
2565 float32 *d = vd, *a = va;
2566 uint32_t *n = vn, *m = vm;
2568 for (i = 0; i < elements; i += eltspersegment) {
2569 uint32_t m_idx = m[i + H4(index)];
2571 for (j = i; j < i + eltspersegment; j++) {
2572 d[j] = bfdotadd(a[j], n[j], m_idx);
2575 clear_tail(d, opr_sz, simd_maxsz(desc));
2578 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2580 intptr_t s, opr_sz = simd_oprsz(desc);
2581 float32 *d = vd, *a = va;
2582 uint32_t *n = vn, *m = vm;
2584 for (s = 0; s < opr_sz / 4; s += 4) {
2585 float32 sum00, sum01, sum10, sum11;
2588 * Process the entire segment at once, writing back the
2589 * results only after we've consumed all of the inputs.
2591 * Key to indicies by column:
2592 * i j i k j k
2594 sum00 = a[s + H4(0 + 0)];
2595 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2596 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2598 sum01 = a[s + H4(0 + 1)];
2599 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2600 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2602 sum10 = a[s + H4(2 + 0)];
2603 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2604 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2606 sum11 = a[s + H4(2 + 1)];
2607 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2608 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2610 d[s + H4(0 + 0)] = sum00;
2611 d[s + H4(0 + 1)] = sum01;
2612 d[s + H4(2 + 0)] = sum10;
2613 d[s + H4(2 + 1)] = sum11;
2615 clear_tail(d, opr_sz, simd_maxsz(desc));
2618 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2619 void *stat, uint32_t desc)
2621 intptr_t i, opr_sz = simd_oprsz(desc);
2622 intptr_t sel = simd_data(desc);
2623 float32 *d = vd, *a = va;
2624 bfloat16 *n = vn, *m = vm;
2626 for (i = 0; i < opr_sz / 4; ++i) {
2627 float32 nn = n[H2(i * 2 + sel)] << 16;
2628 float32 mm = m[H2(i * 2 + sel)] << 16;
2629 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2631 clear_tail(d, opr_sz, simd_maxsz(desc));
2634 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2635 void *va, void *stat, uint32_t desc)
2637 intptr_t i, j, opr_sz = simd_oprsz(desc);
2638 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2639 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2640 intptr_t elements = opr_sz / 4;
2641 intptr_t eltspersegment = MIN(16 / 4, elements);
2642 float32 *d = vd, *a = va;
2643 bfloat16 *n = vn, *m = vm;
2645 for (i = 0; i < elements; i += eltspersegment) {
2646 float32 m_idx = m[H2(2 * i + index)] << 16;
2648 for (j = i; j < i + eltspersegment; j++) {
2649 float32 n_j = n[H2(2 * j + sel)] << 16;
2650 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2653 clear_tail(d, opr_sz, simd_maxsz(desc));