2 * ARM NEON vector operations.
4 * Copyright (c) 2007, 2008 CodeSourcery.
5 * Written by Paul Brook
7 * This code is licensed under the GNU GPL v2.
10 #include "qemu/osdep.h"
12 #include "exec/helper-proto.h"
13 #include "tcg/tcg-gvec-desc.h"
14 #include "fpu/softfloat.h"
15 #include "vec_internal.h"
17 #define SIGNBIT (uint32_t)0x80000000
18 #define SIGNBIT64 ((uint64_t)1 << 63)
20 #define SET_QC() env->vfp.qc[0] = 1
22 #define NEON_TYPE1(name, type) \
28 #define NEON_TYPE2(name, type) \
34 #define NEON_TYPE4(name, type) \
43 #define NEON_TYPE2(name, type) \
49 #define NEON_TYPE4(name, type) \
59 NEON_TYPE4(s8
, int8_t)
60 NEON_TYPE4(u8
, uint8_t)
61 NEON_TYPE2(s16
, int16_t)
62 NEON_TYPE2(u16
, uint16_t)
63 NEON_TYPE1(s32
, int32_t)
64 NEON_TYPE1(u32
, uint32_t)
69 /* Copy from a uint32_t to a vector structure type. */
70 #define NEON_UNPACK(vtype, dest, val) do { \
79 /* Copy from a vector structure type to a uint32_t. */
80 #define NEON_PACK(vtype, dest, val) do { \
90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
100 #define NEON_VOP_BODY(vtype, n) \
106 NEON_UNPACK(vtype, vsrc1, arg1); \
107 NEON_UNPACK(vtype, vsrc2, arg2); \
109 NEON_PACK(vtype, res, vdest); \
113 #define NEON_VOP(name, vtype, n) \
114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115 NEON_VOP_BODY(vtype, n)
117 #define NEON_VOP_ENV(name, vtype, n) \
118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119 NEON_VOP_BODY(vtype, n)
121 #define NEON_GVEC_VOP2(name, vtype) \
122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
124 intptr_t i, opr_sz = simd_oprsz(desc); \
125 vtype *d = vd, *n = vn, *m = vm; \
126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
127 NEON_FN(d[i], n[i], m[i]); \
129 clear_tail(d, opr_sz, simd_maxsz(desc)); \
132 #define NEON_GVEC_VOP2_ENV(name, vtype) \
133 void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
135 intptr_t i, opr_sz = simd_oprsz(desc); \
136 vtype *d = vd, *n = vn, *m = vm; \
137 CPUARMState *env = venv; \
138 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
139 NEON_FN(d[i], n[i], m[i]); \
141 clear_tail(d, opr_sz, simd_maxsz(desc)); \
144 #define NEON_GVEC_VOP2i_ENV(name, vtype) \
145 void HELPER(name)(void *vd, void *vn, void *venv, uint32_t desc) \
147 intptr_t i, opr_sz = simd_oprsz(desc); \
148 int imm = simd_data(desc); \
149 vtype *d = vd, *n = vn; \
150 CPUARMState *env = venv; \
151 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
152 NEON_FN(d[i], n[i], imm); \
154 clear_tail(d, opr_sz, simd_maxsz(desc)); \
157 /* Pairwise operations. */
158 /* For 32-bit elements each segment only contains a single element, so
159 the elementwise and pairwise operations are the same. */
161 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
162 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
164 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
165 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
166 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
167 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
169 #define NEON_POP(name, vtype, n) \
170 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
176 NEON_UNPACK(vtype, vsrc1, arg1); \
177 NEON_UNPACK(vtype, vsrc2, arg2); \
179 NEON_PACK(vtype, res, vdest); \
183 /* Unary operators. */
184 #define NEON_VOP1(name, vtype, n) \
185 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
189 NEON_UNPACK(vtype, vsrc1, arg); \
191 NEON_PACK(vtype, arg, vdest); \
195 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
196 NEON_POP(pmin_s8
, neon_s8
, 4)
197 NEON_POP(pmin_u8
, neon_u8
, 4)
198 NEON_POP(pmin_s16
, neon_s16
, 2)
199 NEON_POP(pmin_u16
, neon_u16
, 2)
202 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
203 NEON_POP(pmax_s8
, neon_s8
, 4)
204 NEON_POP(pmax_u8
, neon_u8
, 4)
205 NEON_POP(pmax_s16
, neon_s16
, 2)
206 NEON_POP(pmax_u16
, neon_u16
, 2)
209 #define NEON_FN(dest, src1, src2) \
210 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
211 NEON_VOP(shl_u16
, neon_u16
, 2)
214 #define NEON_FN(dest, src1, src2) \
215 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
216 NEON_VOP(shl_s16
, neon_s16
, 2)
219 #define NEON_FN(dest, src1, src2) \
220 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
221 NEON_VOP(rshl_s8
, neon_s8
, 4)
222 NEON_GVEC_VOP2(gvec_srshl_b
, int8_t)
225 #define NEON_FN(dest, src1, src2) \
226 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
227 NEON_VOP(rshl_s16
, neon_s16
, 2)
228 NEON_GVEC_VOP2(gvec_srshl_h
, int16_t)
231 #define NEON_FN(dest, src1, src2) \
232 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
233 NEON_GVEC_VOP2(gvec_srshl_s
, int32_t)
236 #define NEON_FN(dest, src1, src2) \
237 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
238 NEON_GVEC_VOP2(gvec_srshl_d
, int64_t)
241 uint32_t HELPER(neon_rshl_s32
)(uint32_t val
, uint32_t shift
)
243 return do_sqrshl_bhs(val
, (int8_t)shift
, 32, true, NULL
);
246 uint64_t HELPER(neon_rshl_s64
)(uint64_t val
, uint64_t shift
)
248 return do_sqrshl_d(val
, (int8_t)shift
, true, NULL
);
251 #define NEON_FN(dest, src1, src2) \
252 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
253 NEON_VOP(rshl_u8
, neon_u8
, 4)
254 NEON_GVEC_VOP2(gvec_urshl_b
, uint8_t)
257 #define NEON_FN(dest, src1, src2) \
258 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
259 NEON_VOP(rshl_u16
, neon_u16
, 2)
260 NEON_GVEC_VOP2(gvec_urshl_h
, uint16_t)
263 #define NEON_FN(dest, src1, src2) \
264 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
265 NEON_GVEC_VOP2(gvec_urshl_s
, int32_t)
268 #define NEON_FN(dest, src1, src2) \
269 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
270 NEON_GVEC_VOP2(gvec_urshl_d
, int64_t)
273 uint32_t HELPER(neon_rshl_u32
)(uint32_t val
, uint32_t shift
)
275 return do_uqrshl_bhs(val
, (int8_t)shift
, 32, true, NULL
);
278 uint64_t HELPER(neon_rshl_u64
)(uint64_t val
, uint64_t shift
)
280 return do_uqrshl_d(val
, (int8_t)shift
, true, NULL
);
283 #define NEON_FN(dest, src1, src2) \
284 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
285 NEON_VOP_ENV(qshl_u8
, neon_u8
, 4)
286 NEON_GVEC_VOP2_ENV(neon_uqshl_b
, uint8_t)
287 NEON_GVEC_VOP2i_ENV(neon_uqshli_b
, uint8_t)
290 #define NEON_FN(dest, src1, src2) \
291 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
292 NEON_VOP_ENV(qshl_u16
, neon_u16
, 2)
293 NEON_GVEC_VOP2_ENV(neon_uqshl_h
, uint16_t)
294 NEON_GVEC_VOP2i_ENV(neon_uqshli_h
, uint16_t)
297 #define NEON_FN(dest, src1, src2) \
298 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
299 NEON_GVEC_VOP2_ENV(neon_uqshl_s
, uint32_t)
300 NEON_GVEC_VOP2i_ENV(neon_uqshli_s
, uint32_t)
303 #define NEON_FN(dest, src1, src2) \
304 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
305 NEON_GVEC_VOP2_ENV(neon_uqshl_d
, uint64_t)
306 NEON_GVEC_VOP2i_ENV(neon_uqshli_d
, uint64_t)
309 uint32_t HELPER(neon_qshl_u32
)(CPUARMState
*env
, uint32_t val
, uint32_t shift
)
311 return do_uqrshl_bhs(val
, (int8_t)shift
, 32, false, env
->vfp
.qc
);
314 uint64_t HELPER(neon_qshl_u64
)(CPUARMState
*env
, uint64_t val
, uint64_t shift
)
316 return do_uqrshl_d(val
, (int8_t)shift
, false, env
->vfp
.qc
);
319 #define NEON_FN(dest, src1, src2) \
320 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
321 NEON_VOP_ENV(qshl_s8
, neon_s8
, 4)
322 NEON_GVEC_VOP2_ENV(neon_sqshl_b
, int8_t)
323 NEON_GVEC_VOP2i_ENV(neon_sqshli_b
, int8_t)
326 #define NEON_FN(dest, src1, src2) \
327 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
328 NEON_VOP_ENV(qshl_s16
, neon_s16
, 2)
329 NEON_GVEC_VOP2_ENV(neon_sqshl_h
, int16_t)
330 NEON_GVEC_VOP2i_ENV(neon_sqshli_h
, int16_t)
333 #define NEON_FN(dest, src1, src2) \
334 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
335 NEON_GVEC_VOP2_ENV(neon_sqshl_s
, int32_t)
336 NEON_GVEC_VOP2i_ENV(neon_sqshli_s
, int32_t)
339 #define NEON_FN(dest, src1, src2) \
340 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
341 NEON_GVEC_VOP2_ENV(neon_sqshl_d
, int64_t)
342 NEON_GVEC_VOP2i_ENV(neon_sqshli_d
, int64_t)
345 uint32_t HELPER(neon_qshl_s32
)(CPUARMState
*env
, uint32_t val
, uint32_t shift
)
347 return do_sqrshl_bhs(val
, (int8_t)shift
, 32, false, env
->vfp
.qc
);
350 uint64_t HELPER(neon_qshl_s64
)(CPUARMState
*env
, uint64_t val
, uint64_t shift
)
352 return do_sqrshl_d(val
, (int8_t)shift
, false, env
->vfp
.qc
);
355 #define NEON_FN(dest, src1, src2) \
356 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
357 NEON_VOP_ENV(qshlu_s8
, neon_s8
, 4)
358 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b
, int8_t)
361 #define NEON_FN(dest, src1, src2) \
362 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
363 NEON_VOP_ENV(qshlu_s16
, neon_s16
, 2)
364 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h
, int16_t)
367 uint32_t HELPER(neon_qshlu_s32
)(CPUARMState
*env
, uint32_t val
, uint32_t shift
)
369 return do_suqrshl_bhs(val
, (int8_t)shift
, 32, false, env
->vfp
.qc
);
372 uint64_t HELPER(neon_qshlu_s64
)(CPUARMState
*env
, uint64_t val
, uint64_t shift
)
374 return do_suqrshl_d(val
, (int8_t)shift
, false, env
->vfp
.qc
);
377 #define NEON_FN(dest, src1, src2) \
378 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
379 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s
, int32_t)
382 #define NEON_FN(dest, src1, src2) \
383 (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
384 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d
, int64_t)
387 #define NEON_FN(dest, src1, src2) \
388 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
389 NEON_VOP_ENV(qrshl_u8
, neon_u8
, 4)
390 NEON_GVEC_VOP2_ENV(neon_uqrshl_b
, uint8_t)
393 #define NEON_FN(dest, src1, src2) \
394 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
395 NEON_VOP_ENV(qrshl_u16
, neon_u16
, 2)
396 NEON_GVEC_VOP2_ENV(neon_uqrshl_h
, uint16_t)
399 #define NEON_FN(dest, src1, src2) \
400 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
401 NEON_GVEC_VOP2_ENV(neon_uqrshl_s
, uint32_t)
404 #define NEON_FN(dest, src1, src2) \
405 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
406 NEON_GVEC_VOP2_ENV(neon_uqrshl_d
, uint64_t)
409 uint32_t HELPER(neon_qrshl_u32
)(CPUARMState
*env
, uint32_t val
, uint32_t shift
)
411 return do_uqrshl_bhs(val
, (int8_t)shift
, 32, true, env
->vfp
.qc
);
414 uint64_t HELPER(neon_qrshl_u64
)(CPUARMState
*env
, uint64_t val
, uint64_t shift
)
416 return do_uqrshl_d(val
, (int8_t)shift
, true, env
->vfp
.qc
);
419 #define NEON_FN(dest, src1, src2) \
420 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
421 NEON_VOP_ENV(qrshl_s8
, neon_s8
, 4)
422 NEON_GVEC_VOP2_ENV(neon_sqrshl_b
, int8_t)
425 #define NEON_FN(dest, src1, src2) \
426 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
427 NEON_VOP_ENV(qrshl_s16
, neon_s16
, 2)
428 NEON_GVEC_VOP2_ENV(neon_sqrshl_h
, int16_t)
431 #define NEON_FN(dest, src1, src2) \
432 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
433 NEON_GVEC_VOP2_ENV(neon_sqrshl_s
, int32_t)
436 #define NEON_FN(dest, src1, src2) \
437 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
438 NEON_GVEC_VOP2_ENV(neon_sqrshl_d
, int64_t)
441 uint32_t HELPER(neon_qrshl_s32
)(CPUARMState
*env
, uint32_t val
, uint32_t shift
)
443 return do_sqrshl_bhs(val
, (int8_t)shift
, 32, true, env
->vfp
.qc
);
446 uint64_t HELPER(neon_qrshl_s64
)(CPUARMState
*env
, uint64_t val
, uint64_t shift
)
448 return do_sqrshl_d(val
, (int8_t)shift
, true, env
->vfp
.qc
);
451 uint32_t HELPER(neon_add_u8
)(uint32_t a
, uint32_t b
)
454 mask
= (a
^ b
) & 0x80808080u
;
457 return (a
+ b
) ^ mask
;
460 uint32_t HELPER(neon_add_u16
)(uint32_t a
, uint32_t b
)
463 mask
= (a
^ b
) & 0x80008000u
;
466 return (a
+ b
) ^ mask
;
469 #define NEON_FN(dest, src1, src2) dest = src1 - src2
470 NEON_VOP(sub_u8
, neon_u8
, 4)
471 NEON_VOP(sub_u16
, neon_u16
, 2)
474 #define NEON_FN(dest, src1, src2) dest = src1 * src2
475 NEON_VOP(mul_u8
, neon_u8
, 4)
476 NEON_VOP(mul_u16
, neon_u16
, 2)
479 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
480 NEON_VOP(tst_u8
, neon_u8
, 4)
481 NEON_VOP(tst_u16
, neon_u16
, 2)
482 NEON_VOP(tst_u32
, neon_u32
, 1)
485 /* Count Leading Sign/Zero Bits. */
486 static inline int do_clz8(uint8_t x
)
494 static inline int do_clz16(uint16_t x
)
502 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
503 NEON_VOP1(clz_u8
, neon_u8
, 4)
506 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
507 NEON_VOP1(clz_u16
, neon_u16
, 2)
510 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
511 NEON_VOP1(cls_s8
, neon_s8
, 4)
514 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
515 NEON_VOP1(cls_s16
, neon_s16
, 2)
518 uint32_t HELPER(neon_cls_s32
)(uint32_t x
)
523 for (count
= 32; x
; count
--)
529 uint32_t HELPER(neon_cnt_u8
)(uint32_t x
)
531 x
= (x
& 0x55555555) + ((x
>> 1) & 0x55555555);
532 x
= (x
& 0x33333333) + ((x
>> 2) & 0x33333333);
533 x
= (x
& 0x0f0f0f0f) + ((x
>> 4) & 0x0f0f0f0f);
537 /* Reverse bits in each 8 bit word */
538 uint32_t HELPER(neon_rbit_u8
)(uint32_t x
)
540 x
= ((x
& 0xf0f0f0f0) >> 4)
541 | ((x
& 0x0f0f0f0f) << 4);
542 x
= ((x
& 0x88888888) >> 3)
543 | ((x
& 0x44444444) >> 1)
544 | ((x
& 0x22222222) << 1)
545 | ((x
& 0x11111111) << 3);
549 #define NEON_QDMULH16(dest, src1, src2, round) do { \
550 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
551 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
553 tmp = (tmp >> 31) ^ ~SIGNBIT; \
560 if ((int32_t)tmp < old) { \
567 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
568 NEON_VOP_ENV(qdmulh_s16
, neon_s16
, 2)
570 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
571 NEON_VOP_ENV(qrdmulh_s16
, neon_s16
, 2)
575 #define NEON_QDMULH32(dest, src1, src2, round) do { \
576 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
577 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
579 tmp = (tmp >> 63) ^ ~SIGNBIT64; \
585 tmp += (int64_t)1 << 31; \
586 if ((int64_t)tmp < old) { \
588 tmp = SIGNBIT64 - 1; \
593 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
594 NEON_VOP_ENV(qdmulh_s32
, neon_s32
, 1)
596 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
597 NEON_VOP_ENV(qrdmulh_s32
, neon_s32
, 1)
601 /* Only the low 32-bits of output are significant. */
602 uint64_t HELPER(neon_narrow_u8
)(uint64_t x
)
604 return (x
& 0xffu
) | ((x
>> 8) & 0xff00u
) | ((x
>> 16) & 0xff0000u
)
605 | ((x
>> 24) & 0xff000000u
);
608 /* Only the low 32-bits of output are significant. */
609 uint64_t HELPER(neon_narrow_u16
)(uint64_t x
)
611 return (x
& 0xffffu
) | ((x
>> 16) & 0xffff0000u
);
614 uint32_t HELPER(neon_narrow_high_u8
)(uint64_t x
)
616 return ((x
>> 8) & 0xff) | ((x
>> 16) & 0xff00)
617 | ((x
>> 24) & 0xff0000) | ((x
>> 32) & 0xff000000);
620 uint32_t HELPER(neon_narrow_high_u16
)(uint64_t x
)
622 return ((x
>> 16) & 0xffff) | ((x
>> 32) & 0xffff0000);
625 uint32_t HELPER(neon_narrow_round_high_u8
)(uint64_t x
)
627 x
&= 0xff80ff80ff80ff80ull
;
628 x
+= 0x0080008000800080ull
;
629 return ((x
>> 8) & 0xff) | ((x
>> 16) & 0xff00)
630 | ((x
>> 24) & 0xff0000) | ((x
>> 32) & 0xff000000);
633 uint32_t HELPER(neon_narrow_round_high_u16
)(uint64_t x
)
635 x
&= 0xffff8000ffff8000ull
;
636 x
+= 0x0000800000008000ull
;
637 return ((x
>> 16) & 0xffff) | ((x
>> 32) & 0xffff0000);
640 /* Only the low 32-bits of output are significant. */
641 uint64_t HELPER(neon_unarrow_sat8
)(CPUARMState
*env
, uint64_t x
)
657 res |= (uint32_t)d << (n / 2); \
668 /* Only the low 32-bits of output are significant. */
669 uint64_t HELPER(neon_narrow_sat_u8
)(CPUARMState
*env
, uint64_t x
)
682 res |= (uint32_t)d << (n / 2);
692 /* Only the low 32-bits of output are significant. */
693 uint64_t HELPER(neon_narrow_sat_s8
)(CPUARMState
*env
, uint64_t x
)
700 if (s != (int8_t)s) { \
701 d = (s >> 15) ^ 0x7f; \
706 res |= (uint32_t)d << (n / 2);
716 /* Only the low 32-bits of output are significant. */
717 uint64_t HELPER(neon_unarrow_sat16
)(CPUARMState
*env
, uint64_t x
)
722 if (low
& 0x80000000) {
725 } else if (low
> 0xffff) {
730 if (high
& 0x80000000) {
733 } else if (high
> 0xffff) {
737 return deposit32(low
, 16, 16, high
);
740 /* Only the low 32-bits of output are significant. */
741 uint64_t HELPER(neon_narrow_sat_u16
)(CPUARMState
*env
, uint64_t x
)
755 return deposit32(low
, 16, 16, high
);
758 /* Only the low 32-bits of output are significant. */
759 uint64_t HELPER(neon_narrow_sat_s16
)(CPUARMState
*env
, uint64_t x
)
764 if (low
!= (int16_t)low
) {
765 low
= (low
>> 31) ^ 0x7fff;
769 if (high
!= (int16_t)high
) {
770 high
= (high
>> 31) ^ 0x7fff;
773 return deposit32(low
, 16, 16, high
);
776 /* Only the low 32-bits of output are significant. */
777 uint64_t HELPER(neon_unarrow_sat32
)(CPUARMState
*env
, uint64_t x
)
779 if (x
& 0x8000000000000000ull
) {
783 if (x
> 0xffffffffu
) {
790 /* Only the low 32-bits of output are significant. */
791 uint64_t HELPER(neon_narrow_sat_u32
)(CPUARMState
*env
, uint64_t x
)
793 if (x
> 0xffffffffu
) {
800 /* Only the low 32-bits of output are significant. */
801 uint64_t HELPER(neon_narrow_sat_s32
)(CPUARMState
*env
, uint64_t x
)
803 if ((int64_t)x
!= (int32_t)x
) {
805 return (uint32_t)((int64_t)x
>> 63) ^ 0x7fffffff;
810 uint64_t HELPER(neon_widen_u8
)(uint32_t x
)
815 tmp
= (uint8_t)(x
>> 8);
817 tmp
= (uint8_t)(x
>> 16);
819 tmp
= (uint8_t)(x
>> 24);
824 uint64_t HELPER(neon_widen_s8
)(uint32_t x
)
828 ret
= (uint16_t)(int8_t)x
;
829 tmp
= (uint16_t)(int8_t)(x
>> 8);
831 tmp
= (uint16_t)(int8_t)(x
>> 16);
833 tmp
= (uint16_t)(int8_t)(x
>> 24);
838 uint64_t HELPER(neon_widen_u16
)(uint32_t x
)
840 uint64_t high
= (uint16_t)(x
>> 16);
841 return ((uint16_t)x
) | (high
<< 32);
844 uint64_t HELPER(neon_widen_s16
)(uint32_t x
)
846 uint64_t high
= (int16_t)(x
>> 16);
847 return ((uint32_t)(int16_t)x
) | (high
<< 32);
850 uint64_t HELPER(neon_addl_u16
)(uint64_t a
, uint64_t b
)
853 mask
= (a
^ b
) & 0x8000800080008000ull
;
854 a
&= ~0x8000800080008000ull
;
855 b
&= ~0x8000800080008000ull
;
856 return (a
+ b
) ^ mask
;
859 uint64_t HELPER(neon_addl_u32
)(uint64_t a
, uint64_t b
)
862 mask
= (a
^ b
) & 0x8000000080000000ull
;
863 a
&= ~0x8000000080000000ull
;
864 b
&= ~0x8000000080000000ull
;
865 return (a
+ b
) ^ mask
;
868 uint64_t HELPER(neon_paddl_u16
)(uint64_t a
, uint64_t b
)
873 tmp
= a
& 0x0000ffff0000ffffull
;
874 tmp
+= (a
>> 16) & 0x0000ffff0000ffffull
;
875 tmp2
= b
& 0xffff0000ffff0000ull
;
876 tmp2
+= (b
<< 16) & 0xffff0000ffff0000ull
;
877 return ( tmp
& 0xffff)
878 | ((tmp
>> 16) & 0xffff0000ull
)
879 | ((tmp2
<< 16) & 0xffff00000000ull
)
880 | ( tmp2
& 0xffff000000000000ull
);
883 uint64_t HELPER(neon_paddl_u32
)(uint64_t a
, uint64_t b
)
885 uint32_t low
= a
+ (a
>> 32);
886 uint32_t high
= b
+ (b
>> 32);
887 return low
+ ((uint64_t)high
<< 32);
890 uint64_t HELPER(neon_subl_u16
)(uint64_t a
, uint64_t b
)
893 mask
= (a
^ ~b
) & 0x8000800080008000ull
;
894 a
|= 0x8000800080008000ull
;
895 b
&= ~0x8000800080008000ull
;
896 return (a
- b
) ^ mask
;
899 uint64_t HELPER(neon_subl_u32
)(uint64_t a
, uint64_t b
)
902 mask
= (a
^ ~b
) & 0x8000000080000000ull
;
903 a
|= 0x8000000080000000ull
;
904 b
&= ~0x8000000080000000ull
;
905 return (a
- b
) ^ mask
;
908 uint64_t HELPER(neon_addl_saturate_s32
)(CPUARMState
*env
, uint64_t a
, uint64_t b
)
916 if (((low
^ x
) & SIGNBIT
) && !((x
^ y
) & SIGNBIT
)) {
918 low
= ((int32_t)x
>> 31) ^ ~SIGNBIT
;
923 if (((high
^ x
) & SIGNBIT
) && !((x
^ y
) & SIGNBIT
)) {
925 high
= ((int32_t)x
>> 31) ^ ~SIGNBIT
;
927 return low
| ((uint64_t)high
<< 32);
930 uint64_t HELPER(neon_addl_saturate_s64
)(CPUARMState
*env
, uint64_t a
, uint64_t b
)
935 if (((result
^ a
) & SIGNBIT64
) && !((a
^ b
) & SIGNBIT64
)) {
937 result
= ((int64_t)a
>> 63) ^ ~SIGNBIT64
;
942 /* We have to do the arithmetic in a larger type than
943 * the input type, because for example with a signed 32 bit
944 * op the absolute difference can overflow a signed 32 bit value.
946 #define DO_ABD(dest, x, y, intype, arithtype) do { \
947 arithtype tmp_x = (intype)(x); \
948 arithtype tmp_y = (intype)(y); \
949 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
952 uint64_t HELPER(neon_abdl_u16
)(uint32_t a
, uint32_t b
)
956 DO_ABD(result
, a
, b
, uint8_t, uint32_t);
957 DO_ABD(tmp
, a
>> 8, b
>> 8, uint8_t, uint32_t);
959 DO_ABD(tmp
, a
>> 16, b
>> 16, uint8_t, uint32_t);
961 DO_ABD(tmp
, a
>> 24, b
>> 24, uint8_t, uint32_t);
966 uint64_t HELPER(neon_abdl_s16
)(uint32_t a
, uint32_t b
)
970 DO_ABD(result
, a
, b
, int8_t, int32_t);
971 DO_ABD(tmp
, a
>> 8, b
>> 8, int8_t, int32_t);
973 DO_ABD(tmp
, a
>> 16, b
>> 16, int8_t, int32_t);
975 DO_ABD(tmp
, a
>> 24, b
>> 24, int8_t, int32_t);
980 uint64_t HELPER(neon_abdl_u32
)(uint32_t a
, uint32_t b
)
984 DO_ABD(result
, a
, b
, uint16_t, uint32_t);
985 DO_ABD(tmp
, a
>> 16, b
>> 16, uint16_t, uint32_t);
986 return result
| (tmp
<< 32);
989 uint64_t HELPER(neon_abdl_s32
)(uint32_t a
, uint32_t b
)
993 DO_ABD(result
, a
, b
, int16_t, int32_t);
994 DO_ABD(tmp
, a
>> 16, b
>> 16, int16_t, int32_t);
995 return result
| (tmp
<< 32);
998 uint64_t HELPER(neon_abdl_u64
)(uint32_t a
, uint32_t b
)
1001 DO_ABD(result
, a
, b
, uint32_t, uint64_t);
1005 uint64_t HELPER(neon_abdl_s64
)(uint32_t a
, uint32_t b
)
1008 DO_ABD(result
, a
, b
, int32_t, int64_t);
1013 /* Widening multiply. Named type is the source type. */
1014 #define DO_MULL(dest, x, y, type1, type2) do { \
1017 dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1020 uint64_t HELPER(neon_mull_u8
)(uint32_t a
, uint32_t b
)
1025 DO_MULL(result
, a
, b
, uint8_t, uint16_t);
1026 DO_MULL(tmp
, a
>> 8, b
>> 8, uint8_t, uint16_t);
1027 result
|= tmp
<< 16;
1028 DO_MULL(tmp
, a
>> 16, b
>> 16, uint8_t, uint16_t);
1029 result
|= tmp
<< 32;
1030 DO_MULL(tmp
, a
>> 24, b
>> 24, uint8_t, uint16_t);
1031 result
|= tmp
<< 48;
1035 uint64_t HELPER(neon_mull_s8
)(uint32_t a
, uint32_t b
)
1040 DO_MULL(result
, a
, b
, int8_t, uint16_t);
1041 DO_MULL(tmp
, a
>> 8, b
>> 8, int8_t, uint16_t);
1042 result
|= tmp
<< 16;
1043 DO_MULL(tmp
, a
>> 16, b
>> 16, int8_t, uint16_t);
1044 result
|= tmp
<< 32;
1045 DO_MULL(tmp
, a
>> 24, b
>> 24, int8_t, uint16_t);
1046 result
|= tmp
<< 48;
1050 uint64_t HELPER(neon_mull_u16
)(uint32_t a
, uint32_t b
)
1055 DO_MULL(result
, a
, b
, uint16_t, uint32_t);
1056 DO_MULL(tmp
, a
>> 16, b
>> 16, uint16_t, uint32_t);
1057 return result
| (tmp
<< 32);
1060 uint64_t HELPER(neon_mull_s16
)(uint32_t a
, uint32_t b
)
1065 DO_MULL(result
, a
, b
, int16_t, uint32_t);
1066 DO_MULL(tmp
, a
>> 16, b
>> 16, int16_t, uint32_t);
1067 return result
| (tmp
<< 32);
1070 uint64_t HELPER(neon_negl_u16
)(uint64_t x
)
1074 result
= (uint16_t)-x
;
1076 result
|= (uint64_t)tmp
<< 16;
1078 result
|= (uint64_t)tmp
<< 32;
1080 result
|= (uint64_t)tmp
<< 48;
1084 uint64_t HELPER(neon_negl_u32
)(uint64_t x
)
1087 uint32_t high
= -(x
>> 32);
1088 return low
| ((uint64_t)high
<< 32);
1091 /* Saturating sign manipulation. */
1092 /* ??? Make these use NEON_VOP1 */
1093 #define DO_QABS8(x) do { \
1094 if (x == (int8_t)0x80) { \
1097 } else if (x < 0) { \
1100 uint32_t HELPER(neon_qabs_s8
)(CPUARMState
*env
, uint32_t x
)
1103 NEON_UNPACK(neon_s8
, vec
, x
);
1108 NEON_PACK(neon_s8
, x
, vec
);
1113 #define DO_QNEG8(x) do { \
1114 if (x == (int8_t)0x80) { \
1120 uint32_t HELPER(neon_qneg_s8
)(CPUARMState
*env
, uint32_t x
)
1123 NEON_UNPACK(neon_s8
, vec
, x
);
1128 NEON_PACK(neon_s8
, x
, vec
);
1133 #define DO_QABS16(x) do { \
1134 if (x == (int16_t)0x8000) { \
1137 } else if (x < 0) { \
1140 uint32_t HELPER(neon_qabs_s16
)(CPUARMState
*env
, uint32_t x
)
1143 NEON_UNPACK(neon_s16
, vec
, x
);
1146 NEON_PACK(neon_s16
, x
, vec
);
1151 #define DO_QNEG16(x) do { \
1152 if (x == (int16_t)0x8000) { \
1158 uint32_t HELPER(neon_qneg_s16
)(CPUARMState
*env
, uint32_t x
)
1161 NEON_UNPACK(neon_s16
, vec
, x
);
1164 NEON_PACK(neon_s16
, x
, vec
);
1169 uint32_t HELPER(neon_qabs_s32
)(CPUARMState
*env
, uint32_t x
)
1174 } else if ((int32_t)x
< 0) {
1180 uint32_t HELPER(neon_qneg_s32
)(CPUARMState
*env
, uint32_t x
)
1191 uint64_t HELPER(neon_qabs_s64
)(CPUARMState
*env
, uint64_t x
)
1193 if (x
== SIGNBIT64
) {
1196 } else if ((int64_t)x
< 0) {
1202 uint64_t HELPER(neon_qneg_s64
)(CPUARMState
*env
, uint64_t x
)
1204 if (x
== SIGNBIT64
) {
1213 /* NEON Float helpers. */
1215 /* Floating point comparisons produce an integer result.
1216 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1217 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1219 uint32_t HELPER(neon_ceq_f32
)(uint32_t a
, uint32_t b
, void *fpstp
)
1221 float_status
*fpst
= fpstp
;
1222 return -float32_eq_quiet(make_float32(a
), make_float32(b
), fpst
);
1225 uint32_t HELPER(neon_cge_f32
)(uint32_t a
, uint32_t b
, void *fpstp
)
1227 float_status
*fpst
= fpstp
;
1228 return -float32_le(make_float32(b
), make_float32(a
), fpst
);
1231 uint32_t HELPER(neon_cgt_f32
)(uint32_t a
, uint32_t b
, void *fpstp
)
1233 float_status
*fpst
= fpstp
;
1234 return -float32_lt(make_float32(b
), make_float32(a
), fpst
);
1237 uint32_t HELPER(neon_acge_f32
)(uint32_t a
, uint32_t b
, void *fpstp
)
1239 float_status
*fpst
= fpstp
;
1240 float32 f0
= float32_abs(make_float32(a
));
1241 float32 f1
= float32_abs(make_float32(b
));
1242 return -float32_le(f1
, f0
, fpst
);
1245 uint32_t HELPER(neon_acgt_f32
)(uint32_t a
, uint32_t b
, void *fpstp
)
1247 float_status
*fpst
= fpstp
;
1248 float32 f0
= float32_abs(make_float32(a
));
1249 float32 f1
= float32_abs(make_float32(b
));
1250 return -float32_lt(f1
, f0
, fpst
);
1253 uint64_t HELPER(neon_acge_f64
)(uint64_t a
, uint64_t b
, void *fpstp
)
1255 float_status
*fpst
= fpstp
;
1256 float64 f0
= float64_abs(make_float64(a
));
1257 float64 f1
= float64_abs(make_float64(b
));
1258 return -float64_le(f1
, f0
, fpst
);
1261 uint64_t HELPER(neon_acgt_f64
)(uint64_t a
, uint64_t b
, void *fpstp
)
1263 float_status
*fpst
= fpstp
;
1264 float64 f0
= float64_abs(make_float64(a
));
1265 float64 f1
= float64_abs(make_float64(b
));
1266 return -float64_lt(f1
, f0
, fpst
);
1269 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1271 void HELPER(neon_qunzip8
)(void *vd
, void *vm
)
1273 uint64_t *rd
= vd
, *rm
= vm
;
1274 uint64_t zd0
= rd
[0], zd1
= rd
[1];
1275 uint64_t zm0
= rm
[0], zm1
= rm
[1];
1277 uint64_t d0
= ELEM(zd0
, 0, 8) | (ELEM(zd0
, 2, 8) << 8)
1278 | (ELEM(zd0
, 4, 8) << 16) | (ELEM(zd0
, 6, 8) << 24)
1279 | (ELEM(zd1
, 0, 8) << 32) | (ELEM(zd1
, 2, 8) << 40)
1280 | (ELEM(zd1
, 4, 8) << 48) | (ELEM(zd1
, 6, 8) << 56);
1281 uint64_t d1
= ELEM(zm0
, 0, 8) | (ELEM(zm0
, 2, 8) << 8)
1282 | (ELEM(zm0
, 4, 8) << 16) | (ELEM(zm0
, 6, 8) << 24)
1283 | (ELEM(zm1
, 0, 8) << 32) | (ELEM(zm1
, 2, 8) << 40)
1284 | (ELEM(zm1
, 4, 8) << 48) | (ELEM(zm1
, 6, 8) << 56);
1285 uint64_t m0
= ELEM(zd0
, 1, 8) | (ELEM(zd0
, 3, 8) << 8)
1286 | (ELEM(zd0
, 5, 8) << 16) | (ELEM(zd0
, 7, 8) << 24)
1287 | (ELEM(zd1
, 1, 8) << 32) | (ELEM(zd1
, 3, 8) << 40)
1288 | (ELEM(zd1
, 5, 8) << 48) | (ELEM(zd1
, 7, 8) << 56);
1289 uint64_t m1
= ELEM(zm0
, 1, 8) | (ELEM(zm0
, 3, 8) << 8)
1290 | (ELEM(zm0
, 5, 8) << 16) | (ELEM(zm0
, 7, 8) << 24)
1291 | (ELEM(zm1
, 1, 8) << 32) | (ELEM(zm1
, 3, 8) << 40)
1292 | (ELEM(zm1
, 5, 8) << 48) | (ELEM(zm1
, 7, 8) << 56);
1300 void HELPER(neon_qunzip16
)(void *vd
, void *vm
)
1302 uint64_t *rd
= vd
, *rm
= vm
;
1303 uint64_t zd0
= rd
[0], zd1
= rd
[1];
1304 uint64_t zm0
= rm
[0], zm1
= rm
[1];
1306 uint64_t d0
= ELEM(zd0
, 0, 16) | (ELEM(zd0
, 2, 16) << 16)
1307 | (ELEM(zd1
, 0, 16) << 32) | (ELEM(zd1
, 2, 16) << 48);
1308 uint64_t d1
= ELEM(zm0
, 0, 16) | (ELEM(zm0
, 2, 16) << 16)
1309 | (ELEM(zm1
, 0, 16) << 32) | (ELEM(zm1
, 2, 16) << 48);
1310 uint64_t m0
= ELEM(zd0
, 1, 16) | (ELEM(zd0
, 3, 16) << 16)
1311 | (ELEM(zd1
, 1, 16) << 32) | (ELEM(zd1
, 3, 16) << 48);
1312 uint64_t m1
= ELEM(zm0
, 1, 16) | (ELEM(zm0
, 3, 16) << 16)
1313 | (ELEM(zm1
, 1, 16) << 32) | (ELEM(zm1
, 3, 16) << 48);
1321 void HELPER(neon_qunzip32
)(void *vd
, void *vm
)
1323 uint64_t *rd
= vd
, *rm
= vm
;
1324 uint64_t zd0
= rd
[0], zd1
= rd
[1];
1325 uint64_t zm0
= rm
[0], zm1
= rm
[1];
1327 uint64_t d0
= ELEM(zd0
, 0, 32) | (ELEM(zd1
, 0, 32) << 32);
1328 uint64_t d1
= ELEM(zm0
, 0, 32) | (ELEM(zm1
, 0, 32) << 32);
1329 uint64_t m0
= ELEM(zd0
, 1, 32) | (ELEM(zd1
, 1, 32) << 32);
1330 uint64_t m1
= ELEM(zm0
, 1, 32) | (ELEM(zm1
, 1, 32) << 32);
1338 void HELPER(neon_unzip8
)(void *vd
, void *vm
)
1340 uint64_t *rd
= vd
, *rm
= vm
;
1341 uint64_t zd
= rd
[0], zm
= rm
[0];
1343 uint64_t d0
= ELEM(zd
, 0, 8) | (ELEM(zd
, 2, 8) << 8)
1344 | (ELEM(zd
, 4, 8) << 16) | (ELEM(zd
, 6, 8) << 24)
1345 | (ELEM(zm
, 0, 8) << 32) | (ELEM(zm
, 2, 8) << 40)
1346 | (ELEM(zm
, 4, 8) << 48) | (ELEM(zm
, 6, 8) << 56);
1347 uint64_t m0
= ELEM(zd
, 1, 8) | (ELEM(zd
, 3, 8) << 8)
1348 | (ELEM(zd
, 5, 8) << 16) | (ELEM(zd
, 7, 8) << 24)
1349 | (ELEM(zm
, 1, 8) << 32) | (ELEM(zm
, 3, 8) << 40)
1350 | (ELEM(zm
, 5, 8) << 48) | (ELEM(zm
, 7, 8) << 56);
1356 void HELPER(neon_unzip16
)(void *vd
, void *vm
)
1358 uint64_t *rd
= vd
, *rm
= vm
;
1359 uint64_t zd
= rd
[0], zm
= rm
[0];
1361 uint64_t d0
= ELEM(zd
, 0, 16) | (ELEM(zd
, 2, 16) << 16)
1362 | (ELEM(zm
, 0, 16) << 32) | (ELEM(zm
, 2, 16) << 48);
1363 uint64_t m0
= ELEM(zd
, 1, 16) | (ELEM(zd
, 3, 16) << 16)
1364 | (ELEM(zm
, 1, 16) << 32) | (ELEM(zm
, 3, 16) << 48);
1370 void HELPER(neon_qzip8
)(void *vd
, void *vm
)
1372 uint64_t *rd
= vd
, *rm
= vm
;
1373 uint64_t zd0
= rd
[0], zd1
= rd
[1];
1374 uint64_t zm0
= rm
[0], zm1
= rm
[1];
1376 uint64_t d0
= ELEM(zd0
, 0, 8) | (ELEM(zm0
, 0, 8) << 8)
1377 | (ELEM(zd0
, 1, 8) << 16) | (ELEM(zm0
, 1, 8) << 24)
1378 | (ELEM(zd0
, 2, 8) << 32) | (ELEM(zm0
, 2, 8) << 40)
1379 | (ELEM(zd0
, 3, 8) << 48) | (ELEM(zm0
, 3, 8) << 56);
1380 uint64_t d1
= ELEM(zd0
, 4, 8) | (ELEM(zm0
, 4, 8) << 8)
1381 | (ELEM(zd0
, 5, 8) << 16) | (ELEM(zm0
, 5, 8) << 24)
1382 | (ELEM(zd0
, 6, 8) << 32) | (ELEM(zm0
, 6, 8) << 40)
1383 | (ELEM(zd0
, 7, 8) << 48) | (ELEM(zm0
, 7, 8) << 56);
1384 uint64_t m0
= ELEM(zd1
, 0, 8) | (ELEM(zm1
, 0, 8) << 8)
1385 | (ELEM(zd1
, 1, 8) << 16) | (ELEM(zm1
, 1, 8) << 24)
1386 | (ELEM(zd1
, 2, 8) << 32) | (ELEM(zm1
, 2, 8) << 40)
1387 | (ELEM(zd1
, 3, 8) << 48) | (ELEM(zm1
, 3, 8) << 56);
1388 uint64_t m1
= ELEM(zd1
, 4, 8) | (ELEM(zm1
, 4, 8) << 8)
1389 | (ELEM(zd1
, 5, 8) << 16) | (ELEM(zm1
, 5, 8) << 24)
1390 | (ELEM(zd1
, 6, 8) << 32) | (ELEM(zm1
, 6, 8) << 40)
1391 | (ELEM(zd1
, 7, 8) << 48) | (ELEM(zm1
, 7, 8) << 56);
1399 void HELPER(neon_qzip16
)(void *vd
, void *vm
)
1401 uint64_t *rd
= vd
, *rm
= vm
;
1402 uint64_t zd0
= rd
[0], zd1
= rd
[1];
1403 uint64_t zm0
= rm
[0], zm1
= rm
[1];
1405 uint64_t d0
= ELEM(zd0
, 0, 16) | (ELEM(zm0
, 0, 16) << 16)
1406 | (ELEM(zd0
, 1, 16) << 32) | (ELEM(zm0
, 1, 16) << 48);
1407 uint64_t d1
= ELEM(zd0
, 2, 16) | (ELEM(zm0
, 2, 16) << 16)
1408 | (ELEM(zd0
, 3, 16) << 32) | (ELEM(zm0
, 3, 16) << 48);
1409 uint64_t m0
= ELEM(zd1
, 0, 16) | (ELEM(zm1
, 0, 16) << 16)
1410 | (ELEM(zd1
, 1, 16) << 32) | (ELEM(zm1
, 1, 16) << 48);
1411 uint64_t m1
= ELEM(zd1
, 2, 16) | (ELEM(zm1
, 2, 16) << 16)
1412 | (ELEM(zd1
, 3, 16) << 32) | (ELEM(zm1
, 3, 16) << 48);
1420 void HELPER(neon_qzip32
)(void *vd
, void *vm
)
1422 uint64_t *rd
= vd
, *rm
= vm
;
1423 uint64_t zd0
= rd
[0], zd1
= rd
[1];
1424 uint64_t zm0
= rm
[0], zm1
= rm
[1];
1426 uint64_t d0
= ELEM(zd0
, 0, 32) | (ELEM(zm0
, 0, 32) << 32);
1427 uint64_t d1
= ELEM(zd0
, 1, 32) | (ELEM(zm0
, 1, 32) << 32);
1428 uint64_t m0
= ELEM(zd1
, 0, 32) | (ELEM(zm1
, 0, 32) << 32);
1429 uint64_t m1
= ELEM(zd1
, 1, 32) | (ELEM(zm1
, 1, 32) << 32);
1437 void HELPER(neon_zip8
)(void *vd
, void *vm
)
1439 uint64_t *rd
= vd
, *rm
= vm
;
1440 uint64_t zd
= rd
[0], zm
= rm
[0];
1442 uint64_t d0
= ELEM(zd
, 0, 8) | (ELEM(zm
, 0, 8) << 8)
1443 | (ELEM(zd
, 1, 8) << 16) | (ELEM(zm
, 1, 8) << 24)
1444 | (ELEM(zd
, 2, 8) << 32) | (ELEM(zm
, 2, 8) << 40)
1445 | (ELEM(zd
, 3, 8) << 48) | (ELEM(zm
, 3, 8) << 56);
1446 uint64_t m0
= ELEM(zd
, 4, 8) | (ELEM(zm
, 4, 8) << 8)
1447 | (ELEM(zd
, 5, 8) << 16) | (ELEM(zm
, 5, 8) << 24)
1448 | (ELEM(zd
, 6, 8) << 32) | (ELEM(zm
, 6, 8) << 40)
1449 | (ELEM(zd
, 7, 8) << 48) | (ELEM(zm
, 7, 8) << 56);
1455 void HELPER(neon_zip16
)(void *vd
, void *vm
)
1457 uint64_t *rd
= vd
, *rm
= vm
;
1458 uint64_t zd
= rd
[0], zm
= rm
[0];
1460 uint64_t d0
= ELEM(zd
, 0, 16) | (ELEM(zm
, 0, 16) << 16)
1461 | (ELEM(zd
, 1, 16) << 32) | (ELEM(zm
, 1, 16) << 48);
1462 uint64_t m0
= ELEM(zd
, 2, 16) | (ELEM(zm
, 2, 16) << 16)
1463 | (ELEM(zd
, 3, 16) << 32) | (ELEM(zm
, 3, 16) << 48);