Merge tag 'pull-loongarch-20241016' of https://gitlab.com/gaosong/qemu into staging
[qemu/armbru.git] / target / arm / tcg / gengvec.c
blobf652520b652d85343c29f667a7ba05de2d784b80
1 /*
2 * ARM generic vector expansion
4 * Copyright (c) 2003 Fabrice Bellard
5 * Copyright (c) 2005-2007 CodeSourcery
6 * Copyright (c) 2007 OpenedHand, Ltd.
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
22 #include "qemu/osdep.h"
23 #include "translate.h"
26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
27 uint32_t opr_sz, uint32_t max_sz,
28 gen_helper_gvec_3_ptr *fn)
30 TCGv_ptr qc_ptr = tcg_temp_new_ptr();
32 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
33 tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc));
34 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
35 opr_sz, max_sz, 0, fn);
38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
39 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
41 static gen_helper_gvec_3_ptr * const fns[2] = {
42 gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s
44 tcg_debug_assert(vece >= 1 && vece <= 2);
45 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
49 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
51 static gen_helper_gvec_3_ptr * const fns[2] = {
52 gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s
54 tcg_debug_assert(vece >= 1 && vece <= 2);
55 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
59 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
61 static gen_helper_gvec_3_ptr * const fns[2] = {
62 gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
64 tcg_debug_assert(vece >= 1 && vece <= 2);
65 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
69 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
71 static gen_helper_gvec_3_ptr * const fns[2] = {
72 gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
74 tcg_debug_assert(vece >= 1 && vece <= 2);
75 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
78 #define GEN_CMP0(NAME, COND) \
79 void NAME(unsigned vece, uint32_t d, uint32_t m, \
80 uint32_t opr_sz, uint32_t max_sz) \
81 { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); }
83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ)
84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE)
85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE)
86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT)
87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)
89 #undef GEN_CMP0
91 void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
92 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
94 /* Signed shift out of range results in all-sign-bits */
95 shift = MIN(shift, (8 << vece) - 1);
96 tcg_gen_gvec_sari(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
99 void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
100 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
102 /* Unsigned shift out of range results in all-zero-bits */
103 if (shift >= (8 << vece)) {
104 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
105 } else {
106 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
110 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
112 tcg_gen_vec_sar8i_i64(a, a, shift);
113 tcg_gen_vec_add8_i64(d, d, a);
116 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
118 tcg_gen_vec_sar16i_i64(a, a, shift);
119 tcg_gen_vec_add16_i64(d, d, a);
122 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
124 tcg_gen_sari_i32(a, a, shift);
125 tcg_gen_add_i32(d, d, a);
128 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
130 tcg_gen_sari_i64(a, a, shift);
131 tcg_gen_add_i64(d, d, a);
134 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
136 tcg_gen_sari_vec(vece, a, a, sh);
137 tcg_gen_add_vec(vece, d, d, a);
140 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
141 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
143 static const TCGOpcode vecop_list[] = {
144 INDEX_op_sari_vec, INDEX_op_add_vec, 0
146 static const GVecGen2i ops[4] = {
147 { .fni8 = gen_ssra8_i64,
148 .fniv = gen_ssra_vec,
149 .fno = gen_helper_gvec_ssra_b,
150 .load_dest = true,
151 .opt_opc = vecop_list,
152 .vece = MO_8 },
153 { .fni8 = gen_ssra16_i64,
154 .fniv = gen_ssra_vec,
155 .fno = gen_helper_gvec_ssra_h,
156 .load_dest = true,
157 .opt_opc = vecop_list,
158 .vece = MO_16 },
159 { .fni4 = gen_ssra32_i32,
160 .fniv = gen_ssra_vec,
161 .fno = gen_helper_gvec_ssra_s,
162 .load_dest = true,
163 .opt_opc = vecop_list,
164 .vece = MO_32 },
165 { .fni8 = gen_ssra64_i64,
166 .fniv = gen_ssra_vec,
167 .fno = gen_helper_gvec_ssra_d,
168 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
169 .opt_opc = vecop_list,
170 .load_dest = true,
171 .vece = MO_64 },
174 /* tszimm encoding produces immediates in the range [1..esize]. */
175 tcg_debug_assert(shift > 0);
176 tcg_debug_assert(shift <= (8 << vece));
179 * Shifts larger than the element size are architecturally valid.
180 * Signed results in all sign bits.
182 shift = MIN(shift, (8 << vece) - 1);
183 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
186 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
188 tcg_gen_vec_shr8i_i64(a, a, shift);
189 tcg_gen_vec_add8_i64(d, d, a);
192 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
194 tcg_gen_vec_shr16i_i64(a, a, shift);
195 tcg_gen_vec_add16_i64(d, d, a);
198 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
200 tcg_gen_shri_i32(a, a, shift);
201 tcg_gen_add_i32(d, d, a);
204 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
206 tcg_gen_shri_i64(a, a, shift);
207 tcg_gen_add_i64(d, d, a);
210 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
212 tcg_gen_shri_vec(vece, a, a, sh);
213 tcg_gen_add_vec(vece, d, d, a);
216 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
217 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
219 static const TCGOpcode vecop_list[] = {
220 INDEX_op_shri_vec, INDEX_op_add_vec, 0
222 static const GVecGen2i ops[4] = {
223 { .fni8 = gen_usra8_i64,
224 .fniv = gen_usra_vec,
225 .fno = gen_helper_gvec_usra_b,
226 .load_dest = true,
227 .opt_opc = vecop_list,
228 .vece = MO_8, },
229 { .fni8 = gen_usra16_i64,
230 .fniv = gen_usra_vec,
231 .fno = gen_helper_gvec_usra_h,
232 .load_dest = true,
233 .opt_opc = vecop_list,
234 .vece = MO_16, },
235 { .fni4 = gen_usra32_i32,
236 .fniv = gen_usra_vec,
237 .fno = gen_helper_gvec_usra_s,
238 .load_dest = true,
239 .opt_opc = vecop_list,
240 .vece = MO_32, },
241 { .fni8 = gen_usra64_i64,
242 .fniv = gen_usra_vec,
243 .fno = gen_helper_gvec_usra_d,
244 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
245 .load_dest = true,
246 .opt_opc = vecop_list,
247 .vece = MO_64, },
250 /* tszimm encoding produces immediates in the range [1..esize]. */
251 tcg_debug_assert(shift > 0);
252 tcg_debug_assert(shift <= (8 << vece));
255 * Shifts larger than the element size are architecturally valid.
256 * Unsigned results in all zeros as input to accumulate: nop.
258 if (shift < (8 << vece)) {
259 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
260 } else {
261 /* Nop, but we do need to clear the tail. */
262 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
267 * Shift one less than the requested amount, and the low bit is
268 * the rounding bit. For the 8 and 16-bit operations, because we
269 * mask the low bit, we can perform a normal integer shift instead
270 * of a vector shift.
272 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
274 TCGv_i64 t = tcg_temp_new_i64();
276 tcg_gen_shri_i64(t, a, sh - 1);
277 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
278 tcg_gen_vec_sar8i_i64(d, a, sh);
279 tcg_gen_vec_add8_i64(d, d, t);
282 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
284 TCGv_i64 t = tcg_temp_new_i64();
286 tcg_gen_shri_i64(t, a, sh - 1);
287 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
288 tcg_gen_vec_sar16i_i64(d, a, sh);
289 tcg_gen_vec_add16_i64(d, d, t);
292 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
294 TCGv_i32 t;
296 /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
297 if (sh == 32) {
298 tcg_gen_movi_i32(d, 0);
299 return;
301 t = tcg_temp_new_i32();
302 tcg_gen_extract_i32(t, a, sh - 1, 1);
303 tcg_gen_sari_i32(d, a, sh);
304 tcg_gen_add_i32(d, d, t);
307 void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
309 TCGv_i64 t = tcg_temp_new_i64();
311 tcg_gen_extract_i64(t, a, sh - 1, 1);
312 tcg_gen_sari_i64(d, a, sh);
313 tcg_gen_add_i64(d, d, t);
316 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
318 TCGv_vec t = tcg_temp_new_vec_matching(d);
319 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
321 tcg_gen_shri_vec(vece, t, a, sh - 1);
322 tcg_gen_and_vec(vece, t, t, ones);
323 tcg_gen_sari_vec(vece, d, a, sh);
324 tcg_gen_add_vec(vece, d, d, t);
327 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
328 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
330 static const TCGOpcode vecop_list[] = {
331 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
333 static const GVecGen2i ops[4] = {
334 { .fni8 = gen_srshr8_i64,
335 .fniv = gen_srshr_vec,
336 .fno = gen_helper_gvec_srshr_b,
337 .opt_opc = vecop_list,
338 .vece = MO_8 },
339 { .fni8 = gen_srshr16_i64,
340 .fniv = gen_srshr_vec,
341 .fno = gen_helper_gvec_srshr_h,
342 .opt_opc = vecop_list,
343 .vece = MO_16 },
344 { .fni4 = gen_srshr32_i32,
345 .fniv = gen_srshr_vec,
346 .fno = gen_helper_gvec_srshr_s,
347 .opt_opc = vecop_list,
348 .vece = MO_32 },
349 { .fni8 = gen_srshr64_i64,
350 .fniv = gen_srshr_vec,
351 .fno = gen_helper_gvec_srshr_d,
352 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
353 .opt_opc = vecop_list,
354 .vece = MO_64 },
357 /* tszimm encoding produces immediates in the range [1..esize] */
358 tcg_debug_assert(shift > 0);
359 tcg_debug_assert(shift <= (8 << vece));
361 if (shift == (8 << vece)) {
363 * Shifts larger than the element size are architecturally valid.
364 * Signed results in all sign bits. With rounding, this produces
365 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
366 * I.e. always zero.
368 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
369 } else {
370 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
374 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
376 TCGv_i64 t = tcg_temp_new_i64();
378 gen_srshr8_i64(t, a, sh);
379 tcg_gen_vec_add8_i64(d, d, t);
382 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
384 TCGv_i64 t = tcg_temp_new_i64();
386 gen_srshr16_i64(t, a, sh);
387 tcg_gen_vec_add16_i64(d, d, t);
390 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
392 TCGv_i32 t = tcg_temp_new_i32();
394 gen_srshr32_i32(t, a, sh);
395 tcg_gen_add_i32(d, d, t);
398 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
400 TCGv_i64 t = tcg_temp_new_i64();
402 gen_srshr64_i64(t, a, sh);
403 tcg_gen_add_i64(d, d, t);
406 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
408 TCGv_vec t = tcg_temp_new_vec_matching(d);
410 gen_srshr_vec(vece, t, a, sh);
411 tcg_gen_add_vec(vece, d, d, t);
414 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
415 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
417 static const TCGOpcode vecop_list[] = {
418 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
420 static const GVecGen2i ops[4] = {
421 { .fni8 = gen_srsra8_i64,
422 .fniv = gen_srsra_vec,
423 .fno = gen_helper_gvec_srsra_b,
424 .opt_opc = vecop_list,
425 .load_dest = true,
426 .vece = MO_8 },
427 { .fni8 = gen_srsra16_i64,
428 .fniv = gen_srsra_vec,
429 .fno = gen_helper_gvec_srsra_h,
430 .opt_opc = vecop_list,
431 .load_dest = true,
432 .vece = MO_16 },
433 { .fni4 = gen_srsra32_i32,
434 .fniv = gen_srsra_vec,
435 .fno = gen_helper_gvec_srsra_s,
436 .opt_opc = vecop_list,
437 .load_dest = true,
438 .vece = MO_32 },
439 { .fni8 = gen_srsra64_i64,
440 .fniv = gen_srsra_vec,
441 .fno = gen_helper_gvec_srsra_d,
442 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
443 .opt_opc = vecop_list,
444 .load_dest = true,
445 .vece = MO_64 },
448 /* tszimm encoding produces immediates in the range [1..esize] */
449 tcg_debug_assert(shift > 0);
450 tcg_debug_assert(shift <= (8 << vece));
453 * Shifts larger than the element size are architecturally valid.
454 * Signed results in all sign bits. With rounding, this produces
455 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
456 * I.e. always zero. With accumulation, this leaves D unchanged.
458 if (shift == (8 << vece)) {
459 /* Nop, but we do need to clear the tail. */
460 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
461 } else {
462 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
466 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
468 TCGv_i64 t = tcg_temp_new_i64();
470 tcg_gen_shri_i64(t, a, sh - 1);
471 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
472 tcg_gen_vec_shr8i_i64(d, a, sh);
473 tcg_gen_vec_add8_i64(d, d, t);
476 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
478 TCGv_i64 t = tcg_temp_new_i64();
480 tcg_gen_shri_i64(t, a, sh - 1);
481 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
482 tcg_gen_vec_shr16i_i64(d, a, sh);
483 tcg_gen_vec_add16_i64(d, d, t);
486 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
488 TCGv_i32 t;
490 /* Handle shift by the input size for the benefit of trans_URSHR_ri */
491 if (sh == 32) {
492 tcg_gen_extract_i32(d, a, sh - 1, 1);
493 return;
495 t = tcg_temp_new_i32();
496 tcg_gen_extract_i32(t, a, sh - 1, 1);
497 tcg_gen_shri_i32(d, a, sh);
498 tcg_gen_add_i32(d, d, t);
501 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
503 TCGv_i64 t = tcg_temp_new_i64();
505 tcg_gen_extract_i64(t, a, sh - 1, 1);
506 tcg_gen_shri_i64(d, a, sh);
507 tcg_gen_add_i64(d, d, t);
510 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
512 TCGv_vec t = tcg_temp_new_vec_matching(d);
513 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
515 tcg_gen_shri_vec(vece, t, a, shift - 1);
516 tcg_gen_and_vec(vece, t, t, ones);
517 tcg_gen_shri_vec(vece, d, a, shift);
518 tcg_gen_add_vec(vece, d, d, t);
521 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
522 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
524 static const TCGOpcode vecop_list[] = {
525 INDEX_op_shri_vec, INDEX_op_add_vec, 0
527 static const GVecGen2i ops[4] = {
528 { .fni8 = gen_urshr8_i64,
529 .fniv = gen_urshr_vec,
530 .fno = gen_helper_gvec_urshr_b,
531 .opt_opc = vecop_list,
532 .vece = MO_8 },
533 { .fni8 = gen_urshr16_i64,
534 .fniv = gen_urshr_vec,
535 .fno = gen_helper_gvec_urshr_h,
536 .opt_opc = vecop_list,
537 .vece = MO_16 },
538 { .fni4 = gen_urshr32_i32,
539 .fniv = gen_urshr_vec,
540 .fno = gen_helper_gvec_urshr_s,
541 .opt_opc = vecop_list,
542 .vece = MO_32 },
543 { .fni8 = gen_urshr64_i64,
544 .fniv = gen_urshr_vec,
545 .fno = gen_helper_gvec_urshr_d,
546 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
547 .opt_opc = vecop_list,
548 .vece = MO_64 },
551 /* tszimm encoding produces immediates in the range [1..esize] */
552 tcg_debug_assert(shift > 0);
553 tcg_debug_assert(shift <= (8 << vece));
555 if (shift == (8 << vece)) {
557 * Shifts larger than the element size are architecturally valid.
558 * Unsigned results in zero. With rounding, this produces a
559 * copy of the most significant bit.
561 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
562 } else {
563 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
567 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
569 TCGv_i64 t = tcg_temp_new_i64();
571 if (sh == 8) {
572 tcg_gen_vec_shr8i_i64(t, a, 7);
573 } else {
574 gen_urshr8_i64(t, a, sh);
576 tcg_gen_vec_add8_i64(d, d, t);
579 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
581 TCGv_i64 t = tcg_temp_new_i64();
583 if (sh == 16) {
584 tcg_gen_vec_shr16i_i64(t, a, 15);
585 } else {
586 gen_urshr16_i64(t, a, sh);
588 tcg_gen_vec_add16_i64(d, d, t);
591 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
593 TCGv_i32 t = tcg_temp_new_i32();
595 if (sh == 32) {
596 tcg_gen_shri_i32(t, a, 31);
597 } else {
598 gen_urshr32_i32(t, a, sh);
600 tcg_gen_add_i32(d, d, t);
603 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
605 TCGv_i64 t = tcg_temp_new_i64();
607 if (sh == 64) {
608 tcg_gen_shri_i64(t, a, 63);
609 } else {
610 gen_urshr64_i64(t, a, sh);
612 tcg_gen_add_i64(d, d, t);
615 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
617 TCGv_vec t = tcg_temp_new_vec_matching(d);
619 if (sh == (8 << vece)) {
620 tcg_gen_shri_vec(vece, t, a, sh - 1);
621 } else {
622 gen_urshr_vec(vece, t, a, sh);
624 tcg_gen_add_vec(vece, d, d, t);
627 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
628 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
630 static const TCGOpcode vecop_list[] = {
631 INDEX_op_shri_vec, INDEX_op_add_vec, 0
633 static const GVecGen2i ops[4] = {
634 { .fni8 = gen_ursra8_i64,
635 .fniv = gen_ursra_vec,
636 .fno = gen_helper_gvec_ursra_b,
637 .opt_opc = vecop_list,
638 .load_dest = true,
639 .vece = MO_8 },
640 { .fni8 = gen_ursra16_i64,
641 .fniv = gen_ursra_vec,
642 .fno = gen_helper_gvec_ursra_h,
643 .opt_opc = vecop_list,
644 .load_dest = true,
645 .vece = MO_16 },
646 { .fni4 = gen_ursra32_i32,
647 .fniv = gen_ursra_vec,
648 .fno = gen_helper_gvec_ursra_s,
649 .opt_opc = vecop_list,
650 .load_dest = true,
651 .vece = MO_32 },
652 { .fni8 = gen_ursra64_i64,
653 .fniv = gen_ursra_vec,
654 .fno = gen_helper_gvec_ursra_d,
655 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
656 .opt_opc = vecop_list,
657 .load_dest = true,
658 .vece = MO_64 },
661 /* tszimm encoding produces immediates in the range [1..esize] */
662 tcg_debug_assert(shift > 0);
663 tcg_debug_assert(shift <= (8 << vece));
665 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
668 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
670 uint64_t mask = dup_const(MO_8, 0xff >> shift);
671 TCGv_i64 t = tcg_temp_new_i64();
673 tcg_gen_shri_i64(t, a, shift);
674 tcg_gen_andi_i64(t, t, mask);
675 tcg_gen_andi_i64(d, d, ~mask);
676 tcg_gen_or_i64(d, d, t);
679 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
681 uint64_t mask = dup_const(MO_16, 0xffff >> shift);
682 TCGv_i64 t = tcg_temp_new_i64();
684 tcg_gen_shri_i64(t, a, shift);
685 tcg_gen_andi_i64(t, t, mask);
686 tcg_gen_andi_i64(d, d, ~mask);
687 tcg_gen_or_i64(d, d, t);
690 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
692 tcg_gen_shri_i32(a, a, shift);
693 tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
696 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
698 tcg_gen_shri_i64(a, a, shift);
699 tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
702 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
704 TCGv_vec t = tcg_temp_new_vec_matching(d);
705 int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh);
706 TCGv_vec m = tcg_constant_vec_matching(d, vece, mi);
708 tcg_gen_shri_vec(vece, t, a, sh);
709 tcg_gen_and_vec(vece, d, d, m);
710 tcg_gen_or_vec(vece, d, d, t);
713 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
714 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
716 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
717 const GVecGen2i ops[4] = {
718 { .fni8 = gen_shr8_ins_i64,
719 .fniv = gen_shr_ins_vec,
720 .fno = gen_helper_gvec_sri_b,
721 .load_dest = true,
722 .opt_opc = vecop_list,
723 .vece = MO_8 },
724 { .fni8 = gen_shr16_ins_i64,
725 .fniv = gen_shr_ins_vec,
726 .fno = gen_helper_gvec_sri_h,
727 .load_dest = true,
728 .opt_opc = vecop_list,
729 .vece = MO_16 },
730 { .fni4 = gen_shr32_ins_i32,
731 .fniv = gen_shr_ins_vec,
732 .fno = gen_helper_gvec_sri_s,
733 .load_dest = true,
734 .opt_opc = vecop_list,
735 .vece = MO_32 },
736 { .fni8 = gen_shr64_ins_i64,
737 .fniv = gen_shr_ins_vec,
738 .fno = gen_helper_gvec_sri_d,
739 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
740 .load_dest = true,
741 .opt_opc = vecop_list,
742 .vece = MO_64 },
745 /* tszimm encoding produces immediates in the range [1..esize]. */
746 tcg_debug_assert(shift > 0);
747 tcg_debug_assert(shift <= (8 << vece));
749 /* Shift of esize leaves destination unchanged. */
750 if (shift < (8 << vece)) {
751 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
752 } else {
753 /* Nop, but we do need to clear the tail. */
754 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
758 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
760 uint64_t mask = dup_const(MO_8, 0xff << shift);
761 TCGv_i64 t = tcg_temp_new_i64();
763 tcg_gen_shli_i64(t, a, shift);
764 tcg_gen_andi_i64(t, t, mask);
765 tcg_gen_andi_i64(d, d, ~mask);
766 tcg_gen_or_i64(d, d, t);
769 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
771 uint64_t mask = dup_const(MO_16, 0xffff << shift);
772 TCGv_i64 t = tcg_temp_new_i64();
774 tcg_gen_shli_i64(t, a, shift);
775 tcg_gen_andi_i64(t, t, mask);
776 tcg_gen_andi_i64(d, d, ~mask);
777 tcg_gen_or_i64(d, d, t);
780 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
782 tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
785 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
787 tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
790 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
792 TCGv_vec t = tcg_temp_new_vec_matching(d);
793 TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh));
795 tcg_gen_shli_vec(vece, t, a, sh);
796 tcg_gen_and_vec(vece, d, d, m);
797 tcg_gen_or_vec(vece, d, d, t);
800 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
801 int64_t shift, uint32_t opr_sz, uint32_t max_sz)
803 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
804 const GVecGen2i ops[4] = {
805 { .fni8 = gen_shl8_ins_i64,
806 .fniv = gen_shl_ins_vec,
807 .fno = gen_helper_gvec_sli_b,
808 .load_dest = true,
809 .opt_opc = vecop_list,
810 .vece = MO_8 },
811 { .fni8 = gen_shl16_ins_i64,
812 .fniv = gen_shl_ins_vec,
813 .fno = gen_helper_gvec_sli_h,
814 .load_dest = true,
815 .opt_opc = vecop_list,
816 .vece = MO_16 },
817 { .fni4 = gen_shl32_ins_i32,
818 .fniv = gen_shl_ins_vec,
819 .fno = gen_helper_gvec_sli_s,
820 .load_dest = true,
821 .opt_opc = vecop_list,
822 .vece = MO_32 },
823 { .fni8 = gen_shl64_ins_i64,
824 .fniv = gen_shl_ins_vec,
825 .fno = gen_helper_gvec_sli_d,
826 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
827 .load_dest = true,
828 .opt_opc = vecop_list,
829 .vece = MO_64 },
832 /* tszimm encoding produces immediates in the range [0..esize-1]. */
833 tcg_debug_assert(shift >= 0);
834 tcg_debug_assert(shift < (8 << vece));
836 if (shift == 0) {
837 tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
838 } else {
839 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
843 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
845 gen_helper_neon_mul_u8(a, a, b);
846 gen_helper_neon_add_u8(d, d, a);
849 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
851 gen_helper_neon_mul_u8(a, a, b);
852 gen_helper_neon_sub_u8(d, d, a);
855 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
857 gen_helper_neon_mul_u16(a, a, b);
858 gen_helper_neon_add_u16(d, d, a);
861 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
863 gen_helper_neon_mul_u16(a, a, b);
864 gen_helper_neon_sub_u16(d, d, a);
867 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
869 tcg_gen_mul_i32(a, a, b);
870 tcg_gen_add_i32(d, d, a);
873 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
875 tcg_gen_mul_i32(a, a, b);
876 tcg_gen_sub_i32(d, d, a);
879 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
881 tcg_gen_mul_i64(a, a, b);
882 tcg_gen_add_i64(d, d, a);
885 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
887 tcg_gen_mul_i64(a, a, b);
888 tcg_gen_sub_i64(d, d, a);
891 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
893 tcg_gen_mul_vec(vece, a, a, b);
894 tcg_gen_add_vec(vece, d, d, a);
897 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
899 tcg_gen_mul_vec(vece, a, a, b);
900 tcg_gen_sub_vec(vece, d, d, a);
903 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
904 * these tables are shared with AArch64 which does support them.
906 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
907 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
909 static const TCGOpcode vecop_list[] = {
910 INDEX_op_mul_vec, INDEX_op_add_vec, 0
912 static const GVecGen3 ops[4] = {
913 { .fni4 = gen_mla8_i32,
914 .fniv = gen_mla_vec,
915 .load_dest = true,
916 .opt_opc = vecop_list,
917 .vece = MO_8 },
918 { .fni4 = gen_mla16_i32,
919 .fniv = gen_mla_vec,
920 .load_dest = true,
921 .opt_opc = vecop_list,
922 .vece = MO_16 },
923 { .fni4 = gen_mla32_i32,
924 .fniv = gen_mla_vec,
925 .load_dest = true,
926 .opt_opc = vecop_list,
927 .vece = MO_32 },
928 { .fni8 = gen_mla64_i64,
929 .fniv = gen_mla_vec,
930 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
931 .load_dest = true,
932 .opt_opc = vecop_list,
933 .vece = MO_64 },
935 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
938 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
939 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
941 static const TCGOpcode vecop_list[] = {
942 INDEX_op_mul_vec, INDEX_op_sub_vec, 0
944 static const GVecGen3 ops[4] = {
945 { .fni4 = gen_mls8_i32,
946 .fniv = gen_mls_vec,
947 .load_dest = true,
948 .opt_opc = vecop_list,
949 .vece = MO_8 },
950 { .fni4 = gen_mls16_i32,
951 .fniv = gen_mls_vec,
952 .load_dest = true,
953 .opt_opc = vecop_list,
954 .vece = MO_16 },
955 { .fni4 = gen_mls32_i32,
956 .fniv = gen_mls_vec,
957 .load_dest = true,
958 .opt_opc = vecop_list,
959 .vece = MO_32 },
960 { .fni8 = gen_mls64_i64,
961 .fniv = gen_mls_vec,
962 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
963 .load_dest = true,
964 .opt_opc = vecop_list,
965 .vece = MO_64 },
967 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
970 /* CMTST : test is "if (X & Y != 0)". */
971 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
973 tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b);
976 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
978 tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b);
981 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
983 tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b);
986 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
987 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
989 static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
990 static const GVecGen3 ops[4] = {
991 { .fni4 = gen_helper_neon_tst_u8,
992 .fniv = gen_cmtst_vec,
993 .opt_opc = vecop_list,
994 .vece = MO_8 },
995 { .fni4 = gen_helper_neon_tst_u16,
996 .fniv = gen_cmtst_vec,
997 .opt_opc = vecop_list,
998 .vece = MO_16 },
999 { .fni4 = gen_cmtst_i32,
1000 .fniv = gen_cmtst_vec,
1001 .opt_opc = vecop_list,
1002 .vece = MO_32 },
1003 { .fni8 = gen_cmtst_i64,
1004 .fniv = gen_cmtst_vec,
1005 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1006 .opt_opc = vecop_list,
1007 .vece = MO_64 },
1009 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1012 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1014 TCGv_i32 lval = tcg_temp_new_i32();
1015 TCGv_i32 rval = tcg_temp_new_i32();
1016 TCGv_i32 lsh = tcg_temp_new_i32();
1017 TCGv_i32 rsh = tcg_temp_new_i32();
1018 TCGv_i32 zero = tcg_constant_i32(0);
1019 TCGv_i32 max = tcg_constant_i32(32);
1022 * Rely on the TCG guarantee that out of range shifts produce
1023 * unspecified results, not undefined behaviour (i.e. no trap).
1024 * Discard out-of-range results after the fact.
1026 tcg_gen_ext8s_i32(lsh, shift);
1027 tcg_gen_neg_i32(rsh, lsh);
1028 tcg_gen_shl_i32(lval, src, lsh);
1029 tcg_gen_shr_i32(rval, src, rsh);
1030 tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
1031 tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
1034 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1036 TCGv_i64 lval = tcg_temp_new_i64();
1037 TCGv_i64 rval = tcg_temp_new_i64();
1038 TCGv_i64 lsh = tcg_temp_new_i64();
1039 TCGv_i64 rsh = tcg_temp_new_i64();
1040 TCGv_i64 zero = tcg_constant_i64(0);
1041 TCGv_i64 max = tcg_constant_i64(64);
1044 * Rely on the TCG guarantee that out of range shifts produce
1045 * unspecified results, not undefined behaviour (i.e. no trap).
1046 * Discard out-of-range results after the fact.
1048 tcg_gen_ext8s_i64(lsh, shift);
1049 tcg_gen_neg_i64(rsh, lsh);
1050 tcg_gen_shl_i64(lval, src, lsh);
1051 tcg_gen_shr_i64(rval, src, rsh);
1052 tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
1053 tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
1056 static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
1057 TCGv_vec src, TCGv_vec shift)
1059 TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1060 TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1061 TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1062 TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1063 TCGv_vec max, zero;
1065 tcg_gen_neg_vec(vece, rsh, shift);
1066 if (vece == MO_8) {
1067 tcg_gen_mov_vec(lsh, shift);
1068 } else {
1069 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
1070 tcg_gen_and_vec(vece, lsh, shift, msk);
1071 tcg_gen_and_vec(vece, rsh, rsh, msk);
1075 * Rely on the TCG guarantee that out of range shifts produce
1076 * unspecified results, not undefined behaviour (i.e. no trap).
1077 * Discard out-of-range results after the fact.
1079 tcg_gen_shlv_vec(vece, lval, src, lsh);
1080 tcg_gen_shrv_vec(vece, rval, src, rsh);
1083 * The choice of GE (signed) and GEU (unsigned) are biased toward
1084 * the instructions of the x86_64 host. For MO_8, the whole byte
1085 * is significant so we must use an unsigned compare; otherwise we
1086 * have already masked to a byte and so a signed compare works.
1087 * Other tcg hosts have a full set of comparisons and do not care.
1089 zero = tcg_constant_vec_matching(dst, vece, 0);
1090 max = tcg_constant_vec_matching(dst, vece, 8 << vece);
1091 if (vece == MO_8) {
1092 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval);
1093 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval);
1094 } else {
1095 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval);
1096 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval);
1098 tcg_gen_or_vec(vece, dst, lval, rval);
1101 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1102 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1104 static const TCGOpcode vecop_list[] = {
1105 INDEX_op_neg_vec, INDEX_op_shlv_vec,
1106 INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0
1108 static const GVecGen3 ops[4] = {
1109 { .fniv = gen_ushl_vec,
1110 .fno = gen_helper_gvec_ushl_b,
1111 .opt_opc = vecop_list,
1112 .vece = MO_8 },
1113 { .fniv = gen_ushl_vec,
1114 .fno = gen_helper_gvec_ushl_h,
1115 .opt_opc = vecop_list,
1116 .vece = MO_16 },
1117 { .fni4 = gen_ushl_i32,
1118 .fniv = gen_ushl_vec,
1119 .opt_opc = vecop_list,
1120 .vece = MO_32 },
1121 { .fni8 = gen_ushl_i64,
1122 .fniv = gen_ushl_vec,
1123 .opt_opc = vecop_list,
1124 .vece = MO_64 },
1126 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1129 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
1131 TCGv_i32 lval = tcg_temp_new_i32();
1132 TCGv_i32 rval = tcg_temp_new_i32();
1133 TCGv_i32 lsh = tcg_temp_new_i32();
1134 TCGv_i32 rsh = tcg_temp_new_i32();
1135 TCGv_i32 zero = tcg_constant_i32(0);
1136 TCGv_i32 max = tcg_constant_i32(31);
1139 * Rely on the TCG guarantee that out of range shifts produce
1140 * unspecified results, not undefined behaviour (i.e. no trap).
1141 * Discard out-of-range results after the fact.
1143 tcg_gen_ext8s_i32(lsh, shift);
1144 tcg_gen_neg_i32(rsh, lsh);
1145 tcg_gen_shl_i32(lval, src, lsh);
1146 tcg_gen_umin_i32(rsh, rsh, max);
1147 tcg_gen_sar_i32(rval, src, rsh);
1148 tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
1149 tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
1152 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
1154 TCGv_i64 lval = tcg_temp_new_i64();
1155 TCGv_i64 rval = tcg_temp_new_i64();
1156 TCGv_i64 lsh = tcg_temp_new_i64();
1157 TCGv_i64 rsh = tcg_temp_new_i64();
1158 TCGv_i64 zero = tcg_constant_i64(0);
1159 TCGv_i64 max = tcg_constant_i64(63);
1162 * Rely on the TCG guarantee that out of range shifts produce
1163 * unspecified results, not undefined behaviour (i.e. no trap).
1164 * Discard out-of-range results after the fact.
1166 tcg_gen_ext8s_i64(lsh, shift);
1167 tcg_gen_neg_i64(rsh, lsh);
1168 tcg_gen_shl_i64(lval, src, lsh);
1169 tcg_gen_umin_i64(rsh, rsh, max);
1170 tcg_gen_sar_i64(rval, src, rsh);
1171 tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
1172 tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
1175 static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
1176 TCGv_vec src, TCGv_vec shift)
1178 TCGv_vec lval = tcg_temp_new_vec_matching(dst);
1179 TCGv_vec rval = tcg_temp_new_vec_matching(dst);
1180 TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
1181 TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
1182 TCGv_vec max, zero;
1185 * Rely on the TCG guarantee that out of range shifts produce
1186 * unspecified results, not undefined behaviour (i.e. no trap).
1187 * Discard out-of-range results after the fact.
1189 tcg_gen_neg_vec(vece, rsh, shift);
1190 if (vece == MO_8) {
1191 tcg_gen_mov_vec(lsh, shift);
1192 } else {
1193 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
1194 tcg_gen_and_vec(vece, lsh, shift, msk);
1195 tcg_gen_and_vec(vece, rsh, rsh, msk);
1198 /* Bound rsh so out of bound right shift gets -1. */
1199 max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1);
1200 tcg_gen_umin_vec(vece, rsh, rsh, max);
1202 tcg_gen_shlv_vec(vece, lval, src, lsh);
1203 tcg_gen_sarv_vec(vece, rval, src, rsh);
1205 /* Select in-bound left shift. */
1206 zero = tcg_constant_vec_matching(dst, vece, 0);
1207 tcg_gen_cmpsel_vec(TCG_COND_GT, vece, lval, lsh, max, zero, lval);
1209 /* Select between left and right shift. */
1210 if (vece == MO_8) {
1211 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval);
1212 } else {
1213 TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80);
1214 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval);
1218 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1219 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1221 static const TCGOpcode vecop_list[] = {
1222 INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
1223 INDEX_op_sarv_vec, INDEX_op_cmpsel_vec, 0
1225 static const GVecGen3 ops[4] = {
1226 { .fniv = gen_sshl_vec,
1227 .fno = gen_helper_gvec_sshl_b,
1228 .opt_opc = vecop_list,
1229 .vece = MO_8 },
1230 { .fniv = gen_sshl_vec,
1231 .fno = gen_helper_gvec_sshl_h,
1232 .opt_opc = vecop_list,
1233 .vece = MO_16 },
1234 { .fni4 = gen_sshl_i32,
1235 .fniv = gen_sshl_vec,
1236 .opt_opc = vecop_list,
1237 .vece = MO_32 },
1238 { .fni8 = gen_sshl_i64,
1239 .fniv = gen_sshl_vec,
1240 .opt_opc = vecop_list,
1241 .vece = MO_64 },
1243 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1246 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1247 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1249 static gen_helper_gvec_3 * const fns[] = {
1250 gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h,
1251 gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d,
1253 tcg_debug_assert(vece <= MO_64);
1254 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1257 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1258 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1260 static gen_helper_gvec_3 * const fns[] = {
1261 gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h,
1262 gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d,
1264 tcg_debug_assert(vece <= MO_64);
1265 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1268 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1269 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1271 static gen_helper_gvec_3_ptr * const fns[] = {
1272 gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h,
1273 gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d,
1275 tcg_debug_assert(vece <= MO_64);
1276 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1277 opr_sz, max_sz, 0, fns[vece]);
1280 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1281 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1283 static gen_helper_gvec_3_ptr * const fns[] = {
1284 gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h,
1285 gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d,
1287 tcg_debug_assert(vece <= MO_64);
1288 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1289 opr_sz, max_sz, 0, fns[vece]);
1292 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1293 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1295 static gen_helper_gvec_3_ptr * const fns[] = {
1296 gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h,
1297 gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d,
1299 tcg_debug_assert(vece <= MO_64);
1300 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1301 opr_sz, max_sz, 0, fns[vece]);
1304 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1305 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1307 static gen_helper_gvec_3_ptr * const fns[] = {
1308 gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h,
1309 gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d,
1311 tcg_debug_assert(vece <= MO_64);
1312 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env,
1313 opr_sz, max_sz, 0, fns[vece]);
1316 void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1317 int64_t c, uint32_t opr_sz, uint32_t max_sz)
1319 static gen_helper_gvec_2_ptr * const fns[] = {
1320 gen_helper_neon_sqshli_b, gen_helper_neon_sqshli_h,
1321 gen_helper_neon_sqshli_s, gen_helper_neon_sqshli_d,
1323 tcg_debug_assert(vece <= MO_64);
1324 tcg_debug_assert(c >= 0 && c <= (8 << vece));
1325 tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
1328 void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1329 int64_t c, uint32_t opr_sz, uint32_t max_sz)
1331 static gen_helper_gvec_2_ptr * const fns[] = {
1332 gen_helper_neon_uqshli_b, gen_helper_neon_uqshli_h,
1333 gen_helper_neon_uqshli_s, gen_helper_neon_uqshli_d,
1335 tcg_debug_assert(vece <= MO_64);
1336 tcg_debug_assert(c >= 0 && c <= (8 << vece));
1337 tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
1340 void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1341 int64_t c, uint32_t opr_sz, uint32_t max_sz)
1343 static gen_helper_gvec_2_ptr * const fns[] = {
1344 gen_helper_neon_sqshlui_b, gen_helper_neon_sqshlui_h,
1345 gen_helper_neon_sqshlui_s, gen_helper_neon_sqshlui_d,
1347 tcg_debug_assert(vece <= MO_64);
1348 tcg_debug_assert(c >= 0 && c <= (8 << vece));
1349 tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
1352 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1354 uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
1355 TCGv_i64 tmp = tcg_temp_new_i64();
1357 tcg_gen_add_i64(tmp, a, b);
1358 tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max));
1359 tcg_gen_xor_i64(tmp, tmp, res);
1360 tcg_gen_or_i64(qc, qc, tmp);
1363 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1365 TCGv_i64 t = tcg_temp_new_i64();
1367 tcg_gen_add_i64(t, a, b);
1368 tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a,
1369 tcg_constant_i64(UINT64_MAX), t);
1370 tcg_gen_xor_i64(t, t, res);
1371 tcg_gen_or_i64(qc, qc, t);
1374 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1375 TCGv_vec a, TCGv_vec b)
1377 TCGv_vec x = tcg_temp_new_vec_matching(t);
1378 tcg_gen_add_vec(vece, x, a, b);
1379 tcg_gen_usadd_vec(vece, t, a, b);
1380 tcg_gen_xor_vec(vece, x, x, t);
1381 tcg_gen_or_vec(vece, qc, qc, x);
1384 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1385 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1387 static const TCGOpcode vecop_list[] = {
1388 INDEX_op_usadd_vec, INDEX_op_add_vec, 0
1390 static const GVecGen4 ops[4] = {
1391 { .fniv = gen_uqadd_vec,
1392 .fno = gen_helper_gvec_uqadd_b,
1393 .write_aofs = true,
1394 .opt_opc = vecop_list,
1395 .vece = MO_8 },
1396 { .fniv = gen_uqadd_vec,
1397 .fno = gen_helper_gvec_uqadd_h,
1398 .write_aofs = true,
1399 .opt_opc = vecop_list,
1400 .vece = MO_16 },
1401 { .fniv = gen_uqadd_vec,
1402 .fno = gen_helper_gvec_uqadd_s,
1403 .write_aofs = true,
1404 .opt_opc = vecop_list,
1405 .vece = MO_32 },
1406 { .fniv = gen_uqadd_vec,
1407 .fni8 = gen_uqadd_d,
1408 .fno = gen_helper_gvec_uqadd_d,
1409 .write_aofs = true,
1410 .opt_opc = vecop_list,
1411 .vece = MO_64 },
1414 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1415 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1416 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1419 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1421 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1422 int64_t min = -1ll - max;
1423 TCGv_i64 tmp = tcg_temp_new_i64();
1425 tcg_gen_add_i64(tmp, a, b);
1426 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1427 tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1428 tcg_gen_xor_i64(tmp, tmp, res);
1429 tcg_gen_or_i64(qc, qc, tmp);
1432 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1434 TCGv_i64 t0 = tcg_temp_new_i64();
1435 TCGv_i64 t1 = tcg_temp_new_i64();
1436 TCGv_i64 t2 = tcg_temp_new_i64();
1438 tcg_gen_add_i64(t0, a, b);
1440 /* Compute signed overflow indication into T1 */
1441 tcg_gen_xor_i64(t1, a, b);
1442 tcg_gen_xor_i64(t2, t0, a);
1443 tcg_gen_andc_i64(t1, t2, t1);
1445 /* Compute saturated value into T2 */
1446 tcg_gen_sari_i64(t2, a, 63);
1447 tcg_gen_xori_i64(t2, t2, INT64_MAX);
1449 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1450 tcg_gen_xor_i64(t0, t0, res);
1451 tcg_gen_or_i64(qc, qc, t0);
1454 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1455 TCGv_vec a, TCGv_vec b)
1457 TCGv_vec x = tcg_temp_new_vec_matching(t);
1458 tcg_gen_add_vec(vece, x, a, b);
1459 tcg_gen_ssadd_vec(vece, t, a, b);
1460 tcg_gen_xor_vec(vece, x, x, t);
1461 tcg_gen_or_vec(vece, qc, qc, x);
1464 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1465 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1467 static const TCGOpcode vecop_list[] = {
1468 INDEX_op_ssadd_vec, INDEX_op_add_vec, 0
1470 static const GVecGen4 ops[4] = {
1471 { .fniv = gen_sqadd_vec,
1472 .fno = gen_helper_gvec_sqadd_b,
1473 .opt_opc = vecop_list,
1474 .write_aofs = true,
1475 .vece = MO_8 },
1476 { .fniv = gen_sqadd_vec,
1477 .fno = gen_helper_gvec_sqadd_h,
1478 .opt_opc = vecop_list,
1479 .write_aofs = true,
1480 .vece = MO_16 },
1481 { .fniv = gen_sqadd_vec,
1482 .fno = gen_helper_gvec_sqadd_s,
1483 .opt_opc = vecop_list,
1484 .write_aofs = true,
1485 .vece = MO_32 },
1486 { .fniv = gen_sqadd_vec,
1487 .fni8 = gen_sqadd_d,
1488 .fno = gen_helper_gvec_sqadd_d,
1489 .opt_opc = vecop_list,
1490 .write_aofs = true,
1491 .vece = MO_64 },
1494 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1495 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1496 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1499 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1501 TCGv_i64 tmp = tcg_temp_new_i64();
1503 tcg_gen_sub_i64(tmp, a, b);
1504 tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0));
1505 tcg_gen_xor_i64(tmp, tmp, res);
1506 tcg_gen_or_i64(qc, qc, tmp);
1509 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1511 TCGv_i64 t = tcg_temp_new_i64();
1513 tcg_gen_sub_i64(t, a, b);
1514 tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t);
1515 tcg_gen_xor_i64(t, t, res);
1516 tcg_gen_or_i64(qc, qc, t);
1519 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1520 TCGv_vec a, TCGv_vec b)
1522 TCGv_vec x = tcg_temp_new_vec_matching(t);
1523 tcg_gen_sub_vec(vece, x, a, b);
1524 tcg_gen_ussub_vec(vece, t, a, b);
1525 tcg_gen_xor_vec(vece, x, x, t);
1526 tcg_gen_or_vec(vece, qc, qc, x);
1529 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1530 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1532 static const TCGOpcode vecop_list[] = {
1533 INDEX_op_ussub_vec, INDEX_op_sub_vec, 0
1535 static const GVecGen4 ops[4] = {
1536 { .fniv = gen_uqsub_vec,
1537 .fno = gen_helper_gvec_uqsub_b,
1538 .opt_opc = vecop_list,
1539 .write_aofs = true,
1540 .vece = MO_8 },
1541 { .fniv = gen_uqsub_vec,
1542 .fno = gen_helper_gvec_uqsub_h,
1543 .opt_opc = vecop_list,
1544 .write_aofs = true,
1545 .vece = MO_16 },
1546 { .fniv = gen_uqsub_vec,
1547 .fno = gen_helper_gvec_uqsub_s,
1548 .opt_opc = vecop_list,
1549 .write_aofs = true,
1550 .vece = MO_32 },
1551 { .fniv = gen_uqsub_vec,
1552 .fni8 = gen_uqsub_d,
1553 .fno = gen_helper_gvec_uqsub_d,
1554 .opt_opc = vecop_list,
1555 .write_aofs = true,
1556 .vece = MO_64 },
1559 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1560 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1561 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1564 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
1566 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
1567 int64_t min = -1ll - max;
1568 TCGv_i64 tmp = tcg_temp_new_i64();
1570 tcg_gen_sub_i64(tmp, a, b);
1571 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
1572 tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
1573 tcg_gen_xor_i64(tmp, tmp, res);
1574 tcg_gen_or_i64(qc, qc, tmp);
1577 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
1579 TCGv_i64 t0 = tcg_temp_new_i64();
1580 TCGv_i64 t1 = tcg_temp_new_i64();
1581 TCGv_i64 t2 = tcg_temp_new_i64();
1583 tcg_gen_sub_i64(t0, a, b);
1585 /* Compute signed overflow indication into T1 */
1586 tcg_gen_xor_i64(t1, a, b);
1587 tcg_gen_xor_i64(t2, t0, a);
1588 tcg_gen_and_i64(t1, t1, t2);
1590 /* Compute saturated value into T2 */
1591 tcg_gen_sari_i64(t2, a, 63);
1592 tcg_gen_xori_i64(t2, t2, INT64_MAX);
1594 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
1595 tcg_gen_xor_i64(t0, t0, res);
1596 tcg_gen_or_i64(qc, qc, t0);
1599 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
1600 TCGv_vec a, TCGv_vec b)
1602 TCGv_vec x = tcg_temp_new_vec_matching(t);
1603 tcg_gen_sub_vec(vece, x, a, b);
1604 tcg_gen_sssub_vec(vece, t, a, b);
1605 tcg_gen_xor_vec(vece, x, x, t);
1606 tcg_gen_or_vec(vece, qc, qc, x);
1609 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1610 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1612 static const TCGOpcode vecop_list[] = {
1613 INDEX_op_sssub_vec, INDEX_op_sub_vec, 0
1615 static const GVecGen4 ops[4] = {
1616 { .fniv = gen_sqsub_vec,
1617 .fno = gen_helper_gvec_sqsub_b,
1618 .opt_opc = vecop_list,
1619 .write_aofs = true,
1620 .vece = MO_8 },
1621 { .fniv = gen_sqsub_vec,
1622 .fno = gen_helper_gvec_sqsub_h,
1623 .opt_opc = vecop_list,
1624 .write_aofs = true,
1625 .vece = MO_16 },
1626 { .fniv = gen_sqsub_vec,
1627 .fno = gen_helper_gvec_sqsub_s,
1628 .opt_opc = vecop_list,
1629 .write_aofs = true,
1630 .vece = MO_32 },
1631 { .fniv = gen_sqsub_vec,
1632 .fni8 = gen_sqsub_d,
1633 .fno = gen_helper_gvec_sqsub_d,
1634 .opt_opc = vecop_list,
1635 .write_aofs = true,
1636 .vece = MO_64 },
1639 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc));
1640 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
1641 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1644 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1646 TCGv_i32 t = tcg_temp_new_i32();
1648 tcg_gen_sub_i32(t, a, b);
1649 tcg_gen_sub_i32(d, b, a);
1650 tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
1653 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1655 TCGv_i64 t = tcg_temp_new_i64();
1657 tcg_gen_sub_i64(t, a, b);
1658 tcg_gen_sub_i64(d, b, a);
1659 tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
1662 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1664 TCGv_vec t = tcg_temp_new_vec_matching(d);
1666 tcg_gen_smin_vec(vece, t, a, b);
1667 tcg_gen_smax_vec(vece, d, a, b);
1668 tcg_gen_sub_vec(vece, d, d, t);
1671 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1672 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1674 static const TCGOpcode vecop_list[] = {
1675 INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1677 static const GVecGen3 ops[4] = {
1678 { .fniv = gen_sabd_vec,
1679 .fno = gen_helper_gvec_sabd_b,
1680 .opt_opc = vecop_list,
1681 .vece = MO_8 },
1682 { .fniv = gen_sabd_vec,
1683 .fno = gen_helper_gvec_sabd_h,
1684 .opt_opc = vecop_list,
1685 .vece = MO_16 },
1686 { .fni4 = gen_sabd_i32,
1687 .fniv = gen_sabd_vec,
1688 .fno = gen_helper_gvec_sabd_s,
1689 .opt_opc = vecop_list,
1690 .vece = MO_32 },
1691 { .fni8 = gen_sabd_i64,
1692 .fniv = gen_sabd_vec,
1693 .fno = gen_helper_gvec_sabd_d,
1694 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1695 .opt_opc = vecop_list,
1696 .vece = MO_64 },
1698 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1701 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1703 TCGv_i32 t = tcg_temp_new_i32();
1705 tcg_gen_sub_i32(t, a, b);
1706 tcg_gen_sub_i32(d, b, a);
1707 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
1710 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1712 TCGv_i64 t = tcg_temp_new_i64();
1714 tcg_gen_sub_i64(t, a, b);
1715 tcg_gen_sub_i64(d, b, a);
1716 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
1719 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1721 TCGv_vec t = tcg_temp_new_vec_matching(d);
1723 tcg_gen_umin_vec(vece, t, a, b);
1724 tcg_gen_umax_vec(vece, d, a, b);
1725 tcg_gen_sub_vec(vece, d, d, t);
1728 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1729 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1731 static const TCGOpcode vecop_list[] = {
1732 INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1734 static const GVecGen3 ops[4] = {
1735 { .fniv = gen_uabd_vec,
1736 .fno = gen_helper_gvec_uabd_b,
1737 .opt_opc = vecop_list,
1738 .vece = MO_8 },
1739 { .fniv = gen_uabd_vec,
1740 .fno = gen_helper_gvec_uabd_h,
1741 .opt_opc = vecop_list,
1742 .vece = MO_16 },
1743 { .fni4 = gen_uabd_i32,
1744 .fniv = gen_uabd_vec,
1745 .fno = gen_helper_gvec_uabd_s,
1746 .opt_opc = vecop_list,
1747 .vece = MO_32 },
1748 { .fni8 = gen_uabd_i64,
1749 .fniv = gen_uabd_vec,
1750 .fno = gen_helper_gvec_uabd_d,
1751 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1752 .opt_opc = vecop_list,
1753 .vece = MO_64 },
1755 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1758 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1760 TCGv_i32 t = tcg_temp_new_i32();
1761 gen_sabd_i32(t, a, b);
1762 tcg_gen_add_i32(d, d, t);
1765 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1767 TCGv_i64 t = tcg_temp_new_i64();
1768 gen_sabd_i64(t, a, b);
1769 tcg_gen_add_i64(d, d, t);
1772 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1774 TCGv_vec t = tcg_temp_new_vec_matching(d);
1775 gen_sabd_vec(vece, t, a, b);
1776 tcg_gen_add_vec(vece, d, d, t);
1779 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1780 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1782 static const TCGOpcode vecop_list[] = {
1783 INDEX_op_sub_vec, INDEX_op_add_vec,
1784 INDEX_op_smin_vec, INDEX_op_smax_vec, 0
1786 static const GVecGen3 ops[4] = {
1787 { .fniv = gen_saba_vec,
1788 .fno = gen_helper_gvec_saba_b,
1789 .opt_opc = vecop_list,
1790 .load_dest = true,
1791 .vece = MO_8 },
1792 { .fniv = gen_saba_vec,
1793 .fno = gen_helper_gvec_saba_h,
1794 .opt_opc = vecop_list,
1795 .load_dest = true,
1796 .vece = MO_16 },
1797 { .fni4 = gen_saba_i32,
1798 .fniv = gen_saba_vec,
1799 .fno = gen_helper_gvec_saba_s,
1800 .opt_opc = vecop_list,
1801 .load_dest = true,
1802 .vece = MO_32 },
1803 { .fni8 = gen_saba_i64,
1804 .fniv = gen_saba_vec,
1805 .fno = gen_helper_gvec_saba_d,
1806 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1807 .opt_opc = vecop_list,
1808 .load_dest = true,
1809 .vece = MO_64 },
1811 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1814 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1816 TCGv_i32 t = tcg_temp_new_i32();
1817 gen_uabd_i32(t, a, b);
1818 tcg_gen_add_i32(d, d, t);
1821 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1823 TCGv_i64 t = tcg_temp_new_i64();
1824 gen_uabd_i64(t, a, b);
1825 tcg_gen_add_i64(d, d, t);
1828 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1830 TCGv_vec t = tcg_temp_new_vec_matching(d);
1831 gen_uabd_vec(vece, t, a, b);
1832 tcg_gen_add_vec(vece, d, d, t);
1835 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1836 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1838 static const TCGOpcode vecop_list[] = {
1839 INDEX_op_sub_vec, INDEX_op_add_vec,
1840 INDEX_op_umin_vec, INDEX_op_umax_vec, 0
1842 static const GVecGen3 ops[4] = {
1843 { .fniv = gen_uaba_vec,
1844 .fno = gen_helper_gvec_uaba_b,
1845 .opt_opc = vecop_list,
1846 .load_dest = true,
1847 .vece = MO_8 },
1848 { .fniv = gen_uaba_vec,
1849 .fno = gen_helper_gvec_uaba_h,
1850 .opt_opc = vecop_list,
1851 .load_dest = true,
1852 .vece = MO_16 },
1853 { .fni4 = gen_uaba_i32,
1854 .fniv = gen_uaba_vec,
1855 .fno = gen_helper_gvec_uaba_s,
1856 .opt_opc = vecop_list,
1857 .load_dest = true,
1858 .vece = MO_32 },
1859 { .fni8 = gen_uaba_i64,
1860 .fniv = gen_uaba_vec,
1861 .fno = gen_helper_gvec_uaba_d,
1862 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1863 .opt_opc = vecop_list,
1864 .load_dest = true,
1865 .vece = MO_64 },
1867 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
1870 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1871 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1873 static gen_helper_gvec_3 * const fns[4] = {
1874 gen_helper_gvec_addp_b,
1875 gen_helper_gvec_addp_h,
1876 gen_helper_gvec_addp_s,
1877 gen_helper_gvec_addp_d,
1879 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1882 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1883 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1885 static gen_helper_gvec_3 * const fns[4] = {
1886 gen_helper_gvec_smaxp_b,
1887 gen_helper_gvec_smaxp_h,
1888 gen_helper_gvec_smaxp_s,
1890 tcg_debug_assert(vece <= MO_32);
1891 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1894 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1895 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1897 static gen_helper_gvec_3 * const fns[4] = {
1898 gen_helper_gvec_sminp_b,
1899 gen_helper_gvec_sminp_h,
1900 gen_helper_gvec_sminp_s,
1902 tcg_debug_assert(vece <= MO_32);
1903 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1906 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1907 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1909 static gen_helper_gvec_3 * const fns[4] = {
1910 gen_helper_gvec_umaxp_b,
1911 gen_helper_gvec_umaxp_h,
1912 gen_helper_gvec_umaxp_s,
1914 tcg_debug_assert(vece <= MO_32);
1915 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1918 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1919 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1921 static gen_helper_gvec_3 * const fns[4] = {
1922 gen_helper_gvec_uminp_b,
1923 gen_helper_gvec_uminp_h,
1924 gen_helper_gvec_uminp_s,
1926 tcg_debug_assert(vece <= MO_32);
1927 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
1930 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1932 TCGv_i64 t = tcg_temp_new_i64();
1934 tcg_gen_and_i64(t, a, b);
1935 tcg_gen_vec_sar8i_i64(a, a, 1);
1936 tcg_gen_vec_sar8i_i64(b, b, 1);
1937 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
1938 tcg_gen_vec_add8_i64(d, a, b);
1939 tcg_gen_vec_add8_i64(d, d, t);
1942 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1944 TCGv_i64 t = tcg_temp_new_i64();
1946 tcg_gen_and_i64(t, a, b);
1947 tcg_gen_vec_sar16i_i64(a, a, 1);
1948 tcg_gen_vec_sar16i_i64(b, b, 1);
1949 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
1950 tcg_gen_vec_add16_i64(d, a, b);
1951 tcg_gen_vec_add16_i64(d, d, t);
1954 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1956 TCGv_i32 t = tcg_temp_new_i32();
1958 tcg_gen_and_i32(t, a, b);
1959 tcg_gen_sari_i32(a, a, 1);
1960 tcg_gen_sari_i32(b, b, 1);
1961 tcg_gen_andi_i32(t, t, 1);
1962 tcg_gen_add_i32(d, a, b);
1963 tcg_gen_add_i32(d, d, t);
1966 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
1968 TCGv_vec t = tcg_temp_new_vec_matching(d);
1970 tcg_gen_and_vec(vece, t, a, b);
1971 tcg_gen_sari_vec(vece, a, a, 1);
1972 tcg_gen_sari_vec(vece, b, b, 1);
1973 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
1974 tcg_gen_add_vec(vece, d, a, b);
1975 tcg_gen_add_vec(vece, d, d, t);
1978 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
1979 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
1981 static const TCGOpcode vecop_list[] = {
1982 INDEX_op_sari_vec, INDEX_op_add_vec, 0
1984 static const GVecGen3 g[] = {
1985 { .fni8 = gen_shadd8_i64,
1986 .fniv = gen_shadd_vec,
1987 .opt_opc = vecop_list,
1988 .vece = MO_8 },
1989 { .fni8 = gen_shadd16_i64,
1990 .fniv = gen_shadd_vec,
1991 .opt_opc = vecop_list,
1992 .vece = MO_16 },
1993 { .fni4 = gen_shadd_i32,
1994 .fniv = gen_shadd_vec,
1995 .opt_opc = vecop_list,
1996 .vece = MO_32 },
1998 tcg_debug_assert(vece <= MO_32);
1999 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2002 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2004 TCGv_i64 t = tcg_temp_new_i64();
2006 tcg_gen_and_i64(t, a, b);
2007 tcg_gen_vec_shr8i_i64(a, a, 1);
2008 tcg_gen_vec_shr8i_i64(b, b, 1);
2009 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2010 tcg_gen_vec_add8_i64(d, a, b);
2011 tcg_gen_vec_add8_i64(d, d, t);
2014 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2016 TCGv_i64 t = tcg_temp_new_i64();
2018 tcg_gen_and_i64(t, a, b);
2019 tcg_gen_vec_shr16i_i64(a, a, 1);
2020 tcg_gen_vec_shr16i_i64(b, b, 1);
2021 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2022 tcg_gen_vec_add16_i64(d, a, b);
2023 tcg_gen_vec_add16_i64(d, d, t);
2026 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2028 TCGv_i32 t = tcg_temp_new_i32();
2030 tcg_gen_and_i32(t, a, b);
2031 tcg_gen_shri_i32(a, a, 1);
2032 tcg_gen_shri_i32(b, b, 1);
2033 tcg_gen_andi_i32(t, t, 1);
2034 tcg_gen_add_i32(d, a, b);
2035 tcg_gen_add_i32(d, d, t);
2038 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2040 TCGv_vec t = tcg_temp_new_vec_matching(d);
2042 tcg_gen_and_vec(vece, t, a, b);
2043 tcg_gen_shri_vec(vece, a, a, 1);
2044 tcg_gen_shri_vec(vece, b, b, 1);
2045 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2046 tcg_gen_add_vec(vece, d, a, b);
2047 tcg_gen_add_vec(vece, d, d, t);
2050 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2051 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2053 static const TCGOpcode vecop_list[] = {
2054 INDEX_op_shri_vec, INDEX_op_add_vec, 0
2056 static const GVecGen3 g[] = {
2057 { .fni8 = gen_uhadd8_i64,
2058 .fniv = gen_uhadd_vec,
2059 .opt_opc = vecop_list,
2060 .vece = MO_8 },
2061 { .fni8 = gen_uhadd16_i64,
2062 .fniv = gen_uhadd_vec,
2063 .opt_opc = vecop_list,
2064 .vece = MO_16 },
2065 { .fni4 = gen_uhadd_i32,
2066 .fniv = gen_uhadd_vec,
2067 .opt_opc = vecop_list,
2068 .vece = MO_32 },
2070 tcg_debug_assert(vece <= MO_32);
2071 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2074 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2076 TCGv_i64 t = tcg_temp_new_i64();
2078 tcg_gen_andc_i64(t, b, a);
2079 tcg_gen_vec_sar8i_i64(a, a, 1);
2080 tcg_gen_vec_sar8i_i64(b, b, 1);
2081 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2082 tcg_gen_vec_sub8_i64(d, a, b);
2083 tcg_gen_vec_sub8_i64(d, d, t);
2086 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2088 TCGv_i64 t = tcg_temp_new_i64();
2090 tcg_gen_andc_i64(t, b, a);
2091 tcg_gen_vec_sar16i_i64(a, a, 1);
2092 tcg_gen_vec_sar16i_i64(b, b, 1);
2093 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2094 tcg_gen_vec_sub16_i64(d, a, b);
2095 tcg_gen_vec_sub16_i64(d, d, t);
2098 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2100 TCGv_i32 t = tcg_temp_new_i32();
2102 tcg_gen_andc_i32(t, b, a);
2103 tcg_gen_sari_i32(a, a, 1);
2104 tcg_gen_sari_i32(b, b, 1);
2105 tcg_gen_andi_i32(t, t, 1);
2106 tcg_gen_sub_i32(d, a, b);
2107 tcg_gen_sub_i32(d, d, t);
2110 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2112 TCGv_vec t = tcg_temp_new_vec_matching(d);
2114 tcg_gen_andc_vec(vece, t, b, a);
2115 tcg_gen_sari_vec(vece, a, a, 1);
2116 tcg_gen_sari_vec(vece, b, b, 1);
2117 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2118 tcg_gen_sub_vec(vece, d, a, b);
2119 tcg_gen_sub_vec(vece, d, d, t);
2122 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2123 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2125 static const TCGOpcode vecop_list[] = {
2126 INDEX_op_sari_vec, INDEX_op_sub_vec, 0
2128 static const GVecGen3 g[4] = {
2129 { .fni8 = gen_shsub8_i64,
2130 .fniv = gen_shsub_vec,
2131 .opt_opc = vecop_list,
2132 .vece = MO_8 },
2133 { .fni8 = gen_shsub16_i64,
2134 .fniv = gen_shsub_vec,
2135 .opt_opc = vecop_list,
2136 .vece = MO_16 },
2137 { .fni4 = gen_shsub_i32,
2138 .fniv = gen_shsub_vec,
2139 .opt_opc = vecop_list,
2140 .vece = MO_32 },
2142 assert(vece <= MO_32);
2143 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2146 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2148 TCGv_i64 t = tcg_temp_new_i64();
2150 tcg_gen_andc_i64(t, b, a);
2151 tcg_gen_vec_shr8i_i64(a, a, 1);
2152 tcg_gen_vec_shr8i_i64(b, b, 1);
2153 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2154 tcg_gen_vec_sub8_i64(d, a, b);
2155 tcg_gen_vec_sub8_i64(d, d, t);
2158 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2160 TCGv_i64 t = tcg_temp_new_i64();
2162 tcg_gen_andc_i64(t, b, a);
2163 tcg_gen_vec_shr16i_i64(a, a, 1);
2164 tcg_gen_vec_shr16i_i64(b, b, 1);
2165 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2166 tcg_gen_vec_sub16_i64(d, a, b);
2167 tcg_gen_vec_sub16_i64(d, d, t);
2170 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2172 TCGv_i32 t = tcg_temp_new_i32();
2174 tcg_gen_andc_i32(t, b, a);
2175 tcg_gen_shri_i32(a, a, 1);
2176 tcg_gen_shri_i32(b, b, 1);
2177 tcg_gen_andi_i32(t, t, 1);
2178 tcg_gen_sub_i32(d, a, b);
2179 tcg_gen_sub_i32(d, d, t);
2182 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2184 TCGv_vec t = tcg_temp_new_vec_matching(d);
2186 tcg_gen_andc_vec(vece, t, b, a);
2187 tcg_gen_shri_vec(vece, a, a, 1);
2188 tcg_gen_shri_vec(vece, b, b, 1);
2189 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2190 tcg_gen_sub_vec(vece, d, a, b);
2191 tcg_gen_sub_vec(vece, d, d, t);
2194 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2195 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2197 static const TCGOpcode vecop_list[] = {
2198 INDEX_op_shri_vec, INDEX_op_sub_vec, 0
2200 static const GVecGen3 g[4] = {
2201 { .fni8 = gen_uhsub8_i64,
2202 .fniv = gen_uhsub_vec,
2203 .opt_opc = vecop_list,
2204 .vece = MO_8 },
2205 { .fni8 = gen_uhsub16_i64,
2206 .fniv = gen_uhsub_vec,
2207 .opt_opc = vecop_list,
2208 .vece = MO_16 },
2209 { .fni4 = gen_uhsub_i32,
2210 .fniv = gen_uhsub_vec,
2211 .opt_opc = vecop_list,
2212 .vece = MO_32 },
2214 assert(vece <= MO_32);
2215 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2218 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2220 TCGv_i64 t = tcg_temp_new_i64();
2222 tcg_gen_or_i64(t, a, b);
2223 tcg_gen_vec_sar8i_i64(a, a, 1);
2224 tcg_gen_vec_sar8i_i64(b, b, 1);
2225 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2226 tcg_gen_vec_add8_i64(d, a, b);
2227 tcg_gen_vec_add8_i64(d, d, t);
2230 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2232 TCGv_i64 t = tcg_temp_new_i64();
2234 tcg_gen_or_i64(t, a, b);
2235 tcg_gen_vec_sar16i_i64(a, a, 1);
2236 tcg_gen_vec_sar16i_i64(b, b, 1);
2237 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2238 tcg_gen_vec_add16_i64(d, a, b);
2239 tcg_gen_vec_add16_i64(d, d, t);
2242 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2244 TCGv_i32 t = tcg_temp_new_i32();
2246 tcg_gen_or_i32(t, a, b);
2247 tcg_gen_sari_i32(a, a, 1);
2248 tcg_gen_sari_i32(b, b, 1);
2249 tcg_gen_andi_i32(t, t, 1);
2250 tcg_gen_add_i32(d, a, b);
2251 tcg_gen_add_i32(d, d, t);
2254 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2256 TCGv_vec t = tcg_temp_new_vec_matching(d);
2258 tcg_gen_or_vec(vece, t, a, b);
2259 tcg_gen_sari_vec(vece, a, a, 1);
2260 tcg_gen_sari_vec(vece, b, b, 1);
2261 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2262 tcg_gen_add_vec(vece, d, a, b);
2263 tcg_gen_add_vec(vece, d, d, t);
2266 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2267 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2269 static const TCGOpcode vecop_list[] = {
2270 INDEX_op_sari_vec, INDEX_op_add_vec, 0
2272 static const GVecGen3 g[] = {
2273 { .fni8 = gen_srhadd8_i64,
2274 .fniv = gen_srhadd_vec,
2275 .opt_opc = vecop_list,
2276 .vece = MO_8 },
2277 { .fni8 = gen_srhadd16_i64,
2278 .fniv = gen_srhadd_vec,
2279 .opt_opc = vecop_list,
2280 .vece = MO_16 },
2281 { .fni4 = gen_srhadd_i32,
2282 .fniv = gen_srhadd_vec,
2283 .opt_opc = vecop_list,
2284 .vece = MO_32 },
2286 assert(vece <= MO_32);
2287 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);
2290 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2292 TCGv_i64 t = tcg_temp_new_i64();
2294 tcg_gen_or_i64(t, a, b);
2295 tcg_gen_vec_shr8i_i64(a, a, 1);
2296 tcg_gen_vec_shr8i_i64(b, b, 1);
2297 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
2298 tcg_gen_vec_add8_i64(d, a, b);
2299 tcg_gen_vec_add8_i64(d, d, t);
2302 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2304 TCGv_i64 t = tcg_temp_new_i64();
2306 tcg_gen_or_i64(t, a, b);
2307 tcg_gen_vec_shr16i_i64(a, a, 1);
2308 tcg_gen_vec_shr16i_i64(b, b, 1);
2309 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
2310 tcg_gen_vec_add16_i64(d, a, b);
2311 tcg_gen_vec_add16_i64(d, d, t);
2314 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2316 TCGv_i32 t = tcg_temp_new_i32();
2318 tcg_gen_or_i32(t, a, b);
2319 tcg_gen_shri_i32(a, a, 1);
2320 tcg_gen_shri_i32(b, b, 1);
2321 tcg_gen_andi_i32(t, t, 1);
2322 tcg_gen_add_i32(d, a, b);
2323 tcg_gen_add_i32(d, d, t);
2326 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
2328 TCGv_vec t = tcg_temp_new_vec_matching(d);
2330 tcg_gen_or_vec(vece, t, a, b);
2331 tcg_gen_shri_vec(vece, a, a, 1);
2332 tcg_gen_shri_vec(vece, b, b, 1);
2333 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1));
2334 tcg_gen_add_vec(vece, d, a, b);
2335 tcg_gen_add_vec(vece, d, d, t);
2338 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
2339 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
2341 static const TCGOpcode vecop_list[] = {
2342 INDEX_op_shri_vec, INDEX_op_add_vec, 0
2344 static const GVecGen3 g[] = {
2345 { .fni8 = gen_urhadd8_i64,
2346 .fniv = gen_urhadd_vec,
2347 .opt_opc = vecop_list,
2348 .vece = MO_8 },
2349 { .fni8 = gen_urhadd16_i64,
2350 .fniv = gen_urhadd_vec,
2351 .opt_opc = vecop_list,
2352 .vece = MO_16 },
2353 { .fni4 = gen_urhadd_i32,
2354 .fniv = gen_urhadd_vec,
2355 .opt_opc = vecop_list,
2356 .vece = MO_32 },
2358 assert(vece <= MO_32);
2359 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]);