target/arm/translate-neon.inc.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 /*
  24  * This file is intended to be included from translate.c; it uses
  25  * some macros and definitions provided by that file.
  26  * It might be possible to convert it to a standalone .c file eventually.
  27  */
  28
  29 static inline int plus1(DisasContext *s, int x)
  30 {
  31     return x + 1;
  32 }
  33
  34 static inline int rsub_64(DisasContext *s, int x)
  35 {
  36     return 64 - x;
  37 }
  38
  39 static inline int rsub_32(DisasContext *s, int x)
  40 {
  41     return 32 - x;
  42 }
  43 static inline int rsub_16(DisasContext *s, int x)
  44 {
  45     return 16 - x;
  46 }
  47 static inline int rsub_8(DisasContext *s, int x)
  48 {
  49     return 8 - x;
  50 }
  51
  52 /* Include the generated Neon decoder */
  53 #include "decode-neon-dp.inc.c"
  54 #include "decode-neon-ls.inc.c"
  55 #include "decode-neon-shared.inc.c"
  56
  57 /* Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
  58  * where 0 is the least significant end of the register.
  59  */
  60 static inline long
  61 neon_element_offset(int reg, int element, MemOp size)
  62 {
  63     int element_size = 1 << size;
  64     int ofs = element * element_size;
  65 #ifdef HOST_WORDS_BIGENDIAN
  66     /* Calculate the offset assuming fully little-endian,
  67      * then XOR to account for the order of the 8-byte units.
  68      */
  69     if (element_size < 8) {
  70         ofs ^= 8 - element_size;
  71     }
  72 #endif
  73     return neon_reg_offset(reg, 0) + ofs;
  74 }
  75
  76 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  77 {
  78     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  79
  80     switch (mop) {
  81     case MO_UB:
  82         tcg_gen_ld8u_i32(var, cpu_env, offset);
  83         break;
  84     case MO_UW:
  85         tcg_gen_ld16u_i32(var, cpu_env, offset);
  86         break;
  87     case MO_UL:
  88         tcg_gen_ld_i32(var, cpu_env, offset);
  89         break;
  90     default:
  91         g_assert_not_reached();
  92     }
  93 }
  94
  95 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  96 {
  97     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  98
  99     switch (mop) {
 100     case MO_UB:
 101         tcg_gen_ld8u_i64(var, cpu_env, offset);
 102         break;
 103     case MO_UW:
 104         tcg_gen_ld16u_i64(var, cpu_env, offset);
 105         break;
 106     case MO_UL:
 107         tcg_gen_ld32u_i64(var, cpu_env, offset);
 108         break;
 109     case MO_Q:
 110         tcg_gen_ld_i64(var, cpu_env, offset);
 111         break;
 112     default:
 113         g_assert_not_reached();
 114     }
 115 }
 116
 117 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
 118 {
 119     long offset = neon_element_offset(reg, ele, size);
 120
 121     switch (size) {
 122     case MO_8:
 123         tcg_gen_st8_i32(var, cpu_env, offset);
 124         break;
 125     case MO_16:
 126         tcg_gen_st16_i32(var, cpu_env, offset);
 127         break;
 128     case MO_32:
 129         tcg_gen_st_i32(var, cpu_env, offset);
 130         break;
 131     default:
 132         g_assert_not_reached();
 133     }
 134 }
 135
 136 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 137 {
 138     long offset = neon_element_offset(reg, ele, size);
 139
 140     switch (size) {
 141     case MO_8:
 142         tcg_gen_st8_i64(var, cpu_env, offset);
 143         break;
 144     case MO_16:
 145         tcg_gen_st16_i64(var, cpu_env, offset);
 146         break;
 147     case MO_32:
 148         tcg_gen_st32_i64(var, cpu_env, offset);
 149         break;
 150     case MO_64:
 151         tcg_gen_st_i64(var, cpu_env, offset);
 152         break;
 153     default:
 154         g_assert_not_reached();
 155     }
 156 }
 157
 158 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 159 {
 160     int opr_sz;
 161     TCGv_ptr fpst;
 162     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 163
 164     if (!dc_isar_feature(aa32_vcma, s)
 165         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
 166         return false;
 167     }
 168
 169     /* UNDEF accesses to D16-D31 if they don't exist. */
 170     if (!dc_isar_feature(aa32_simd_r32, s) &&
 171         ((a->vd | a->vn | a->vm) & 0x10)) {
 172         return false;
 173     }
 174
 175     if ((a->vn | a->vm | a->vd) & a->q) {
 176         return false;
 177     }
 178
 179     if (!vfp_access_check(s)) {
 180         return true;
 181     }
 182
 183     opr_sz = (1 + a->q) * 8;
 184     fpst = get_fpstatus_ptr(1);
 185     fn_gvec_ptr = a->size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
 186     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 187                        vfp_reg_offset(1, a->vn),
 188                        vfp_reg_offset(1, a->vm),
 189                        fpst, opr_sz, opr_sz, a->rot,
 190                        fn_gvec_ptr);
 191     tcg_temp_free_ptr(fpst);
 192     return true;
 193 }
 194
 195 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 196 {
 197     int opr_sz;
 198     TCGv_ptr fpst;
 199     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 200
 201     if (!dc_isar_feature(aa32_vcma, s)
 202         || (!a->size && !dc_isar_feature(aa32_fp16_arith, s))) {
 203         return false;
 204     }
 205
 206     /* UNDEF accesses to D16-D31 if they don't exist. */
 207     if (!dc_isar_feature(aa32_simd_r32, s) &&
 208         ((a->vd | a->vn | a->vm) & 0x10)) {
 209         return false;
 210     }
 211
 212     if ((a->vn | a->vm | a->vd) & a->q) {
 213         return false;
 214     }
 215
 216     if (!vfp_access_check(s)) {
 217         return true;
 218     }
 219
 220     opr_sz = (1 + a->q) * 8;
 221     fpst = get_fpstatus_ptr(1);
 222     fn_gvec_ptr = a->size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
 223     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 224                        vfp_reg_offset(1, a->vn),
 225                        vfp_reg_offset(1, a->vm),
 226                        fpst, opr_sz, opr_sz, a->rot,
 227                        fn_gvec_ptr);
 228     tcg_temp_free_ptr(fpst);
 229     return true;
 230 }
 231
 232 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
 233 {
 234     int opr_sz;
 235     gen_helper_gvec_3 *fn_gvec;
 236
 237     if (!dc_isar_feature(aa32_dp, s)) {
 238         return false;
 239     }
 240
 241     /* UNDEF accesses to D16-D31 if they don't exist. */
 242     if (!dc_isar_feature(aa32_simd_r32, s) &&
 243         ((a->vd | a->vn | a->vm) & 0x10)) {
 244         return false;
 245     }
 246
 247     if ((a->vn | a->vm | a->vd) & a->q) {
 248         return false;
 249     }
 250
 251     if (!vfp_access_check(s)) {
 252         return true;
 253     }
 254
 255     opr_sz = (1 + a->q) * 8;
 256     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
 257     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
 258                        vfp_reg_offset(1, a->vn),
 259                        vfp_reg_offset(1, a->vm),
 260                        opr_sz, opr_sz, 0, fn_gvec);
 261     return true;
 262 }
 263
 264 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 265 {
 266     int opr_sz;
 267
 268     if (!dc_isar_feature(aa32_fhm, s)) {
 269         return false;
 270     }
 271
 272     /* UNDEF accesses to D16-D31 if they don't exist. */
 273     if (!dc_isar_feature(aa32_simd_r32, s) &&
 274         (a->vd & 0x10)) {
 275         return false;
 276     }
 277
 278     if (a->vd & a->q) {
 279         return false;
 280     }
 281
 282     if (!vfp_access_check(s)) {
 283         return true;
 284     }
 285
 286     opr_sz = (1 + a->q) * 8;
 287     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 288                        vfp_reg_offset(a->q, a->vn),
 289                        vfp_reg_offset(a->q, a->vm),
 290                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 291                        gen_helper_gvec_fmlal_a32);
 292     return true;
 293 }
 294
 295 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 296 {
 297     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 298     int opr_sz;
 299     TCGv_ptr fpst;
 300
 301     if (!dc_isar_feature(aa32_vcma, s)) {
 302         return false;
 303     }
 304     if (a->size == 0 && !dc_isar_feature(aa32_fp16_arith, s)) {
 305         return false;
 306     }
 307
 308     /* UNDEF accesses to D16-D31 if they don't exist. */
 309     if (!dc_isar_feature(aa32_simd_r32, s) &&
 310         ((a->vd | a->vn | a->vm) & 0x10)) {
 311         return false;
 312     }
 313
 314     if ((a->vd | a->vn) & a->q) {
 315         return false;
 316     }
 317
 318     if (!vfp_access_check(s)) {
 319         return true;
 320     }
 321
 322     fn_gvec_ptr = (a->size ? gen_helper_gvec_fcmlas_idx
 323                    : gen_helper_gvec_fcmlah_idx);
 324     opr_sz = (1 + a->q) * 8;
 325     fpst = get_fpstatus_ptr(1);
 326     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 327                        vfp_reg_offset(1, a->vn),
 328                        vfp_reg_offset(1, a->vm),
 329                        fpst, opr_sz, opr_sz,
 330                        (a->index << 2) | a->rot, fn_gvec_ptr);
 331     tcg_temp_free_ptr(fpst);
 332     return true;
 333 }
 334
 335 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
 336 {
 337     gen_helper_gvec_3 *fn_gvec;
 338     int opr_sz;
 339     TCGv_ptr fpst;
 340
 341     if (!dc_isar_feature(aa32_dp, s)) {
 342         return false;
 343     }
 344
 345     /* UNDEF accesses to D16-D31 if they don't exist. */
 346     if (!dc_isar_feature(aa32_simd_r32, s) &&
 347         ((a->vd | a->vn) & 0x10)) {
 348         return false;
 349     }
 350
 351     if ((a->vd | a->vn) & a->q) {
 352         return false;
 353     }
 354
 355     if (!vfp_access_check(s)) {
 356         return true;
 357     }
 358
 359     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
 360     opr_sz = (1 + a->q) * 8;
 361     fpst = get_fpstatus_ptr(1);
 362     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
 363                        vfp_reg_offset(1, a->vn),
 364                        vfp_reg_offset(1, a->rm),
 365                        opr_sz, opr_sz, a->index, fn_gvec);
 366     tcg_temp_free_ptr(fpst);
 367     return true;
 368 }
 369
 370 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 371 {
 372     int opr_sz;
 373
 374     if (!dc_isar_feature(aa32_fhm, s)) {
 375         return false;
 376     }
 377
 378     /* UNDEF accesses to D16-D31 if they don't exist. */
 379     if (!dc_isar_feature(aa32_simd_r32, s) &&
 380         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 381         return false;
 382     }
 383
 384     if (a->vd & a->q) {
 385         return false;
 386     }
 387
 388     if (!vfp_access_check(s)) {
 389         return true;
 390     }
 391
 392     opr_sz = (1 + a->q) * 8;
 393     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 394                        vfp_reg_offset(a->q, a->vn),
 395                        vfp_reg_offset(a->q, a->rm),
 396                        cpu_env, opr_sz, opr_sz,
 397                        (a->index << 2) | a->s, /* is_2 == 0 */
 398                        gen_helper_gvec_fmlal_idx_a32);
 399     return true;
 400 }
 401
 402 static struct {
 403     int nregs;
 404     int interleave;
 405     int spacing;
 406 } const neon_ls_element_type[11] = {
 407     {1, 4, 1},
 408     {1, 4, 2},
 409     {4, 1, 1},
 410     {2, 2, 2},
 411     {1, 3, 1},
 412     {1, 3, 2},
 413     {3, 1, 1},
 414     {1, 1, 1},
 415     {1, 2, 1},
 416     {1, 2, 2},
 417     {2, 1, 1}
 418 };
 419
 420 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 421                                       int stride)
 422 {
 423     if (rm != 15) {
 424         TCGv_i32 base;
 425
 426         base = load_reg(s, rn);
 427         if (rm == 13) {
 428             tcg_gen_addi_i32(base, base, stride);
 429         } else {
 430             TCGv_i32 index;
 431             index = load_reg(s, rm);
 432             tcg_gen_add_i32(base, base, index);
 433             tcg_temp_free_i32(index);
 434         }
 435         store_reg(s, rn, base);
 436     }
 437 }
 438
 439 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 440 {
 441     /* Neon load/store multiple structures */
 442     int nregs, interleave, spacing, reg, n;
 443     MemOp endian = s->be_data;
 444     int mmu_idx = get_mem_index(s);
 445     int size = a->size;
 446     TCGv_i64 tmp64;
 447     TCGv_i32 addr, tmp;
 448
 449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 450         return false;
 451     }
 452
 453     /* UNDEF accesses to D16-D31 if they don't exist */
 454     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 455         return false;
 456     }
 457     if (a->itype > 10) {
 458         return false;
 459     }
 460     /* Catch UNDEF cases for bad values of align field */
 461     switch (a->itype & 0xc) {
 462     case 4:
 463         if (a->align >= 2) {
 464             return false;
 465         }
 466         break;
 467     case 8:
 468         if (a->align == 3) {
 469             return false;
 470         }
 471         break;
 472     default:
 473         break;
 474     }
 475     nregs = neon_ls_element_type[a->itype].nregs;
 476     interleave = neon_ls_element_type[a->itype].interleave;
 477     spacing = neon_ls_element_type[a->itype].spacing;
 478     if (size == 3 && (interleave | spacing) != 1) {
 479         return false;
 480     }
 481
 482     if (!vfp_access_check(s)) {
 483         return true;
 484     }
 485
 486     /* For our purposes, bytes are always little-endian.  */
 487     if (size == 0) {
 488         endian = MO_LE;
 489     }
 490     /*
 491      * Consecutive little-endian elements from a single register
 492      * can be promoted to a larger little-endian operation.
 493      */
 494     if (interleave == 1 && endian == MO_LE) {
 495         size = 3;
 496     }
 497     tmp64 = tcg_temp_new_i64();
 498     addr = tcg_temp_new_i32();
 499     tmp = tcg_const_i32(1 << size);
 500     load_reg_var(s, addr, a->rn);
 501     for (reg = 0; reg < nregs; reg++) {
 502         for (n = 0; n < 8 >> size; n++) {
 503             int xs;
 504             for (xs = 0; xs < interleave; xs++) {
 505                 int tt = a->vd + reg + spacing * xs;
 506
 507                 if (a->l) {
 508                     gen_aa32_ld_i64(s, tmp64, addr, mmu_idx, endian | size);
 509                     neon_store_element64(tt, n, size, tmp64);
 510                 } else {
 511                     neon_load_element64(tmp64, tt, n, size);
 512                     gen_aa32_st_i64(s, tmp64, addr, mmu_idx, endian | size);
 513                 }
 514                 tcg_gen_add_i32(addr, addr, tmp);
 515             }
 516         }
 517     }
 518     tcg_temp_free_i32(addr);
 519     tcg_temp_free_i32(tmp);
 520     tcg_temp_free_i64(tmp64);
 521
 522     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 523     return true;
 524 }
 525
 526 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 527 {
 528     /* Neon load single structure to all lanes */
 529     int reg, stride, vec_size;
 530     int vd = a->vd;
 531     int size = a->size;
 532     int nregs = a->n + 1;
 533     TCGv_i32 addr, tmp;
 534
 535     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 536         return false;
 537     }
 538
 539     /* UNDEF accesses to D16-D31 if they don't exist */
 540     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 541         return false;
 542     }
 543
 544     if (size == 3) {
 545         if (nregs != 4 || a->a == 0) {
 546             return false;
 547         }
 548         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 549         size = 2;
 550     }
 551     if (nregs == 1 && a->a == 1 && size == 0) {
 552         return false;
 553     }
 554     if (nregs == 3 && a->a == 1) {
 555         return false;
 556     }
 557
 558     if (!vfp_access_check(s)) {
 559         return true;
 560     }
 561
 562     /*
 563      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 564      * VLD2/3/4 to all lanes: T bit indicates register stride.
 565      */
 566     stride = a->t ? 2 : 1;
 567     vec_size = nregs == 1 ? stride * 8 : 8;
 568
 569     tmp = tcg_temp_new_i32();
 570     addr = tcg_temp_new_i32();
 571     load_reg_var(s, addr, a->rn);
 572     for (reg = 0; reg < nregs; reg++) {
 573         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
 574                         s->be_data | size);
 575         if ((vd & 1) && vec_size == 16) {
 576             /*
 577              * We cannot write 16 bytes at once because the
 578              * destination is unaligned.
 579              */
 580             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
 581                                  8, 8, tmp);
 582             tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
 583                              neon_reg_offset(vd, 0), 8, 8);
 584         } else {
 585             tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
 586                                  vec_size, vec_size, tmp);
 587         }
 588         tcg_gen_addi_i32(addr, addr, 1 << size);
 589         vd += stride;
 590     }
 591     tcg_temp_free_i32(tmp);
 592     tcg_temp_free_i32(addr);
 593
 594     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 595
 596     return true;
 597 }
 598
 599 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 600 {
 601     /* Neon load/store single structure to one lane */
 602     int reg;
 603     int nregs = a->n + 1;
 604     int vd = a->vd;
 605     TCGv_i32 addr, tmp;
 606
 607     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 608         return false;
 609     }
 610
 611     /* UNDEF accesses to D16-D31 if they don't exist */
 612     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 613         return false;
 614     }
 615
 616     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 617     switch (nregs) {
 618     case 1:
 619         if (((a->align & (1 << a->size)) != 0) ||
 620             (a->size == 2 && ((a->align & 3) == 1 || (a->align & 3) == 2))) {
 621             return false;
 622         }
 623         break;
 624     case 3:
 625         if ((a->align & 1) != 0) {
 626             return false;
 627         }
 628         /* fall through */
 629     case 2:
 630         if (a->size == 2 && (a->align & 2) != 0) {
 631             return false;
 632         }
 633         break;
 634     case 4:
 635         if ((a->size == 2) && ((a->align & 3) == 3)) {
 636             return false;
 637         }
 638         break;
 639     default:
 640         abort();
 641     }
 642     if ((vd + a->stride * (nregs - 1)) > 31) {
 643         /*
 644          * Attempts to write off the end of the register file are
 645          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 646          * access off the end of the array that holds the register data.
 647          */
 648         return false;
 649     }
 650
 651     if (!vfp_access_check(s)) {
 652         return true;
 653     }
 654
 655     tmp = tcg_temp_new_i32();
 656     addr = tcg_temp_new_i32();
 657     load_reg_var(s, addr, a->rn);
 658     /*
 659      * TODO: if we implemented alignment exceptions, we should check
 660      * addr against the alignment encoded in a->align here.
 661      */
 662     for (reg = 0; reg < nregs; reg++) {
 663         if (a->l) {
 664             gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
 665                             s->be_data | a->size);
 666             neon_store_element(vd, a->reg_idx, a->size, tmp);
 667         } else { /* Store */
 668             neon_load_element(tmp, vd, a->reg_idx, a->size);
 669             gen_aa32_st_i32(s, tmp, addr, get_mem_index(s),
 670                             s->be_data | a->size);
 671         }
 672         vd += a->stride;
 673         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 674     }
 675     tcg_temp_free_i32(addr);
 676     tcg_temp_free_i32(tmp);
 677
 678     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 679
 680     return true;
 681 }
 682
 683 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 684 {
 685     int vec_size = a->q ? 16 : 8;
 686     int rd_ofs = neon_reg_offset(a->vd, 0);
 687     int rn_ofs = neon_reg_offset(a->vn, 0);
 688     int rm_ofs = neon_reg_offset(a->vm, 0);
 689
 690     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 691         return false;
 692     }
 693
 694     /* UNDEF accesses to D16-D31 if they don't exist. */
 695     if (!dc_isar_feature(aa32_simd_r32, s) &&
 696         ((a->vd | a->vn | a->vm) & 0x10)) {
 697         return false;
 698     }
 699
 700     if ((a->vn | a->vm | a->vd) & a->q) {
 701         return false;
 702     }
 703
 704     if (!vfp_access_check(s)) {
 705         return true;
 706     }
 707
 708     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 709     return true;
 710 }
 711
 712 #define DO_3SAME(INSN, FUNC)                                            \
 713     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 714     {                                                                   \
 715         return do_3same(s, a, FUNC);                                    \
 716     }
 717
 718 DO_3SAME(VADD, tcg_gen_gvec_add)
 719 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 720 DO_3SAME(VAND, tcg_gen_gvec_and)
 721 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 722 DO_3SAME(VORR, tcg_gen_gvec_or)
 723 DO_3SAME(VORN, tcg_gen_gvec_orc)
 724 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 725 DO_3SAME(VSHL_S, gen_gvec_sshl)
 726 DO_3SAME(VSHL_U, gen_gvec_ushl)
 727 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 728 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 729 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 730 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 731
 732 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 733 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 734     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 735                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 736                                 uint32_t oprsz, uint32_t maxsz)         \
 737     {                                                                   \
 738         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 739     }                                                                   \
 740     DO_3SAME(INSN, gen_##INSN##_3s)
 741
 742 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 743 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 744 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 745
 746 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 747     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 748     {                                                                   \
 749         if (a->size == 3) {                                             \
 750             return false;                                               \
 751         }                                                               \
 752         return do_3same(s, a, FUNC);                                    \
 753     }
 754
 755 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 756 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 757 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 758 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 759 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 760 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 761 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 762 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 763 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 764 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 765 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 766 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 767
 768 #define DO_3SAME_CMP(INSN, COND)                                        \
 769     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 770                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 771                                 uint32_t oprsz, uint32_t maxsz)         \
 772     {                                                                   \
 773         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 774     }                                                                   \
 775     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 776
 777 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 778 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 779 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 780 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 781 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 782
 783 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 784     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 785                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 786     {                                                                      \
 787         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 788     }
 789
 790 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 791
 792 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 793 {
 794     if (a->size != 0) {
 795         return false;
 796     }
 797     return do_3same(s, a, gen_VMUL_p_3s);
 798 }
 799
 800 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 801     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 802     {                                                                   \
 803         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 804             return false;                                               \
 805         }                                                               \
 806         if (a->size != 1 && a->size != 2) {                             \
 807             return false;                                               \
 808         }                                                               \
 809         return do_3same(s, a, FUNC);                                    \
 810     }
 811
 812 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 813 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 814
 815 #define DO_SHA1(NAME, FUNC)                                             \
 816     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 817     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 818     {                                                                   \
 819         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 820             return false;                                               \
 821         }                                                               \
 822         return do_3same(s, a, gen_##NAME##_3s);                         \
 823     }
 824
 825 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 826 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 827 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 828 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 829
 830 #define DO_SHA2(NAME, FUNC)                                             \
 831     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 832     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 833     {                                                                   \
 834         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 835             return false;                                               \
 836         }                                                               \
 837         return do_3same(s, a, gen_##NAME##_3s);                         \
 838     }
 839
 840 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 841 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 842 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 843
 844 #define DO_3SAME_64(INSN, FUNC)                                         \
 845     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 846                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 847                                 uint32_t oprsz, uint32_t maxsz)         \
 848     {                                                                   \
 849         static const GVecGen3 op = { .fni8 = FUNC };                    \
 850         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 851     }                                                                   \
 852     DO_3SAME(INSN, gen_##INSN##_3s)
 853
 854 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 855     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 856     {                                                                   \
 857         FUNC(d, cpu_env, n, m);                                         \
 858     }                                                                   \
 859     DO_3SAME_64(INSN, gen_##INSN##_elt)
 860
 861 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 862 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 863 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 864 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 865 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 866 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 867
 868 #define DO_3SAME_32(INSN, FUNC)                                         \
 869     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 870                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 871                                 uint32_t oprsz, uint32_t maxsz)         \
 872     {                                                                   \
 873         static const GVecGen3 ops[4] = {                                \
 874             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 875             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 876             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 877             { 0 },                                                      \
 878         };                                                              \
 879         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 880     }                                                                   \
 881     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 882     {                                                                   \
 883         if (a->size > 2) {                                              \
 884             return false;                                               \
 885         }                                                               \
 886         return do_3same(s, a, gen_##INSN##_3s);                         \
 887     }
 888
 889 /*
 890  * Some helper functions need to be passed the cpu_env. In order
 891  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 892  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 893  * and which call a NeonGenTwoOpEnvFn().
 894  */
 895 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 896     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 897     {                                                                   \
 898         FUNC(d, cpu_env, n, m);                                         \
 899     }
 900
 901 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 902     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 903     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 904     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 905     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 906                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 907                                 uint32_t oprsz, uint32_t maxsz)         \
 908     {                                                                   \
 909         static const GVecGen3 ops[4] = {                                \
 910             { .fni4 = gen_##INSN##_tramp8 },                            \
 911             { .fni4 = gen_##INSN##_tramp16 },                           \
 912             { .fni4 = gen_##INSN##_tramp32 },                           \
 913             { 0 },                                                      \
 914         };                                                              \
 915         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 916     }                                                                   \
 917     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 918     {                                                                   \
 919         if (a->size > 2) {                                              \
 920             return false;                                               \
 921         }                                                               \
 922         return do_3same(s, a, gen_##INSN##_3s);                         \
 923     }
 924
 925 DO_3SAME_32(VHADD_S, hadd_s)
 926 DO_3SAME_32(VHADD_U, hadd_u)
 927 DO_3SAME_32(VHSUB_S, hsub_s)
 928 DO_3SAME_32(VHSUB_U, hsub_u)
 929 DO_3SAME_32(VRHADD_S, rhadd_s)
 930 DO_3SAME_32(VRHADD_U, rhadd_u)
 931 DO_3SAME_32(VRSHL_S, rshl_s)
 932 DO_3SAME_32(VRSHL_U, rshl_u)
 933
 934 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
 935 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
 936 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
 937 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
 938
 939 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
 940 {
 941     /* Operations handled pairwise 32 bits at a time */
 942     TCGv_i32 tmp, tmp2, tmp3;
 943
 944     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 945         return false;
 946     }
 947
 948     /* UNDEF accesses to D16-D31 if they don't exist. */
 949     if (!dc_isar_feature(aa32_simd_r32, s) &&
 950         ((a->vd | a->vn | a->vm) & 0x10)) {
 951         return false;
 952     }
 953
 954     if (a->size == 3) {
 955         return false;
 956     }
 957
 958     if (!vfp_access_check(s)) {
 959         return true;
 960     }
 961
 962     assert(a->q == 0); /* enforced by decode patterns */
 963
 964     /*
 965      * Note that we have to be careful not to clobber the source operands
 966      * in the "vm == vd" case by storing the result of the first pass too
 967      * early. Since Q is 0 there are always just two passes, so instead
 968      * of a complicated loop over each pass we just unroll.
 969      */
 970     tmp = neon_load_reg(a->vn, 0);
 971     tmp2 = neon_load_reg(a->vn, 1);
 972     fn(tmp, tmp, tmp2);
 973     tcg_temp_free_i32(tmp2);
 974
 975     tmp3 = neon_load_reg(a->vm, 0);
 976     tmp2 = neon_load_reg(a->vm, 1);
 977     fn(tmp3, tmp3, tmp2);
 978     tcg_temp_free_i32(tmp2);
 979
 980     neon_store_reg(a->vd, 0, tmp);
 981     neon_store_reg(a->vd, 1, tmp3);
 982     return true;
 983 }
 984
 985 #define DO_3SAME_PAIR(INSN, func)                                       \
 986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 987     {                                                                   \
 988         static NeonGenTwoOpFn * const fns[] = {                         \
 989             gen_helper_neon_##func##8,                                  \
 990             gen_helper_neon_##func##16,                                 \
 991             gen_helper_neon_##func##32,                                 \
 992         };                                                              \
 993         if (a->size > 2) {                                              \
 994             return false;                                               \
 995         }                                                               \
 996         return do_3same_pair(s, a, fns[a->size]);                       \
 997     }
 998
 999 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1000 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1001 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1002 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1003 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1004 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1005
1006 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1007 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1008 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1009 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1010 DO_3SAME_PAIR(VPADD, padd_u)
1011
1012 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1013     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1014     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1015     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1016                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1017                                 uint32_t oprsz, uint32_t maxsz)         \
1018     {                                                                   \
1019         static const GVecGen3 ops[2] = {                                \
1020             { .fni4 = gen_##INSN##_tramp16 },                           \
1021             { .fni4 = gen_##INSN##_tramp32 },                           \
1022         };                                                              \
1023         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1024     }                                                                   \
1025     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1026     {                                                                   \
1027         if (a->size != 1 && a->size != 2) {                             \
1028             return false;                                               \
1029         }                                                               \
1030         return do_3same(s, a, gen_##INSN##_3s);                         \
1031     }
1032
1033 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1034 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1035
1036 static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
1037                         bool reads_vd)
1038 {
1039     /*
1040      * FP operations handled elementwise 32 bits at a time.
1041      * If reads_vd is true then the old value of Vd will be
1042      * loaded before calling the callback function. This is
1043      * used for multiply-accumulate type operations.
1044      */
1045     TCGv_i32 tmp, tmp2;
1046     int pass;
1047
1048     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1049         return false;
1050     }
1051
1052     /* UNDEF accesses to D16-D31 if they don't exist. */
1053     if (!dc_isar_feature(aa32_simd_r32, s) &&
1054         ((a->vd | a->vn | a->vm) & 0x10)) {
1055         return false;
1056     }
1057
1058     if ((a->vn | a->vm | a->vd) & a->q) {
1059         return false;
1060     }
1061
1062     if (!vfp_access_check(s)) {
1063         return true;
1064     }
1065
1066     TCGv_ptr fpstatus = get_fpstatus_ptr(1);
1067     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1068         tmp = neon_load_reg(a->vn, pass);
1069         tmp2 = neon_load_reg(a->vm, pass);
1070         if (reads_vd) {
1071             TCGv_i32 tmp_rd = neon_load_reg(a->vd, pass);
1072             fn(tmp_rd, tmp, tmp2, fpstatus);
1073             neon_store_reg(a->vd, pass, tmp_rd);
1074             tcg_temp_free_i32(tmp);
1075         } else {
1076             fn(tmp, tmp, tmp2, fpstatus);
1077             neon_store_reg(a->vd, pass, tmp);
1078         }
1079         tcg_temp_free_i32(tmp2);
1080     }
1081     tcg_temp_free_ptr(fpstatus);
1082     return true;
1083 }
1084
1085 /*
1086  * For all the functions using this macro, size == 1 means fp16,
1087  * which is an architecture extension we don't implement yet.
1088  */
1089 #define DO_3S_FP_GVEC(INSN,FUNC)                                        \
1090     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1091                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1092                                 uint32_t oprsz, uint32_t maxsz)         \
1093     {                                                                   \
1094         TCGv_ptr fpst = get_fpstatus_ptr(1);                            \
1095         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1096                            oprsz, maxsz, 0, FUNC);                      \
1097         tcg_temp_free_ptr(fpst);                                        \
1098     }                                                                   \
1099     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1100     {                                                                   \
1101         if (a->size != 0) {                                             \
1102             /* TODO fp16 support */                                     \
1103             return false;                                               \
1104         }                                                               \
1105         return do_3same(s, a, gen_##INSN##_3s);                         \
1106     }
1107
1108
1109 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s)
1110 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s)
1111 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s)
1112 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s)
1113
1114 /*
1115  * For all the functions using this macro, size == 1 means fp16,
1116  * which is an architecture extension we don't implement yet.
1117  */
1118 #define DO_3S_FP(INSN,FUNC,READS_VD)                                \
1119     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1120     {                                                               \
1121         if (a->size != 0) {                                         \
1122             /* TODO fp16 support */                                 \
1123             return false;                                           \
1124         }                                                           \
1125         return do_3same_fp(s, a, FUNC, READS_VD);                   \
1126     }
1127
1128 DO_3S_FP(VCEQ, gen_helper_neon_ceq_f32, false)
1129 DO_3S_FP(VCGE, gen_helper_neon_cge_f32, false)
1130 DO_3S_FP(VCGT, gen_helper_neon_cgt_f32, false)
1131 DO_3S_FP(VACGE, gen_helper_neon_acge_f32, false)
1132 DO_3S_FP(VACGT, gen_helper_neon_acgt_f32, false)
1133 DO_3S_FP(VMAX, gen_helper_vfp_maxs, false)
1134 DO_3S_FP(VMIN, gen_helper_vfp_mins, false)
1135
1136 static void gen_VMLA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1137                             TCGv_ptr fpstatus)
1138 {
1139     gen_helper_vfp_muls(vn, vn, vm, fpstatus);
1140     gen_helper_vfp_adds(vd, vd, vn, fpstatus);
1141 }
1142
1143 static void gen_VMLS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1144                             TCGv_ptr fpstatus)
1145 {
1146     gen_helper_vfp_muls(vn, vn, vm, fpstatus);
1147     gen_helper_vfp_subs(vd, vd, vn, fpstatus);
1148 }
1149
1150 DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
1151 DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
1152
1153 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1154 {
1155     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1156         return false;
1157     }
1158
1159     if (a->size != 0) {
1160         /* TODO fp16 support */
1161         return false;
1162     }
1163
1164     return do_3same_fp(s, a, gen_helper_vfp_maxnums, false);
1165 }
1166
1167 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1168 {
1169     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1170         return false;
1171     }
1172
1173     if (a->size != 0) {
1174         /* TODO fp16 support */
1175         return false;
1176     }
1177
1178     return do_3same_fp(s, a, gen_helper_vfp_minnums, false);
1179 }
1180
1181 WRAP_ENV_FN(gen_VRECPS_tramp, gen_helper_recps_f32)
1182
1183 static void gen_VRECPS_fp_3s(unsigned vece, uint32_t rd_ofs,
1184                              uint32_t rn_ofs, uint32_t rm_ofs,
1185                              uint32_t oprsz, uint32_t maxsz)
1186 {
1187     static const GVecGen3 ops = { .fni4 = gen_VRECPS_tramp };
1188     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1189 }
1190
1191 static bool trans_VRECPS_fp_3s(DisasContext *s, arg_3same *a)
1192 {
1193     if (a->size != 0) {
1194         /* TODO fp16 support */
1195         return false;
1196     }
1197
1198     return do_3same(s, a, gen_VRECPS_fp_3s);
1199 }
1200
1201 WRAP_ENV_FN(gen_VRSQRTS_tramp, gen_helper_rsqrts_f32)
1202
1203 static void gen_VRSQRTS_fp_3s(unsigned vece, uint32_t rd_ofs,
1204                               uint32_t rn_ofs, uint32_t rm_ofs,
1205                               uint32_t oprsz, uint32_t maxsz)
1206 {
1207     static const GVecGen3 ops = { .fni4 = gen_VRSQRTS_tramp };
1208     tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
1209 }
1210
1211 static bool trans_VRSQRTS_fp_3s(DisasContext *s, arg_3same *a)
1212 {
1213     if (a->size != 0) {
1214         /* TODO fp16 support */
1215         return false;
1216     }
1217
1218     return do_3same(s, a, gen_VRSQRTS_fp_3s);
1219 }
1220
1221 static void gen_VFMA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1222                             TCGv_ptr fpstatus)
1223 {
1224     gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
1225 }
1226
1227 static bool trans_VFMA_fp_3s(DisasContext *s, arg_3same *a)
1228 {
1229     if (!dc_isar_feature(aa32_simdfmac, s)) {
1230         return false;
1231     }
1232
1233     if (a->size != 0) {
1234         /* TODO fp16 support */
1235         return false;
1236     }
1237
1238     return do_3same_fp(s, a, gen_VFMA_fp_3s, true);
1239 }
1240
1241 static void gen_VFMS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
1242                             TCGv_ptr fpstatus)
1243 {
1244     gen_helper_vfp_negs(vn, vn);
1245     gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
1246 }
1247
1248 static bool trans_VFMS_fp_3s(DisasContext *s, arg_3same *a)
1249 {
1250     if (!dc_isar_feature(aa32_simdfmac, s)) {
1251         return false;
1252     }
1253
1254     if (a->size != 0) {
1255         /* TODO fp16 support */
1256         return false;
1257     }
1258
1259     return do_3same_fp(s, a, gen_VFMS_fp_3s, true);
1260 }
1261
1262 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
1263 {
1264     /* FP operations handled pairwise 32 bits at a time */
1265     TCGv_i32 tmp, tmp2, tmp3;
1266     TCGv_ptr fpstatus;
1267
1268     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1269         return false;
1270     }
1271
1272     /* UNDEF accesses to D16-D31 if they don't exist. */
1273     if (!dc_isar_feature(aa32_simd_r32, s) &&
1274         ((a->vd | a->vn | a->vm) & 0x10)) {
1275         return false;
1276     }
1277
1278     if (!vfp_access_check(s)) {
1279         return true;
1280     }
1281
1282     assert(a->q == 0); /* enforced by decode patterns */
1283
1284     /*
1285      * Note that we have to be careful not to clobber the source operands
1286      * in the "vm == vd" case by storing the result of the first pass too
1287      * early. Since Q is 0 there are always just two passes, so instead
1288      * of a complicated loop over each pass we just unroll.
1289      */
1290     fpstatus = get_fpstatus_ptr(1);
1291     tmp = neon_load_reg(a->vn, 0);
1292     tmp2 = neon_load_reg(a->vn, 1);
1293     fn(tmp, tmp, tmp2, fpstatus);
1294     tcg_temp_free_i32(tmp2);
1295
1296     tmp3 = neon_load_reg(a->vm, 0);
1297     tmp2 = neon_load_reg(a->vm, 1);
1298     fn(tmp3, tmp3, tmp2, fpstatus);
1299     tcg_temp_free_i32(tmp2);
1300     tcg_temp_free_ptr(fpstatus);
1301
1302     neon_store_reg(a->vd, 0, tmp);
1303     neon_store_reg(a->vd, 1, tmp3);
1304     return true;
1305 }
1306
1307 /*
1308  * For all the functions using this macro, size == 1 means fp16,
1309  * which is an architecture extension we don't implement yet.
1310  */
1311 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1312     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1313     {                                                               \
1314         if (a->size != 0) {                                         \
1315             /* TODO fp16 support */                                 \
1316             return false;                                           \
1317         }                                                           \
1318         return do_3same_fp_pair(s, a, FUNC);                        \
1319     }
1320
1321 DO_3S_FP_PAIR(VPADD, gen_helper_vfp_adds)
1322 DO_3S_FP_PAIR(VPMAX, gen_helper_vfp_maxs)
1323 DO_3S_FP_PAIR(VPMIN, gen_helper_vfp_mins)
1324
1325 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1326 {
1327     /* Handle a 2-reg-shift insn which can be vectorized. */
1328     int vec_size = a->q ? 16 : 8;
1329     int rd_ofs = neon_reg_offset(a->vd, 0);
1330     int rm_ofs = neon_reg_offset(a->vm, 0);
1331
1332     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1333         return false;
1334     }
1335
1336     /* UNDEF accesses to D16-D31 if they don't exist. */
1337     if (!dc_isar_feature(aa32_simd_r32, s) &&
1338         ((a->vd | a->vm) & 0x10)) {
1339         return false;
1340     }
1341
1342     if ((a->vm | a->vd) & a->q) {
1343         return false;
1344     }
1345
1346     if (!vfp_access_check(s)) {
1347         return true;
1348     }
1349
1350     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1351     return true;
1352 }
1353
1354 #define DO_2SH(INSN, FUNC)                                              \
1355     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1356     {                                                                   \
1357         return do_vector_2sh(s, a, FUNC);                               \
1358     }                                                                   \
1359
1360 DO_2SH(VSHL, tcg_gen_gvec_shli)
1361 DO_2SH(VSLI, gen_gvec_sli)
1362 DO_2SH(VSRI, gen_gvec_sri)
1363 DO_2SH(VSRA_S, gen_gvec_ssra)
1364 DO_2SH(VSRA_U, gen_gvec_usra)
1365 DO_2SH(VRSHR_S, gen_gvec_srshr)
1366 DO_2SH(VRSHR_U, gen_gvec_urshr)
1367 DO_2SH(VRSRA_S, gen_gvec_srsra)
1368 DO_2SH(VRSRA_U, gen_gvec_ursra)
1369
1370 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1371 {
1372     /* Signed shift out of range results in all-sign-bits */
1373     a->shift = MIN(a->shift, (8 << a->size) - 1);
1374     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1375 }
1376
1377 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1378                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1379 {
1380     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1381 }
1382
1383 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1384 {
1385     /* Shift out of range is architecturally valid and results in zero. */
1386     if (a->shift >= (8 << a->size)) {
1387         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1388     } else {
1389         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1390     }
1391 }
1392
1393 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1394                              NeonGenTwo64OpEnvFn *fn)
1395 {
1396     /*
1397      * 2-reg-and-shift operations, size == 3 case, where the
1398      * function needs to be passed cpu_env.
1399      */
1400     TCGv_i64 constimm;
1401     int pass;
1402
1403     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1404         return false;
1405     }
1406
1407     /* UNDEF accesses to D16-D31 if they don't exist. */
1408     if (!dc_isar_feature(aa32_simd_r32, s) &&
1409         ((a->vd | a->vm) & 0x10)) {
1410         return false;
1411     }
1412
1413     if ((a->vm | a->vd) & a->q) {
1414         return false;
1415     }
1416
1417     if (!vfp_access_check(s)) {
1418         return true;
1419     }
1420
1421     /*
1422      * To avoid excessive duplication of ops we implement shift
1423      * by immediate using the variable shift operations.
1424      */
1425     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1426
1427     for (pass = 0; pass < a->q + 1; pass++) {
1428         TCGv_i64 tmp = tcg_temp_new_i64();
1429
1430         neon_load_reg64(tmp, a->vm + pass);
1431         fn(tmp, cpu_env, tmp, constimm);
1432         neon_store_reg64(tmp, a->vd + pass);
1433         tcg_temp_free_i64(tmp);
1434     }
1435     tcg_temp_free_i64(constimm);
1436     return true;
1437 }
1438
1439 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1440                              NeonGenTwoOpEnvFn *fn)
1441 {
1442     /*
1443      * 2-reg-and-shift operations, size < 3 case, where the
1444      * helper needs to be passed cpu_env.
1445      */
1446     TCGv_i32 constimm;
1447     int pass;
1448
1449     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1450         return false;
1451     }
1452
1453     /* UNDEF accesses to D16-D31 if they don't exist. */
1454     if (!dc_isar_feature(aa32_simd_r32, s) &&
1455         ((a->vd | a->vm) & 0x10)) {
1456         return false;
1457     }
1458
1459     if ((a->vm | a->vd) & a->q) {
1460         return false;
1461     }
1462
1463     if (!vfp_access_check(s)) {
1464         return true;
1465     }
1466
1467     /*
1468      * To avoid excessive duplication of ops we implement shift
1469      * by immediate using the variable shift operations.
1470      */
1471     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1472
1473     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1474         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
1475         fn(tmp, cpu_env, tmp, constimm);
1476         neon_store_reg(a->vd, pass, tmp);
1477     }
1478     tcg_temp_free_i32(constimm);
1479     return true;
1480 }
1481
1482 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1483     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1484     {                                                                   \
1485         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1486     }                                                                   \
1487     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1488     {                                                                   \
1489         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1490             gen_helper_neon_##FUNC##8,                                  \
1491             gen_helper_neon_##FUNC##16,                                 \
1492             gen_helper_neon_##FUNC##32,                                 \
1493         };                                                              \
1494         assert(a->size < ARRAY_SIZE(fns));                              \
1495         return do_2shift_env_32(s, a, fns[a->size]);                    \
1496     }
1497
1498 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1499 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1500 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1501
1502 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1503                                 NeonGenTwo64OpFn *shiftfn,
1504                                 NeonGenNarrowEnvFn *narrowfn)
1505 {
1506     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1507     TCGv_i64 constimm, rm1, rm2;
1508     TCGv_i32 rd;
1509
1510     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1511         return false;
1512     }
1513
1514     /* UNDEF accesses to D16-D31 if they don't exist. */
1515     if (!dc_isar_feature(aa32_simd_r32, s) &&
1516         ((a->vd | a->vm) & 0x10)) {
1517         return false;
1518     }
1519
1520     if (a->vm & 1) {
1521         return false;
1522     }
1523
1524     if (!vfp_access_check(s)) {
1525         return true;
1526     }
1527
1528     /*
1529      * This is always a right shift, and the shiftfn is always a
1530      * left-shift helper, which thus needs the negated shift count.
1531      */
1532     constimm = tcg_const_i64(-a->shift);
1533     rm1 = tcg_temp_new_i64();
1534     rm2 = tcg_temp_new_i64();
1535
1536     /* Load both inputs first to avoid potential overwrite if rm == rd */
1537     neon_load_reg64(rm1, a->vm);
1538     neon_load_reg64(rm2, a->vm + 1);
1539
1540     shiftfn(rm1, rm1, constimm);
1541     rd = tcg_temp_new_i32();
1542     narrowfn(rd, cpu_env, rm1);
1543     neon_store_reg(a->vd, 0, rd);
1544
1545     shiftfn(rm2, rm2, constimm);
1546     rd = tcg_temp_new_i32();
1547     narrowfn(rd, cpu_env, rm2);
1548     neon_store_reg(a->vd, 1, rd);
1549
1550     tcg_temp_free_i64(rm1);
1551     tcg_temp_free_i64(rm2);
1552     tcg_temp_free_i64(constimm);
1553
1554     return true;
1555 }
1556
1557 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1558                                 NeonGenTwoOpFn *shiftfn,
1559                                 NeonGenNarrowEnvFn *narrowfn)
1560 {
1561     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1562     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1563     TCGv_i64 rtmp;
1564     uint32_t imm;
1565
1566     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1567         return false;
1568     }
1569
1570     /* UNDEF accesses to D16-D31 if they don't exist. */
1571     if (!dc_isar_feature(aa32_simd_r32, s) &&
1572         ((a->vd | a->vm) & 0x10)) {
1573         return false;
1574     }
1575
1576     if (a->vm & 1) {
1577         return false;
1578     }
1579
1580     if (!vfp_access_check(s)) {
1581         return true;
1582     }
1583
1584     /*
1585      * This is always a right shift, and the shiftfn is always a
1586      * left-shift helper, which thus needs the negated shift count
1587      * duplicated into each lane of the immediate value.
1588      */
1589     if (a->size == 1) {
1590         imm = (uint16_t)(-a->shift);
1591         imm |= imm << 16;
1592     } else {
1593         /* size == 2 */
1594         imm = -a->shift;
1595     }
1596     constimm = tcg_const_i32(imm);
1597
1598     /* Load all inputs first to avoid potential overwrite */
1599     rm1 = neon_load_reg(a->vm, 0);
1600     rm2 = neon_load_reg(a->vm, 1);
1601     rm3 = neon_load_reg(a->vm + 1, 0);
1602     rm4 = neon_load_reg(a->vm + 1, 1);
1603     rtmp = tcg_temp_new_i64();
1604
1605     shiftfn(rm1, rm1, constimm);
1606     shiftfn(rm2, rm2, constimm);
1607
1608     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1609     tcg_temp_free_i32(rm2);
1610
1611     narrowfn(rm1, cpu_env, rtmp);
1612     neon_store_reg(a->vd, 0, rm1);
1613
1614     shiftfn(rm3, rm3, constimm);
1615     shiftfn(rm4, rm4, constimm);
1616     tcg_temp_free_i32(constimm);
1617
1618     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1619     tcg_temp_free_i32(rm4);
1620
1621     narrowfn(rm3, cpu_env, rtmp);
1622     tcg_temp_free_i64(rtmp);
1623     neon_store_reg(a->vd, 1, rm3);
1624     return true;
1625 }
1626
1627 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1628     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1629     {                                                                   \
1630         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1631     }
1632 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1633     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1634     {                                                                   \
1635         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1636     }
1637
1638 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1639 {
1640     tcg_gen_extrl_i64_i32(dest, src);
1641 }
1642
1643 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1644 {
1645     gen_helper_neon_narrow_u16(dest, src);
1646 }
1647
1648 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1649 {
1650     gen_helper_neon_narrow_u8(dest, src);
1651 }
1652
1653 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1654 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1655 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1656
1657 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1658 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1659 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1660
1661 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1662 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1663 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1664
1665 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1666 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1667 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1668 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1669 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1670 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1671
1672 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1673 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1674 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1675
1676 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1677 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1678 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1679
1680 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1681 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1682 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1683
1684 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1685                          NeonGenWidenFn *widenfn, bool u)
1686 {
1687     TCGv_i64 tmp;
1688     TCGv_i32 rm0, rm1;
1689     uint64_t widen_mask = 0;
1690
1691     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1692         return false;
1693     }
1694
1695     /* UNDEF accesses to D16-D31 if they don't exist. */
1696     if (!dc_isar_feature(aa32_simd_r32, s) &&
1697         ((a->vd | a->vm) & 0x10)) {
1698         return false;
1699     }
1700
1701     if (a->vd & 1) {
1702         return false;
1703     }
1704
1705     if (!vfp_access_check(s)) {
1706         return true;
1707     }
1708
1709     /*
1710      * This is a widen-and-shift operation. The shift is always less
1711      * than the width of the source type, so after widening the input
1712      * vector we can simply shift the whole 64-bit widened register,
1713      * and then clear the potential overflow bits resulting from left
1714      * bits of the narrow input appearing as right bits of the left
1715      * neighbour narrow input. Calculate a mask of bits to clear.
1716      */
1717     if ((a->shift != 0) && (a->size < 2 || u)) {
1718         int esize = 8 << a->size;
1719         widen_mask = MAKE_64BIT_MASK(0, esize);
1720         widen_mask >>= esize - a->shift;
1721         widen_mask = dup_const(a->size + 1, widen_mask);
1722     }
1723
1724     rm0 = neon_load_reg(a->vm, 0);
1725     rm1 = neon_load_reg(a->vm, 1);
1726     tmp = tcg_temp_new_i64();
1727
1728     widenfn(tmp, rm0);
1729     tcg_temp_free_i32(rm0);
1730     if (a->shift != 0) {
1731         tcg_gen_shli_i64(tmp, tmp, a->shift);
1732         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1733     }
1734     neon_store_reg64(tmp, a->vd);
1735
1736     widenfn(tmp, rm1);
1737     tcg_temp_free_i32(rm1);
1738     if (a->shift != 0) {
1739         tcg_gen_shli_i64(tmp, tmp, a->shift);
1740         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1741     }
1742     neon_store_reg64(tmp, a->vd + 1);
1743     tcg_temp_free_i64(tmp);
1744     return true;
1745 }
1746
1747 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1748 {
1749     static NeonGenWidenFn * const widenfn[] = {
1750         gen_helper_neon_widen_s8,
1751         gen_helper_neon_widen_s16,
1752         tcg_gen_ext_i32_i64,
1753     };
1754     return do_vshll_2sh(s, a, widenfn[a->size], false);
1755 }
1756
1757 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1758 {
1759     static NeonGenWidenFn * const widenfn[] = {
1760         gen_helper_neon_widen_u8,
1761         gen_helper_neon_widen_u16,
1762         tcg_gen_extu_i32_i64,
1763     };
1764     return do_vshll_2sh(s, a, widenfn[a->size], true);
1765 }
1766
1767 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1768                       NeonGenTwoSingleOpFn *fn)
1769 {
1770     /* FP operations in 2-reg-and-shift group */
1771     TCGv_i32 tmp, shiftv;
1772     TCGv_ptr fpstatus;
1773     int pass;
1774
1775     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1776         return false;
1777     }
1778
1779     /* UNDEF accesses to D16-D31 if they don't exist. */
1780     if (!dc_isar_feature(aa32_simd_r32, s) &&
1781         ((a->vd | a->vm) & 0x10)) {
1782         return false;
1783     }
1784
1785     if ((a->vm | a->vd) & a->q) {
1786         return false;
1787     }
1788
1789     if (!vfp_access_check(s)) {
1790         return true;
1791     }
1792
1793     fpstatus = get_fpstatus_ptr(1);
1794     shiftv = tcg_const_i32(a->shift);
1795     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1796         tmp = neon_load_reg(a->vm, pass);
1797         fn(tmp, tmp, shiftv, fpstatus);
1798         neon_store_reg(a->vd, pass, tmp);
1799     }
1800     tcg_temp_free_ptr(fpstatus);
1801     tcg_temp_free_i32(shiftv);
1802     return true;
1803 }
1804
1805 #define DO_FP_2SH(INSN, FUNC)                                           \
1806     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1807     {                                                                   \
1808         return do_fp_2sh(s, a, FUNC);                                   \
1809     }
1810
1811 DO_FP_2SH(VCVT_SF, gen_helper_vfp_sltos)
1812 DO_FP_2SH(VCVT_UF, gen_helper_vfp_ultos)
1813 DO_FP_2SH(VCVT_FS, gen_helper_vfp_tosls_round_to_zero)
1814 DO_FP_2SH(VCVT_FU, gen_helper_vfp_touls_round_to_zero)
1815
1816 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1817 {
1818     /*
1819      * Expand the encoded constant.
1820      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1821      * We choose to not special-case this and will behave as if a
1822      * valid constant encoding of 0 had been given.
1823      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1824      */
1825     switch (cmode) {
1826     case 0: case 1:
1827         /* no-op */
1828         break;
1829     case 2: case 3:
1830         imm <<= 8;
1831         break;
1832     case 4: case 5:
1833         imm <<= 16;
1834         break;
1835     case 6: case 7:
1836         imm <<= 24;
1837         break;
1838     case 8: case 9:
1839         imm |= imm << 16;
1840         break;
1841     case 10: case 11:
1842         imm = (imm << 8) | (imm << 24);
1843         break;
1844     case 12:
1845         imm = (imm << 8) | 0xff;
1846         break;
1847     case 13:
1848         imm = (imm << 16) | 0xffff;
1849         break;
1850     case 14:
1851         if (op) {
1852             /*
1853              * This is the only case where the top and bottom 32 bits
1854              * of the encoded constant differ.
1855              */
1856             uint64_t imm64 = 0;
1857             int n;
1858
1859             for (n = 0; n < 8; n++) {
1860                 if (imm & (1 << n)) {
1861                     imm64 |= (0xffULL << (n * 8));
1862                 }
1863             }
1864             return imm64;
1865         }
1866         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1867         break;
1868     case 15:
1869         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1870             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1871         break;
1872     }
1873     if (op) {
1874         imm = ~imm;
1875     }
1876     return dup_const(MO_32, imm);
1877 }
1878
1879 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1880                         GVecGen2iFn *fn)
1881 {
1882     uint64_t imm;
1883     int reg_ofs, vec_size;
1884
1885     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1886         return false;
1887     }
1888
1889     /* UNDEF accesses to D16-D31 if they don't exist. */
1890     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1891         return false;
1892     }
1893
1894     if (a->vd & a->q) {
1895         return false;
1896     }
1897
1898     if (!vfp_access_check(s)) {
1899         return true;
1900     }
1901
1902     reg_ofs = neon_reg_offset(a->vd, 0);
1903     vec_size = a->q ? 16 : 8;
1904     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1905
1906     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1907     return true;
1908 }
1909
1910 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1911                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1912 {
1913     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1914 }
1915
1916 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1917 {
1918     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1919     GVecGen2iFn *fn;
1920
1921     if ((a->cmode & 1) && a->cmode < 12) {
1922         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1923         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1924     } else {
1925         /* There is one unallocated cmode/op combination in this space */
1926         if (a->cmode == 15 && a->op == 1) {
1927             return false;
1928         }
1929         fn = gen_VMOV_1r;
1930     }
1931     return do_1reg_imm(s, a, fn);
1932 }
1933
1934 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1935                            NeonGenWidenFn *widenfn,
1936                            NeonGenTwo64OpFn *opfn,
1937                            bool src1_wide)
1938 {
1939     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1940     TCGv_i64 rn0_64, rn1_64, rm_64;
1941     TCGv_i32 rm;
1942
1943     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1944         return false;
1945     }
1946
1947     /* UNDEF accesses to D16-D31 if they don't exist. */
1948     if (!dc_isar_feature(aa32_simd_r32, s) &&
1949         ((a->vd | a->vn | a->vm) & 0x10)) {
1950         return false;
1951     }
1952
1953     if (!widenfn || !opfn) {
1954         /* size == 3 case, which is an entirely different insn group */
1955         return false;
1956     }
1957
1958     if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
1959         return false;
1960     }
1961
1962     if (!vfp_access_check(s)) {
1963         return true;
1964     }
1965
1966     rn0_64 = tcg_temp_new_i64();
1967     rn1_64 = tcg_temp_new_i64();
1968     rm_64 = tcg_temp_new_i64();
1969
1970     if (src1_wide) {
1971         neon_load_reg64(rn0_64, a->vn);
1972     } else {
1973         TCGv_i32 tmp = neon_load_reg(a->vn, 0);
1974         widenfn(rn0_64, tmp);
1975         tcg_temp_free_i32(tmp);
1976     }
1977     rm = neon_load_reg(a->vm, 0);
1978
1979     widenfn(rm_64, rm);
1980     tcg_temp_free_i32(rm);
1981     opfn(rn0_64, rn0_64, rm_64);
1982
1983     /*
1984      * Load second pass inputs before storing the first pass result, to
1985      * avoid incorrect results if a narrow input overlaps with the result.
1986      */
1987     if (src1_wide) {
1988         neon_load_reg64(rn1_64, a->vn + 1);
1989     } else {
1990         TCGv_i32 tmp = neon_load_reg(a->vn, 1);
1991         widenfn(rn1_64, tmp);
1992         tcg_temp_free_i32(tmp);
1993     }
1994     rm = neon_load_reg(a->vm, 1);
1995
1996     neon_store_reg64(rn0_64, a->vd);
1997
1998     widenfn(rm_64, rm);
1999     tcg_temp_free_i32(rm);
2000     opfn(rn1_64, rn1_64, rm_64);
2001     neon_store_reg64(rn1_64, a->vd + 1);
2002
2003     tcg_temp_free_i64(rn0_64);
2004     tcg_temp_free_i64(rn1_64);
2005     tcg_temp_free_i64(rm_64);
2006
2007     return true;
2008 }
2009
2010 #define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
2011     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2012     {                                                                   \
2013         static NeonGenWidenFn * const widenfn[] = {                     \
2014             gen_helper_neon_widen_##S##8,                               \
2015             gen_helper_neon_widen_##S##16,                              \
2016             tcg_gen_##EXT##_i32_i64,                                    \
2017             NULL,                                                       \
2018         };                                                              \
2019         static NeonGenTwo64OpFn * const addfn[] = {                     \
2020             gen_helper_neon_##OP##l_u16,                                \
2021             gen_helper_neon_##OP##l_u32,                                \
2022             tcg_gen_##OP##_i64,                                         \
2023             NULL,                                                       \
2024         };                                                              \
2025         return do_prewiden_3d(s, a, widenfn[a->size],                   \
2026                               addfn[a->size], SRC1WIDE);                \
2027     }
2028
2029 DO_PREWIDEN(VADDL_S, s, ext, add, false)
2030 DO_PREWIDEN(VADDL_U, u, extu, add, false)
2031 DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
2032 DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
2033 DO_PREWIDEN(VADDW_S, s, ext, add, true)
2034 DO_PREWIDEN(VADDW_U, u, extu, add, true)
2035 DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
2036 DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
2037
2038 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
2039                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
2040 {
2041     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
2042     TCGv_i64 rn_64, rm_64;
2043     TCGv_i32 rd0, rd1;
2044
2045     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2046         return false;
2047     }
2048
2049     /* UNDEF accesses to D16-D31 if they don't exist. */
2050     if (!dc_isar_feature(aa32_simd_r32, s) &&
2051         ((a->vd | a->vn | a->vm) & 0x10)) {
2052         return false;
2053     }
2054
2055     if (!opfn || !narrowfn) {
2056         /* size == 3 case, which is an entirely different insn group */
2057         return false;
2058     }
2059
2060     if ((a->vn | a->vm) & 1) {
2061         return false;
2062     }
2063
2064     if (!vfp_access_check(s)) {
2065         return true;
2066     }
2067
2068     rn_64 = tcg_temp_new_i64();
2069     rm_64 = tcg_temp_new_i64();
2070     rd0 = tcg_temp_new_i32();
2071     rd1 = tcg_temp_new_i32();
2072
2073     neon_load_reg64(rn_64, a->vn);
2074     neon_load_reg64(rm_64, a->vm);
2075
2076     opfn(rn_64, rn_64, rm_64);
2077
2078     narrowfn(rd0, rn_64);
2079
2080     neon_load_reg64(rn_64, a->vn + 1);
2081     neon_load_reg64(rm_64, a->vm + 1);
2082
2083     opfn(rn_64, rn_64, rm_64);
2084
2085     narrowfn(rd1, rn_64);
2086
2087     neon_store_reg(a->vd, 0, rd0);
2088     neon_store_reg(a->vd, 1, rd1);
2089
2090     tcg_temp_free_i64(rn_64);
2091     tcg_temp_free_i64(rm_64);
2092
2093     return true;
2094 }
2095
2096 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
2097     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2098     {                                                                   \
2099         static NeonGenTwo64OpFn * const addfn[] = {                     \
2100             gen_helper_neon_##OP##l_u16,                                \
2101             gen_helper_neon_##OP##l_u32,                                \
2102             tcg_gen_##OP##_i64,                                         \
2103             NULL,                                                       \
2104         };                                                              \
2105         static NeonGenNarrowFn * const narrowfn[] = {                   \
2106             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
2107             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
2108             EXTOP,                                                      \
2109             NULL,                                                       \
2110         };                                                              \
2111         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2112     }
2113
2114 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2115 {
2116     tcg_gen_addi_i64(rn, rn, 1u << 31);
2117     tcg_gen_extrh_i64_i32(rd, rn);
2118 }
2119
2120 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2121 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2122 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2123 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2124
2125 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2126                        NeonGenTwoOpWidenFn *opfn,
2127                        NeonGenTwo64OpFn *accfn)
2128 {
2129     /*
2130      * 3-regs different lengths, long operations.
2131      * These perform an operation on two inputs that returns a double-width
2132      * result, and then possibly perform an accumulation operation of
2133      * that result into the double-width destination.
2134      */
2135     TCGv_i64 rd0, rd1, tmp;
2136     TCGv_i32 rn, rm;
2137
2138     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2139         return false;
2140     }
2141
2142     /* UNDEF accesses to D16-D31 if they don't exist. */
2143     if (!dc_isar_feature(aa32_simd_r32, s) &&
2144         ((a->vd | a->vn | a->vm) & 0x10)) {
2145         return false;
2146     }
2147
2148     if (!opfn) {
2149         /* size == 3 case, which is an entirely different insn group */
2150         return false;
2151     }
2152
2153     if (a->vd & 1) {
2154         return false;
2155     }
2156
2157     if (!vfp_access_check(s)) {
2158         return true;
2159     }
2160
2161     rd0 = tcg_temp_new_i64();
2162     rd1 = tcg_temp_new_i64();
2163
2164     rn = neon_load_reg(a->vn, 0);
2165     rm = neon_load_reg(a->vm, 0);
2166     opfn(rd0, rn, rm);
2167     tcg_temp_free_i32(rn);
2168     tcg_temp_free_i32(rm);
2169
2170     rn = neon_load_reg(a->vn, 1);
2171     rm = neon_load_reg(a->vm, 1);
2172     opfn(rd1, rn, rm);
2173     tcg_temp_free_i32(rn);
2174     tcg_temp_free_i32(rm);
2175
2176     /* Don't store results until after all loads: they might overlap */
2177     if (accfn) {
2178         tmp = tcg_temp_new_i64();
2179         neon_load_reg64(tmp, a->vd);
2180         accfn(tmp, tmp, rd0);
2181         neon_store_reg64(tmp, a->vd);
2182         neon_load_reg64(tmp, a->vd + 1);
2183         accfn(tmp, tmp, rd1);
2184         neon_store_reg64(tmp, a->vd + 1);
2185         tcg_temp_free_i64(tmp);
2186     } else {
2187         neon_store_reg64(rd0, a->vd);
2188         neon_store_reg64(rd1, a->vd + 1);
2189     }
2190
2191     tcg_temp_free_i64(rd0);
2192     tcg_temp_free_i64(rd1);
2193
2194     return true;
2195 }
2196
2197 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2198 {
2199     static NeonGenTwoOpWidenFn * const opfn[] = {
2200         gen_helper_neon_abdl_s16,
2201         gen_helper_neon_abdl_s32,
2202         gen_helper_neon_abdl_s64,
2203         NULL,
2204     };
2205
2206     return do_long_3d(s, a, opfn[a->size], NULL);
2207 }
2208
2209 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2210 {
2211     static NeonGenTwoOpWidenFn * const opfn[] = {
2212         gen_helper_neon_abdl_u16,
2213         gen_helper_neon_abdl_u32,
2214         gen_helper_neon_abdl_u64,
2215         NULL,
2216     };
2217
2218     return do_long_3d(s, a, opfn[a->size], NULL);
2219 }
2220
2221 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2222 {
2223     static NeonGenTwoOpWidenFn * const opfn[] = {
2224         gen_helper_neon_abdl_s16,
2225         gen_helper_neon_abdl_s32,
2226         gen_helper_neon_abdl_s64,
2227         NULL,
2228     };
2229     static NeonGenTwo64OpFn * const addfn[] = {
2230         gen_helper_neon_addl_u16,
2231         gen_helper_neon_addl_u32,
2232         tcg_gen_add_i64,
2233         NULL,
2234     };
2235
2236     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2237 }
2238
2239 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2240 {
2241     static NeonGenTwoOpWidenFn * const opfn[] = {
2242         gen_helper_neon_abdl_u16,
2243         gen_helper_neon_abdl_u32,
2244         gen_helper_neon_abdl_u64,
2245         NULL,
2246     };
2247     static NeonGenTwo64OpFn * const addfn[] = {
2248         gen_helper_neon_addl_u16,
2249         gen_helper_neon_addl_u32,
2250         tcg_gen_add_i64,
2251         NULL,
2252     };
2253
2254     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2255 }
2256
2257 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2258 {
2259     TCGv_i32 lo = tcg_temp_new_i32();
2260     TCGv_i32 hi = tcg_temp_new_i32();
2261
2262     tcg_gen_muls2_i32(lo, hi, rn, rm);
2263     tcg_gen_concat_i32_i64(rd, lo, hi);
2264
2265     tcg_temp_free_i32(lo);
2266     tcg_temp_free_i32(hi);
2267 }
2268
2269 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2270 {
2271     TCGv_i32 lo = tcg_temp_new_i32();
2272     TCGv_i32 hi = tcg_temp_new_i32();
2273
2274     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2275     tcg_gen_concat_i32_i64(rd, lo, hi);
2276
2277     tcg_temp_free_i32(lo);
2278     tcg_temp_free_i32(hi);
2279 }
2280
2281 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2282 {
2283     static NeonGenTwoOpWidenFn * const opfn[] = {
2284         gen_helper_neon_mull_s8,
2285         gen_helper_neon_mull_s16,
2286         gen_mull_s32,
2287         NULL,
2288     };
2289
2290     return do_long_3d(s, a, opfn[a->size], NULL);
2291 }
2292
2293 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2294 {
2295     static NeonGenTwoOpWidenFn * const opfn[] = {
2296         gen_helper_neon_mull_u8,
2297         gen_helper_neon_mull_u16,
2298         gen_mull_u32,
2299         NULL,
2300     };
2301
2302     return do_long_3d(s, a, opfn[a->size], NULL);
2303 }
2304
2305 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2306     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2307     {                                                                   \
2308         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2309             gen_helper_neon_##MULL##8,                                  \
2310             gen_helper_neon_##MULL##16,                                 \
2311             gen_##MULL##32,                                             \
2312             NULL,                                                       \
2313         };                                                              \
2314         static NeonGenTwo64OpFn * const accfn[] = {                     \
2315             gen_helper_neon_##ACC##l_u16,                               \
2316             gen_helper_neon_##ACC##l_u32,                               \
2317             tcg_gen_##ACC##_i64,                                        \
2318             NULL,                                                       \
2319         };                                                              \
2320         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2321     }
2322
2323 DO_VMLAL(VMLAL_S,mull_s,add)
2324 DO_VMLAL(VMLAL_U,mull_u,add)
2325 DO_VMLAL(VMLSL_S,mull_s,sub)
2326 DO_VMLAL(VMLSL_U,mull_u,sub)
2327
2328 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2329 {
2330     gen_helper_neon_mull_s16(rd, rn, rm);
2331     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2332 }
2333
2334 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2335 {
2336     gen_mull_s32(rd, rn, rm);
2337     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2338 }
2339
2340 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2341 {
2342     static NeonGenTwoOpWidenFn * const opfn[] = {
2343         NULL,
2344         gen_VQDMULL_16,
2345         gen_VQDMULL_32,
2346         NULL,
2347     };
2348
2349     return do_long_3d(s, a, opfn[a->size], NULL);
2350 }
2351
2352 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2353 {
2354     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2355 }
2356
2357 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2358 {
2359     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2360 }
2361
2362 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2363 {
2364     static NeonGenTwoOpWidenFn * const opfn[] = {
2365         NULL,
2366         gen_VQDMULL_16,
2367         gen_VQDMULL_32,
2368         NULL,
2369     };
2370     static NeonGenTwo64OpFn * const accfn[] = {
2371         NULL,
2372         gen_VQDMLAL_acc_16,
2373         gen_VQDMLAL_acc_32,
2374         NULL,
2375     };
2376
2377     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2378 }
2379
2380 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2381 {
2382     gen_helper_neon_negl_u32(rm, rm);
2383     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2384 }
2385
2386 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2387 {
2388     tcg_gen_neg_i64(rm, rm);
2389     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2390 }
2391
2392 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2393 {
2394     static NeonGenTwoOpWidenFn * const opfn[] = {
2395         NULL,
2396         gen_VQDMULL_16,
2397         gen_VQDMULL_32,
2398         NULL,
2399     };
2400     static NeonGenTwo64OpFn * const accfn[] = {
2401         NULL,
2402         gen_VQDMLSL_acc_16,
2403         gen_VQDMLSL_acc_32,
2404         NULL,
2405     };
2406
2407     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2408 }
2409
2410 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2411 {
2412     gen_helper_gvec_3 *fn_gvec;
2413
2414     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2415         return false;
2416     }
2417
2418     /* UNDEF accesses to D16-D31 if they don't exist. */
2419     if (!dc_isar_feature(aa32_simd_r32, s) &&
2420         ((a->vd | a->vn | a->vm) & 0x10)) {
2421         return false;
2422     }
2423
2424     if (a->vd & 1) {
2425         return false;
2426     }
2427
2428     switch (a->size) {
2429     case 0:
2430         fn_gvec = gen_helper_neon_pmull_h;
2431         break;
2432     case 2:
2433         if (!dc_isar_feature(aa32_pmull, s)) {
2434             return false;
2435         }
2436         fn_gvec = gen_helper_gvec_pmull_q;
2437         break;
2438     default:
2439         return false;
2440     }
2441
2442     if (!vfp_access_check(s)) {
2443         return true;
2444     }
2445
2446     tcg_gen_gvec_3_ool(neon_reg_offset(a->vd, 0),
2447                        neon_reg_offset(a->vn, 0),
2448                        neon_reg_offset(a->vm, 0),
2449                        16, 16, 0, fn_gvec);
2450     return true;
2451 }
2452
2453 static void gen_neon_dup_low16(TCGv_i32 var)
2454 {
2455     TCGv_i32 tmp = tcg_temp_new_i32();
2456     tcg_gen_ext16u_i32(var, var);
2457     tcg_gen_shli_i32(tmp, var, 16);
2458     tcg_gen_or_i32(var, var, tmp);
2459     tcg_temp_free_i32(tmp);
2460 }
2461
2462 static void gen_neon_dup_high16(TCGv_i32 var)
2463 {
2464     TCGv_i32 tmp = tcg_temp_new_i32();
2465     tcg_gen_andi_i32(var, var, 0xffff0000);
2466     tcg_gen_shri_i32(tmp, var, 16);
2467     tcg_gen_or_i32(var, var, tmp);
2468     tcg_temp_free_i32(tmp);
2469 }
2470
2471 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2472 {
2473     TCGv_i32 tmp;
2474     if (size == 1) {
2475         tmp = neon_load_reg(reg & 7, reg >> 4);
2476         if (reg & 8) {
2477             gen_neon_dup_high16(tmp);
2478         } else {
2479             gen_neon_dup_low16(tmp);
2480         }
2481     } else {
2482         tmp = neon_load_reg(reg & 15, reg >> 4);
2483     }
2484     return tmp;
2485 }
2486
2487 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2488                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2489 {
2490     /*
2491      * Two registers and a scalar: perform an operation between
2492      * the input elements and the scalar, and then possibly
2493      * perform an accumulation operation of that result into the
2494      * destination.
2495      */
2496     TCGv_i32 scalar;
2497     int pass;
2498
2499     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2500         return false;
2501     }
2502
2503     /* UNDEF accesses to D16-D31 if they don't exist. */
2504     if (!dc_isar_feature(aa32_simd_r32, s) &&
2505         ((a->vd | a->vn | a->vm) & 0x10)) {
2506         return false;
2507     }
2508
2509     if (!opfn) {
2510         /* Bad size (including size == 3, which is a different insn group) */
2511         return false;
2512     }
2513
2514     if (a->q && ((a->vd | a->vn) & 1)) {
2515         return false;
2516     }
2517
2518     if (!vfp_access_check(s)) {
2519         return true;
2520     }
2521
2522     scalar = neon_get_scalar(a->size, a->vm);
2523
2524     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2525         TCGv_i32 tmp = neon_load_reg(a->vn, pass);
2526         opfn(tmp, tmp, scalar);
2527         if (accfn) {
2528             TCGv_i32 rd = neon_load_reg(a->vd, pass);
2529             accfn(tmp, rd, tmp);
2530             tcg_temp_free_i32(rd);
2531         }
2532         neon_store_reg(a->vd, pass, tmp);
2533     }
2534     tcg_temp_free_i32(scalar);
2535     return true;
2536 }
2537
2538 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2539 {
2540     static NeonGenTwoOpFn * const opfn[] = {
2541         NULL,
2542         gen_helper_neon_mul_u16,
2543         tcg_gen_mul_i32,
2544         NULL,
2545     };
2546
2547     return do_2scalar(s, a, opfn[a->size], NULL);
2548 }
2549
2550 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2551 {
2552     static NeonGenTwoOpFn * const opfn[] = {
2553         NULL,
2554         gen_helper_neon_mul_u16,
2555         tcg_gen_mul_i32,
2556         NULL,
2557     };
2558     static NeonGenTwoOpFn * const accfn[] = {
2559         NULL,
2560         gen_helper_neon_add_u16,
2561         tcg_gen_add_i32,
2562         NULL,
2563     };
2564
2565     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2566 }
2567
2568 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2569 {
2570     static NeonGenTwoOpFn * const opfn[] = {
2571         NULL,
2572         gen_helper_neon_mul_u16,
2573         tcg_gen_mul_i32,
2574         NULL,
2575     };
2576     static NeonGenTwoOpFn * const accfn[] = {
2577         NULL,
2578         gen_helper_neon_sub_u16,
2579         tcg_gen_sub_i32,
2580         NULL,
2581     };
2582
2583     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2584 }
2585
2586 /*
2587  * Rather than have a float-specific version of do_2scalar just for
2588  * three insns, we wrap a NeonGenTwoSingleOpFn to turn it into
2589  * a NeonGenTwoOpFn.
2590  */
2591 #define WRAP_FP_FN(WRAPNAME, FUNC)                              \
2592     static void WRAPNAME(TCGv_i32 rd, TCGv_i32 rn, TCGv_i32 rm) \
2593     {                                                           \
2594         TCGv_ptr fpstatus = get_fpstatus_ptr(1);                \
2595         FUNC(rd, rn, rm, fpstatus);                             \
2596         tcg_temp_free_ptr(fpstatus);                            \
2597     }
2598
2599 WRAP_FP_FN(gen_VMUL_F_mul, gen_helper_vfp_muls)
2600 WRAP_FP_FN(gen_VMUL_F_add, gen_helper_vfp_adds)
2601 WRAP_FP_FN(gen_VMUL_F_sub, gen_helper_vfp_subs)
2602
2603 static bool trans_VMUL_F_2sc(DisasContext *s, arg_2scalar *a)
2604 {
2605     static NeonGenTwoOpFn * const opfn[] = {
2606         NULL,
2607         NULL, /* TODO: fp16 support */
2608         gen_VMUL_F_mul,
2609         NULL,
2610     };
2611
2612     return do_2scalar(s, a, opfn[a->size], NULL);
2613 }
2614
2615 static bool trans_VMLA_F_2sc(DisasContext *s, arg_2scalar *a)
2616 {
2617     static NeonGenTwoOpFn * const opfn[] = {
2618         NULL,
2619         NULL, /* TODO: fp16 support */
2620         gen_VMUL_F_mul,
2621         NULL,
2622     };
2623     static NeonGenTwoOpFn * const accfn[] = {
2624         NULL,
2625         NULL, /* TODO: fp16 support */
2626         gen_VMUL_F_add,
2627         NULL,
2628     };
2629
2630     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2631 }
2632
2633 static bool trans_VMLS_F_2sc(DisasContext *s, arg_2scalar *a)
2634 {
2635     static NeonGenTwoOpFn * const opfn[] = {
2636         NULL,
2637         NULL, /* TODO: fp16 support */
2638         gen_VMUL_F_mul,
2639         NULL,
2640     };
2641     static NeonGenTwoOpFn * const accfn[] = {
2642         NULL,
2643         NULL, /* TODO: fp16 support */
2644         gen_VMUL_F_sub,
2645         NULL,
2646     };
2647
2648     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2649 }
2650
2651 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2652 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2653 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2654 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2655
2656 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2657 {
2658     static NeonGenTwoOpFn * const opfn[] = {
2659         NULL,
2660         gen_VQDMULH_16,
2661         gen_VQDMULH_32,
2662         NULL,
2663     };
2664
2665     return do_2scalar(s, a, opfn[a->size], NULL);
2666 }
2667
2668 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2669 {
2670     static NeonGenTwoOpFn * const opfn[] = {
2671         NULL,
2672         gen_VQRDMULH_16,
2673         gen_VQRDMULH_32,
2674         NULL,
2675     };
2676
2677     return do_2scalar(s, a, opfn[a->size], NULL);
2678 }
2679
2680 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2681                             NeonGenThreeOpEnvFn *opfn)
2682 {
2683     /*
2684      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2685      * performs a kind of fused op-then-accumulate using a helper
2686      * function that takes all of rd, rn and the scalar at once.
2687      */
2688     TCGv_i32 scalar;
2689     int pass;
2690
2691     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2692         return false;
2693     }
2694
2695     if (!dc_isar_feature(aa32_rdm, s)) {
2696         return false;
2697     }
2698
2699     /* UNDEF accesses to D16-D31 if they don't exist. */
2700     if (!dc_isar_feature(aa32_simd_r32, s) &&
2701         ((a->vd | a->vn | a->vm) & 0x10)) {
2702         return false;
2703     }
2704
2705     if (!opfn) {
2706         /* Bad size (including size == 3, which is a different insn group) */
2707         return false;
2708     }
2709
2710     if (a->q && ((a->vd | a->vn) & 1)) {
2711         return false;
2712     }
2713
2714     if (!vfp_access_check(s)) {
2715         return true;
2716     }
2717
2718     scalar = neon_get_scalar(a->size, a->vm);
2719
2720     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2721         TCGv_i32 rn = neon_load_reg(a->vn, pass);
2722         TCGv_i32 rd = neon_load_reg(a->vd, pass);
2723         opfn(rd, cpu_env, rn, scalar, rd);
2724         tcg_temp_free_i32(rn);
2725         neon_store_reg(a->vd, pass, rd);
2726     }
2727     tcg_temp_free_i32(scalar);
2728
2729     return true;
2730 }
2731
2732 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2733 {
2734     static NeonGenThreeOpEnvFn *opfn[] = {
2735         NULL,
2736         gen_helper_neon_qrdmlah_s16,
2737         gen_helper_neon_qrdmlah_s32,
2738         NULL,
2739     };
2740     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2741 }
2742
2743 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2744 {
2745     static NeonGenThreeOpEnvFn *opfn[] = {
2746         NULL,
2747         gen_helper_neon_qrdmlsh_s16,
2748         gen_helper_neon_qrdmlsh_s32,
2749         NULL,
2750     };
2751     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2752 }
2753
2754 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2755                             NeonGenTwoOpWidenFn *opfn,
2756                             NeonGenTwo64OpFn *accfn)
2757 {
2758     /*
2759      * Two registers and a scalar, long operations: perform an
2760      * operation on the input elements and the scalar which produces
2761      * a double-width result, and then possibly perform an accumulation
2762      * operation of that result into the destination.
2763      */
2764     TCGv_i32 scalar, rn;
2765     TCGv_i64 rn0_64, rn1_64;
2766
2767     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2768         return false;
2769     }
2770
2771     /* UNDEF accesses to D16-D31 if they don't exist. */
2772     if (!dc_isar_feature(aa32_simd_r32, s) &&
2773         ((a->vd | a->vn | a->vm) & 0x10)) {
2774         return false;
2775     }
2776
2777     if (!opfn) {
2778         /* Bad size (including size == 3, which is a different insn group) */
2779         return false;
2780     }
2781
2782     if (a->vd & 1) {
2783         return false;
2784     }
2785
2786     if (!vfp_access_check(s)) {
2787         return true;
2788     }
2789
2790     scalar = neon_get_scalar(a->size, a->vm);
2791
2792     /* Load all inputs before writing any outputs, in case of overlap */
2793     rn = neon_load_reg(a->vn, 0);
2794     rn0_64 = tcg_temp_new_i64();
2795     opfn(rn0_64, rn, scalar);
2796     tcg_temp_free_i32(rn);
2797
2798     rn = neon_load_reg(a->vn, 1);
2799     rn1_64 = tcg_temp_new_i64();
2800     opfn(rn1_64, rn, scalar);
2801     tcg_temp_free_i32(rn);
2802     tcg_temp_free_i32(scalar);
2803
2804     if (accfn) {
2805         TCGv_i64 t64 = tcg_temp_new_i64();
2806         neon_load_reg64(t64, a->vd);
2807         accfn(t64, t64, rn0_64);
2808         neon_store_reg64(t64, a->vd);
2809         neon_load_reg64(t64, a->vd + 1);
2810         accfn(t64, t64, rn1_64);
2811         neon_store_reg64(t64, a->vd + 1);
2812         tcg_temp_free_i64(t64);
2813     } else {
2814         neon_store_reg64(rn0_64, a->vd);
2815         neon_store_reg64(rn1_64, a->vd + 1);
2816     }
2817     tcg_temp_free_i64(rn0_64);
2818     tcg_temp_free_i64(rn1_64);
2819     return true;
2820 }
2821
2822 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2823 {
2824     static NeonGenTwoOpWidenFn * const opfn[] = {
2825         NULL,
2826         gen_helper_neon_mull_s16,
2827         gen_mull_s32,
2828         NULL,
2829     };
2830
2831     return do_2scalar_long(s, a, opfn[a->size], NULL);
2832 }
2833
2834 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2835 {
2836     static NeonGenTwoOpWidenFn * const opfn[] = {
2837         NULL,
2838         gen_helper_neon_mull_u16,
2839         gen_mull_u32,
2840         NULL,
2841     };
2842
2843     return do_2scalar_long(s, a, opfn[a->size], NULL);
2844 }
2845
2846 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2847     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2848     {                                                                   \
2849         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2850             NULL,                                                       \
2851             gen_helper_neon_##MULL##16,                                 \
2852             gen_##MULL##32,                                             \
2853             NULL,                                                       \
2854         };                                                              \
2855         static NeonGenTwo64OpFn * const accfn[] = {                     \
2856             NULL,                                                       \
2857             gen_helper_neon_##ACC##l_u32,                               \
2858             tcg_gen_##ACC##_i64,                                        \
2859             NULL,                                                       \
2860         };                                                              \
2861         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2862     }
2863
2864 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2865 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2866 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2867 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2868
2869 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2870 {
2871     static NeonGenTwoOpWidenFn * const opfn[] = {
2872         NULL,
2873         gen_VQDMULL_16,
2874         gen_VQDMULL_32,
2875         NULL,
2876     };
2877
2878     return do_2scalar_long(s, a, opfn[a->size], NULL);
2879 }
2880
2881 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2882 {
2883     static NeonGenTwoOpWidenFn * const opfn[] = {
2884         NULL,
2885         gen_VQDMULL_16,
2886         gen_VQDMULL_32,
2887         NULL,
2888     };
2889     static NeonGenTwo64OpFn * const accfn[] = {
2890         NULL,
2891         gen_VQDMLAL_acc_16,
2892         gen_VQDMLAL_acc_32,
2893         NULL,
2894     };
2895
2896     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2897 }
2898
2899 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2900 {
2901     static NeonGenTwoOpWidenFn * const opfn[] = {
2902         NULL,
2903         gen_VQDMULL_16,
2904         gen_VQDMULL_32,
2905         NULL,
2906     };
2907     static NeonGenTwo64OpFn * const accfn[] = {
2908         NULL,
2909         gen_VQDMLSL_acc_16,
2910         gen_VQDMLSL_acc_32,
2911         NULL,
2912     };
2913
2914     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2915 }
2916
2917 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2918 {
2919     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2920         return false;
2921     }
2922
2923     /* UNDEF accesses to D16-D31 if they don't exist. */
2924     if (!dc_isar_feature(aa32_simd_r32, s) &&
2925         ((a->vd | a->vn | a->vm) & 0x10)) {
2926         return false;
2927     }
2928
2929     if ((a->vn | a->vm | a->vd) & a->q) {
2930         return false;
2931     }
2932
2933     if (a->imm > 7 && !a->q) {
2934         return false;
2935     }
2936
2937     if (!vfp_access_check(s)) {
2938         return true;
2939     }
2940
2941     if (!a->q) {
2942         /* Extract 64 bits from <Vm:Vn> */
2943         TCGv_i64 left, right, dest;
2944
2945         left = tcg_temp_new_i64();
2946         right = tcg_temp_new_i64();
2947         dest = tcg_temp_new_i64();
2948
2949         neon_load_reg64(right, a->vn);
2950         neon_load_reg64(left, a->vm);
2951         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2952         neon_store_reg64(dest, a->vd);
2953
2954         tcg_temp_free_i64(left);
2955         tcg_temp_free_i64(right);
2956         tcg_temp_free_i64(dest);
2957     } else {
2958         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2959         TCGv_i64 left, middle, right, destleft, destright;
2960
2961         left = tcg_temp_new_i64();
2962         middle = tcg_temp_new_i64();
2963         right = tcg_temp_new_i64();
2964         destleft = tcg_temp_new_i64();
2965         destright = tcg_temp_new_i64();
2966
2967         if (a->imm < 8) {
2968             neon_load_reg64(right, a->vn);
2969             neon_load_reg64(middle, a->vn + 1);
2970             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2971             neon_load_reg64(left, a->vm);
2972             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2973         } else {
2974             neon_load_reg64(right, a->vn + 1);
2975             neon_load_reg64(middle, a->vm);
2976             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2977             neon_load_reg64(left, a->vm + 1);
2978             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2979         }
2980
2981         neon_store_reg64(destright, a->vd);
2982         neon_store_reg64(destleft, a->vd + 1);
2983
2984         tcg_temp_free_i64(destright);
2985         tcg_temp_free_i64(destleft);
2986         tcg_temp_free_i64(right);
2987         tcg_temp_free_i64(middle);
2988         tcg_temp_free_i64(left);
2989     }
2990     return true;
2991 }
2992
2993 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2994 {
2995     int n;
2996     TCGv_i32 tmp, tmp2, tmp3, tmp4;
2997     TCGv_ptr ptr1;
2998
2999     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3000         return false;
3001     }
3002
3003     /* UNDEF accesses to D16-D31 if they don't exist. */
3004     if (!dc_isar_feature(aa32_simd_r32, s) &&
3005         ((a->vd | a->vn | a->vm) & 0x10)) {
3006         return false;
3007     }
3008
3009     if (!vfp_access_check(s)) {
3010         return true;
3011     }
3012
3013     n = a->len + 1;
3014     if ((a->vn + n) > 32) {
3015         /*
3016          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
3017          * helper function running off the end of the register file.
3018          */
3019         return false;
3020     }
3021     n <<= 3;
3022     if (a->op) {
3023         tmp = neon_load_reg(a->vd, 0);
3024     } else {
3025         tmp = tcg_temp_new_i32();
3026         tcg_gen_movi_i32(tmp, 0);
3027     }
3028     tmp2 = neon_load_reg(a->vm, 0);
3029     ptr1 = vfp_reg_ptr(true, a->vn);
3030     tmp4 = tcg_const_i32(n);
3031     gen_helper_neon_tbl(tmp2, tmp2, tmp, ptr1, tmp4);
3032     tcg_temp_free_i32(tmp);
3033     if (a->op) {
3034         tmp = neon_load_reg(a->vd, 1);
3035     } else {
3036         tmp = tcg_temp_new_i32();
3037         tcg_gen_movi_i32(tmp, 0);
3038     }
3039     tmp3 = neon_load_reg(a->vm, 1);
3040     gen_helper_neon_tbl(tmp3, tmp3, tmp, ptr1, tmp4);
3041     tcg_temp_free_i32(tmp4);
3042     tcg_temp_free_ptr(ptr1);
3043     neon_store_reg(a->vd, 0, tmp2);
3044     neon_store_reg(a->vd, 1, tmp3);
3045     tcg_temp_free_i32(tmp);
3046     return true;
3047 }
3048
3049 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
3050 {
3051     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3052         return false;
3053     }
3054
3055     /* UNDEF accesses to D16-D31 if they don't exist. */
3056     if (!dc_isar_feature(aa32_simd_r32, s) &&
3057         ((a->vd | a->vm) & 0x10)) {
3058         return false;
3059     }
3060
3061     if (a->vd & a->q) {
3062         return false;
3063     }
3064
3065     if (!vfp_access_check(s)) {
3066         return true;
3067     }
3068
3069     tcg_gen_gvec_dup_mem(a->size, neon_reg_offset(a->vd, 0),
3070                          neon_element_offset(a->vm, a->index, a->size),
3071                          a->q ? 16 : 8, a->q ? 16 : 8);
3072     return true;
3073 }
3074
3075 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
3076 {
3077     int pass, half;
3078
3079     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3080         return false;
3081     }
3082
3083     /* UNDEF accesses to D16-D31 if they don't exist. */
3084     if (!dc_isar_feature(aa32_simd_r32, s) &&
3085         ((a->vd | a->vm) & 0x10)) {
3086         return false;
3087     }
3088
3089     if ((a->vd | a->vm) & a->q) {
3090         return false;
3091     }
3092
3093     if (a->size == 3) {
3094         return false;
3095     }
3096
3097     if (!vfp_access_check(s)) {
3098         return true;
3099     }
3100
3101     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3102         TCGv_i32 tmp[2];
3103
3104         for (half = 0; half < 2; half++) {
3105             tmp[half] = neon_load_reg(a->vm, pass * 2 + half);
3106             switch (a->size) {
3107             case 0:
3108                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
3109                 break;
3110             case 1:
3111                 gen_swap_half(tmp[half], tmp[half]);
3112                 break;
3113             case 2:
3114                 break;
3115             default:
3116                 g_assert_not_reached();
3117             }
3118         }
3119         neon_store_reg(a->vd, pass * 2, tmp[1]);
3120         neon_store_reg(a->vd, pass * 2 + 1, tmp[0]);
3121     }
3122     return true;
3123 }
3124
3125 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3126                               NeonGenWidenFn *widenfn,
3127                               NeonGenTwo64OpFn *opfn,
3128                               NeonGenTwo64OpFn *accfn)
3129 {
3130     /*
3131      * Pairwise long operations: widen both halves of the pair,
3132      * combine the pairs with the opfn, and then possibly accumulate
3133      * into the destination with the accfn.
3134      */
3135     int pass;
3136
3137     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3138         return false;
3139     }
3140
3141     /* UNDEF accesses to D16-D31 if they don't exist. */
3142     if (!dc_isar_feature(aa32_simd_r32, s) &&
3143         ((a->vd | a->vm) & 0x10)) {
3144         return false;
3145     }
3146
3147     if ((a->vd | a->vm) & a->q) {
3148         return false;
3149     }
3150
3151     if (!widenfn) {
3152         return false;
3153     }
3154
3155     if (!vfp_access_check(s)) {
3156         return true;
3157     }
3158
3159     for (pass = 0; pass < a->q + 1; pass++) {
3160         TCGv_i32 tmp;
3161         TCGv_i64 rm0_64, rm1_64, rd_64;
3162
3163         rm0_64 = tcg_temp_new_i64();
3164         rm1_64 = tcg_temp_new_i64();
3165         rd_64 = tcg_temp_new_i64();
3166         tmp = neon_load_reg(a->vm, pass * 2);
3167         widenfn(rm0_64, tmp);
3168         tcg_temp_free_i32(tmp);
3169         tmp = neon_load_reg(a->vm, pass * 2 + 1);
3170         widenfn(rm1_64, tmp);
3171         tcg_temp_free_i32(tmp);
3172         opfn(rd_64, rm0_64, rm1_64);
3173         tcg_temp_free_i64(rm0_64);
3174         tcg_temp_free_i64(rm1_64);
3175
3176         if (accfn) {
3177             TCGv_i64 tmp64 = tcg_temp_new_i64();
3178             neon_load_reg64(tmp64, a->vd + pass);
3179             accfn(rd_64, tmp64, rd_64);
3180             tcg_temp_free_i64(tmp64);
3181         }
3182         neon_store_reg64(rd_64, a->vd + pass);
3183         tcg_temp_free_i64(rd_64);
3184     }
3185     return true;
3186 }
3187
3188 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3189 {
3190     static NeonGenWidenFn * const widenfn[] = {
3191         gen_helper_neon_widen_s8,
3192         gen_helper_neon_widen_s16,
3193         tcg_gen_ext_i32_i64,
3194         NULL,
3195     };
3196     static NeonGenTwo64OpFn * const opfn[] = {
3197         gen_helper_neon_paddl_u16,
3198         gen_helper_neon_paddl_u32,
3199         tcg_gen_add_i64,
3200         NULL,
3201     };
3202
3203     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3204 }
3205
3206 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3207 {
3208     static NeonGenWidenFn * const widenfn[] = {
3209         gen_helper_neon_widen_u8,
3210         gen_helper_neon_widen_u16,
3211         tcg_gen_extu_i32_i64,
3212         NULL,
3213     };
3214     static NeonGenTwo64OpFn * const opfn[] = {
3215         gen_helper_neon_paddl_u16,
3216         gen_helper_neon_paddl_u32,
3217         tcg_gen_add_i64,
3218         NULL,
3219     };
3220
3221     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3222 }
3223
3224 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3225 {
3226     static NeonGenWidenFn * const widenfn[] = {
3227         gen_helper_neon_widen_s8,
3228         gen_helper_neon_widen_s16,
3229         tcg_gen_ext_i32_i64,
3230         NULL,
3231     };
3232     static NeonGenTwo64OpFn * const opfn[] = {
3233         gen_helper_neon_paddl_u16,
3234         gen_helper_neon_paddl_u32,
3235         tcg_gen_add_i64,
3236         NULL,
3237     };
3238     static NeonGenTwo64OpFn * const accfn[] = {
3239         gen_helper_neon_addl_u16,
3240         gen_helper_neon_addl_u32,
3241         tcg_gen_add_i64,
3242         NULL,
3243     };
3244
3245     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3246                              accfn[a->size]);
3247 }
3248
3249 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3250 {
3251     static NeonGenWidenFn * const widenfn[] = {
3252         gen_helper_neon_widen_u8,
3253         gen_helper_neon_widen_u16,
3254         tcg_gen_extu_i32_i64,
3255         NULL,
3256     };
3257     static NeonGenTwo64OpFn * const opfn[] = {
3258         gen_helper_neon_paddl_u16,
3259         gen_helper_neon_paddl_u32,
3260         tcg_gen_add_i64,
3261         NULL,
3262     };
3263     static NeonGenTwo64OpFn * const accfn[] = {
3264         gen_helper_neon_addl_u16,
3265         gen_helper_neon_addl_u32,
3266         tcg_gen_add_i64,
3267         NULL,
3268     };
3269
3270     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3271                              accfn[a->size]);
3272 }
3273
3274 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3275
3276 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3277                        ZipFn *fn)
3278 {
3279     TCGv_ptr pd, pm;
3280
3281     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3282         return false;
3283     }
3284
3285     /* UNDEF accesses to D16-D31 if they don't exist. */
3286     if (!dc_isar_feature(aa32_simd_r32, s) &&
3287         ((a->vd | a->vm) & 0x10)) {
3288         return false;
3289     }
3290
3291     if ((a->vd | a->vm) & a->q) {
3292         return false;
3293     }
3294
3295     if (!fn) {
3296         /* Bad size or size/q combination */
3297         return false;
3298     }
3299
3300     if (!vfp_access_check(s)) {
3301         return true;
3302     }
3303
3304     pd = vfp_reg_ptr(true, a->vd);
3305     pm = vfp_reg_ptr(true, a->vm);
3306     fn(pd, pm);
3307     tcg_temp_free_ptr(pd);
3308     tcg_temp_free_ptr(pm);
3309     return true;
3310 }
3311
3312 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3313 {
3314     static ZipFn * const fn[2][4] = {
3315         {
3316             gen_helper_neon_unzip8,
3317             gen_helper_neon_unzip16,
3318             NULL,
3319             NULL,
3320         }, {
3321             gen_helper_neon_qunzip8,
3322             gen_helper_neon_qunzip16,
3323             gen_helper_neon_qunzip32,
3324             NULL,
3325         }
3326     };
3327     return do_zip_uzp(s, a, fn[a->q][a->size]);
3328 }
3329
3330 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3331 {
3332     static ZipFn * const fn[2][4] = {
3333         {
3334             gen_helper_neon_zip8,
3335             gen_helper_neon_zip16,
3336             NULL,
3337             NULL,
3338         }, {
3339             gen_helper_neon_qzip8,
3340             gen_helper_neon_qzip16,
3341             gen_helper_neon_qzip32,
3342             NULL,
3343         }
3344     };
3345     return do_zip_uzp(s, a, fn[a->q][a->size]);
3346 }
3347
3348 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3349                      NeonGenNarrowEnvFn *narrowfn)
3350 {
3351     TCGv_i64 rm;
3352     TCGv_i32 rd0, rd1;
3353
3354     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3355         return false;
3356     }
3357
3358     /* UNDEF accesses to D16-D31 if they don't exist. */
3359     if (!dc_isar_feature(aa32_simd_r32, s) &&
3360         ((a->vd | a->vm) & 0x10)) {
3361         return false;
3362     }
3363
3364     if (a->vm & 1) {
3365         return false;
3366     }
3367
3368     if (!narrowfn) {
3369         return false;
3370     }
3371
3372     if (!vfp_access_check(s)) {
3373         return true;
3374     }
3375
3376     rm = tcg_temp_new_i64();
3377     rd0 = tcg_temp_new_i32();
3378     rd1 = tcg_temp_new_i32();
3379
3380     neon_load_reg64(rm, a->vm);
3381     narrowfn(rd0, cpu_env, rm);
3382     neon_load_reg64(rm, a->vm + 1);
3383     narrowfn(rd1, cpu_env, rm);
3384     neon_store_reg(a->vd, 0, rd0);
3385     neon_store_reg(a->vd, 1, rd1);
3386     tcg_temp_free_i64(rm);
3387     return true;
3388 }
3389
3390 #define DO_VMOVN(INSN, FUNC)                                    \
3391     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3392     {                                                           \
3393         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3394             FUNC##8,                                            \
3395             FUNC##16,                                           \
3396             FUNC##32,                                           \
3397             NULL,                                               \
3398         };                                                      \
3399         return do_vmovn(s, a, narrowfn[a->size]);               \
3400     }
3401
3402 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3403 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3404 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3405 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3406
3407 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3408 {
3409     TCGv_i32 rm0, rm1;
3410     TCGv_i64 rd;
3411     static NeonGenWidenFn * const widenfns[] = {
3412         gen_helper_neon_widen_u8,
3413         gen_helper_neon_widen_u16,
3414         tcg_gen_extu_i32_i64,
3415         NULL,
3416     };
3417     NeonGenWidenFn *widenfn = widenfns[a->size];
3418
3419     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3420         return false;
3421     }
3422
3423     /* UNDEF accesses to D16-D31 if they don't exist. */
3424     if (!dc_isar_feature(aa32_simd_r32, s) &&
3425         ((a->vd | a->vm) & 0x10)) {
3426         return false;
3427     }
3428
3429     if (a->vd & 1) {
3430         return false;
3431     }
3432
3433     if (!widenfn) {
3434         return false;
3435     }
3436
3437     if (!vfp_access_check(s)) {
3438         return true;
3439     }
3440
3441     rd = tcg_temp_new_i64();
3442
3443     rm0 = neon_load_reg(a->vm, 0);
3444     rm1 = neon_load_reg(a->vm, 1);
3445
3446     widenfn(rd, rm0);
3447     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3448     neon_store_reg64(rd, a->vd);
3449     widenfn(rd, rm1);
3450     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3451     neon_store_reg64(rd, a->vd + 1);
3452
3453     tcg_temp_free_i64(rd);
3454     tcg_temp_free_i32(rm0);
3455     tcg_temp_free_i32(rm1);
3456     return true;
3457 }
3458
3459 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3460 {
3461     TCGv_ptr fpst;
3462     TCGv_i32 ahp, tmp, tmp2, tmp3;
3463
3464     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3465         !dc_isar_feature(aa32_fp16_spconv, s)) {
3466         return false;
3467     }
3468
3469     /* UNDEF accesses to D16-D31 if they don't exist. */
3470     if (!dc_isar_feature(aa32_simd_r32, s) &&
3471         ((a->vd | a->vm) & 0x10)) {
3472         return false;
3473     }
3474
3475     if ((a->vm & 1) || (a->size != 1)) {
3476         return false;
3477     }
3478
3479     if (!vfp_access_check(s)) {
3480         return true;
3481     }
3482
3483     fpst = get_fpstatus_ptr(true);
3484     ahp = get_ahp_flag();
3485     tmp = neon_load_reg(a->vm, 0);
3486     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3487     tmp2 = neon_load_reg(a->vm, 1);
3488     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3489     tcg_gen_shli_i32(tmp2, tmp2, 16);
3490     tcg_gen_or_i32(tmp2, tmp2, tmp);
3491     tcg_temp_free_i32(tmp);
3492     tmp = neon_load_reg(a->vm, 2);
3493     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3494     tmp3 = neon_load_reg(a->vm, 3);
3495     neon_store_reg(a->vd, 0, tmp2);
3496     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3497     tcg_gen_shli_i32(tmp3, tmp3, 16);
3498     tcg_gen_or_i32(tmp3, tmp3, tmp);
3499     neon_store_reg(a->vd, 1, tmp3);
3500     tcg_temp_free_i32(tmp);
3501     tcg_temp_free_i32(ahp);
3502     tcg_temp_free_ptr(fpst);
3503
3504     return true;
3505 }
3506
3507 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3508 {
3509     TCGv_ptr fpst;
3510     TCGv_i32 ahp, tmp, tmp2, tmp3;
3511
3512     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3513         !dc_isar_feature(aa32_fp16_spconv, s)) {
3514         return false;
3515     }
3516
3517     /* UNDEF accesses to D16-D31 if they don't exist. */
3518     if (!dc_isar_feature(aa32_simd_r32, s) &&
3519         ((a->vd | a->vm) & 0x10)) {
3520         return false;
3521     }
3522
3523     if ((a->vd & 1) || (a->size != 1)) {
3524         return false;
3525     }
3526
3527     if (!vfp_access_check(s)) {
3528         return true;
3529     }
3530
3531     fpst = get_fpstatus_ptr(true);
3532     ahp = get_ahp_flag();
3533     tmp3 = tcg_temp_new_i32();
3534     tmp = neon_load_reg(a->vm, 0);
3535     tmp2 = neon_load_reg(a->vm, 1);
3536     tcg_gen_ext16u_i32(tmp3, tmp);
3537     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3538     neon_store_reg(a->vd, 0, tmp3);
3539     tcg_gen_shri_i32(tmp, tmp, 16);
3540     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3541     neon_store_reg(a->vd, 1, tmp);
3542     tmp3 = tcg_temp_new_i32();
3543     tcg_gen_ext16u_i32(tmp3, tmp2);
3544     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3545     neon_store_reg(a->vd, 2, tmp3);
3546     tcg_gen_shri_i32(tmp2, tmp2, 16);
3547     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3548     neon_store_reg(a->vd, 3, tmp2);
3549     tcg_temp_free_i32(ahp);
3550     tcg_temp_free_ptr(fpst);
3551
3552     return true;
3553 }
3554
3555 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3556 {
3557     int vec_size = a->q ? 16 : 8;
3558     int rd_ofs = neon_reg_offset(a->vd, 0);
3559     int rm_ofs = neon_reg_offset(a->vm, 0);
3560
3561     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3562         return false;
3563     }
3564
3565     /* UNDEF accesses to D16-D31 if they don't exist. */
3566     if (!dc_isar_feature(aa32_simd_r32, s) &&
3567         ((a->vd | a->vm) & 0x10)) {
3568         return false;
3569     }
3570
3571     if (a->size == 3) {
3572         return false;
3573     }
3574
3575     if ((a->vd | a->vm) & a->q) {
3576         return false;
3577     }
3578
3579     if (!vfp_access_check(s)) {
3580         return true;
3581     }
3582
3583     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3584
3585     return true;
3586 }
3587
3588 #define DO_2MISC_VEC(INSN, FN)                                  \
3589     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3590     {                                                           \
3591         return do_2misc_vec(s, a, FN);                          \
3592     }
3593
3594 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3595 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3596 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3597 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3598 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3599 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3600 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3601
3602 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3603 {
3604     if (a->size != 0) {
3605         return false;
3606     }
3607     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3608 }
3609
3610 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3611     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3612                          uint32_t rm_ofs, uint32_t oprsz,               \
3613                          uint32_t maxsz)                                \
3614     {                                                                   \
3615         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3616                            DATA, FUNC);                                 \
3617     }
3618
3619 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3620     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3621                          uint32_t rm_ofs, uint32_t oprsz,               \
3622                          uint32_t maxsz)                                \
3623     {                                                                   \
3624         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3625     }
3626
3627 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3628 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3629 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3630 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3631 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3632 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3633 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3634
3635 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3636     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3637     {                                                           \
3638         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3639             return false;                                       \
3640         }                                                       \
3641         return do_2misc_vec(s, a, gen_##INSN);                  \
3642     }
3643
3644 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3645 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3646 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3647 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3648 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3649 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3650 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3651
3652 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3653 {
3654     int pass;
3655
3656     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3657     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3658         return false;
3659     }
3660
3661     /* UNDEF accesses to D16-D31 if they don't exist. */
3662     if (!dc_isar_feature(aa32_simd_r32, s) &&
3663         ((a->vd | a->vm) & 0x10)) {
3664         return false;
3665     }
3666
3667     if (!fn) {
3668         return false;
3669     }
3670
3671     if ((a->vd | a->vm) & a->q) {
3672         return false;
3673     }
3674
3675     if (!vfp_access_check(s)) {
3676         return true;
3677     }
3678
3679     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3680         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
3681         fn(tmp, tmp);
3682         neon_store_reg(a->vd, pass, tmp);
3683     }
3684
3685     return true;
3686 }
3687
3688 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3689 {
3690     static NeonGenOneOpFn * const fn[] = {
3691         tcg_gen_bswap32_i32,
3692         gen_swap_half,
3693         NULL,
3694         NULL,
3695     };
3696     return do_2misc(s, a, fn[a->size]);
3697 }
3698
3699 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3700 {
3701     if (a->size != 0) {
3702         return false;
3703     }
3704     return do_2misc(s, a, gen_rev16);
3705 }
3706
3707 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3708 {
3709     static NeonGenOneOpFn * const fn[] = {
3710         gen_helper_neon_cls_s8,
3711         gen_helper_neon_cls_s16,
3712         gen_helper_neon_cls_s32,
3713         NULL,
3714     };
3715     return do_2misc(s, a, fn[a->size]);
3716 }
3717
3718 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3719 {
3720     tcg_gen_clzi_i32(rd, rm, 32);
3721 }
3722
3723 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3724 {
3725     static NeonGenOneOpFn * const fn[] = {
3726         gen_helper_neon_clz_u8,
3727         gen_helper_neon_clz_u16,
3728         do_VCLZ_32,
3729         NULL,
3730     };
3731     return do_2misc(s, a, fn[a->size]);
3732 }
3733
3734 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3735 {
3736     if (a->size != 0) {
3737         return false;
3738     }
3739     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3740 }
3741
3742 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3743 {
3744     if (a->size != 2) {
3745         return false;
3746     }
3747     /* TODO: FP16 : size == 1 */
3748     return do_2misc(s, a, gen_helper_vfp_abss);
3749 }
3750
3751 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3752 {
3753     if (a->size != 2) {
3754         return false;
3755     }
3756     /* TODO: FP16 : size == 1 */
3757     return do_2misc(s, a, gen_helper_vfp_negs);
3758 }
3759
3760 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3761 {
3762     if (a->size != 2) {
3763         return false;
3764     }
3765     return do_2misc(s, a, gen_helper_recpe_u32);
3766 }
3767
3768 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3769 {
3770     if (a->size != 2) {
3771         return false;
3772     }
3773     return do_2misc(s, a, gen_helper_rsqrte_u32);
3774 }
3775
3776 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3777     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3778     {                                                   \
3779         FUNC(d, cpu_env, m);                            \
3780     }
3781
3782 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3783 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3784 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3785 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3786 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3787 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3788
3789 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3790 {
3791     static NeonGenOneOpFn * const fn[] = {
3792         gen_VQABS_s8,
3793         gen_VQABS_s16,
3794         gen_VQABS_s32,
3795         NULL,
3796     };
3797     return do_2misc(s, a, fn[a->size]);
3798 }
3799
3800 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3801 {
3802     static NeonGenOneOpFn * const fn[] = {
3803         gen_VQNEG_s8,
3804         gen_VQNEG_s16,
3805         gen_VQNEG_s32,
3806         NULL,
3807     };
3808     return do_2misc(s, a, fn[a->size]);
3809 }
3810
3811 static bool do_2misc_fp(DisasContext *s, arg_2misc *a,
3812                         NeonGenOneSingleOpFn *fn)
3813 {
3814     int pass;
3815     TCGv_ptr fpst;
3816
3817     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3818     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3819         return false;
3820     }
3821
3822     /* UNDEF accesses to D16-D31 if they don't exist. */
3823     if (!dc_isar_feature(aa32_simd_r32, s) &&
3824         ((a->vd | a->vm) & 0x10)) {
3825         return false;
3826     }
3827
3828     if (a->size != 2) {
3829         /* TODO: FP16 will be the size == 1 case */
3830         return false;
3831     }
3832
3833     if ((a->vd | a->vm) & a->q) {
3834         return false;
3835     }
3836
3837     if (!vfp_access_check(s)) {
3838         return true;
3839     }
3840
3841     fpst = get_fpstatus_ptr(1);
3842     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3843         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
3844         fn(tmp, tmp, fpst);
3845         neon_store_reg(a->vd, pass, tmp);
3846     }
3847     tcg_temp_free_ptr(fpst);
3848
3849     return true;
3850 }
3851
3852 #define DO_2MISC_FP(INSN, FUNC)                                 \
3853     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3854     {                                                           \
3855         return do_2misc_fp(s, a, FUNC);                         \
3856     }
3857
3858 DO_2MISC_FP(VRECPE_F, gen_helper_recpe_f32)
3859 DO_2MISC_FP(VRSQRTE_F, gen_helper_rsqrte_f32)
3860 DO_2MISC_FP(VCVT_FS, gen_helper_vfp_sitos)
3861 DO_2MISC_FP(VCVT_FU, gen_helper_vfp_uitos)
3862 DO_2MISC_FP(VCVT_SF, gen_helper_vfp_tosizs)
3863 DO_2MISC_FP(VCVT_UF, gen_helper_vfp_touizs)
3864
3865 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3866 {
3867     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3868         return false;
3869     }
3870     return do_2misc_fp(s, a, gen_helper_rints_exact);
3871 }
3872
3873 #define WRAP_FP_CMP0_FWD(WRAPNAME, FUNC)                        \
3874     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m, TCGv_ptr fpst) \
3875     {                                                           \
3876         TCGv_i32 zero = tcg_const_i32(0);                       \
3877         FUNC(d, m, zero, fpst);                                 \
3878         tcg_temp_free_i32(zero);                                \
3879     }
3880 #define WRAP_FP_CMP0_REV(WRAPNAME, FUNC)                        \
3881     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m, TCGv_ptr fpst) \
3882     {                                                           \
3883         TCGv_i32 zero = tcg_const_i32(0);                       \
3884         FUNC(d, zero, m, fpst);                                 \
3885         tcg_temp_free_i32(zero);                                \
3886     }
3887
3888 #define DO_FP_CMP0(INSN, FUNC, REV)                             \
3889     WRAP_FP_CMP0_##REV(gen_##INSN, FUNC)                        \
3890     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3891     {                                                           \
3892         return do_2misc_fp(s, a, gen_##INSN);                   \
3893     }
3894
3895 DO_FP_CMP0(VCGT0_F, gen_helper_neon_cgt_f32, FWD)
3896 DO_FP_CMP0(VCGE0_F, gen_helper_neon_cge_f32, FWD)
3897 DO_FP_CMP0(VCEQ0_F, gen_helper_neon_ceq_f32, FWD)
3898 DO_FP_CMP0(VCLE0_F, gen_helper_neon_cge_f32, REV)
3899 DO_FP_CMP0(VCLT0_F, gen_helper_neon_cgt_f32, REV)
3900
3901 static bool do_vrint(DisasContext *s, arg_2misc *a, int rmode)
3902 {
3903     /*
3904      * Handle a VRINT* operation by iterating 32 bits at a time,
3905      * with a specified rounding mode in operation.
3906      */
3907     int pass;
3908     TCGv_ptr fpst;
3909     TCGv_i32 tcg_rmode;
3910
3911     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3912         !arm_dc_feature(s, ARM_FEATURE_V8)) {
3913         return false;
3914     }
3915
3916     /* UNDEF accesses to D16-D31 if they don't exist. */
3917     if (!dc_isar_feature(aa32_simd_r32, s) &&
3918         ((a->vd | a->vm) & 0x10)) {
3919         return false;
3920     }
3921
3922     if (a->size != 2) {
3923         /* TODO: FP16 will be the size == 1 case */
3924         return false;
3925     }
3926
3927     if ((a->vd | a->vm) & a->q) {
3928         return false;
3929     }
3930
3931     if (!vfp_access_check(s)) {
3932         return true;
3933     }
3934
3935     fpst = get_fpstatus_ptr(1);
3936     tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
3937     gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
3938     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3939         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
3940         gen_helper_rints(tmp, tmp, fpst);
3941         neon_store_reg(a->vd, pass, tmp);
3942     }
3943     gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
3944     tcg_temp_free_i32(tcg_rmode);
3945     tcg_temp_free_ptr(fpst);
3946
3947     return true;
3948 }
3949
3950 #define DO_VRINT(INSN, RMODE)                                   \
3951     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3952     {                                                           \
3953         return do_vrint(s, a, RMODE);                           \
3954     }
3955
3956 DO_VRINT(VRINTN, FPROUNDING_TIEEVEN)
3957 DO_VRINT(VRINTA, FPROUNDING_TIEAWAY)
3958 DO_VRINT(VRINTZ, FPROUNDING_ZERO)
3959 DO_VRINT(VRINTM, FPROUNDING_NEGINF)
3960 DO_VRINT(VRINTP, FPROUNDING_POSINF)
3961
3962 static bool do_vcvt(DisasContext *s, arg_2misc *a, int rmode, bool is_signed)
3963 {
3964     /*
3965      * Handle a VCVT* operation by iterating 32 bits at a time,
3966      * with a specified rounding mode in operation.
3967      */
3968     int pass;
3969     TCGv_ptr fpst;
3970     TCGv_i32 tcg_rmode, tcg_shift;
3971
3972     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3973         !arm_dc_feature(s, ARM_FEATURE_V8)) {
3974         return false;
3975     }
3976
3977     /* UNDEF accesses to D16-D31 if they don't exist. */
3978     if (!dc_isar_feature(aa32_simd_r32, s) &&
3979         ((a->vd | a->vm) & 0x10)) {
3980         return false;
3981     }
3982
3983     if (a->size != 2) {
3984         /* TODO: FP16 will be the size == 1 case */
3985         return false;
3986     }
3987
3988     if ((a->vd | a->vm) & a->q) {
3989         return false;
3990     }
3991
3992     if (!vfp_access_check(s)) {
3993         return true;
3994     }
3995
3996     fpst = get_fpstatus_ptr(1);
3997     tcg_shift = tcg_const_i32(0);
3998     tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
3999     gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
4000     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
4001         TCGv_i32 tmp = neon_load_reg(a->vm, pass);
4002         if (is_signed) {
4003             gen_helper_vfp_tosls(tmp, tmp, tcg_shift, fpst);
4004         } else {
4005             gen_helper_vfp_touls(tmp, tmp, tcg_shift, fpst);
4006         }
4007         neon_store_reg(a->vd, pass, tmp);
4008     }
4009     gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
4010     tcg_temp_free_i32(tcg_rmode);
4011     tcg_temp_free_i32(tcg_shift);
4012     tcg_temp_free_ptr(fpst);
4013
4014     return true;
4015 }
4016
4017 #define DO_VCVT(INSN, RMODE, SIGNED)                            \
4018     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
4019     {                                                           \
4020         return do_vcvt(s, a, RMODE, SIGNED);                    \
4021     }
4022
4023 DO_VCVT(VCVTAU, FPROUNDING_TIEAWAY, false)
4024 DO_VCVT(VCVTAS, FPROUNDING_TIEAWAY, true)
4025 DO_VCVT(VCVTNU, FPROUNDING_TIEEVEN, false)
4026 DO_VCVT(VCVTNS, FPROUNDING_TIEEVEN, true)
4027 DO_VCVT(VCVTPU, FPROUNDING_POSINF, false)
4028 DO_VCVT(VCVTPS, FPROUNDING_POSINF, true)
4029 DO_VCVT(VCVTMU, FPROUNDING_NEGINF, false)
4030 DO_VCVT(VCVTMS, FPROUNDING_NEGINF, true)
4031
4032 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
4033 {
4034     TCGv_i64 rm, rd;
4035     int pass;
4036
4037     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
4038         return false;
4039     }
4040
4041     /* UNDEF accesses to D16-D31 if they don't exist. */
4042     if (!dc_isar_feature(aa32_simd_r32, s) &&
4043         ((a->vd | a->vm) & 0x10)) {
4044         return false;
4045     }
4046
4047     if (a->size != 0) {
4048         return false;
4049     }
4050
4051     if ((a->vd | a->vm) & a->q) {
4052         return false;
4053     }
4054
4055     if (!vfp_access_check(s)) {
4056         return true;
4057     }
4058
4059     rm = tcg_temp_new_i64();
4060     rd = tcg_temp_new_i64();
4061     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
4062         neon_load_reg64(rm, a->vm + pass);
4063         neon_load_reg64(rd, a->vd + pass);
4064         neon_store_reg64(rm, a->vd + pass);
4065         neon_store_reg64(rd, a->vm + pass);
4066     }
4067     tcg_temp_free_i64(rm);
4068     tcg_temp_free_i64(rd);
4069
4070     return true;
4071 }
4072 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
4073 {
4074     TCGv_i32 rd, tmp;
4075
4076     rd = tcg_temp_new_i32();
4077     tmp = tcg_temp_new_i32();
4078
4079     tcg_gen_shli_i32(rd, t0, 8);
4080     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
4081     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
4082     tcg_gen_or_i32(rd, rd, tmp);
4083
4084     tcg_gen_shri_i32(t1, t1, 8);
4085     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
4086     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
4087     tcg_gen_or_i32(t1, t1, tmp);
4088     tcg_gen_mov_i32(t0, rd);
4089
4090     tcg_temp_free_i32(tmp);
4091     tcg_temp_free_i32(rd);
4092 }
4093
4094 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
4095 {
4096     TCGv_i32 rd, tmp;
4097
4098     rd = tcg_temp_new_i32();
4099     tmp = tcg_temp_new_i32();
4100
4101     tcg_gen_shli_i32(rd, t0, 16);
4102     tcg_gen_andi_i32(tmp, t1, 0xffff);
4103     tcg_gen_or_i32(rd, rd, tmp);
4104     tcg_gen_shri_i32(t1, t1, 16);
4105     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
4106     tcg_gen_or_i32(t1, t1, tmp);
4107     tcg_gen_mov_i32(t0, rd);
4108
4109     tcg_temp_free_i32(tmp);
4110     tcg_temp_free_i32(rd);
4111 }
4112
4113 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
4114 {
4115     TCGv_i32 tmp, tmp2;
4116     int pass;
4117
4118     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
4119         return false;
4120     }
4121
4122     /* UNDEF accesses to D16-D31 if they don't exist. */
4123     if (!dc_isar_feature(aa32_simd_r32, s) &&
4124         ((a->vd | a->vm) & 0x10)) {
4125         return false;
4126     }
4127
4128     if ((a->vd | a->vm) & a->q) {
4129         return false;
4130     }
4131
4132     if (a->size == 3) {
4133         return false;
4134     }
4135
4136     if (!vfp_access_check(s)) {
4137         return true;
4138     }
4139
4140     if (a->size == 2) {
4141         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
4142             tmp = neon_load_reg(a->vm, pass);
4143             tmp2 = neon_load_reg(a->vd, pass + 1);
4144             neon_store_reg(a->vm, pass, tmp2);
4145             neon_store_reg(a->vd, pass + 1, tmp);
4146         }
4147     } else {
4148         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
4149             tmp = neon_load_reg(a->vm, pass);
4150             tmp2 = neon_load_reg(a->vd, pass);
4151             if (a->size == 0) {
4152                 gen_neon_trn_u8(tmp, tmp2);
4153             } else {
4154                 gen_neon_trn_u16(tmp, tmp2);
4155             }
4156             neon_store_reg(a->vm, pass, tmp2);
4157             neon_store_reg(a->vd, pass, tmp);
4158         }
4159     }
4160     return true;
4161 }