target/arm/tcg/translate-neon.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "qemu/osdep.h"
  24 #include "translate.h"
  25 #include "translate-a32.h"
  26
  27 /* Include the generated Neon decoder */
  28 #include "decode-neon-dp.c.inc"
  29 #include "decode-neon-ls.c.inc"
  30 #include "decode-neon-shared.c.inc"
  31
  32 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  33 {
  34     TCGv_ptr ret = tcg_temp_new_ptr();
  35     tcg_gen_addi_ptr(ret, tcg_env, vfp_reg_offset(dp, reg));
  36     return ret;
  37 }
  38
  39 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  40 {
  41     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  42
  43     switch (mop) {
  44     case MO_UB:
  45         tcg_gen_ld8u_i32(var, tcg_env, offset);
  46         break;
  47     case MO_UW:
  48         tcg_gen_ld16u_i32(var, tcg_env, offset);
  49         break;
  50     case MO_UL:
  51         tcg_gen_ld_i32(var, tcg_env, offset);
  52         break;
  53     default:
  54         g_assert_not_reached();
  55     }
  56 }
  57
  58 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  59 {
  60     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  61
  62     switch (mop) {
  63     case MO_UB:
  64         tcg_gen_ld8u_i64(var, tcg_env, offset);
  65         break;
  66     case MO_UW:
  67         tcg_gen_ld16u_i64(var, tcg_env, offset);
  68         break;
  69     case MO_UL:
  70         tcg_gen_ld32u_i64(var, tcg_env, offset);
  71         break;
  72     case MO_UQ:
  73         tcg_gen_ld_i64(var, tcg_env, offset);
  74         break;
  75     default:
  76         g_assert_not_reached();
  77     }
  78 }
  79
  80 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
  81 {
  82     long offset = neon_element_offset(reg, ele, size);
  83
  84     switch (size) {
  85     case MO_8:
  86         tcg_gen_st8_i32(var, tcg_env, offset);
  87         break;
  88     case MO_16:
  89         tcg_gen_st16_i32(var, tcg_env, offset);
  90         break;
  91     case MO_32:
  92         tcg_gen_st_i32(var, tcg_env, offset);
  93         break;
  94     default:
  95         g_assert_not_reached();
  96     }
  97 }
  98
  99 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 100 {
 101     long offset = neon_element_offset(reg, ele, size);
 102
 103     switch (size) {
 104     case MO_8:
 105         tcg_gen_st8_i64(var, tcg_env, offset);
 106         break;
 107     case MO_16:
 108         tcg_gen_st16_i64(var, tcg_env, offset);
 109         break;
 110     case MO_32:
 111         tcg_gen_st32_i64(var, tcg_env, offset);
 112         break;
 113     case MO_64:
 114         tcg_gen_st_i64(var, tcg_env, offset);
 115         break;
 116     default:
 117         g_assert_not_reached();
 118     }
 119 }
 120
 121 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 122                          int data, gen_helper_gvec_4 *fn_gvec)
 123 {
 124     /* UNDEF accesses to D16-D31 if they don't exist. */
 125     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 126         return false;
 127     }
 128
 129     /*
 130      * UNDEF accesses to odd registers for each bit of Q.
 131      * Q will be 0b111 for all Q-reg instructions, otherwise
 132      * when we have mixed Q- and D-reg inputs.
 133      */
 134     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 135         return false;
 136     }
 137
 138     if (!vfp_access_check(s)) {
 139         return true;
 140     }
 141
 142     int opr_sz = q ? 16 : 8;
 143     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 144                        vfp_reg_offset(1, vn),
 145                        vfp_reg_offset(1, vm),
 146                        vfp_reg_offset(1, vd),
 147                        opr_sz, opr_sz, data, fn_gvec);
 148     return true;
 149 }
 150
 151 static bool do_neon_ddda_env(DisasContext *s, int q, int vd, int vn, int vm,
 152                              int data, gen_helper_gvec_4_ptr *fn_gvec)
 153 {
 154     /* UNDEF accesses to D16-D31 if they don't exist. */
 155     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 156         return false;
 157     }
 158
 159     /*
 160      * UNDEF accesses to odd registers for each bit of Q.
 161      * Q will be 0b111 for all Q-reg instructions, otherwise
 162      * when we have mixed Q- and D-reg inputs.
 163      */
 164     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 165         return false;
 166     }
 167
 168     if (!vfp_access_check(s)) {
 169         return true;
 170     }
 171
 172     int opr_sz = q ? 16 : 8;
 173     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 174                        vfp_reg_offset(1, vn),
 175                        vfp_reg_offset(1, vm),
 176                        vfp_reg_offset(1, vd),
 177                        tcg_env,
 178                        opr_sz, opr_sz, data, fn_gvec);
 179     return true;
 180 }
 181
 182 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 183                               int data, ARMFPStatusFlavour fp_flavour,
 184                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
 185 {
 186     /* UNDEF accesses to D16-D31 if they don't exist. */
 187     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 188         return false;
 189     }
 190
 191     /*
 192      * UNDEF accesses to odd registers for each bit of Q.
 193      * Q will be 0b111 for all Q-reg instructions, otherwise
 194      * when we have mixed Q- and D-reg inputs.
 195      */
 196     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 197         return false;
 198     }
 199
 200     if (!vfp_access_check(s)) {
 201         return true;
 202     }
 203
 204     int opr_sz = q ? 16 : 8;
 205     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 206
 207     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 208                        vfp_reg_offset(1, vn),
 209                        vfp_reg_offset(1, vm),
 210                        vfp_reg_offset(1, vd),
 211                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 212     return true;
 213 }
 214
 215 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 216 {
 217     if (!dc_isar_feature(aa32_vcma, s)) {
 218         return false;
 219     }
 220     if (a->size == MO_16) {
 221         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 222             return false;
 223         }
 224         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 225                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
 226     }
 227     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 228                              FPST_STD, gen_helper_gvec_fcmlas);
 229 }
 230
 231 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 232 {
 233     int opr_sz;
 234     TCGv_ptr fpst;
 235     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 236
 237     if (!dc_isar_feature(aa32_vcma, s)
 238         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 239         return false;
 240     }
 241
 242     /* UNDEF accesses to D16-D31 if they don't exist. */
 243     if (!dc_isar_feature(aa32_simd_r32, s) &&
 244         ((a->vd | a->vn | a->vm) & 0x10)) {
 245         return false;
 246     }
 247
 248     if ((a->vn | a->vm | a->vd) & a->q) {
 249         return false;
 250     }
 251
 252     if (!vfp_access_check(s)) {
 253         return true;
 254     }
 255
 256     opr_sz = (1 + a->q) * 8;
 257     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 258     fn_gvec_ptr = (a->size == MO_16) ?
 259         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 260     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 261                        vfp_reg_offset(1, a->vn),
 262                        vfp_reg_offset(1, a->vm),
 263                        fpst, opr_sz, opr_sz, a->rot,
 264                        fn_gvec_ptr);
 265     return true;
 266 }
 267
 268 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 269 {
 270     if (!dc_isar_feature(aa32_dp, s)) {
 271         return false;
 272     }
 273     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 274                         gen_helper_gvec_sdot_b);
 275 }
 276
 277 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 278 {
 279     if (!dc_isar_feature(aa32_dp, s)) {
 280         return false;
 281     }
 282     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 283                         gen_helper_gvec_udot_b);
 284 }
 285
 286 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 287 {
 288     if (!dc_isar_feature(aa32_i8mm, s)) {
 289         return false;
 290     }
 291     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 292                         gen_helper_gvec_usdot_b);
 293 }
 294
 295 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 296 {
 297     if (!dc_isar_feature(aa32_bf16, s)) {
 298         return false;
 299     }
 300     return do_neon_ddda_env(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 301                             gen_helper_gvec_bfdot);
 302 }
 303
 304 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 305 {
 306     int opr_sz;
 307
 308     if (!dc_isar_feature(aa32_fhm, s)) {
 309         return false;
 310     }
 311
 312     /* UNDEF accesses to D16-D31 if they don't exist. */
 313     if (!dc_isar_feature(aa32_simd_r32, s) &&
 314         (a->vd & 0x10)) {
 315         return false;
 316     }
 317
 318     if (a->vd & a->q) {
 319         return false;
 320     }
 321
 322     if (!vfp_access_check(s)) {
 323         return true;
 324     }
 325
 326     opr_sz = (1 + a->q) * 8;
 327     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 328                        vfp_reg_offset(a->q, a->vn),
 329                        vfp_reg_offset(a->q, a->vm),
 330                        tcg_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 331                        gen_helper_gvec_fmlal_a32);
 332     return true;
 333 }
 334
 335 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 336 {
 337     int data = (a->index << 2) | a->rot;
 338
 339     if (!dc_isar_feature(aa32_vcma, s)) {
 340         return false;
 341     }
 342     if (a->size == MO_16) {
 343         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 344             return false;
 345         }
 346         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 347                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 348     }
 349     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 350                              FPST_STD, gen_helper_gvec_fcmlas_idx);
 351 }
 352
 353 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 354 {
 355     if (!dc_isar_feature(aa32_dp, s)) {
 356         return false;
 357     }
 358     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 359                         gen_helper_gvec_sdot_idx_b);
 360 }
 361
 362 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 363 {
 364     if (!dc_isar_feature(aa32_dp, s)) {
 365         return false;
 366     }
 367     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 368                         gen_helper_gvec_udot_idx_b);
 369 }
 370
 371 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 372 {
 373     if (!dc_isar_feature(aa32_i8mm, s)) {
 374         return false;
 375     }
 376     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 377                         gen_helper_gvec_usdot_idx_b);
 378 }
 379
 380 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 381 {
 382     if (!dc_isar_feature(aa32_i8mm, s)) {
 383         return false;
 384     }
 385     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 386                         gen_helper_gvec_sudot_idx_b);
 387 }
 388
 389 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 390 {
 391     if (!dc_isar_feature(aa32_bf16, s)) {
 392         return false;
 393     }
 394     return do_neon_ddda_env(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 395                             gen_helper_gvec_bfdot_idx);
 396 }
 397
 398 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 399 {
 400     int opr_sz;
 401
 402     if (!dc_isar_feature(aa32_fhm, s)) {
 403         return false;
 404     }
 405
 406     /* UNDEF accesses to D16-D31 if they don't exist. */
 407     if (!dc_isar_feature(aa32_simd_r32, s) &&
 408         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 409         return false;
 410     }
 411
 412     if (a->vd & a->q) {
 413         return false;
 414     }
 415
 416     if (!vfp_access_check(s)) {
 417         return true;
 418     }
 419
 420     opr_sz = (1 + a->q) * 8;
 421     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 422                        vfp_reg_offset(a->q, a->vn),
 423                        vfp_reg_offset(a->q, a->rm),
 424                        tcg_env, opr_sz, opr_sz,
 425                        (a->index << 2) | a->s, /* is_2 == 0 */
 426                        gen_helper_gvec_fmlal_idx_a32);
 427     return true;
 428 }
 429
 430 static struct {
 431     int nregs;
 432     int interleave;
 433     int spacing;
 434 } const neon_ls_element_type[11] = {
 435     {1, 4, 1},
 436     {1, 4, 2},
 437     {4, 1, 1},
 438     {2, 2, 2},
 439     {1, 3, 1},
 440     {1, 3, 2},
 441     {3, 1, 1},
 442     {1, 1, 1},
 443     {1, 2, 1},
 444     {1, 2, 2},
 445     {2, 1, 1}
 446 };
 447
 448 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 449                                       int stride)
 450 {
 451     if (rm != 15) {
 452         TCGv_i32 base;
 453
 454         base = load_reg(s, rn);
 455         if (rm == 13) {
 456             tcg_gen_addi_i32(base, base, stride);
 457         } else {
 458             TCGv_i32 index;
 459             index = load_reg(s, rm);
 460             tcg_gen_add_i32(base, base, index);
 461         }
 462         store_reg(s, rn, base);
 463     }
 464 }
 465
 466 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 467 {
 468     /* Neon load/store multiple structures */
 469     int nregs, interleave, spacing, reg, n;
 470     MemOp mop, align, endian;
 471     int mmu_idx = get_mem_index(s);
 472     int size = a->size;
 473     TCGv_i64 tmp64;
 474     TCGv_i32 addr;
 475
 476     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 477         return false;
 478     }
 479
 480     /* UNDEF accesses to D16-D31 if they don't exist */
 481     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 482         return false;
 483     }
 484     if (a->itype > 10) {
 485         return false;
 486     }
 487     /* Catch UNDEF cases for bad values of align field */
 488     switch (a->itype & 0xc) {
 489     case 4:
 490         if (a->align >= 2) {
 491             return false;
 492         }
 493         break;
 494     case 8:
 495         if (a->align == 3) {
 496             return false;
 497         }
 498         break;
 499     default:
 500         break;
 501     }
 502     nregs = neon_ls_element_type[a->itype].nregs;
 503     interleave = neon_ls_element_type[a->itype].interleave;
 504     spacing = neon_ls_element_type[a->itype].spacing;
 505     if (size == 3 && (interleave | spacing) != 1) {
 506         return false;
 507     }
 508
 509     if (!vfp_access_check(s)) {
 510         return true;
 511     }
 512
 513     /* For our purposes, bytes are always little-endian.  */
 514     endian = s->be_data;
 515     if (size == 0) {
 516         endian = MO_LE;
 517     }
 518
 519     /* Enforce alignment requested by the instruction */
 520     if (a->align) {
 521         align = pow2_align(a->align + 2); /* 4 ** a->align */
 522     } else {
 523         align = s->align_mem ? MO_ALIGN : 0;
 524     }
 525
 526     /*
 527      * Consecutive little-endian elements from a single register
 528      * can be promoted to a larger little-endian operation.
 529      */
 530     if (interleave == 1 && endian == MO_LE) {
 531         /* Retain any natural alignment. */
 532         if (align == MO_ALIGN) {
 533             align = pow2_align(size);
 534         }
 535         size = 3;
 536     }
 537
 538     tmp64 = tcg_temp_new_i64();
 539     addr = tcg_temp_new_i32();
 540     load_reg_var(s, addr, a->rn);
 541
 542     mop = endian | size | align;
 543     for (reg = 0; reg < nregs; reg++) {
 544         for (n = 0; n < 8 >> size; n++) {
 545             int xs;
 546             for (xs = 0; xs < interleave; xs++) {
 547                 int tt = a->vd + reg + spacing * xs;
 548
 549                 if (a->l) {
 550                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 551                     neon_store_element64(tt, n, size, tmp64);
 552                 } else {
 553                     neon_load_element64(tmp64, tt, n, size);
 554                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 555                 }
 556                 tcg_gen_addi_i32(addr, addr, 1 << size);
 557
 558                 /* Subsequent memory operations inherit alignment */
 559                 mop &= ~MO_AMASK;
 560             }
 561         }
 562     }
 563
 564     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 565     return true;
 566 }
 567
 568 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 569 {
 570     /* Neon load single structure to all lanes */
 571     int reg, stride, vec_size;
 572     int vd = a->vd;
 573     int size = a->size;
 574     int nregs = a->n + 1;
 575     TCGv_i32 addr, tmp;
 576     MemOp mop, align;
 577
 578     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 579         return false;
 580     }
 581
 582     /* UNDEF accesses to D16-D31 if they don't exist */
 583     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 584         return false;
 585     }
 586
 587     align = 0;
 588     if (size == 3) {
 589         if (nregs != 4 || a->a == 0) {
 590             return false;
 591         }
 592         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 593         size = MO_32;
 594         align = MO_ALIGN_16;
 595     } else if (a->a) {
 596         switch (nregs) {
 597         case 1:
 598             if (size == 0) {
 599                 return false;
 600             }
 601             align = MO_ALIGN;
 602             break;
 603         case 2:
 604             align = pow2_align(size + 1);
 605             break;
 606         case 3:
 607             return false;
 608         case 4:
 609             if (size == 2) {
 610                 align = pow2_align(3);
 611             } else {
 612                 align = pow2_align(size + 2);
 613             }
 614             break;
 615         default:
 616             g_assert_not_reached();
 617         }
 618     }
 619
 620     if (!vfp_access_check(s)) {
 621         return true;
 622     }
 623
 624     /*
 625      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 626      * VLD2/3/4 to all lanes: T bit indicates register stride.
 627      */
 628     stride = a->t ? 2 : 1;
 629     vec_size = nregs == 1 ? stride * 8 : 8;
 630     mop = size | align;
 631     tmp = tcg_temp_new_i32();
 632     addr = tcg_temp_new_i32();
 633     load_reg_var(s, addr, a->rn);
 634     for (reg = 0; reg < nregs; reg++) {
 635         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 636         if ((vd & 1) && vec_size == 16) {
 637             /*
 638              * We cannot write 16 bytes at once because the
 639              * destination is unaligned.
 640              */
 641             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 642                                  8, 8, tmp);
 643             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 644                              neon_full_reg_offset(vd), 8, 8);
 645         } else {
 646             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 647                                  vec_size, vec_size, tmp);
 648         }
 649         tcg_gen_addi_i32(addr, addr, 1 << size);
 650         vd += stride;
 651
 652         /* Subsequent memory operations inherit alignment */
 653         mop &= ~MO_AMASK;
 654     }
 655
 656     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 657
 658     return true;
 659 }
 660
 661 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 662 {
 663     /* Neon load/store single structure to one lane */
 664     int reg;
 665     int nregs = a->n + 1;
 666     int vd = a->vd;
 667     TCGv_i32 addr, tmp;
 668     MemOp mop;
 669
 670     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 671         return false;
 672     }
 673
 674     /* UNDEF accesses to D16-D31 if they don't exist */
 675     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 676         return false;
 677     }
 678
 679     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 680     switch (nregs) {
 681     case 1:
 682         if (a->stride != 1) {
 683             return false;
 684         }
 685         if (((a->align & (1 << a->size)) != 0) ||
 686             (a->size == 2 && (a->align == 1 || a->align == 2))) {
 687             return false;
 688         }
 689         break;
 690     case 2:
 691         if (a->size == 2 && (a->align & 2) != 0) {
 692             return false;
 693         }
 694         break;
 695     case 3:
 696         if (a->align != 0) {
 697             return false;
 698         }
 699         break;
 700     case 4:
 701         if (a->size == 2 && a->align == 3) {
 702             return false;
 703         }
 704         break;
 705     default:
 706         g_assert_not_reached();
 707     }
 708     if ((vd + a->stride * (nregs - 1)) > 31) {
 709         /*
 710          * Attempts to write off the end of the register file are
 711          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 712          * access off the end of the array that holds the register data.
 713          */
 714         return false;
 715     }
 716
 717     if (!vfp_access_check(s)) {
 718         return true;
 719     }
 720
 721     /* Pick up SCTLR settings */
 722     mop = finalize_memop(s, a->size);
 723
 724     if (a->align) {
 725         MemOp align_op;
 726
 727         switch (nregs) {
 728         case 1:
 729             /* For VLD1, use natural alignment. */
 730             align_op = MO_ALIGN;
 731             break;
 732         case 2:
 733             /* For VLD2, use double alignment. */
 734             align_op = pow2_align(a->size + 1);
 735             break;
 736         case 4:
 737             if (a->size == MO_32) {
 738                 /*
 739                  * For VLD4.32, align = 1 is double alignment, align = 2 is
 740                  * quad alignment; align = 3 is rejected above.
 741                  */
 742                 align_op = pow2_align(a->size + a->align);
 743             } else {
 744                 /* For VLD4.8 and VLD.16, we want quad alignment. */
 745                 align_op = pow2_align(a->size + 2);
 746             }
 747             break;
 748         default:
 749             /* For VLD3, the alignment field is zero and rejected above. */
 750             g_assert_not_reached();
 751         }
 752
 753         mop = (mop & ~MO_AMASK) | align_op;
 754     }
 755
 756     tmp = tcg_temp_new_i32();
 757     addr = tcg_temp_new_i32();
 758     load_reg_var(s, addr, a->rn);
 759
 760     for (reg = 0; reg < nregs; reg++) {
 761         if (a->l) {
 762             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 763             neon_store_element(vd, a->reg_idx, a->size, tmp);
 764         } else { /* Store */
 765             neon_load_element(tmp, vd, a->reg_idx, a->size);
 766             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 767         }
 768         vd += a->stride;
 769         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 770
 771         /* Subsequent memory operations inherit alignment */
 772         mop &= ~MO_AMASK;
 773     }
 774
 775     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 776
 777     return true;
 778 }
 779
 780 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 781 {
 782     int vec_size = a->q ? 16 : 8;
 783     int rd_ofs = neon_full_reg_offset(a->vd);
 784     int rn_ofs = neon_full_reg_offset(a->vn);
 785     int rm_ofs = neon_full_reg_offset(a->vm);
 786
 787     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 788         return false;
 789     }
 790
 791     /* UNDEF accesses to D16-D31 if they don't exist. */
 792     if (!dc_isar_feature(aa32_simd_r32, s) &&
 793         ((a->vd | a->vn | a->vm) & 0x10)) {
 794         return false;
 795     }
 796
 797     if ((a->vn | a->vm | a->vd) & a->q) {
 798         return false;
 799     }
 800
 801     if (!vfp_access_check(s)) {
 802         return true;
 803     }
 804
 805     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 806     return true;
 807 }
 808
 809 #define DO_3SAME(INSN, FUNC)                                            \
 810     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 811     {                                                                   \
 812         return do_3same(s, a, FUNC);                                    \
 813     }
 814
 815 DO_3SAME(VADD, tcg_gen_gvec_add)
 816 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 817 DO_3SAME(VAND, tcg_gen_gvec_and)
 818 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 819 DO_3SAME(VORR, tcg_gen_gvec_or)
 820 DO_3SAME(VORN, tcg_gen_gvec_orc)
 821 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 822 DO_3SAME(VSHL_S, gen_gvec_sshl)
 823 DO_3SAME(VSHL_U, gen_gvec_ushl)
 824 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 825 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 826 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 827 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 828 DO_3SAME(VRSHL_S, gen_gvec_srshl)
 829 DO_3SAME(VRSHL_U, gen_gvec_urshl)
 830 DO_3SAME(VQSHL_S, gen_neon_sqshl)
 831 DO_3SAME(VQSHL_U, gen_neon_uqshl)
 832 DO_3SAME(VQRSHL_S, gen_neon_sqrshl)
 833 DO_3SAME(VQRSHL_U, gen_neon_uqrshl)
 834
 835 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 836 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 837     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 838                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 839                                 uint32_t oprsz, uint32_t maxsz)         \
 840     {                                                                   \
 841         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 842     }                                                                   \
 843     DO_3SAME(INSN, gen_##INSN##_3s)
 844
 845 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 846 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 847 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 848
 849 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 850     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 851     {                                                                   \
 852         if (a->size == 3) {                                             \
 853             return false;                                               \
 854         }                                                               \
 855         return do_3same(s, a, FUNC);                                    \
 856     }
 857
 858 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 859 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 860 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 861 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 862 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 863 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 864 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 865 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 866 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 867 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 868 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 869 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 870 DO_3SAME_NO_SZ_3(VPADD, gen_gvec_addp)
 871 DO_3SAME_NO_SZ_3(VPMAX_S, gen_gvec_smaxp)
 872 DO_3SAME_NO_SZ_3(VPMIN_S, gen_gvec_sminp)
 873 DO_3SAME_NO_SZ_3(VPMAX_U, gen_gvec_umaxp)
 874 DO_3SAME_NO_SZ_3(VPMIN_U, gen_gvec_uminp)
 875 DO_3SAME_NO_SZ_3(VHADD_S, gen_gvec_shadd)
 876 DO_3SAME_NO_SZ_3(VHADD_U, gen_gvec_uhadd)
 877 DO_3SAME_NO_SZ_3(VHSUB_S, gen_gvec_shsub)
 878 DO_3SAME_NO_SZ_3(VHSUB_U, gen_gvec_uhsub)
 879 DO_3SAME_NO_SZ_3(VRHADD_S, gen_gvec_srhadd)
 880 DO_3SAME_NO_SZ_3(VRHADD_U, gen_gvec_urhadd)
 881
 882 #define DO_3SAME_CMP(INSN, COND)                                        \
 883     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 884                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 885                                 uint32_t oprsz, uint32_t maxsz)         \
 886     {                                                                   \
 887         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 888     }                                                                   \
 889     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 890
 891 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 892 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 893 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 894 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 895 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 896
 897 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 898     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 899                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 900     {                                                                      \
 901         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 902     }
 903
 904 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 905
 906 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 907 {
 908     if (a->size != 0) {
 909         return false;
 910     }
 911     return do_3same(s, a, gen_VMUL_p_3s);
 912 }
 913
 914 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 915     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 916     {                                                                   \
 917         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 918             return false;                                               \
 919         }                                                               \
 920         if (a->size != 1 && a->size != 2) {                             \
 921             return false;                                               \
 922         }                                                               \
 923         return do_3same(s, a, FUNC);                                    \
 924     }
 925
 926 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 927 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 928
 929 #define DO_SHA1(NAME, FUNC)                                             \
 930     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 931     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 932     {                                                                   \
 933         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 934             return false;                                               \
 935         }                                                               \
 936         return do_3same(s, a, gen_##NAME##_3s);                         \
 937     }
 938
 939 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 940 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 941 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 942 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 943
 944 #define DO_SHA2(NAME, FUNC)                                             \
 945     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 946     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 947     {                                                                   \
 948         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 949             return false;                                               \
 950         }                                                               \
 951         return do_3same(s, a, gen_##NAME##_3s);                         \
 952     }
 953
 954 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 955 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 956 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 957
 958 /*
 959  * Some helper functions need to be passed the tcg_env. In order
 960  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 961  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 962  * and which call a NeonGenTwoOpEnvFn().
 963  */
 964 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 965     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 966     {                                                                   \
 967         FUNC(d, tcg_env, n, m);                                         \
 968     }
 969
 970 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
 971     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 972     { return a->size >= 1 && a->size <= 2 && do_3same(s, a, FUNC); }
 973
 974 DO_3SAME_VQDMULH(VQDMULH, gen_gvec_sqdmulh_qc)
 975 DO_3SAME_VQDMULH(VQRDMULH, gen_gvec_sqrdmulh_qc)
 976
 977 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
 978     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
 979                          uint32_t rn_ofs, uint32_t rm_ofs,              \
 980                          uint32_t oprsz, uint32_t maxsz)                \
 981     {                                                                   \
 982         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
 983         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
 984                            oprsz, maxsz, 0, FUNC);                      \
 985     }
 986
 987 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
 988     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
 989     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
 990     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
 991     {                                                                   \
 992         if (a->size == MO_16) {                                         \
 993             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
 994                 return false;                                           \
 995             }                                                           \
 996             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
 997         }                                                               \
 998         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
 999     }
1000
1001
1002 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1003 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1004 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1005 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1006 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1007 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1008 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1009 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1010 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1011 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1012 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1013 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1014 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1015 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1016 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1017 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1018 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1019 DO_3S_FP_GVEC(VPADD, gen_helper_gvec_faddp_s, gen_helper_gvec_faddp_h)
1020 DO_3S_FP_GVEC(VPMAX, gen_helper_gvec_fmaxp_s, gen_helper_gvec_fmaxp_h)
1021 DO_3S_FP_GVEC(VPMIN, gen_helper_gvec_fminp_s, gen_helper_gvec_fminp_h)
1022
1023 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1024 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1025 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1026 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1027
1028 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1029 {
1030     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1031         return false;
1032     }
1033
1034     if (a->size == MO_16) {
1035         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1036             return false;
1037         }
1038         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1039     }
1040     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1041 }
1042
1043 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1044 {
1045     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1046         return false;
1047     }
1048
1049     if (a->size == MO_16) {
1050         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1051             return false;
1052         }
1053         return do_3same(s, a, gen_VMINNM_fp16_3s);
1054     }
1055     return do_3same(s, a, gen_VMINNM_fp32_3s);
1056 }
1057
1058 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1059 {
1060     /* Handle a 2-reg-shift insn which can be vectorized. */
1061     int vec_size = a->q ? 16 : 8;
1062     int rd_ofs = neon_full_reg_offset(a->vd);
1063     int rm_ofs = neon_full_reg_offset(a->vm);
1064
1065     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1066         return false;
1067     }
1068
1069     /* UNDEF accesses to D16-D31 if they don't exist. */
1070     if (!dc_isar_feature(aa32_simd_r32, s) &&
1071         ((a->vd | a->vm) & 0x10)) {
1072         return false;
1073     }
1074
1075     if ((a->vm | a->vd) & a->q) {
1076         return false;
1077     }
1078
1079     if (!vfp_access_check(s)) {
1080         return true;
1081     }
1082
1083     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1084     return true;
1085 }
1086
1087 #define DO_2SH(INSN, FUNC)                                              \
1088     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1089     {                                                                   \
1090         return do_vector_2sh(s, a, FUNC);                               \
1091     }                                                                   \
1092
1093 DO_2SH(VSHL, tcg_gen_gvec_shli)
1094 DO_2SH(VSLI, gen_gvec_sli)
1095 DO_2SH(VSRI, gen_gvec_sri)
1096 DO_2SH(VSRA_S, gen_gvec_ssra)
1097 DO_2SH(VSRA_U, gen_gvec_usra)
1098 DO_2SH(VRSHR_S, gen_gvec_srshr)
1099 DO_2SH(VRSHR_U, gen_gvec_urshr)
1100 DO_2SH(VRSRA_S, gen_gvec_srsra)
1101 DO_2SH(VRSRA_U, gen_gvec_ursra)
1102 DO_2SH(VSHR_S, gen_gvec_sshr)
1103 DO_2SH(VSHR_U, gen_gvec_ushr)
1104 DO_2SH(VQSHLU, gen_neon_sqshlui)
1105 DO_2SH(VQSHL_U, gen_neon_uqshli)
1106 DO_2SH(VQSHL_S, gen_neon_sqshli)
1107
1108 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1109                                 NeonGenTwo64OpFn *shiftfn,
1110                                 NeonGenOne64OpEnvFn *narrowfn)
1111 {
1112     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1113     TCGv_i64 constimm, rm1, rm2, rd;
1114
1115     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1116         return false;
1117     }
1118
1119     /* UNDEF accesses to D16-D31 if they don't exist. */
1120     if (!dc_isar_feature(aa32_simd_r32, s) &&
1121         ((a->vd | a->vm) & 0x10)) {
1122         return false;
1123     }
1124
1125     if (a->vm & 1) {
1126         return false;
1127     }
1128
1129     if (!vfp_access_check(s)) {
1130         return true;
1131     }
1132
1133     /*
1134      * This is always a right shift, and the shiftfn is always a
1135      * left-shift helper, which thus needs the negated shift count.
1136      */
1137     constimm = tcg_constant_i64(-a->shift);
1138     rm1 = tcg_temp_new_i64();
1139     rm2 = tcg_temp_new_i64();
1140     rd = tcg_temp_new_i64();
1141
1142     /* Load both inputs first to avoid potential overwrite if rm == rd */
1143     read_neon_element64(rm1, a->vm, 0, MO_64);
1144     read_neon_element64(rm2, a->vm, 1, MO_64);
1145
1146     shiftfn(rm1, rm1, constimm);
1147     narrowfn(rd, tcg_env, rm1);
1148     write_neon_element64(rd, a->vd, 0, MO_32);
1149
1150     shiftfn(rm2, rm2, constimm);
1151     narrowfn(rd, tcg_env, rm2);
1152     write_neon_element64(rd, a->vd, 1, MO_32);
1153
1154     return true;
1155 }
1156
1157 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1158                                 NeonGenTwoOpFn *shiftfn,
1159                                 NeonGenOne64OpEnvFn *narrowfn)
1160 {
1161     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1162     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1163     TCGv_i64 rtmp;
1164     uint32_t imm;
1165
1166     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1167         return false;
1168     }
1169
1170     /* UNDEF accesses to D16-D31 if they don't exist. */
1171     if (!dc_isar_feature(aa32_simd_r32, s) &&
1172         ((a->vd | a->vm) & 0x10)) {
1173         return false;
1174     }
1175
1176     if (a->vm & 1) {
1177         return false;
1178     }
1179
1180     if (!vfp_access_check(s)) {
1181         return true;
1182     }
1183
1184     /*
1185      * This is always a right shift, and the shiftfn is always a
1186      * left-shift helper, which thus needs the negated shift count
1187      * duplicated into each lane of the immediate value.
1188      */
1189     if (a->size == 1) {
1190         imm = (uint16_t)(-a->shift);
1191         imm |= imm << 16;
1192     } else {
1193         /* size == 2 */
1194         imm = -a->shift;
1195     }
1196     constimm = tcg_constant_i32(imm);
1197
1198     /* Load all inputs first to avoid potential overwrite */
1199     rm1 = tcg_temp_new_i32();
1200     rm2 = tcg_temp_new_i32();
1201     rm3 = tcg_temp_new_i32();
1202     rm4 = tcg_temp_new_i32();
1203     read_neon_element32(rm1, a->vm, 0, MO_32);
1204     read_neon_element32(rm2, a->vm, 1, MO_32);
1205     read_neon_element32(rm3, a->vm, 2, MO_32);
1206     read_neon_element32(rm4, a->vm, 3, MO_32);
1207     rtmp = tcg_temp_new_i64();
1208
1209     shiftfn(rm1, rm1, constimm);
1210     shiftfn(rm2, rm2, constimm);
1211
1212     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1213
1214     narrowfn(rtmp, tcg_env, rtmp);
1215     write_neon_element64(rtmp, a->vd, 0, MO_32);
1216
1217     shiftfn(rm3, rm3, constimm);
1218     shiftfn(rm4, rm4, constimm);
1219
1220     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1221
1222     narrowfn(rtmp, tcg_env, rtmp);
1223     write_neon_element64(rtmp, a->vd, 1, MO_32);
1224     return true;
1225 }
1226
1227 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1228     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1229     {                                                                   \
1230         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1231     }
1232 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1233     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1234     {                                                                   \
1235         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1236     }
1237
1238 static void gen_neon_narrow_u32(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src)
1239 {
1240     tcg_gen_ext32u_i64(dest, src);
1241 }
1242
1243 static void gen_neon_narrow_u16(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src)
1244 {
1245     gen_helper_neon_narrow_u16(dest, src);
1246 }
1247
1248 static void gen_neon_narrow_u8(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src)
1249 {
1250     gen_helper_neon_narrow_u8(dest, src);
1251 }
1252
1253 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1254 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1255 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1256
1257 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1258 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1259 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1260
1261 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1262 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1263 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1264
1265 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1266 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1267 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1268 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1269 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1270 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1271
1272 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1273 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1274 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1275
1276 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1277 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1278 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1279
1280 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1281 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1282 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1283
1284 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1285                          NeonGenWidenFn *widenfn, bool u)
1286 {
1287     TCGv_i64 tmp;
1288     TCGv_i32 rm0, rm1;
1289     uint64_t widen_mask = 0;
1290
1291     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1292         return false;
1293     }
1294
1295     /* UNDEF accesses to D16-D31 if they don't exist. */
1296     if (!dc_isar_feature(aa32_simd_r32, s) &&
1297         ((a->vd | a->vm) & 0x10)) {
1298         return false;
1299     }
1300
1301     if (a->vd & 1) {
1302         return false;
1303     }
1304
1305     if (!vfp_access_check(s)) {
1306         return true;
1307     }
1308
1309     /*
1310      * This is a widen-and-shift operation. The shift is always less
1311      * than the width of the source type, so after widening the input
1312      * vector we can simply shift the whole 64-bit widened register,
1313      * and then clear the potential overflow bits resulting from left
1314      * bits of the narrow input appearing as right bits of the left
1315      * neighbour narrow input. Calculate a mask of bits to clear.
1316      */
1317     if ((a->shift != 0) && (a->size < 2 || u)) {
1318         int esize = 8 << a->size;
1319         widen_mask = MAKE_64BIT_MASK(0, esize);
1320         widen_mask >>= esize - a->shift;
1321         widen_mask = dup_const(a->size + 1, widen_mask);
1322     }
1323
1324     rm0 = tcg_temp_new_i32();
1325     rm1 = tcg_temp_new_i32();
1326     read_neon_element32(rm0, a->vm, 0, MO_32);
1327     read_neon_element32(rm1, a->vm, 1, MO_32);
1328     tmp = tcg_temp_new_i64();
1329
1330     widenfn(tmp, rm0);
1331     if (a->shift != 0) {
1332         tcg_gen_shli_i64(tmp, tmp, a->shift);
1333         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1334     }
1335     write_neon_element64(tmp, a->vd, 0, MO_64);
1336
1337     widenfn(tmp, rm1);
1338     if (a->shift != 0) {
1339         tcg_gen_shli_i64(tmp, tmp, a->shift);
1340         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1341     }
1342     write_neon_element64(tmp, a->vd, 1, MO_64);
1343     return true;
1344 }
1345
1346 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1347 {
1348     static NeonGenWidenFn * const widenfn[] = {
1349         gen_helper_neon_widen_s8,
1350         gen_helper_neon_widen_s16,
1351         tcg_gen_ext_i32_i64,
1352     };
1353     return do_vshll_2sh(s, a, widenfn[a->size], false);
1354 }
1355
1356 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1357 {
1358     static NeonGenWidenFn * const widenfn[] = {
1359         gen_helper_neon_widen_u8,
1360         gen_helper_neon_widen_u16,
1361         tcg_gen_extu_i32_i64,
1362     };
1363     return do_vshll_2sh(s, a, widenfn[a->size], true);
1364 }
1365
1366 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1367                       gen_helper_gvec_2_ptr *fn)
1368 {
1369     /* FP operations in 2-reg-and-shift group */
1370     int vec_size = a->q ? 16 : 8;
1371     int rd_ofs = neon_full_reg_offset(a->vd);
1372     int rm_ofs = neon_full_reg_offset(a->vm);
1373     TCGv_ptr fpst;
1374
1375     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1376         return false;
1377     }
1378
1379     if (a->size == MO_16) {
1380         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1381             return false;
1382         }
1383     }
1384
1385     /* UNDEF accesses to D16-D31 if they don't exist. */
1386     if (!dc_isar_feature(aa32_simd_r32, s) &&
1387         ((a->vd | a->vm) & 0x10)) {
1388         return false;
1389     }
1390
1391     if ((a->vm | a->vd) & a->q) {
1392         return false;
1393     }
1394
1395     if (!vfp_access_check(s)) {
1396         return true;
1397     }
1398
1399     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1400     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1401     return true;
1402 }
1403
1404 #define DO_FP_2SH(INSN, FUNC)                                           \
1405     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1406     {                                                                   \
1407         return do_fp_2sh(s, a, FUNC);                                   \
1408     }
1409
1410 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1411 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1412 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1413 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1414
1415 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1416 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1417 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1418 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1419
1420 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1421                         GVecGen2iFn *fn)
1422 {
1423     uint64_t imm;
1424     int reg_ofs, vec_size;
1425
1426     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1427         return false;
1428     }
1429
1430     /* UNDEF accesses to D16-D31 if they don't exist. */
1431     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1432         return false;
1433     }
1434
1435     if (a->vd & a->q) {
1436         return false;
1437     }
1438
1439     if (!vfp_access_check(s)) {
1440         return true;
1441     }
1442
1443     reg_ofs = neon_full_reg_offset(a->vd);
1444     vec_size = a->q ? 16 : 8;
1445     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1446
1447     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1448     return true;
1449 }
1450
1451 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1452                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1453 {
1454     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1455 }
1456
1457 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1458 {
1459     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1460     GVecGen2iFn *fn;
1461
1462     if ((a->cmode & 1) && a->cmode < 12) {
1463         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1464         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1465     } else {
1466         /* There is one unallocated cmode/op combination in this space */
1467         if (a->cmode == 15 && a->op == 1) {
1468             return false;
1469         }
1470         fn = gen_VMOV_1r;
1471     }
1472     return do_1reg_imm(s, a, fn);
1473 }
1474
1475 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1476                            NeonGenWidenFn *widenfn,
1477                            NeonGenTwo64OpFn *opfn,
1478                            int src1_mop, int src2_mop)
1479 {
1480     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1481     TCGv_i64 rn0_64, rn1_64, rm_64;
1482
1483     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1484         return false;
1485     }
1486
1487     /* UNDEF accesses to D16-D31 if they don't exist. */
1488     if (!dc_isar_feature(aa32_simd_r32, s) &&
1489         ((a->vd | a->vn | a->vm) & 0x10)) {
1490         return false;
1491     }
1492
1493     if (!opfn) {
1494         /* size == 3 case, which is an entirely different insn group */
1495         return false;
1496     }
1497
1498     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1499         return false;
1500     }
1501
1502     if (!vfp_access_check(s)) {
1503         return true;
1504     }
1505
1506     rn0_64 = tcg_temp_new_i64();
1507     rn1_64 = tcg_temp_new_i64();
1508     rm_64 = tcg_temp_new_i64();
1509
1510     if (src1_mop >= 0) {
1511         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1512     } else {
1513         TCGv_i32 tmp = tcg_temp_new_i32();
1514         read_neon_element32(tmp, a->vn, 0, MO_32);
1515         widenfn(rn0_64, tmp);
1516     }
1517     if (src2_mop >= 0) {
1518         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1519     } else {
1520         TCGv_i32 tmp = tcg_temp_new_i32();
1521         read_neon_element32(tmp, a->vm, 0, MO_32);
1522         widenfn(rm_64, tmp);
1523     }
1524
1525     opfn(rn0_64, rn0_64, rm_64);
1526
1527     /*
1528      * Load second pass inputs before storing the first pass result, to
1529      * avoid incorrect results if a narrow input overlaps with the result.
1530      */
1531     if (src1_mop >= 0) {
1532         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1533     } else {
1534         TCGv_i32 tmp = tcg_temp_new_i32();
1535         read_neon_element32(tmp, a->vn, 1, MO_32);
1536         widenfn(rn1_64, tmp);
1537     }
1538     if (src2_mop >= 0) {
1539         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1540     } else {
1541         TCGv_i32 tmp = tcg_temp_new_i32();
1542         read_neon_element32(tmp, a->vm, 1, MO_32);
1543         widenfn(rm_64, tmp);
1544     }
1545
1546     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1547
1548     opfn(rn1_64, rn1_64, rm_64);
1549     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1550
1551     return true;
1552 }
1553
1554 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1555     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1556     {                                                                   \
1557         static NeonGenWidenFn * const widenfn[] = {                     \
1558             gen_helper_neon_widen_##S##8,                               \
1559             gen_helper_neon_widen_##S##16,                              \
1560             NULL, NULL,                                                 \
1561         };                                                              \
1562         static NeonGenTwo64OpFn * const addfn[] = {                     \
1563             gen_helper_neon_##OP##l_u16,                                \
1564             gen_helper_neon_##OP##l_u32,                                \
1565             tcg_gen_##OP##_i64,                                         \
1566             NULL,                                                       \
1567         };                                                              \
1568         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1569         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1570                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1571                               narrow_mop);                              \
1572     }
1573
1574 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1575 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1576 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1577 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1578 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1579 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1580 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1581 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1582
1583 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1584                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1585 {
1586     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1587     TCGv_i64 rn_64, rm_64;
1588     TCGv_i32 rd0, rd1;
1589
1590     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1591         return false;
1592     }
1593
1594     /* UNDEF accesses to D16-D31 if they don't exist. */
1595     if (!dc_isar_feature(aa32_simd_r32, s) &&
1596         ((a->vd | a->vn | a->vm) & 0x10)) {
1597         return false;
1598     }
1599
1600     if (!opfn || !narrowfn) {
1601         /* size == 3 case, which is an entirely different insn group */
1602         return false;
1603     }
1604
1605     if ((a->vn | a->vm) & 1) {
1606         return false;
1607     }
1608
1609     if (!vfp_access_check(s)) {
1610         return true;
1611     }
1612
1613     rn_64 = tcg_temp_new_i64();
1614     rm_64 = tcg_temp_new_i64();
1615     rd0 = tcg_temp_new_i32();
1616     rd1 = tcg_temp_new_i32();
1617
1618     read_neon_element64(rn_64, a->vn, 0, MO_64);
1619     read_neon_element64(rm_64, a->vm, 0, MO_64);
1620
1621     opfn(rn_64, rn_64, rm_64);
1622
1623     narrowfn(rd0, rn_64);
1624
1625     read_neon_element64(rn_64, a->vn, 1, MO_64);
1626     read_neon_element64(rm_64, a->vm, 1, MO_64);
1627
1628     opfn(rn_64, rn_64, rm_64);
1629
1630     narrowfn(rd1, rn_64);
1631
1632     write_neon_element32(rd0, a->vd, 0, MO_32);
1633     write_neon_element32(rd1, a->vd, 1, MO_32);
1634
1635     return true;
1636 }
1637
1638 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1639     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1640     {                                                                   \
1641         static NeonGenTwo64OpFn * const addfn[] = {                     \
1642             gen_helper_neon_##OP##l_u16,                                \
1643             gen_helper_neon_##OP##l_u32,                                \
1644             tcg_gen_##OP##_i64,                                         \
1645             NULL,                                                       \
1646         };                                                              \
1647         static NeonGenNarrowFn * const narrowfn[] = {                   \
1648             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1649             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1650             EXTOP,                                                      \
1651             NULL,                                                       \
1652         };                                                              \
1653         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1654     }
1655
1656 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1657 {
1658     tcg_gen_addi_i64(rn, rn, 1u << 31);
1659     tcg_gen_extrh_i64_i32(rd, rn);
1660 }
1661
1662 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1663 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1664 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1665 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1666
1667 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1668                        NeonGenTwoOpWidenFn *opfn,
1669                        NeonGenTwo64OpFn *accfn)
1670 {
1671     /*
1672      * 3-regs different lengths, long operations.
1673      * These perform an operation on two inputs that returns a double-width
1674      * result, and then possibly perform an accumulation operation of
1675      * that result into the double-width destination.
1676      */
1677     TCGv_i64 rd0, rd1, tmp;
1678     TCGv_i32 rn, rm;
1679
1680     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1681         return false;
1682     }
1683
1684     /* UNDEF accesses to D16-D31 if they don't exist. */
1685     if (!dc_isar_feature(aa32_simd_r32, s) &&
1686         ((a->vd | a->vn | a->vm) & 0x10)) {
1687         return false;
1688     }
1689
1690     if (!opfn) {
1691         /* size == 3 case, which is an entirely different insn group */
1692         return false;
1693     }
1694
1695     if (a->vd & 1) {
1696         return false;
1697     }
1698
1699     if (!vfp_access_check(s)) {
1700         return true;
1701     }
1702
1703     rd0 = tcg_temp_new_i64();
1704     rd1 = tcg_temp_new_i64();
1705
1706     rn = tcg_temp_new_i32();
1707     rm = tcg_temp_new_i32();
1708     read_neon_element32(rn, a->vn, 0, MO_32);
1709     read_neon_element32(rm, a->vm, 0, MO_32);
1710     opfn(rd0, rn, rm);
1711
1712     read_neon_element32(rn, a->vn, 1, MO_32);
1713     read_neon_element32(rm, a->vm, 1, MO_32);
1714     opfn(rd1, rn, rm);
1715
1716     /* Don't store results until after all loads: they might overlap */
1717     if (accfn) {
1718         tmp = tcg_temp_new_i64();
1719         read_neon_element64(tmp, a->vd, 0, MO_64);
1720         accfn(rd0, tmp, rd0);
1721         read_neon_element64(tmp, a->vd, 1, MO_64);
1722         accfn(rd1, tmp, rd1);
1723     }
1724
1725     write_neon_element64(rd0, a->vd, 0, MO_64);
1726     write_neon_element64(rd1, a->vd, 1, MO_64);
1727
1728     return true;
1729 }
1730
1731 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
1732 {
1733     static NeonGenTwoOpWidenFn * const opfn[] = {
1734         gen_helper_neon_abdl_s16,
1735         gen_helper_neon_abdl_s32,
1736         gen_helper_neon_abdl_s64,
1737         NULL,
1738     };
1739
1740     return do_long_3d(s, a, opfn[a->size], NULL);
1741 }
1742
1743 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
1744 {
1745     static NeonGenTwoOpWidenFn * const opfn[] = {
1746         gen_helper_neon_abdl_u16,
1747         gen_helper_neon_abdl_u32,
1748         gen_helper_neon_abdl_u64,
1749         NULL,
1750     };
1751
1752     return do_long_3d(s, a, opfn[a->size], NULL);
1753 }
1754
1755 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
1756 {
1757     static NeonGenTwoOpWidenFn * const opfn[] = {
1758         gen_helper_neon_abdl_s16,
1759         gen_helper_neon_abdl_s32,
1760         gen_helper_neon_abdl_s64,
1761         NULL,
1762     };
1763     static NeonGenTwo64OpFn * const addfn[] = {
1764         gen_helper_neon_addl_u16,
1765         gen_helper_neon_addl_u32,
1766         tcg_gen_add_i64,
1767         NULL,
1768     };
1769
1770     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
1771 }
1772
1773 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
1774 {
1775     static NeonGenTwoOpWidenFn * const opfn[] = {
1776         gen_helper_neon_abdl_u16,
1777         gen_helper_neon_abdl_u32,
1778         gen_helper_neon_abdl_u64,
1779         NULL,
1780     };
1781     static NeonGenTwo64OpFn * const addfn[] = {
1782         gen_helper_neon_addl_u16,
1783         gen_helper_neon_addl_u32,
1784         tcg_gen_add_i64,
1785         NULL,
1786     };
1787
1788     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
1789 }
1790
1791 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1792 {
1793     TCGv_i32 lo = tcg_temp_new_i32();
1794     TCGv_i32 hi = tcg_temp_new_i32();
1795
1796     tcg_gen_muls2_i32(lo, hi, rn, rm);
1797     tcg_gen_concat_i32_i64(rd, lo, hi);
1798 }
1799
1800 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1801 {
1802     TCGv_i32 lo = tcg_temp_new_i32();
1803     TCGv_i32 hi = tcg_temp_new_i32();
1804
1805     tcg_gen_mulu2_i32(lo, hi, rn, rm);
1806     tcg_gen_concat_i32_i64(rd, lo, hi);
1807 }
1808
1809 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
1810 {
1811     static NeonGenTwoOpWidenFn * const opfn[] = {
1812         gen_helper_neon_mull_s8,
1813         gen_helper_neon_mull_s16,
1814         gen_mull_s32,
1815         NULL,
1816     };
1817
1818     return do_long_3d(s, a, opfn[a->size], NULL);
1819 }
1820
1821 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
1822 {
1823     static NeonGenTwoOpWidenFn * const opfn[] = {
1824         gen_helper_neon_mull_u8,
1825         gen_helper_neon_mull_u16,
1826         gen_mull_u32,
1827         NULL,
1828     };
1829
1830     return do_long_3d(s, a, opfn[a->size], NULL);
1831 }
1832
1833 #define DO_VMLAL(INSN,MULL,ACC)                                         \
1834     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1835     {                                                                   \
1836         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
1837             gen_helper_neon_##MULL##8,                                  \
1838             gen_helper_neon_##MULL##16,                                 \
1839             gen_##MULL##32,                                             \
1840             NULL,                                                       \
1841         };                                                              \
1842         static NeonGenTwo64OpFn * const accfn[] = {                     \
1843             gen_helper_neon_##ACC##l_u16,                               \
1844             gen_helper_neon_##ACC##l_u32,                               \
1845             tcg_gen_##ACC##_i64,                                        \
1846             NULL,                                                       \
1847         };                                                              \
1848         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
1849     }
1850
1851 DO_VMLAL(VMLAL_S,mull_s,add)
1852 DO_VMLAL(VMLAL_U,mull_u,add)
1853 DO_VMLAL(VMLSL_S,mull_s,sub)
1854 DO_VMLAL(VMLSL_U,mull_u,sub)
1855
1856 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1857 {
1858     gen_helper_neon_mull_s16(rd, rn, rm);
1859     gen_helper_neon_addl_saturate_s32(rd, tcg_env, rd, rd);
1860 }
1861
1862 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
1863 {
1864     gen_mull_s32(rd, rn, rm);
1865     gen_helper_neon_addl_saturate_s64(rd, tcg_env, rd, rd);
1866 }
1867
1868 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
1869 {
1870     static NeonGenTwoOpWidenFn * const opfn[] = {
1871         NULL,
1872         gen_VQDMULL_16,
1873         gen_VQDMULL_32,
1874         NULL,
1875     };
1876
1877     return do_long_3d(s, a, opfn[a->size], NULL);
1878 }
1879
1880 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
1881 {
1882     gen_helper_neon_addl_saturate_s32(rd, tcg_env, rn, rm);
1883 }
1884
1885 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
1886 {
1887     gen_helper_neon_addl_saturate_s64(rd, tcg_env, rn, rm);
1888 }
1889
1890 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
1891 {
1892     static NeonGenTwoOpWidenFn * const opfn[] = {
1893         NULL,
1894         gen_VQDMULL_16,
1895         gen_VQDMULL_32,
1896         NULL,
1897     };
1898     static NeonGenTwo64OpFn * const accfn[] = {
1899         NULL,
1900         gen_VQDMLAL_acc_16,
1901         gen_VQDMLAL_acc_32,
1902         NULL,
1903     };
1904
1905     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
1906 }
1907
1908 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
1909 {
1910     gen_helper_neon_negl_u32(rm, rm);
1911     gen_helper_neon_addl_saturate_s32(rd, tcg_env, rn, rm);
1912 }
1913
1914 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
1915 {
1916     tcg_gen_neg_i64(rm, rm);
1917     gen_helper_neon_addl_saturate_s64(rd, tcg_env, rn, rm);
1918 }
1919
1920 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
1921 {
1922     static NeonGenTwoOpWidenFn * const opfn[] = {
1923         NULL,
1924         gen_VQDMULL_16,
1925         gen_VQDMULL_32,
1926         NULL,
1927     };
1928     static NeonGenTwo64OpFn * const accfn[] = {
1929         NULL,
1930         gen_VQDMLSL_acc_16,
1931         gen_VQDMLSL_acc_32,
1932         NULL,
1933     };
1934
1935     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
1936 }
1937
1938 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
1939 {
1940     gen_helper_gvec_3 *fn_gvec;
1941
1942     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1943         return false;
1944     }
1945
1946     /* UNDEF accesses to D16-D31 if they don't exist. */
1947     if (!dc_isar_feature(aa32_simd_r32, s) &&
1948         ((a->vd | a->vn | a->vm) & 0x10)) {
1949         return false;
1950     }
1951
1952     if (a->vd & 1) {
1953         return false;
1954     }
1955
1956     switch (a->size) {
1957     case 0:
1958         fn_gvec = gen_helper_neon_pmull_h;
1959         break;
1960     case 2:
1961         if (!dc_isar_feature(aa32_pmull, s)) {
1962             return false;
1963         }
1964         fn_gvec = gen_helper_gvec_pmull_q;
1965         break;
1966     default:
1967         return false;
1968     }
1969
1970     if (!vfp_access_check(s)) {
1971         return true;
1972     }
1973
1974     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
1975                        neon_full_reg_offset(a->vn),
1976                        neon_full_reg_offset(a->vm),
1977                        16, 16, 0, fn_gvec);
1978     return true;
1979 }
1980
1981 static void gen_neon_dup_low16(TCGv_i32 var)
1982 {
1983     TCGv_i32 tmp = tcg_temp_new_i32();
1984     tcg_gen_ext16u_i32(var, var);
1985     tcg_gen_shli_i32(tmp, var, 16);
1986     tcg_gen_or_i32(var, var, tmp);
1987 }
1988
1989 static void gen_neon_dup_high16(TCGv_i32 var)
1990 {
1991     TCGv_i32 tmp = tcg_temp_new_i32();
1992     tcg_gen_andi_i32(var, var, 0xffff0000);
1993     tcg_gen_shri_i32(tmp, var, 16);
1994     tcg_gen_or_i32(var, var, tmp);
1995 }
1996
1997 static inline TCGv_i32 neon_get_scalar(int size, int reg)
1998 {
1999     TCGv_i32 tmp = tcg_temp_new_i32();
2000     if (size == MO_16) {
2001         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2002         if (reg & 8) {
2003             gen_neon_dup_high16(tmp);
2004         } else {
2005             gen_neon_dup_low16(tmp);
2006         }
2007     } else {
2008         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2009     }
2010     return tmp;
2011 }
2012
2013 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2014                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2015 {
2016     /*
2017      * Two registers and a scalar: perform an operation between
2018      * the input elements and the scalar, and then possibly
2019      * perform an accumulation operation of that result into the
2020      * destination.
2021      */
2022     TCGv_i32 scalar, tmp;
2023     int pass;
2024
2025     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2026         return false;
2027     }
2028
2029     /* UNDEF accesses to D16-D31 if they don't exist. */
2030     if (!dc_isar_feature(aa32_simd_r32, s) &&
2031         ((a->vd | a->vn | a->vm) & 0x10)) {
2032         return false;
2033     }
2034
2035     if (!opfn) {
2036         /* Bad size (including size == 3, which is a different insn group) */
2037         return false;
2038     }
2039
2040     if (a->q && ((a->vd | a->vn) & 1)) {
2041         return false;
2042     }
2043
2044     if (!vfp_access_check(s)) {
2045         return true;
2046     }
2047
2048     scalar = neon_get_scalar(a->size, a->vm);
2049     tmp = tcg_temp_new_i32();
2050
2051     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2052         read_neon_element32(tmp, a->vn, pass, MO_32);
2053         opfn(tmp, tmp, scalar);
2054         if (accfn) {
2055             TCGv_i32 rd = tcg_temp_new_i32();
2056             read_neon_element32(rd, a->vd, pass, MO_32);
2057             accfn(tmp, rd, tmp);
2058         }
2059         write_neon_element32(tmp, a->vd, pass, MO_32);
2060     }
2061     return true;
2062 }
2063
2064 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2065 {
2066     static NeonGenTwoOpFn * const opfn[] = {
2067         NULL,
2068         gen_helper_neon_mul_u16,
2069         tcg_gen_mul_i32,
2070         NULL,
2071     };
2072
2073     return do_2scalar(s, a, opfn[a->size], NULL);
2074 }
2075
2076 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2077 {
2078     static NeonGenTwoOpFn * const opfn[] = {
2079         NULL,
2080         gen_helper_neon_mul_u16,
2081         tcg_gen_mul_i32,
2082         NULL,
2083     };
2084     static NeonGenTwoOpFn * const accfn[] = {
2085         NULL,
2086         gen_helper_neon_add_u16,
2087         tcg_gen_add_i32,
2088         NULL,
2089     };
2090
2091     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2092 }
2093
2094 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2095 {
2096     static NeonGenTwoOpFn * const opfn[] = {
2097         NULL,
2098         gen_helper_neon_mul_u16,
2099         tcg_gen_mul_i32,
2100         NULL,
2101     };
2102     static NeonGenTwoOpFn * const accfn[] = {
2103         NULL,
2104         gen_helper_neon_sub_u16,
2105         tcg_gen_sub_i32,
2106         NULL,
2107     };
2108
2109     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2110 }
2111
2112 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2113                               gen_helper_gvec_3_ptr *fn)
2114 {
2115     /* Two registers and a scalar, using gvec */
2116     int vec_size = a->q ? 16 : 8;
2117     int rd_ofs = neon_full_reg_offset(a->vd);
2118     int rn_ofs = neon_full_reg_offset(a->vn);
2119     int rm_ofs;
2120     int idx;
2121     TCGv_ptr fpstatus;
2122
2123     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2124         return false;
2125     }
2126
2127     /* UNDEF accesses to D16-D31 if they don't exist. */
2128     if (!dc_isar_feature(aa32_simd_r32, s) &&
2129         ((a->vd | a->vn | a->vm) & 0x10)) {
2130         return false;
2131     }
2132
2133     if (!fn) {
2134         /* Bad size (including size == 3, which is a different insn group) */
2135         return false;
2136     }
2137
2138     if (a->q && ((a->vd | a->vn) & 1)) {
2139         return false;
2140     }
2141
2142     if (!vfp_access_check(s)) {
2143         return true;
2144     }
2145
2146     /* a->vm is M:Vm, which encodes both register and index */
2147     idx = extract32(a->vm, a->size + 2, 2);
2148     a->vm = extract32(a->vm, 0, a->size + 2);
2149     rm_ofs = neon_full_reg_offset(a->vm);
2150
2151     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2152     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2153                        vec_size, vec_size, idx, fn);
2154     return true;
2155 }
2156
2157 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2158     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2159     {                                                                   \
2160         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2161             NULL,                                                       \
2162             gen_helper_##FUNC##_h,                                      \
2163             gen_helper_##FUNC##_s,                                      \
2164             NULL,                                                       \
2165         };                                                              \
2166         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2167             return false;                                               \
2168         }                                                               \
2169         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2170     }
2171
2172 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2173 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2174 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2175
2176 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2177 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2178 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2179 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2180
2181 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2182 {
2183     static NeonGenTwoOpFn * const opfn[] = {
2184         NULL,
2185         gen_VQDMULH_16,
2186         gen_VQDMULH_32,
2187         NULL,
2188     };
2189
2190     return do_2scalar(s, a, opfn[a->size], NULL);
2191 }
2192
2193 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2194 {
2195     static NeonGenTwoOpFn * const opfn[] = {
2196         NULL,
2197         gen_VQRDMULH_16,
2198         gen_VQRDMULH_32,
2199         NULL,
2200     };
2201
2202     return do_2scalar(s, a, opfn[a->size], NULL);
2203 }
2204
2205 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2206                             NeonGenThreeOpEnvFn *opfn)
2207 {
2208     /*
2209      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2210      * performs a kind of fused op-then-accumulate using a helper
2211      * function that takes all of rd, rn and the scalar at once.
2212      */
2213     TCGv_i32 scalar, rn, rd;
2214     int pass;
2215
2216     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2217         return false;
2218     }
2219
2220     if (!dc_isar_feature(aa32_rdm, s)) {
2221         return false;
2222     }
2223
2224     /* UNDEF accesses to D16-D31 if they don't exist. */
2225     if (!dc_isar_feature(aa32_simd_r32, s) &&
2226         ((a->vd | a->vn | a->vm) & 0x10)) {
2227         return false;
2228     }
2229
2230     if (!opfn) {
2231         /* Bad size (including size == 3, which is a different insn group) */
2232         return false;
2233     }
2234
2235     if (a->q && ((a->vd | a->vn) & 1)) {
2236         return false;
2237     }
2238
2239     if (!vfp_access_check(s)) {
2240         return true;
2241     }
2242
2243     scalar = neon_get_scalar(a->size, a->vm);
2244     rn = tcg_temp_new_i32();
2245     rd = tcg_temp_new_i32();
2246
2247     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2248         read_neon_element32(rn, a->vn, pass, MO_32);
2249         read_neon_element32(rd, a->vd, pass, MO_32);
2250         opfn(rd, tcg_env, rn, scalar, rd);
2251         write_neon_element32(rd, a->vd, pass, MO_32);
2252     }
2253     return true;
2254 }
2255
2256 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2257 {
2258     static NeonGenThreeOpEnvFn *opfn[] = {
2259         NULL,
2260         gen_helper_neon_qrdmlah_s16,
2261         gen_helper_neon_qrdmlah_s32,
2262         NULL,
2263     };
2264     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2265 }
2266
2267 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2268 {
2269     static NeonGenThreeOpEnvFn *opfn[] = {
2270         NULL,
2271         gen_helper_neon_qrdmlsh_s16,
2272         gen_helper_neon_qrdmlsh_s32,
2273         NULL,
2274     };
2275     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2276 }
2277
2278 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2279                             NeonGenTwoOpWidenFn *opfn,
2280                             NeonGenTwo64OpFn *accfn)
2281 {
2282     /*
2283      * Two registers and a scalar, long operations: perform an
2284      * operation on the input elements and the scalar which produces
2285      * a double-width result, and then possibly perform an accumulation
2286      * operation of that result into the destination.
2287      */
2288     TCGv_i32 scalar, rn;
2289     TCGv_i64 rn0_64, rn1_64;
2290
2291     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2292         return false;
2293     }
2294
2295     /* UNDEF accesses to D16-D31 if they don't exist. */
2296     if (!dc_isar_feature(aa32_simd_r32, s) &&
2297         ((a->vd | a->vn | a->vm) & 0x10)) {
2298         return false;
2299     }
2300
2301     if (!opfn) {
2302         /* Bad size (including size == 3, which is a different insn group) */
2303         return false;
2304     }
2305
2306     if (a->vd & 1) {
2307         return false;
2308     }
2309
2310     if (!vfp_access_check(s)) {
2311         return true;
2312     }
2313
2314     scalar = neon_get_scalar(a->size, a->vm);
2315
2316     /* Load all inputs before writing any outputs, in case of overlap */
2317     rn = tcg_temp_new_i32();
2318     read_neon_element32(rn, a->vn, 0, MO_32);
2319     rn0_64 = tcg_temp_new_i64();
2320     opfn(rn0_64, rn, scalar);
2321
2322     read_neon_element32(rn, a->vn, 1, MO_32);
2323     rn1_64 = tcg_temp_new_i64();
2324     opfn(rn1_64, rn, scalar);
2325
2326     if (accfn) {
2327         TCGv_i64 t64 = tcg_temp_new_i64();
2328         read_neon_element64(t64, a->vd, 0, MO_64);
2329         accfn(rn0_64, t64, rn0_64);
2330         read_neon_element64(t64, a->vd, 1, MO_64);
2331         accfn(rn1_64, t64, rn1_64);
2332     }
2333
2334     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2335     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2336     return true;
2337 }
2338
2339 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2340 {
2341     static NeonGenTwoOpWidenFn * const opfn[] = {
2342         NULL,
2343         gen_helper_neon_mull_s16,
2344         gen_mull_s32,
2345         NULL,
2346     };
2347
2348     return do_2scalar_long(s, a, opfn[a->size], NULL);
2349 }
2350
2351 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2352 {
2353     static NeonGenTwoOpWidenFn * const opfn[] = {
2354         NULL,
2355         gen_helper_neon_mull_u16,
2356         gen_mull_u32,
2357         NULL,
2358     };
2359
2360     return do_2scalar_long(s, a, opfn[a->size], NULL);
2361 }
2362
2363 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2364     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2365     {                                                                   \
2366         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2367             NULL,                                                       \
2368             gen_helper_neon_##MULL##16,                                 \
2369             gen_##MULL##32,                                             \
2370             NULL,                                                       \
2371         };                                                              \
2372         static NeonGenTwo64OpFn * const accfn[] = {                     \
2373             NULL,                                                       \
2374             gen_helper_neon_##ACC##l_u32,                               \
2375             tcg_gen_##ACC##_i64,                                        \
2376             NULL,                                                       \
2377         };                                                              \
2378         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2379     }
2380
2381 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2382 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2383 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2384 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2385
2386 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2387 {
2388     static NeonGenTwoOpWidenFn * const opfn[] = {
2389         NULL,
2390         gen_VQDMULL_16,
2391         gen_VQDMULL_32,
2392         NULL,
2393     };
2394
2395     return do_2scalar_long(s, a, opfn[a->size], NULL);
2396 }
2397
2398 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2399 {
2400     static NeonGenTwoOpWidenFn * const opfn[] = {
2401         NULL,
2402         gen_VQDMULL_16,
2403         gen_VQDMULL_32,
2404         NULL,
2405     };
2406     static NeonGenTwo64OpFn * const accfn[] = {
2407         NULL,
2408         gen_VQDMLAL_acc_16,
2409         gen_VQDMLAL_acc_32,
2410         NULL,
2411     };
2412
2413     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2414 }
2415
2416 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2417 {
2418     static NeonGenTwoOpWidenFn * const opfn[] = {
2419         NULL,
2420         gen_VQDMULL_16,
2421         gen_VQDMULL_32,
2422         NULL,
2423     };
2424     static NeonGenTwo64OpFn * const accfn[] = {
2425         NULL,
2426         gen_VQDMLSL_acc_16,
2427         gen_VQDMLSL_acc_32,
2428         NULL,
2429     };
2430
2431     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2432 }
2433
2434 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2435 {
2436     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2437         return false;
2438     }
2439
2440     /* UNDEF accesses to D16-D31 if they don't exist. */
2441     if (!dc_isar_feature(aa32_simd_r32, s) &&
2442         ((a->vd | a->vn | a->vm) & 0x10)) {
2443         return false;
2444     }
2445
2446     if ((a->vn | a->vm | a->vd) & a->q) {
2447         return false;
2448     }
2449
2450     if (a->imm > 7 && !a->q) {
2451         return false;
2452     }
2453
2454     if (!vfp_access_check(s)) {
2455         return true;
2456     }
2457
2458     if (!a->q) {
2459         /* Extract 64 bits from <Vm:Vn> */
2460         TCGv_i64 left, right, dest;
2461
2462         left = tcg_temp_new_i64();
2463         right = tcg_temp_new_i64();
2464         dest = tcg_temp_new_i64();
2465
2466         read_neon_element64(right, a->vn, 0, MO_64);
2467         read_neon_element64(left, a->vm, 0, MO_64);
2468         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2469         write_neon_element64(dest, a->vd, 0, MO_64);
2470     } else {
2471         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2472         TCGv_i64 left, middle, right, destleft, destright;
2473
2474         left = tcg_temp_new_i64();
2475         middle = tcg_temp_new_i64();
2476         right = tcg_temp_new_i64();
2477         destleft = tcg_temp_new_i64();
2478         destright = tcg_temp_new_i64();
2479
2480         if (a->imm < 8) {
2481             read_neon_element64(right, a->vn, 0, MO_64);
2482             read_neon_element64(middle, a->vn, 1, MO_64);
2483             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2484             read_neon_element64(left, a->vm, 0, MO_64);
2485             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2486         } else {
2487             read_neon_element64(right, a->vn, 1, MO_64);
2488             read_neon_element64(middle, a->vm, 0, MO_64);
2489             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2490             read_neon_element64(left, a->vm, 1, MO_64);
2491             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2492         }
2493
2494         write_neon_element64(destright, a->vd, 0, MO_64);
2495         write_neon_element64(destleft, a->vd, 1, MO_64);
2496     }
2497     return true;
2498 }
2499
2500 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2501 {
2502     TCGv_i64 val, def;
2503     TCGv_i32 desc;
2504
2505     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2506         return false;
2507     }
2508
2509     /* UNDEF accesses to D16-D31 if they don't exist. */
2510     if (!dc_isar_feature(aa32_simd_r32, s) &&
2511         ((a->vd | a->vn | a->vm) & 0x10)) {
2512         return false;
2513     }
2514
2515     if ((a->vn + a->len + 1) > 32) {
2516         /*
2517          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2518          * helper function running off the end of the register file.
2519          */
2520         return false;
2521     }
2522
2523     if (!vfp_access_check(s)) {
2524         return true;
2525     }
2526
2527     desc = tcg_constant_i32((a->vn << 2) | a->len);
2528     def = tcg_temp_new_i64();
2529     if (a->op) {
2530         read_neon_element64(def, a->vd, 0, MO_64);
2531     } else {
2532         tcg_gen_movi_i64(def, 0);
2533     }
2534     val = tcg_temp_new_i64();
2535     read_neon_element64(val, a->vm, 0, MO_64);
2536
2537     gen_helper_neon_tbl(val, tcg_env, desc, val, def);
2538     write_neon_element64(val, a->vd, 0, MO_64);
2539     return true;
2540 }
2541
2542 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2543 {
2544     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2545         return false;
2546     }
2547
2548     /* UNDEF accesses to D16-D31 if they don't exist. */
2549     if (!dc_isar_feature(aa32_simd_r32, s) &&
2550         ((a->vd | a->vm) & 0x10)) {
2551         return false;
2552     }
2553
2554     if (a->vd & a->q) {
2555         return false;
2556     }
2557
2558     if (!vfp_access_check(s)) {
2559         return true;
2560     }
2561
2562     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2563                          neon_element_offset(a->vm, a->index, a->size),
2564                          a->q ? 16 : 8, a->q ? 16 : 8);
2565     return true;
2566 }
2567
2568 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2569 {
2570     int pass, half;
2571     TCGv_i32 tmp[2];
2572
2573     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2574         return false;
2575     }
2576
2577     /* UNDEF accesses to D16-D31 if they don't exist. */
2578     if (!dc_isar_feature(aa32_simd_r32, s) &&
2579         ((a->vd | a->vm) & 0x10)) {
2580         return false;
2581     }
2582
2583     if ((a->vd | a->vm) & a->q) {
2584         return false;
2585     }
2586
2587     if (a->size == 3) {
2588         return false;
2589     }
2590
2591     if (!vfp_access_check(s)) {
2592         return true;
2593     }
2594
2595     tmp[0] = tcg_temp_new_i32();
2596     tmp[1] = tcg_temp_new_i32();
2597
2598     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2599         for (half = 0; half < 2; half++) {
2600             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2601             switch (a->size) {
2602             case 0:
2603                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2604                 break;
2605             case 1:
2606                 gen_swap_half(tmp[half], tmp[half]);
2607                 break;
2608             case 2:
2609                 break;
2610             default:
2611                 g_assert_not_reached();
2612             }
2613         }
2614         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
2615         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
2616     }
2617     return true;
2618 }
2619
2620 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2621                               NeonGenWidenFn *widenfn,
2622                               NeonGenTwo64OpFn *opfn,
2623                               NeonGenTwo64OpFn *accfn)
2624 {
2625     /*
2626      * Pairwise long operations: widen both halves of the pair,
2627      * combine the pairs with the opfn, and then possibly accumulate
2628      * into the destination with the accfn.
2629      */
2630     int pass;
2631
2632     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2633         return false;
2634     }
2635
2636     /* UNDEF accesses to D16-D31 if they don't exist. */
2637     if (!dc_isar_feature(aa32_simd_r32, s) &&
2638         ((a->vd | a->vm) & 0x10)) {
2639         return false;
2640     }
2641
2642     if ((a->vd | a->vm) & a->q) {
2643         return false;
2644     }
2645
2646     if (!widenfn) {
2647         return false;
2648     }
2649
2650     if (!vfp_access_check(s)) {
2651         return true;
2652     }
2653
2654     for (pass = 0; pass < a->q + 1; pass++) {
2655         TCGv_i32 tmp;
2656         TCGv_i64 rm0_64, rm1_64, rd_64;
2657
2658         rm0_64 = tcg_temp_new_i64();
2659         rm1_64 = tcg_temp_new_i64();
2660         rd_64 = tcg_temp_new_i64();
2661
2662         tmp = tcg_temp_new_i32();
2663         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
2664         widenfn(rm0_64, tmp);
2665         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
2666         widenfn(rm1_64, tmp);
2667
2668         opfn(rd_64, rm0_64, rm1_64);
2669
2670         if (accfn) {
2671             TCGv_i64 tmp64 = tcg_temp_new_i64();
2672             read_neon_element64(tmp64, a->vd, pass, MO_64);
2673             accfn(rd_64, tmp64, rd_64);
2674         }
2675         write_neon_element64(rd_64, a->vd, pass, MO_64);
2676     }
2677     return true;
2678 }
2679
2680 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
2681 {
2682     static NeonGenWidenFn * const widenfn[] = {
2683         gen_helper_neon_widen_s8,
2684         gen_helper_neon_widen_s16,
2685         tcg_gen_ext_i32_i64,
2686         NULL,
2687     };
2688     static NeonGenTwo64OpFn * const opfn[] = {
2689         gen_helper_neon_paddl_u16,
2690         gen_helper_neon_paddl_u32,
2691         tcg_gen_add_i64,
2692         NULL,
2693     };
2694
2695     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
2696 }
2697
2698 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
2699 {
2700     static NeonGenWidenFn * const widenfn[] = {
2701         gen_helper_neon_widen_u8,
2702         gen_helper_neon_widen_u16,
2703         tcg_gen_extu_i32_i64,
2704         NULL,
2705     };
2706     static NeonGenTwo64OpFn * const opfn[] = {
2707         gen_helper_neon_paddl_u16,
2708         gen_helper_neon_paddl_u32,
2709         tcg_gen_add_i64,
2710         NULL,
2711     };
2712
2713     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
2714 }
2715
2716 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
2717 {
2718     static NeonGenWidenFn * const widenfn[] = {
2719         gen_helper_neon_widen_s8,
2720         gen_helper_neon_widen_s16,
2721         tcg_gen_ext_i32_i64,
2722         NULL,
2723     };
2724     static NeonGenTwo64OpFn * const opfn[] = {
2725         gen_helper_neon_paddl_u16,
2726         gen_helper_neon_paddl_u32,
2727         tcg_gen_add_i64,
2728         NULL,
2729     };
2730     static NeonGenTwo64OpFn * const accfn[] = {
2731         gen_helper_neon_addl_u16,
2732         gen_helper_neon_addl_u32,
2733         tcg_gen_add_i64,
2734         NULL,
2735     };
2736
2737     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
2738                              accfn[a->size]);
2739 }
2740
2741 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
2742 {
2743     static NeonGenWidenFn * const widenfn[] = {
2744         gen_helper_neon_widen_u8,
2745         gen_helper_neon_widen_u16,
2746         tcg_gen_extu_i32_i64,
2747         NULL,
2748     };
2749     static NeonGenTwo64OpFn * const opfn[] = {
2750         gen_helper_neon_paddl_u16,
2751         gen_helper_neon_paddl_u32,
2752         tcg_gen_add_i64,
2753         NULL,
2754     };
2755     static NeonGenTwo64OpFn * const accfn[] = {
2756         gen_helper_neon_addl_u16,
2757         gen_helper_neon_addl_u32,
2758         tcg_gen_add_i64,
2759         NULL,
2760     };
2761
2762     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
2763                              accfn[a->size]);
2764 }
2765
2766 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
2767
2768 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
2769                        ZipFn *fn)
2770 {
2771     TCGv_ptr pd, pm;
2772
2773     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2774         return false;
2775     }
2776
2777     /* UNDEF accesses to D16-D31 if they don't exist. */
2778     if (!dc_isar_feature(aa32_simd_r32, s) &&
2779         ((a->vd | a->vm) & 0x10)) {
2780         return false;
2781     }
2782
2783     if ((a->vd | a->vm) & a->q) {
2784         return false;
2785     }
2786
2787     if (!fn) {
2788         /* Bad size or size/q combination */
2789         return false;
2790     }
2791
2792     if (!vfp_access_check(s)) {
2793         return true;
2794     }
2795
2796     pd = vfp_reg_ptr(true, a->vd);
2797     pm = vfp_reg_ptr(true, a->vm);
2798     fn(pd, pm);
2799     return true;
2800 }
2801
2802 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
2803 {
2804     static ZipFn * const fn[2][4] = {
2805         {
2806             gen_helper_neon_unzip8,
2807             gen_helper_neon_unzip16,
2808             NULL,
2809             NULL,
2810         }, {
2811             gen_helper_neon_qunzip8,
2812             gen_helper_neon_qunzip16,
2813             gen_helper_neon_qunzip32,
2814             NULL,
2815         }
2816     };
2817     return do_zip_uzp(s, a, fn[a->q][a->size]);
2818 }
2819
2820 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
2821 {
2822     static ZipFn * const fn[2][4] = {
2823         {
2824             gen_helper_neon_zip8,
2825             gen_helper_neon_zip16,
2826             NULL,
2827             NULL,
2828         }, {
2829             gen_helper_neon_qzip8,
2830             gen_helper_neon_qzip16,
2831             gen_helper_neon_qzip32,
2832             NULL,
2833         }
2834     };
2835     return do_zip_uzp(s, a, fn[a->q][a->size]);
2836 }
2837
2838 static bool do_vmovn(DisasContext *s, arg_2misc *a,
2839                      NeonGenOne64OpEnvFn *narrowfn)
2840 {
2841     TCGv_i64 rm, rd0, rd1;
2842
2843     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2844         return false;
2845     }
2846
2847     /* UNDEF accesses to D16-D31 if they don't exist. */
2848     if (!dc_isar_feature(aa32_simd_r32, s) &&
2849         ((a->vd | a->vm) & 0x10)) {
2850         return false;
2851     }
2852
2853     if (a->vm & 1) {
2854         return false;
2855     }
2856
2857     if (!narrowfn) {
2858         return false;
2859     }
2860
2861     if (!vfp_access_check(s)) {
2862         return true;
2863     }
2864
2865     rm = tcg_temp_new_i64();
2866     rd0 = tcg_temp_new_i64();
2867     rd1 = tcg_temp_new_i64();
2868
2869     read_neon_element64(rm, a->vm, 0, MO_64);
2870     narrowfn(rd0, tcg_env, rm);
2871     read_neon_element64(rm, a->vm, 1, MO_64);
2872     narrowfn(rd1, tcg_env, rm);
2873     write_neon_element64(rd0, a->vd, 0, MO_32);
2874     write_neon_element64(rd1, a->vd, 1, MO_32);
2875     return true;
2876 }
2877
2878 #define DO_VMOVN(INSN, FUNC)                                    \
2879     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
2880     {                                                           \
2881         static NeonGenOne64OpEnvFn * const narrowfn[] = {       \
2882             FUNC##8,                                            \
2883             FUNC##16,                                           \
2884             FUNC##32,                                           \
2885             NULL,                                               \
2886         };                                                      \
2887         return do_vmovn(s, a, narrowfn[a->size]);               \
2888     }
2889
2890 DO_VMOVN(VMOVN, gen_neon_narrow_u)
2891 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
2892 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
2893 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
2894
2895 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
2896 {
2897     TCGv_i32 rm0, rm1;
2898     TCGv_i64 rd;
2899     static NeonGenWidenFn * const widenfns[] = {
2900         gen_helper_neon_widen_u8,
2901         gen_helper_neon_widen_u16,
2902         tcg_gen_extu_i32_i64,
2903         NULL,
2904     };
2905     NeonGenWidenFn *widenfn = widenfns[a->size];
2906
2907     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2908         return false;
2909     }
2910
2911     /* UNDEF accesses to D16-D31 if they don't exist. */
2912     if (!dc_isar_feature(aa32_simd_r32, s) &&
2913         ((a->vd | a->vm) & 0x10)) {
2914         return false;
2915     }
2916
2917     if (a->vd & 1) {
2918         return false;
2919     }
2920
2921     if (!widenfn) {
2922         return false;
2923     }
2924
2925     if (!vfp_access_check(s)) {
2926         return true;
2927     }
2928
2929     rd = tcg_temp_new_i64();
2930     rm0 = tcg_temp_new_i32();
2931     rm1 = tcg_temp_new_i32();
2932
2933     read_neon_element32(rm0, a->vm, 0, MO_32);
2934     read_neon_element32(rm1, a->vm, 1, MO_32);
2935
2936     widenfn(rd, rm0);
2937     tcg_gen_shli_i64(rd, rd, 8 << a->size);
2938     write_neon_element64(rd, a->vd, 0, MO_64);
2939     widenfn(rd, rm1);
2940     tcg_gen_shli_i64(rd, rd, 8 << a->size);
2941     write_neon_element64(rd, a->vd, 1, MO_64);
2942     return true;
2943 }
2944
2945 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
2946 {
2947     TCGv_ptr fpst;
2948     TCGv_i64 tmp;
2949     TCGv_i32 dst0, dst1;
2950
2951     if (!dc_isar_feature(aa32_bf16, s)) {
2952         return false;
2953     }
2954
2955     /* UNDEF accesses to D16-D31 if they don't exist. */
2956     if (!dc_isar_feature(aa32_simd_r32, s) &&
2957         ((a->vd | a->vm) & 0x10)) {
2958         return false;
2959     }
2960
2961     if ((a->vm & 1) || (a->size != 1)) {
2962         return false;
2963     }
2964
2965     if (!vfp_access_check(s)) {
2966         return true;
2967     }
2968
2969     fpst = fpstatus_ptr(FPST_STD);
2970     tmp = tcg_temp_new_i64();
2971     dst0 = tcg_temp_new_i32();
2972     dst1 = tcg_temp_new_i32();
2973
2974     read_neon_element64(tmp, a->vm, 0, MO_64);
2975     gen_helper_bfcvt_pair(dst0, tmp, fpst);
2976
2977     read_neon_element64(tmp, a->vm, 1, MO_64);
2978     gen_helper_bfcvt_pair(dst1, tmp, fpst);
2979
2980     write_neon_element32(dst0, a->vd, 0, MO_32);
2981     write_neon_element32(dst1, a->vd, 1, MO_32);
2982     return true;
2983 }
2984
2985 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
2986 {
2987     TCGv_ptr fpst;
2988     TCGv_i32 ahp, tmp, tmp2, tmp3;
2989
2990     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
2991         !dc_isar_feature(aa32_fp16_spconv, s)) {
2992         return false;
2993     }
2994
2995     /* UNDEF accesses to D16-D31 if they don't exist. */
2996     if (!dc_isar_feature(aa32_simd_r32, s) &&
2997         ((a->vd | a->vm) & 0x10)) {
2998         return false;
2999     }
3000
3001     if ((a->vm & 1) || (a->size != 1)) {
3002         return false;
3003     }
3004
3005     if (!vfp_access_check(s)) {
3006         return true;
3007     }
3008
3009     fpst = fpstatus_ptr(FPST_STD);
3010     ahp = get_ahp_flag();
3011     tmp = tcg_temp_new_i32();
3012     read_neon_element32(tmp, a->vm, 0, MO_32);
3013     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3014     tmp2 = tcg_temp_new_i32();
3015     read_neon_element32(tmp2, a->vm, 1, MO_32);
3016     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3017     tcg_gen_shli_i32(tmp2, tmp2, 16);
3018     tcg_gen_or_i32(tmp2, tmp2, tmp);
3019     read_neon_element32(tmp, a->vm, 2, MO_32);
3020     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3021     tmp3 = tcg_temp_new_i32();
3022     read_neon_element32(tmp3, a->vm, 3, MO_32);
3023     write_neon_element32(tmp2, a->vd, 0, MO_32);
3024     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3025     tcg_gen_shli_i32(tmp3, tmp3, 16);
3026     tcg_gen_or_i32(tmp3, tmp3, tmp);
3027     write_neon_element32(tmp3, a->vd, 1, MO_32);
3028     return true;
3029 }
3030
3031 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3032 {
3033     TCGv_ptr fpst;
3034     TCGv_i32 ahp, tmp, tmp2, tmp3;
3035
3036     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3037         !dc_isar_feature(aa32_fp16_spconv, s)) {
3038         return false;
3039     }
3040
3041     /* UNDEF accesses to D16-D31 if they don't exist. */
3042     if (!dc_isar_feature(aa32_simd_r32, s) &&
3043         ((a->vd | a->vm) & 0x10)) {
3044         return false;
3045     }
3046
3047     if ((a->vd & 1) || (a->size != 1)) {
3048         return false;
3049     }
3050
3051     if (!vfp_access_check(s)) {
3052         return true;
3053     }
3054
3055     fpst = fpstatus_ptr(FPST_STD);
3056     ahp = get_ahp_flag();
3057     tmp3 = tcg_temp_new_i32();
3058     tmp2 = tcg_temp_new_i32();
3059     tmp = tcg_temp_new_i32();
3060     read_neon_element32(tmp, a->vm, 0, MO_32);
3061     read_neon_element32(tmp2, a->vm, 1, MO_32);
3062     tcg_gen_ext16u_i32(tmp3, tmp);
3063     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3064     write_neon_element32(tmp3, a->vd, 0, MO_32);
3065     tcg_gen_shri_i32(tmp, tmp, 16);
3066     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3067     write_neon_element32(tmp, a->vd, 1, MO_32);
3068     tcg_gen_ext16u_i32(tmp3, tmp2);
3069     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3070     write_neon_element32(tmp3, a->vd, 2, MO_32);
3071     tcg_gen_shri_i32(tmp2, tmp2, 16);
3072     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3073     write_neon_element32(tmp2, a->vd, 3, MO_32);
3074     return true;
3075 }
3076
3077 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3078 {
3079     int vec_size = a->q ? 16 : 8;
3080     int rd_ofs = neon_full_reg_offset(a->vd);
3081     int rm_ofs = neon_full_reg_offset(a->vm);
3082
3083     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3084         return false;
3085     }
3086
3087     /* UNDEF accesses to D16-D31 if they don't exist. */
3088     if (!dc_isar_feature(aa32_simd_r32, s) &&
3089         ((a->vd | a->vm) & 0x10)) {
3090         return false;
3091     }
3092
3093     if (a->size == 3) {
3094         return false;
3095     }
3096
3097     if ((a->vd | a->vm) & a->q) {
3098         return false;
3099     }
3100
3101     if (!vfp_access_check(s)) {
3102         return true;
3103     }
3104
3105     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3106
3107     return true;
3108 }
3109
3110 #define DO_2MISC_VEC(INSN, FN)                                  \
3111     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3112     {                                                           \
3113         return do_2misc_vec(s, a, FN);                          \
3114     }
3115
3116 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3117 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3118 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3119 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3120 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3121 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3122 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3123
3124 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3125 {
3126     if (a->size != 0) {
3127         return false;
3128     }
3129     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3130 }
3131
3132 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3133     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3134                          uint32_t rm_ofs, uint32_t oprsz,               \
3135                          uint32_t maxsz)                                \
3136     {                                                                   \
3137         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3138                            DATA, FUNC);                                 \
3139     }
3140
3141 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3142     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3143                          uint32_t rm_ofs, uint32_t oprsz,               \
3144                          uint32_t maxsz)                                \
3145     {                                                                   \
3146         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3147     }
3148
3149 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3150 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aesd, 0)
3151 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3152 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesimc, 0)
3153 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3154 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3155 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3156
3157 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3158     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3159     {                                                           \
3160         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3161             return false;                                       \
3162         }                                                       \
3163         return do_2misc_vec(s, a, gen_##INSN);                  \
3164     }
3165
3166 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3167 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3168 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3169 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3170 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3171 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3172 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3173
3174 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3175 {
3176     TCGv_i32 tmp;
3177     int pass;
3178
3179     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3180     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3181         return false;
3182     }
3183
3184     /* UNDEF accesses to D16-D31 if they don't exist. */
3185     if (!dc_isar_feature(aa32_simd_r32, s) &&
3186         ((a->vd | a->vm) & 0x10)) {
3187         return false;
3188     }
3189
3190     if (!fn) {
3191         return false;
3192     }
3193
3194     if ((a->vd | a->vm) & a->q) {
3195         return false;
3196     }
3197
3198     if (!vfp_access_check(s)) {
3199         return true;
3200     }
3201
3202     tmp = tcg_temp_new_i32();
3203     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3204         read_neon_element32(tmp, a->vm, pass, MO_32);
3205         fn(tmp, tmp);
3206         write_neon_element32(tmp, a->vd, pass, MO_32);
3207     }
3208     return true;
3209 }
3210
3211 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3212 {
3213     static NeonGenOneOpFn * const fn[] = {
3214         tcg_gen_bswap32_i32,
3215         gen_swap_half,
3216         NULL,
3217         NULL,
3218     };
3219     return do_2misc(s, a, fn[a->size]);
3220 }
3221
3222 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3223 {
3224     if (a->size != 0) {
3225         return false;
3226     }
3227     return do_2misc(s, a, gen_rev16);
3228 }
3229
3230 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3231 {
3232     static NeonGenOneOpFn * const fn[] = {
3233         gen_helper_neon_cls_s8,
3234         gen_helper_neon_cls_s16,
3235         gen_helper_neon_cls_s32,
3236         NULL,
3237     };
3238     return do_2misc(s, a, fn[a->size]);
3239 }
3240
3241 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3242 {
3243     tcg_gen_clzi_i32(rd, rm, 32);
3244 }
3245
3246 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3247 {
3248     static NeonGenOneOpFn * const fn[] = {
3249         gen_helper_neon_clz_u8,
3250         gen_helper_neon_clz_u16,
3251         do_VCLZ_32,
3252         NULL,
3253     };
3254     return do_2misc(s, a, fn[a->size]);
3255 }
3256
3257 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3258 {
3259     if (a->size != 0) {
3260         return false;
3261     }
3262     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3263 }
3264
3265 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3266                        uint32_t oprsz, uint32_t maxsz)
3267 {
3268     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3269                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3270                       oprsz, maxsz);
3271 }
3272
3273 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3274 {
3275     if (a->size == MO_16) {
3276         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3277             return false;
3278         }
3279     } else if (a->size != MO_32) {
3280         return false;
3281     }
3282     return do_2misc_vec(s, a, gen_VABS_F);
3283 }
3284
3285 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3286                        uint32_t oprsz, uint32_t maxsz)
3287 {
3288     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3289                       vece == MO_16 ? 0x8000 : 0x80000000,
3290                       oprsz, maxsz);
3291 }
3292
3293 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3294 {
3295     if (a->size == MO_16) {
3296         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3297             return false;
3298         }
3299     } else if (a->size != MO_32) {
3300         return false;
3301     }
3302     return do_2misc_vec(s, a, gen_VNEG_F);
3303 }
3304
3305 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3306 {
3307     if (a->size != 2) {
3308         return false;
3309     }
3310     return do_2misc(s, a, gen_helper_recpe_u32);
3311 }
3312
3313 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3314 {
3315     if (a->size != 2) {
3316         return false;
3317     }
3318     return do_2misc(s, a, gen_helper_rsqrte_u32);
3319 }
3320
3321 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3322     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3323     {                                                   \
3324         FUNC(d, tcg_env, m);                            \
3325     }
3326
3327 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3328 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3329 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3330 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3331 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3332 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3333
3334 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3335 {
3336     static NeonGenOneOpFn * const fn[] = {
3337         gen_VQABS_s8,
3338         gen_VQABS_s16,
3339         gen_VQABS_s32,
3340         NULL,
3341     };
3342     return do_2misc(s, a, fn[a->size]);
3343 }
3344
3345 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3346 {
3347     static NeonGenOneOpFn * const fn[] = {
3348         gen_VQNEG_s8,
3349         gen_VQNEG_s16,
3350         gen_VQNEG_s32,
3351         NULL,
3352     };
3353     return do_2misc(s, a, fn[a->size]);
3354 }
3355
3356 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3357     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3358                            uint32_t rm_ofs,                             \
3359                            uint32_t oprsz, uint32_t maxsz)              \
3360     {                                                                   \
3361         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3362             NULL, HFUNC, SFUNC, NULL,                                   \
3363         };                                                              \
3364         TCGv_ptr fpst;                                                  \
3365         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3366         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3367                            fns[vece]);                                  \
3368     }                                                                   \
3369     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3370     {                                                                   \
3371         if (a->size == MO_16) {                                         \
3372             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3373                 return false;                                           \
3374             }                                                           \
3375         } else if (a->size != MO_32) {                                  \
3376             return false;                                               \
3377         }                                                               \
3378         return do_2misc_vec(s, a, gen_##INSN);                          \
3379     }
3380
3381 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3382 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3383 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3384 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3385 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3386 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3387 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3388 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3389 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3390 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3391 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3392
3393 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3394
3395 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3396 {
3397     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3398         return false;
3399     }
3400     return trans_VRINTX_impl(s, a);
3401 }
3402
3403 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3404     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3405                            uint32_t rm_ofs,                             \
3406                            uint32_t oprsz, uint32_t maxsz)              \
3407     {                                                                   \
3408         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3409             NULL,                                                       \
3410             gen_helper_gvec_##OP##h,                                    \
3411             gen_helper_gvec_##OP##s,                                    \
3412             NULL,                                                       \
3413         };                                                              \
3414         TCGv_ptr fpst;                                                  \
3415         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3416         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3417                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3418     }                                                                   \
3419     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3420     {                                                                   \
3421         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3422             return false;                                               \
3423         }                                                               \
3424         if (a->size == MO_16) {                                         \
3425             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3426                 return false;                                           \
3427             }                                                           \
3428         } else if (a->size != MO_32) {                                  \
3429             return false;                                               \
3430         }                                                               \
3431         return do_2misc_vec(s, a, gen_##INSN);                          \
3432     }
3433
3434 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3435 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3436 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3437 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3438 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3439 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3440 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3441 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3442
3443 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3444 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3445 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3446 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3447 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3448
3449 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3450 {
3451     TCGv_i64 rm, rd;
3452     int pass;
3453
3454     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3455         return false;
3456     }
3457
3458     /* UNDEF accesses to D16-D31 if they don't exist. */
3459     if (!dc_isar_feature(aa32_simd_r32, s) &&
3460         ((a->vd | a->vm) & 0x10)) {
3461         return false;
3462     }
3463
3464     if (a->size != 0) {
3465         return false;
3466     }
3467
3468     if ((a->vd | a->vm) & a->q) {
3469         return false;
3470     }
3471
3472     if (!vfp_access_check(s)) {
3473         return true;
3474     }
3475
3476     rm = tcg_temp_new_i64();
3477     rd = tcg_temp_new_i64();
3478     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3479         read_neon_element64(rm, a->vm, pass, MO_64);
3480         read_neon_element64(rd, a->vd, pass, MO_64);
3481         write_neon_element64(rm, a->vd, pass, MO_64);
3482         write_neon_element64(rd, a->vm, pass, MO_64);
3483     }
3484     return true;
3485 }
3486
3487 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3488 {
3489     TCGv_i32 rd, tmp;
3490
3491     rd = tcg_temp_new_i32();
3492     tmp = tcg_temp_new_i32();
3493
3494     tcg_gen_shli_i32(rd, t0, 8);
3495     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3496     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3497     tcg_gen_or_i32(rd, rd, tmp);
3498
3499     tcg_gen_shri_i32(t1, t1, 8);
3500     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3501     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3502     tcg_gen_or_i32(t1, t1, tmp);
3503     tcg_gen_mov_i32(t0, rd);
3504 }
3505
3506 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3507 {
3508     TCGv_i32 rd, tmp;
3509
3510     rd = tcg_temp_new_i32();
3511     tmp = tcg_temp_new_i32();
3512
3513     tcg_gen_shli_i32(rd, t0, 16);
3514     tcg_gen_andi_i32(tmp, t1, 0xffff);
3515     tcg_gen_or_i32(rd, rd, tmp);
3516     tcg_gen_shri_i32(t1, t1, 16);
3517     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3518     tcg_gen_or_i32(t1, t1, tmp);
3519     tcg_gen_mov_i32(t0, rd);
3520 }
3521
3522 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3523 {
3524     TCGv_i32 tmp, tmp2;
3525     int pass;
3526
3527     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3528         return false;
3529     }
3530
3531     /* UNDEF accesses to D16-D31 if they don't exist. */
3532     if (!dc_isar_feature(aa32_simd_r32, s) &&
3533         ((a->vd | a->vm) & 0x10)) {
3534         return false;
3535     }
3536
3537     if ((a->vd | a->vm) & a->q) {
3538         return false;
3539     }
3540
3541     if (a->size == 3) {
3542         return false;
3543     }
3544
3545     if (!vfp_access_check(s)) {
3546         return true;
3547     }
3548
3549     tmp = tcg_temp_new_i32();
3550     tmp2 = tcg_temp_new_i32();
3551     if (a->size == MO_32) {
3552         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3553             read_neon_element32(tmp, a->vm, pass, MO_32);
3554             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3555             write_neon_element32(tmp2, a->vm, pass, MO_32);
3556             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3557         }
3558     } else {
3559         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3560             read_neon_element32(tmp, a->vm, pass, MO_32);
3561             read_neon_element32(tmp2, a->vd, pass, MO_32);
3562             if (a->size == MO_8) {
3563                 gen_neon_trn_u8(tmp, tmp2);
3564             } else {
3565                 gen_neon_trn_u16(tmp, tmp2);
3566             }
3567             write_neon_element32(tmp2, a->vm, pass, MO_32);
3568             write_neon_element32(tmp, a->vd, pass, MO_32);
3569         }
3570     }
3571     return true;
3572 }
3573
3574 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
3575 {
3576     if (!dc_isar_feature(aa32_i8mm, s)) {
3577         return false;
3578     }
3579     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3580                         gen_helper_gvec_smmla_b);
3581 }
3582
3583 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
3584 {
3585     if (!dc_isar_feature(aa32_i8mm, s)) {
3586         return false;
3587     }
3588     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3589                         gen_helper_gvec_ummla_b);
3590 }
3591
3592 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
3593 {
3594     if (!dc_isar_feature(aa32_i8mm, s)) {
3595         return false;
3596     }
3597     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3598                         gen_helper_gvec_usmmla_b);
3599 }
3600
3601 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
3602 {
3603     if (!dc_isar_feature(aa32_bf16, s)) {
3604         return false;
3605     }
3606     return do_neon_ddda_env(s, 7, a->vd, a->vn, a->vm, 0,
3607                             gen_helper_gvec_bfmmla);
3608 }
3609
3610 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
3611 {
3612     if (!dc_isar_feature(aa32_bf16, s)) {
3613         return false;
3614     }
3615     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
3616                              gen_helper_gvec_bfmlal);
3617 }
3618
3619 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
3620 {
3621     if (!dc_isar_feature(aa32_bf16, s)) {
3622         return false;
3623     }
3624     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
3625                              (a->index << 1) | a->q, FPST_STD,
3626                              gen_helper_gvec_bfmlal_idx);
3627 }