VEX/priv/guest_arm64_toIR.c

   1 /* -*- mode: C; c-basic-offset: 3; -*- */
   2
   3 /*--------------------------------------------------------------------*/
   4 /*--- begin                                     guest_arm64_toIR.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of Valgrind, a dynamic binary instrumentation
   9    framework.
  10
  11    Copyright (C) 2013-2017 OpenWorks
  12       info@open-works.net
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28 */
  29
  30 /* KNOWN LIMITATIONS 2014-Nov-16
  31
  32    * Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.
  33
  34      Also FP comparison "unordered" .. is implemented as normal FP
  35      comparison.
  36
  37      Both should be fixed.  They behave incorrectly in the presence of
  38      NaNs.
  39
  40      FMULX is treated the same as FMUL.  That's also not correct.
  41
  42    * Floating multiply-add (etc) insns.  Are split into a multiply and
  43      an add, and so suffer double rounding and hence sometimes the
  44      least significant mantissa bit is incorrect.  Fix: use the IR
  45      multiply-add IROps instead.
  46
  47    * FRINTA, FRINTN are kludged .. they just round to nearest.  No special
  48      handling for the "ties" case.  FRINTX might be dubious too.
  49
  50    * Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
  51      just rounds to nearest.
  52 */
  53
  54 /* "Special" instructions.
  55
  56    This instruction decoder can decode four special instructions
  57    which mean nothing natively (are no-ops as far as regs/mem are
  58    concerned) but have meaning for supporting Valgrind.  A special
  59    instruction is flagged by a 16-byte preamble:
  60
  61       93CC0D8C 93CC358C 93CCCD8C 93CCF58C
  62       (ror x12, x12, #3;   ror x12, x12, #13
  63        ror x12, x12, #51;  ror x12, x12, #61)
  64
  65    Following that, one of the following 3 are allowed
  66    (standard interpretation in parentheses):
  67
  68       AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
  69       AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
  70       AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
  71       AA090129 (orr x9,x9,x9)      IR injection
  72
  73    Any other bytes following the 16-byte preamble are illegal and
  74    constitute a failure in instruction decoding.  This all assumes
  75    that the preamble will never occur except in specific code
  76    fragments designed for Valgrind to catch.
  77 */
  78
  79 /* Translates ARM64 code to IR. */
  80
  81 #include "libvex_basictypes.h"
  82 #include "libvex_ir.h"
  83 #include "libvex.h"
  84 #include "libvex_guest_arm64.h"
  85
  86 #include "main_util.h"
  87 #include "main_globals.h"
  88 #include "guest_generic_bb_to_IR.h"
  89 #include "guest_arm64_defs.h"
  90
  91
  92 /*------------------------------------------------------------*/
  93 /*--- Globals                                              ---*/
  94 /*------------------------------------------------------------*/
  95
  96 /* These are set at the start of the translation of a instruction, so
  97    that we don't have to pass them around endlessly.  CONST means does
  98    not change during translation of the instruction.
  99 */
 100
 101 /* CONST: what is the host's endianness?  We need to know this in
 102    order to do sub-register accesses to the SIMD/FP registers
 103    correctly. */
 104 static VexEndness host_endness;
 105
 106 /* CONST: The guest address for the instruction currently being
 107    translated.  */
 108 static Addr64 guest_PC_curr_instr;
 109
 110 /* MOD: The IRSB* into which we're generating code. */
 111 static IRSB* irsb;
 112
 113
 114 /*------------------------------------------------------------*/
 115 /*--- Debugging output                                     ---*/
 116 /*------------------------------------------------------------*/
 117
 118 #define DIP(format, args...)           \
 119    if (vex_traceflags & VEX_TRACE_FE)  \
 120       vex_printf(format, ## args)
 121
 122 #define DIS(buf, format, args...)      \
 123    if (vex_traceflags & VEX_TRACE_FE)  \
 124       vex_sprintf(buf, format, ## args)
 125
 126
 127 /*------------------------------------------------------------*/
 128 /*--- Helper bits and pieces for deconstructing the        ---*/
 129 /*--- arm insn stream.                                     ---*/
 130 /*------------------------------------------------------------*/
 131
 132 /* Do a little-endian load of a 32-bit word, regardless of the
 133    endianness of the underlying host. */
 134 static inline UInt getUIntLittleEndianly ( const UChar* p )
 135 {
 136    UInt w = 0;
 137    w = (w << 8) | p[3];
 138    w = (w << 8) | p[2];
 139    w = (w << 8) | p[1];
 140    w = (w << 8) | p[0];
 141    return w;
 142 }
 143
 144 /* Sign extend a N-bit value up to 64 bits, by copying
 145    bit N-1 into all higher positions. */
 146 static ULong sx_to_64 ( ULong x, UInt n )
 147 {
 148    vassert(n > 1 && n < 64);
 149    x <<= (64-n);
 150    Long r = (Long)x;
 151    r >>= (64-n);
 152    return (ULong)r;
 153 }
 154
 155 //ZZ /* Do a little-endian load of a 16-bit word, regardless of the
 156 //ZZ    endianness of the underlying host. */
 157 //ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
 158 //ZZ {
 159 //ZZ    UShort w = 0;
 160 //ZZ    w = (w << 8) | p[1];
 161 //ZZ    w = (w << 8) | p[0];
 162 //ZZ    return w;
 163 //ZZ }
 164 //ZZ
 165 //ZZ static UInt ROR32 ( UInt x, UInt sh ) {
 166 //ZZ    vassert(sh >= 0 && sh < 32);
 167 //ZZ    if (sh == 0)
 168 //ZZ       return x;
 169 //ZZ    else
 170 //ZZ       return (x << (32-sh)) | (x >> sh);
 171 //ZZ }
 172 //ZZ
 173 //ZZ static Int popcount32 ( UInt x )
 174 //ZZ {
 175 //ZZ    Int res = 0, i;
 176 //ZZ    for (i = 0; i < 32; i++) {
 177 //ZZ       res += (x & 1);
 178 //ZZ       x >>= 1;
 179 //ZZ    }
 180 //ZZ    return res;
 181 //ZZ }
 182 //ZZ
 183 //ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
 184 //ZZ {
 185 //ZZ    UInt mask = 1 << ix;
 186 //ZZ    x &= ~mask;
 187 //ZZ    x |= ((b << ix) & mask);
 188 //ZZ    return x;
 189 //ZZ }
 190
 191 #define BITS2(_b1,_b0)  \
 192    (((_b1) << 1) | (_b0))
 193
 194 #define BITS3(_b2,_b1,_b0)  \
 195   (((_b2) << 2) | ((_b1) << 1) | (_b0))
 196
 197 #define BITS4(_b3,_b2,_b1,_b0)  \
 198    (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
 199
 200 #define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 201    ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
 202     | BITS4((_b3),(_b2),(_b1),(_b0)))
 203
 204 #define BITS5(_b4,_b3,_b2,_b1,_b0)  \
 205    (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
 206 #define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
 207    (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 208 #define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 209    (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 210
 211 #define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 212    (((_b8) << 8)  \
 213     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 214
 215 #define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 216    (((_b9) << 9) | ((_b8) << 8)  \
 217     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 218
 219 #define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 220    (((_b10) << 10)  \
 221     | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 222
 223 #define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
 224    (((_b11) << 11)  \
 225     | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 226
 227 #define X00 BITS2(0,0)
 228 #define X01 BITS2(0,1)
 229 #define X10 BITS2(1,0)
 230 #define X11 BITS2(1,1)
 231
 232 // produces _uint[_bMax:_bMin]
 233 #define SLICE_UInt(_uint,_bMax,_bMin)  \
 234    (( ((UInt)(_uint)) >> (_bMin))  \
 235     & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
 236
 237
 238 /*------------------------------------------------------------*/
 239 /*--- Helper bits and pieces for creating IR fragments.    ---*/
 240 /*------------------------------------------------------------*/
 241
 242 static IRExpr* mkV128 ( UShort w )
 243 {
 244    return IRExpr_Const(IRConst_V128(w));
 245 }
 246
 247 static IRExpr* mkU64 ( ULong i )
 248 {
 249    return IRExpr_Const(IRConst_U64(i));
 250 }
 251
 252 static IRExpr* mkU32 ( UInt i )
 253 {
 254    return IRExpr_Const(IRConst_U32(i));
 255 }
 256
 257 static IRExpr* mkU16 ( UInt i )
 258 {
 259    vassert(i < 65536);
 260    return IRExpr_Const(IRConst_U16(i));
 261 }
 262
 263 static IRExpr* mkU8 ( UInt i )
 264 {
 265    vassert(i < 256);
 266    return IRExpr_Const(IRConst_U8( (UChar)i ));
 267 }
 268
 269 static IRExpr* mkexpr ( IRTemp tmp )
 270 {
 271    return IRExpr_RdTmp(tmp);
 272 }
 273
 274 static IRExpr* unop ( IROp op, IRExpr* a )
 275 {
 276    return IRExpr_Unop(op, a);
 277 }
 278
 279 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
 280 {
 281    return IRExpr_Binop(op, a1, a2);
 282 }
 283
 284 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
 285 {
 286    return IRExpr_Triop(op, a1, a2, a3);
 287 }
 288
 289 static IRExpr* qop ( IROp op, IRExpr* a1, IRExpr* a2,
 290                               IRExpr* a3, IRExpr* a4 )
 291 {
 292    return IRExpr_Qop(op, a1, a2, a3, a4);
 293 }
 294
 295 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
 296 {
 297    return IRExpr_Load(Iend_LE, ty, addr);
 298 }
 299
 300 /* Add a statement to the list held by "irbb". */
 301 static void stmt ( IRStmt* st )
 302 {
 303    addStmtToIRSB( irsb, st );
 304 }
 305
 306 static void assign ( IRTemp dst, IRExpr* e )
 307 {
 308    stmt( IRStmt_WrTmp(dst, e) );
 309 }
 310
 311 static void storeLE ( IRExpr* addr, IRExpr* data )
 312 {
 313    stmt( IRStmt_Store(Iend_LE, addr, data) );
 314 }
 315
 316 //ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
 317 //ZZ {
 318 //ZZ    if (guardT == IRTemp_INVALID) {
 319 //ZZ       /* unconditional */
 320 //ZZ       storeLE(addr, data);
 321 //ZZ    } else {
 322 //ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
 323 //ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 324 //ZZ    }
 325 //ZZ }
 326 //ZZ
 327 //ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
 328 //ZZ                             IRExpr* addr, IRExpr* alt,
 329 //ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
 330 //ZZ {
 331 //ZZ    if (guardT == IRTemp_INVALID) {
 332 //ZZ       /* unconditional */
 333 //ZZ       IRExpr* loaded = NULL;
 334 //ZZ       switch (cvt) {
 335 //ZZ          case ILGop_Ident32:
 336 //ZZ             loaded = loadLE(Ity_I32, addr); break;
 337 //ZZ          case ILGop_8Uto32:
 338 //ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
 339 //ZZ          case ILGop_8Sto32:
 340 //ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
 341 //ZZ          case ILGop_16Uto32:
 342 //ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
 343 //ZZ          case ILGop_16Sto32:
 344 //ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
 345 //ZZ          default:
 346 //ZZ             vassert(0);
 347 //ZZ       }
 348 //ZZ       vassert(loaded != NULL);
 349 //ZZ       assign(dst, loaded);
 350 //ZZ    } else {
 351 //ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
 352 //ZZ          loaded data before putting the data in 'dst'.  If the load
 353 //ZZ          does not take place, 'alt' is placed directly in 'dst'. */
 354 //ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
 355 //ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 356 //ZZ    }
 357 //ZZ }
 358
 359 /* Generate a new temporary of the given type. */
 360 static IRTemp newTemp ( IRType ty )
 361 {
 362    vassert(isPlausibleIRType(ty));
 363    return newIRTemp( irsb->tyenv, ty );
 364 }
 365
 366 /* This is used in many places, so the brevity is an advantage. */
 367 static IRTemp newTempV128(void)
 368 {
 369    return newTemp(Ity_V128);
 370 }
 371
 372 /* Initialise V128 temporaries en masse. */
 373 static
 374 void newTempsV128_2(IRTemp* t1, IRTemp* t2)
 375 {
 376    vassert(t1 && *t1 == IRTemp_INVALID);
 377    vassert(t2 && *t2 == IRTemp_INVALID);
 378    *t1 = newTempV128();
 379    *t2 = newTempV128();
 380 }
 381
 382 static
 383 void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
 384 {
 385    vassert(t1 && *t1 == IRTemp_INVALID);
 386    vassert(t2 && *t2 == IRTemp_INVALID);
 387    vassert(t3 && *t3 == IRTemp_INVALID);
 388    *t1 = newTempV128();
 389    *t2 = newTempV128();
 390    *t3 = newTempV128();
 391 }
 392
 393 static
 394 void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
 395 {
 396    vassert(t1 && *t1 == IRTemp_INVALID);
 397    vassert(t2 && *t2 == IRTemp_INVALID);
 398    vassert(t3 && *t3 == IRTemp_INVALID);
 399    vassert(t4 && *t4 == IRTemp_INVALID);
 400    *t1 = newTempV128();
 401    *t2 = newTempV128();
 402    *t3 = newTempV128();
 403    *t4 = newTempV128();
 404 }
 405
 406 static
 407 void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
 408                     IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
 409 {
 410    vassert(t1 && *t1 == IRTemp_INVALID);
 411    vassert(t2 && *t2 == IRTemp_INVALID);
 412    vassert(t3 && *t3 == IRTemp_INVALID);
 413    vassert(t4 && *t4 == IRTemp_INVALID);
 414    vassert(t5 && *t5 == IRTemp_INVALID);
 415    vassert(t6 && *t6 == IRTemp_INVALID);
 416    vassert(t7 && *t7 == IRTemp_INVALID);
 417    *t1 = newTempV128();
 418    *t2 = newTempV128();
 419    *t3 = newTempV128();
 420    *t4 = newTempV128();
 421    *t5 = newTempV128();
 422    *t6 = newTempV128();
 423    *t7 = newTempV128();
 424 }
 425
 426 //ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
 427 //ZZ    IRRoundingMode. */
 428 //ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
 429 //ZZ {
 430 //ZZ    return mkU32(Irrm_NEAREST);
 431 //ZZ }
 432 //ZZ
 433 //ZZ /* Generate an expression for SRC rotated right by ROT. */
 434 //ZZ static IRExpr* genROR32( IRTemp src, Int rot )
 435 //ZZ {
 436 //ZZ    vassert(rot >= 0 && rot < 32);
 437 //ZZ    if (rot == 0)
 438 //ZZ       return mkexpr(src);
 439 //ZZ    return
 440 //ZZ       binop(Iop_Or32,
 441 //ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
 442 //ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
 443 //ZZ }
 444 //ZZ
 445 //ZZ static IRExpr* mkU128 ( ULong i )
 446 //ZZ {
 447 //ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
 448 //ZZ }
 449 //ZZ
 450 //ZZ /* Generate a 4-aligned version of the given expression if
 451 //ZZ    the given condition is true.  Else return it unchanged. */
 452 //ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
 453 //ZZ {
 454 //ZZ    if (b)
 455 //ZZ       return binop(Iop_And32, e, mkU32(~3));
 456 //ZZ    else
 457 //ZZ       return e;
 458 //ZZ }
 459
 460 /* Other IR construction helpers. */
 461 static IROp mkAND ( IRType ty ) {
 462    switch (ty) {
 463       case Ity_I32: return Iop_And32;
 464       case Ity_I64: return Iop_And64;
 465       default: vpanic("mkAND");
 466    }
 467 }
 468
 469 static IROp mkOR ( IRType ty ) {
 470    switch (ty) {
 471       case Ity_I32: return Iop_Or32;
 472       case Ity_I64: return Iop_Or64;
 473       default: vpanic("mkOR");
 474    }
 475 }
 476
 477 static IROp mkXOR ( IRType ty ) {
 478    switch (ty) {
 479       case Ity_I32: return Iop_Xor32;
 480       case Ity_I64: return Iop_Xor64;
 481       default: vpanic("mkXOR");
 482    }
 483 }
 484
 485 static IROp mkSHL ( IRType ty ) {
 486    switch (ty) {
 487       case Ity_I32: return Iop_Shl32;
 488       case Ity_I64: return Iop_Shl64;
 489       default: vpanic("mkSHL");
 490    }
 491 }
 492
 493 static IROp mkSHR ( IRType ty ) {
 494    switch (ty) {
 495       case Ity_I32: return Iop_Shr32;
 496       case Ity_I64: return Iop_Shr64;
 497       default: vpanic("mkSHR");
 498    }
 499 }
 500
 501 static IROp mkSAR ( IRType ty ) {
 502    switch (ty) {
 503       case Ity_I32: return Iop_Sar32;
 504       case Ity_I64: return Iop_Sar64;
 505       default: vpanic("mkSAR");
 506    }
 507 }
 508
 509 static IROp mkNOT ( IRType ty ) {
 510    switch (ty) {
 511       case Ity_I32: return Iop_Not32;
 512       case Ity_I64: return Iop_Not64;
 513       default: vpanic("mkNOT");
 514    }
 515 }
 516
 517 static IROp mkADD ( IRType ty ) {
 518    switch (ty) {
 519       case Ity_I32: return Iop_Add32;
 520       case Ity_I64: return Iop_Add64;
 521       default: vpanic("mkADD");
 522    }
 523 }
 524
 525 static IROp mkSUB ( IRType ty ) {
 526    switch (ty) {
 527       case Ity_I32: return Iop_Sub32;
 528       case Ity_I64: return Iop_Sub64;
 529       default: vpanic("mkSUB");
 530    }
 531 }
 532
 533 static IROp mkADDF ( IRType ty ) {
 534    switch (ty) {
 535       case Ity_F16: return Iop_AddF16;
 536       case Ity_F32: return Iop_AddF32;
 537       case Ity_F64: return Iop_AddF64;
 538       default: vpanic("mkADDF");
 539    }
 540 }
 541
 542 static IROp mkFMADDF ( IRType ty ) {
 543    switch (ty) {
 544       case Ity_F32: return Iop_MAddF32;
 545       case Ity_F64: return Iop_MAddF64;
 546       default: vpanic("mkFMADDF");
 547    }
 548 }
 549
 550 static IROp mkFMSUBF ( IRType ty ) {
 551    switch (ty) {
 552       case Ity_F32: return Iop_MSubF32;
 553       case Ity_F64: return Iop_MSubF64;
 554       default: vpanic("mkFMSUBF");
 555    }
 556 }
 557
 558 static IROp mkSUBF ( IRType ty ) {
 559    switch (ty) {
 560       case Ity_F16: return Iop_SubF16;
 561       case Ity_F32: return Iop_SubF32;
 562       case Ity_F64: return Iop_SubF64;
 563       default: vpanic("mkSUBF");
 564    }
 565 }
 566
 567 static IROp mkMULF ( IRType ty ) {
 568    switch (ty) {
 569       case Ity_F32: return Iop_MulF32;
 570       case Ity_F64: return Iop_MulF64;
 571       default: vpanic("mkMULF");
 572    }
 573 }
 574
 575 static IROp mkDIVF ( IRType ty ) {
 576    switch (ty) {
 577       case Ity_F32: return Iop_DivF32;
 578       case Ity_F64: return Iop_DivF64;
 579       default: vpanic("mkDIVF");
 580    }
 581 }
 582
 583 static IROp mkNEGF ( IRType ty ) {
 584    switch (ty) {
 585       case Ity_F16: return Iop_NegF16;
 586       case Ity_F32: return Iop_NegF32;
 587       case Ity_F64: return Iop_NegF64;
 588       default: vpanic("mkNEGF");
 589    }
 590 }
 591
 592 static IROp mkABSF ( IRType ty ) {
 593    switch (ty) {
 594       case Ity_F16: return Iop_AbsF16;
 595       case Ity_F32: return Iop_AbsF32;
 596       case Ity_F64: return Iop_AbsF64;
 597       default: vpanic("mkABSF");
 598    }
 599 }
 600
 601 static IROp mkSQRTF ( IRType ty ) {
 602    switch (ty) {
 603       case Ity_F16: return Iop_SqrtF16;
 604       case Ity_F32: return Iop_SqrtF32;
 605       case Ity_F64: return Iop_SqrtF64;
 606       default: vpanic("mkSQRTF");
 607    }
 608 }
 609
 610 static IROp mkVecADD ( UInt size ) {
 611    const IROp ops[4]
 612       = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
 613    vassert(size < 4);
 614    return ops[size];
 615 }
 616
 617 static IROp mkVecQADDU ( UInt size ) {
 618    const IROp ops[4]
 619       = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
 620    vassert(size < 4);
 621    return ops[size];
 622 }
 623
 624 static IROp mkVecQADDS ( UInt size ) {
 625    const IROp ops[4]
 626       = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
 627    vassert(size < 4);
 628    return ops[size];
 629 }
 630
 631 static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
 632    const IROp ops[4]
 633       = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
 634           Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
 635    vassert(size < 4);
 636    return ops[size];
 637 }
 638
 639 static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
 640    const IROp ops[4]
 641       = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
 642           Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
 643    vassert(size < 4);
 644    return ops[size];
 645 }
 646
 647 static IROp mkVecSUB ( UInt size ) {
 648    const IROp ops[4]
 649       = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
 650    vassert(size < 4);
 651    return ops[size];
 652 }
 653
 654 static IROp mkVecQSUBU ( UInt size ) {
 655    const IROp ops[4]
 656       = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
 657    vassert(size < 4);
 658    return ops[size];
 659 }
 660
 661 static IROp mkVecQSUBS ( UInt size ) {
 662    const IROp ops[4]
 663       = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
 664    vassert(size < 4);
 665    return ops[size];
 666 }
 667
 668 static IROp mkVecSARN ( UInt size ) {
 669    const IROp ops[4]
 670       = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
 671    vassert(size < 4);
 672    return ops[size];
 673 }
 674
 675 static IROp mkVecSHRN ( UInt size ) {
 676    const IROp ops[4]
 677       = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
 678    vassert(size < 4);
 679    return ops[size];
 680 }
 681
 682 static IROp mkVecSHLN ( UInt size ) {
 683    const IROp ops[4]
 684       = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
 685    vassert(size < 4);
 686    return ops[size];
 687 }
 688
 689 static IROp mkVecCATEVENLANES ( UInt size ) {
 690    const IROp ops[4]
 691       = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
 692           Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
 693    vassert(size < 4);
 694    return ops[size];
 695 }
 696
 697 static IROp mkVecCATODDLANES ( UInt size ) {
 698    const IROp ops[4]
 699       = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
 700           Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
 701    vassert(size < 4);
 702    return ops[size];
 703 }
 704
 705 static IROp mkVecINTERLEAVELO ( UInt size ) {
 706    const IROp ops[4]
 707       = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
 708           Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
 709    vassert(size < 4);
 710    return ops[size];
 711 }
 712
 713 static IROp mkVecINTERLEAVEHI ( UInt size ) {
 714    const IROp ops[4]
 715       = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
 716           Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
 717    vassert(size < 4);
 718    return ops[size];
 719 }
 720
 721 static IROp mkVecMAXU ( UInt size ) {
 722    const IROp ops[4]
 723       = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
 724    vassert(size < 4);
 725    return ops[size];
 726 }
 727
 728 static IROp mkVecMAXS ( UInt size ) {
 729    const IROp ops[4]
 730       = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
 731    vassert(size < 4);
 732    return ops[size];
 733 }
 734
 735 static IROp mkVecMINU ( UInt size ) {
 736    const IROp ops[4]
 737       = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
 738    vassert(size < 4);
 739    return ops[size];
 740 }
 741
 742 static IROp mkVecMINS ( UInt size ) {
 743    const IROp ops[4]
 744       = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
 745    vassert(size < 4);
 746    return ops[size];
 747 }
 748
 749 static IROp mkVecMUL ( UInt size ) {
 750    const IROp ops[4]
 751       = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
 752    vassert(size < 3);
 753    return ops[size];
 754 }
 755
 756 static IROp mkVecMULLU ( UInt sizeNarrow ) {
 757    const IROp ops[4]
 758       = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
 759    vassert(sizeNarrow < 3);
 760    return ops[sizeNarrow];
 761 }
 762
 763 static IROp mkVecMULLS ( UInt sizeNarrow ) {
 764    const IROp ops[4]
 765       = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
 766    vassert(sizeNarrow < 3);
 767    return ops[sizeNarrow];
 768 }
 769
 770 static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
 771    const IROp ops[4]
 772       = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
 773    vassert(sizeNarrow < 3);
 774    return ops[sizeNarrow];
 775 }
 776
 777 static IROp mkVecCMPEQ ( UInt size ) {
 778    const IROp ops[4]
 779       = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
 780    vassert(size < 4);
 781    return ops[size];
 782 }
 783
 784 static IROp mkVecCMPGTU ( UInt size ) {
 785    const IROp ops[4]
 786       = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
 787    vassert(size < 4);
 788    return ops[size];
 789 }
 790
 791 static IROp mkVecCMPGTS ( UInt size ) {
 792    const IROp ops[4]
 793       = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
 794    vassert(size < 4);
 795    return ops[size];
 796 }
 797
 798 static IROp mkVecABS ( UInt size ) {
 799    const IROp ops[4]
 800       = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
 801    vassert(size < 4);
 802    return ops[size];
 803 }
 804
 805 static IROp mkVecZEROHIxxOFV128 ( UInt size ) {
 806    const IROp ops[4]
 807       = { Iop_ZeroHI120ofV128, Iop_ZeroHI112ofV128,
 808           Iop_ZeroHI96ofV128,  Iop_ZeroHI64ofV128 };
 809    vassert(size < 4);
 810    return ops[size];
 811 }
 812
 813 static IRExpr* mkU ( IRType ty, ULong imm ) {
 814    switch (ty) {
 815       case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
 816       case Ity_I64: return mkU64(imm);
 817       default: vpanic("mkU");
 818    }
 819 }
 820
 821 static IROp mkVecQDMULHIS ( UInt size ) {
 822    const IROp ops[4]
 823       = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
 824    vassert(size < 4);
 825    return ops[size];
 826 }
 827
 828 static IROp mkVecQRDMULHIS ( UInt size ) {
 829    const IROp ops[4]
 830       = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
 831    vassert(size < 4);
 832    return ops[size];
 833 }
 834
 835 static IROp mkVecQANDUQSH ( UInt size ) {
 836    const IROp ops[4]
 837       = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
 838           Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
 839    vassert(size < 4);
 840    return ops[size];
 841 }
 842
 843 static IROp mkVecQANDSQSH ( UInt size ) {
 844    const IROp ops[4]
 845       = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
 846           Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
 847    vassert(size < 4);
 848    return ops[size];
 849 }
 850
 851 static IROp mkVecQANDUQRSH ( UInt size ) {
 852    const IROp ops[4]
 853       = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
 854           Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
 855    vassert(size < 4);
 856    return ops[size];
 857 }
 858
 859 static IROp mkVecQANDSQRSH ( UInt size ) {
 860    const IROp ops[4]
 861       = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
 862           Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
 863    vassert(size < 4);
 864    return ops[size];
 865 }
 866
 867 static IROp mkVecSHU ( UInt size ) {
 868    const IROp ops[4]
 869       = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
 870    vassert(size < 4);
 871    return ops[size];
 872 }
 873
 874 static IROp mkVecSHS ( UInt size ) {
 875    const IROp ops[4]
 876       = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
 877    vassert(size < 4);
 878    return ops[size];
 879 }
 880
 881 static IROp mkVecRSHU ( UInt size ) {
 882    const IROp ops[4]
 883       = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
 884    vassert(size < 4);
 885    return ops[size];
 886 }
 887
 888 static IROp mkVecRSHS ( UInt size ) {
 889    const IROp ops[4]
 890       = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
 891    vassert(size < 4);
 892    return ops[size];
 893 }
 894
 895 static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
 896    const IROp ops[4]
 897       = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
 898           Iop_NarrowUn64to32x2, Iop_INVALID };
 899    vassert(sizeNarrow < 4);
 900    return ops[sizeNarrow];
 901 }
 902
 903 static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) {
 904    const IROp ops[4]
 905       = { Iop_QNarrowUn16Sto8Ux8,  Iop_QNarrowUn32Sto16Ux4,
 906           Iop_QNarrowUn64Sto32Ux2, Iop_INVALID };
 907    vassert(sizeNarrow < 4);
 908    return ops[sizeNarrow];
 909 }
 910
 911 static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) {
 912    const IROp ops[4]
 913       = { Iop_QNarrowUn16Sto8Sx8,  Iop_QNarrowUn32Sto16Sx4,
 914           Iop_QNarrowUn64Sto32Sx2, Iop_INVALID };
 915    vassert(sizeNarrow < 4);
 916    return ops[sizeNarrow];
 917 }
 918
 919 static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) {
 920    const IROp ops[4]
 921       = { Iop_QNarrowUn16Uto8Ux8,  Iop_QNarrowUn32Uto16Ux4,
 922           Iop_QNarrowUn64Uto32Ux2, Iop_INVALID };
 923    vassert(sizeNarrow < 4);
 924    return ops[sizeNarrow];
 925 }
 926
 927 static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) {
 928    const IROp ops[4]
 929       = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4,
 930           Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID };
 931    vassert(sizeNarrow < 4);
 932    return ops[sizeNarrow];
 933 }
 934
 935 static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) {
 936    const IROp ops[4]
 937       = { Iop_QandQSarNnarrow16Sto8Sx8,  Iop_QandQSarNnarrow32Sto16Sx4,
 938           Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID };
 939    vassert(sizeNarrow < 4);
 940    return ops[sizeNarrow];
 941 }
 942
 943 static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) {
 944    const IROp ops[4]
 945       = { Iop_QandQSarNnarrow16Sto8Ux8,  Iop_QandQSarNnarrow32Sto16Ux4,
 946           Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID };
 947    vassert(sizeNarrow < 4);
 948    return ops[sizeNarrow];
 949 }
 950
 951 static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) {
 952    const IROp ops[4]
 953       = { Iop_QandQRShrNnarrow16Uto8Ux8,  Iop_QandQRShrNnarrow32Uto16Ux4,
 954           Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID };
 955    vassert(sizeNarrow < 4);
 956    return ops[sizeNarrow];
 957 }
 958
 959 static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) {
 960    const IROp ops[4]
 961       = { Iop_QandQRSarNnarrow16Sto8Sx8,  Iop_QandQRSarNnarrow32Sto16Sx4,
 962           Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID };
 963    vassert(sizeNarrow < 4);
 964    return ops[sizeNarrow];
 965 }
 966
 967 static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) {
 968    const IROp ops[4]
 969       = { Iop_QandQRSarNnarrow16Sto8Ux8,  Iop_QandQRSarNnarrow32Sto16Ux4,
 970           Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID };
 971    vassert(sizeNarrow < 4);
 972    return ops[sizeNarrow];
 973 }
 974
 975 static IROp mkVecQSHLNSATUU ( UInt size ) {
 976    const IROp ops[4]
 977       = { Iop_QShlNsatUU8x16, Iop_QShlNsatUU16x8,
 978           Iop_QShlNsatUU32x4, Iop_QShlNsatUU64x2 };
 979    vassert(size < 4);
 980    return ops[size];
 981 }
 982
 983 static IROp mkVecQSHLNSATSS ( UInt size ) {
 984    const IROp ops[4]
 985       = { Iop_QShlNsatSS8x16, Iop_QShlNsatSS16x8,
 986           Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2 };
 987    vassert(size < 4);
 988    return ops[size];
 989 }
 990
 991 static IROp mkVecQSHLNSATSU ( UInt size ) {
 992    const IROp ops[4]
 993       = { Iop_QShlNsatSU8x16, Iop_QShlNsatSU16x8,
 994           Iop_QShlNsatSU32x4, Iop_QShlNsatSU64x2 };
 995    vassert(size < 4);
 996    return ops[size];
 997 }
 998
 999 static IROp mkVecADDF ( UInt size ) {
1000    const IROp ops[4]
1001       = { Iop_INVALID, Iop_Add16Fx8, Iop_Add32Fx4, Iop_Add64Fx2 };
1002    vassert(size < 4);
1003    return ops[size];
1004 }
1005
1006 static IROp mkVecMAXF ( UInt size ) {
1007    const IROp ops[4]
1008       = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
1009    vassert(size < 4);
1010    return ops[size];
1011 }
1012
1013 static IROp mkVecMINF ( UInt size ) {
1014    const IROp ops[4]
1015       = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
1016    vassert(size < 4);
1017    return ops[size];
1018 }
1019
1020 /* Generate IR to create 'arg rotated right by imm', for sane values
1021    of 'ty' and 'imm'. */
1022 static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
1023 {
1024    UInt w = 0;
1025    if (ty == Ity_I64) {
1026       w = 64;
1027    } else {
1028       vassert(ty == Ity_I32);
1029       w = 32;
1030    }
1031    vassert(w != 0);
1032    vassert(imm < w);
1033    if (imm == 0) {
1034       return arg;
1035    }
1036    IRTemp res = newTemp(ty);
1037    assign(res, binop(mkOR(ty),
1038                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
1039                      binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
1040    return res;
1041 }
1042
1043 /* Generate IR to set the returned temp to either all-zeroes or
1044    all ones, as a copy of arg<imm>. */
1045 static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
1046 {
1047    UInt w = 0;
1048    if (ty == Ity_I64) {
1049       w = 64;
1050    } else {
1051       vassert(ty == Ity_I32);
1052       w = 32;
1053    }
1054    vassert(w != 0);
1055    vassert(imm < w);
1056    IRTemp res = newTemp(ty);
1057    assign(res, binop(mkSAR(ty),
1058                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
1059                      mkU8(w - 1)));
1060    return res;
1061 }
1062
1063 /* S-widen 8/16/32/64 bit int expr to 64. */
1064 static IRExpr* widenSto64 ( IRType srcTy, IRExpr* e )
1065 {
1066    switch (srcTy) {
1067       case Ity_I64: return e;
1068       case Ity_I32: return unop(Iop_32Sto64, e);
1069       case Ity_I16: return unop(Iop_16Sto64, e);
1070       case Ity_I8:  return unop(Iop_8Sto64, e);
1071       default: vpanic("widenSto64(arm64)");
1072    }
1073 }
1074
1075 /* U-widen 8/16/32/64 bit int expr to 64. */
1076 static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
1077 {
1078    switch (srcTy) {
1079       case Ity_I64: return e;
1080       case Ity_I32: return unop(Iop_32Uto64, e);
1081       case Ity_I16: return unop(Iop_16Uto64, e);
1082       case Ity_I8:  return unop(Iop_8Uto64, e);
1083       default: vpanic("widenUto64(arm64)");
1084    }
1085 }
1086
1087 /* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
1088    of these combinations make sense. */
1089 static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
1090 {
1091    switch (dstTy) {
1092       case Ity_I64: return e;
1093       case Ity_I32: return unop(Iop_64to32, e);
1094       case Ity_I16: return unop(Iop_64to16, e);
1095       case Ity_I8:  return unop(Iop_64to8, e);
1096       default: vpanic("narrowFrom64(arm64)");
1097    }
1098 }
1099
1100
1101 /*------------------------------------------------------------*/
1102 /*--- Helpers for accessing guest registers.               ---*/
1103 /*------------------------------------------------------------*/
1104
1105 #define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
1106 #define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
1107 #define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
1108 #define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
1109 #define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
1110 #define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
1111 #define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
1112 #define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
1113 #define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
1114 #define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
1115 #define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
1116 #define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
1117 #define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
1118 #define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
1119 #define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
1120 #define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
1121 #define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
1122 #define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
1123 #define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
1124 #define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
1125 #define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
1126 #define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
1127 #define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
1128 #define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
1129 #define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
1130 #define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
1131 #define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
1132 #define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
1133 #define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
1134 #define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
1135 #define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)
1136
1137 #define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
1138 #define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)
1139
1140 #define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
1141 #define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
1142 #define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
1143 #define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)
1144
1145 #define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
1146 #define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)
1147
1148 #define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
1149 #define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
1150 #define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
1151 #define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
1152 #define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
1153 #define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
1154 #define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
1155 #define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
1156 #define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
1157 #define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
1158 #define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
1159 #define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
1160 #define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
1161 #define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
1162 #define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
1163 #define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
1164 #define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
1165 #define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
1166 #define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
1167 #define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
1168 #define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
1169 #define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
1170 #define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
1171 #define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
1172 #define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
1173 #define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
1174 #define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
1175 #define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
1176 #define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
1177 #define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
1178 #define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
1179 #define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)
1180
1181 #define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
1182 #define OFFB_QCFLAG   offsetof(VexGuestARM64State,guest_QCFLAG)
1183
1184 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
1185 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
1186
1187 #define OFFB_LLSC_SIZE      offsetof(VexGuestARM64State,guest_LLSC_SIZE)
1188 #define OFFB_LLSC_ADDR      offsetof(VexGuestARM64State,guest_LLSC_ADDR)
1189 #define OFFB_LLSC_DATA_LO64 offsetof(VexGuestARM64State,guest_LLSC_DATA_LO64)
1190 #define OFFB_LLSC_DATA_HI64 offsetof(VexGuestARM64State,guest_LLSC_DATA_HI64)
1191
1192
1193 /* ---------------- Integer registers ---------------- */
1194
1195 static Int offsetIReg64 ( UInt iregNo )
1196 {
1197    /* Do we care about endianness here?  We do if sub-parts of integer
1198       registers are accessed. */
1199    switch (iregNo) {
1200       case 0:  return OFFB_X0;
1201       case 1:  return OFFB_X1;
1202       case 2:  return OFFB_X2;
1203       case 3:  return OFFB_X3;
1204       case 4:  return OFFB_X4;
1205       case 5:  return OFFB_X5;
1206       case 6:  return OFFB_X6;
1207       case 7:  return OFFB_X7;
1208       case 8:  return OFFB_X8;
1209       case 9:  return OFFB_X9;
1210       case 10: return OFFB_X10;
1211       case 11: return OFFB_X11;
1212       case 12: return OFFB_X12;
1213       case 13: return OFFB_X13;
1214       case 14: return OFFB_X14;
1215       case 15: return OFFB_X15;
1216       case 16: return OFFB_X16;
1217       case 17: return OFFB_X17;
1218       case 18: return OFFB_X18;
1219       case 19: return OFFB_X19;
1220       case 20: return OFFB_X20;
1221       case 21: return OFFB_X21;
1222       case 22: return OFFB_X22;
1223       case 23: return OFFB_X23;
1224       case 24: return OFFB_X24;
1225       case 25: return OFFB_X25;
1226       case 26: return OFFB_X26;
1227       case 27: return OFFB_X27;
1228       case 28: return OFFB_X28;
1229       case 29: return OFFB_X29;
1230       case 30: return OFFB_X30;
1231       /* but not 31 */
1232       default: vassert(0);
1233    }
1234 }
1235
1236 static Int offsetIReg64orSP ( UInt iregNo )
1237 {
1238    return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
1239 }
1240
1241 static const HChar* nameIReg64orZR ( UInt iregNo )
1242 {
1243    vassert(iregNo < 32);
1244    static const HChar* names[32]
1245       = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
1246           "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
1247           "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
1248           "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
1249    return names[iregNo];
1250 }
1251
1252 static const HChar* nameIReg64orSP ( UInt iregNo )
1253 {
1254    if (iregNo == 31) {
1255       return "sp";
1256    }
1257    vassert(iregNo < 31);
1258    return nameIReg64orZR(iregNo);
1259 }
1260
1261 static IRExpr* getIReg64orSP ( UInt iregNo )
1262 {
1263    vassert(iregNo < 32);
1264    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1265 }
1266
1267 static IRExpr* getIReg64orZR ( UInt iregNo )
1268 {
1269    if (iregNo == 31) {
1270       return mkU64(0);
1271    }
1272    vassert(iregNo < 31);
1273    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1274 }
1275
1276 static void putIReg64orSP ( UInt iregNo, IRExpr* e )
1277 {
1278    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1279    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1280 }
1281
1282 static void putIReg64orZR ( UInt iregNo, IRExpr* e )
1283 {
1284    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1285    if (iregNo == 31) {
1286       return;
1287    }
1288    vassert(iregNo < 31);
1289    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1290 }
1291
1292 static const HChar* nameIReg32orZR ( UInt iregNo )
1293 {
1294    vassert(iregNo < 32);
1295    static const HChar* names[32]
1296       = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",
1297           "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15",
1298           "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
1299           "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
1300    return names[iregNo];
1301 }
1302
1303 static const HChar* nameIReg32orSP ( UInt iregNo )
1304 {
1305    if (iregNo == 31) {
1306       return "wsp";
1307    }
1308    vassert(iregNo < 31);
1309    return nameIReg32orZR(iregNo);
1310 }
1311
1312 static IRExpr* getIReg32orSP ( UInt iregNo )
1313 {
1314    vassert(iregNo < 32);
1315    return unop(Iop_64to32,
1316                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1317 }
1318
1319 static IRExpr* getIReg32orZR ( UInt iregNo )
1320 {
1321    if (iregNo == 31) {
1322       return mkU32(0);
1323    }
1324    vassert(iregNo < 31);
1325    return unop(Iop_64to32,
1326                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1327 }
1328
1329 static void putIReg32orSP ( UInt iregNo, IRExpr* e )
1330 {
1331    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1332    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1333 }
1334
1335 static void putIReg32orZR ( UInt iregNo, IRExpr* e )
1336 {
1337    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1338    if (iregNo == 31) {
1339       return;
1340    }
1341    vassert(iregNo < 31);
1342    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1343 }
1344
1345 static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
1346 {
1347    vassert(is64 == True || is64 == False);
1348    return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
1349 }
1350
1351 static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
1352 {
1353    vassert(is64 == True || is64 == False);
1354    return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
1355 }
1356
1357 static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
1358 {
1359    vassert(is64 == True || is64 == False);
1360    return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
1361 }
1362
1363 static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
1364 {
1365    vassert(is64 == True || is64 == False);
1366    if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
1367 }
1368
1369 static void putPC ( IRExpr* e )
1370 {
1371    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1372    stmt( IRStmt_Put(OFFB_PC, e) );
1373 }
1374
1375
1376 /* ---------------- Vector (Q) registers ---------------- */
1377
1378 static Int offsetQReg128 ( UInt qregNo )
1379 {
1380    /* We don't care about endianness at this point.  It only becomes
1381       relevant when dealing with sections of these registers.*/
1382    switch (qregNo) {
1383       case 0:  return OFFB_Q0;
1384       case 1:  return OFFB_Q1;
1385       case 2:  return OFFB_Q2;
1386       case 3:  return OFFB_Q3;
1387       case 4:  return OFFB_Q4;
1388       case 5:  return OFFB_Q5;
1389       case 6:  return OFFB_Q6;
1390       case 7:  return OFFB_Q7;
1391       case 8:  return OFFB_Q8;
1392       case 9:  return OFFB_Q9;
1393       case 10: return OFFB_Q10;
1394       case 11: return OFFB_Q11;
1395       case 12: return OFFB_Q12;
1396       case 13: return OFFB_Q13;
1397       case 14: return OFFB_Q14;
1398       case 15: return OFFB_Q15;
1399       case 16: return OFFB_Q16;
1400       case 17: return OFFB_Q17;
1401       case 18: return OFFB_Q18;
1402       case 19: return OFFB_Q19;
1403       case 20: return OFFB_Q20;
1404       case 21: return OFFB_Q21;
1405       case 22: return OFFB_Q22;
1406       case 23: return OFFB_Q23;
1407       case 24: return OFFB_Q24;
1408       case 25: return OFFB_Q25;
1409       case 26: return OFFB_Q26;
1410       case 27: return OFFB_Q27;
1411       case 28: return OFFB_Q28;
1412       case 29: return OFFB_Q29;
1413       case 30: return OFFB_Q30;
1414       case 31: return OFFB_Q31;
1415       default: vassert(0);
1416    }
1417 }
1418
1419 /* Write to a complete Qreg. */
1420 static void putQReg128 ( UInt qregNo, IRExpr* e )
1421 {
1422    vassert(qregNo < 32);
1423    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
1424    stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
1425 }
1426
1427 /* Read a complete Qreg. */
1428 static IRExpr* getQReg128 ( UInt qregNo )
1429 {
1430    vassert(qregNo < 32);
1431    return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
1432 }
1433
1434 /* Produce the IR type for some sub-part of a vector.  For 32- and 64-
1435    bit sub-parts we can choose either integer or float types, and
1436    choose float on the basis that that is the common use case and so
1437    will give least interference with Put-to-Get forwarding later
1438    on. */
1439 static IRType preferredVectorSubTypeFromSize ( UInt szB )
1440 {
1441    switch (szB) {
1442       case 1:  return Ity_I8;
1443       case 2:  return Ity_I16;
1444       case 4:  return Ity_I32; //Ity_F32;
1445       case 8:  return Ity_F64;
1446       case 16: return Ity_V128;
1447       default: vassert(0);
1448    }
1449 }
1450
1451 /* Find the offset of the laneNo'th lane of type laneTy in the given
1452    Qreg.  Since the host is little-endian, the least significant lane
1453    has the lowest offset. */
1454 static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
1455 {
1456    vassert(host_endness == VexEndnessLE);
1457    Int base = offsetQReg128(qregNo);
1458    /* Since the host is little-endian, the least significant lane
1459       will be at the lowest address. */
1460    /* Restrict this to known types, so as to avoid silently accepting
1461       stupid types. */
1462    UInt laneSzB = 0;
1463    switch (laneTy) {
1464       case Ity_I8:                 laneSzB = 1;  break;
1465       case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
1466       case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
1467       case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
1468       case Ity_V128:               laneSzB = 16; break;
1469       default: break;
1470    }
1471    vassert(laneSzB > 0);
1472    UInt minOff = laneNo * laneSzB;
1473    UInt maxOff = minOff + laneSzB - 1;
1474    vassert(maxOff < 16);
1475    return base + minOff;
1476 }
1477
1478 /* Put to the least significant lane of a Qreg. */
1479 static void putQRegLO ( UInt qregNo, IRExpr* e )
1480 {
1481    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1482    Int    off = offsetQRegLane(qregNo, ty, 0);
1483    switch (ty) {
1484       case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
1485       case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
1486          break;
1487       default:
1488          vassert(0); // Other cases are probably invalid
1489    }
1490    stmt(IRStmt_Put(off, e));
1491 }
1492
1493 /* Get from the least significant lane of a Qreg. */
1494 static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
1495 {
1496    Int off = offsetQRegLane(qregNo, ty, 0);
1497    switch (ty) {
1498       case Ity_I8:
1499       case Ity_F16: case Ity_I16:
1500       case Ity_I32: case Ity_I64:
1501       case Ity_F32: case Ity_F64: case Ity_V128:
1502          break;
1503       default:
1504          vassert(0); // Other cases are ATC
1505    }
1506    return IRExpr_Get(off, ty);
1507 }
1508
1509 static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
1510 {
1511    static const HChar* namesQ[32]
1512       = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",
1513           "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15",
1514           "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
1515           "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
1516    static const HChar* namesD[32]
1517       = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
1518           "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
1519           "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1520           "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
1521    static const HChar* namesS[32]
1522       = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
1523           "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
1524           "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
1525           "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
1526    static const HChar* namesH[32]
1527       = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7",
1528           "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15",
1529           "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23",
1530           "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
1531    static const HChar* namesB[32]
1532       = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7",
1533           "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15",
1534           "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23",
1535           "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
1536    vassert(qregNo < 32);
1537    switch (sizeofIRType(laneTy)) {
1538       case 1:  return namesB[qregNo];
1539       case 2:  return namesH[qregNo];
1540       case 4:  return namesS[qregNo];
1541       case 8:  return namesD[qregNo];
1542       case 16: return namesQ[qregNo];
1543       default: vassert(0);
1544    }
1545    /*NOTREACHED*/
1546 }
1547
1548 static const HChar* nameQReg128 ( UInt qregNo )
1549 {
1550    return nameQRegLO(qregNo, Ity_V128);
1551 }
1552
1553 /* Find the offset of the most significant half (8 bytes) of the given
1554    Qreg.  This requires knowing the endianness of the host. */
1555 static Int offsetQRegHI64 ( UInt qregNo )
1556 {
1557    return offsetQRegLane(qregNo, Ity_I64, 1);
1558 }
1559
1560 static IRExpr* getQRegHI64 ( UInt qregNo )
1561 {
1562    return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
1563 }
1564
1565 static void putQRegHI64 ( UInt qregNo, IRExpr* e )
1566 {
1567    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1568    Int    off = offsetQRegHI64(qregNo);
1569    switch (ty) {
1570       case Ity_I64: case Ity_F64:
1571          break;
1572       default:
1573          vassert(0); // Other cases are plain wrong
1574    }
1575    stmt(IRStmt_Put(off, e));
1576 }
1577
1578 /* Put to a specified lane of a Qreg. */
1579 static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
1580 {
1581    IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
1582    Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
1583    switch (laneTy) {
1584       case Ity_F64: case Ity_I64:
1585       case Ity_I32: case Ity_F32:
1586       case Ity_I16: case Ity_F16:
1587       case Ity_I8:
1588          break;
1589       default:
1590          vassert(0); // Other cases are ATC
1591    }
1592    stmt(IRStmt_Put(off, e));
1593 }
1594
1595 /* Get from a specified lane of a Qreg. */
1596 static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
1597 {
1598    Int off = offsetQRegLane(qregNo, laneTy, laneNo);
1599    switch (laneTy) {
1600       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
1601       case Ity_F64: case Ity_F32: case Ity_F16:
1602          break;
1603       default:
1604          vassert(0); // Other cases are ATC
1605    }
1606    return IRExpr_Get(off, laneTy);
1607 }
1608
1609
1610 //ZZ /* ---------------- Misc registers ---------------- */
1611 //ZZ
1612 //ZZ static void putMiscReg32 ( UInt    gsoffset,
1613 //ZZ                            IRExpr* e, /* :: Ity_I32 */
1614 //ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
1615 //ZZ {
1616 //ZZ    switch (gsoffset) {
1617 //ZZ       case OFFB_FPSCR:   break;
1618 //ZZ       case OFFB_QFLAG32: break;
1619 //ZZ       case OFFB_GEFLAG0: break;
1620 //ZZ       case OFFB_GEFLAG1: break;
1621 //ZZ       case OFFB_GEFLAG2: break;
1622 //ZZ       case OFFB_GEFLAG3: break;
1623 //ZZ       default: vassert(0); /* awaiting more cases */
1624 //ZZ    }
1625 //ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1626 //ZZ
1627 //ZZ    if (guardT == IRTemp_INVALID) {
1628 //ZZ       /* unconditional write */
1629 //ZZ       stmt(IRStmt_Put(gsoffset, e));
1630 //ZZ    } else {
1631 //ZZ       stmt(IRStmt_Put(
1632 //ZZ          gsoffset,
1633 //ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
1634 //ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
1635 //ZZ       ));
1636 //ZZ    }
1637 //ZZ }
1638 //ZZ
1639 //ZZ static IRTemp get_ITSTATE ( void )
1640 //ZZ {
1641 //ZZ    ASSERT_IS_THUMB;
1642 //ZZ    IRTemp t = newTemp(Ity_I32);
1643 //ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
1644 //ZZ    return t;
1645 //ZZ }
1646 //ZZ
1647 //ZZ static void put_ITSTATE ( IRTemp t )
1648 //ZZ {
1649 //ZZ    ASSERT_IS_THUMB;
1650 //ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
1651 //ZZ }
1652 //ZZ
1653 //ZZ static IRTemp get_QFLAG32 ( void )
1654 //ZZ {
1655 //ZZ    IRTemp t = newTemp(Ity_I32);
1656 //ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
1657 //ZZ    return t;
1658 //ZZ }
1659 //ZZ
1660 //ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
1661 //ZZ {
1662 //ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
1663 //ZZ }
1664 //ZZ
1665 //ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
1666 //ZZ    Status Register) to indicate that overflow or saturation occurred.
1667 //ZZ    Nb: t must be zero to denote no saturation, and any nonzero
1668 //ZZ    value to indicate saturation. */
1669 //ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
1670 //ZZ {
1671 //ZZ    IRTemp old = get_QFLAG32();
1672 //ZZ    IRTemp nyu = newTemp(Ity_I32);
1673 //ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
1674 //ZZ    put_QFLAG32(nyu, condT);
1675 //ZZ }
1676
1677
1678 /* ---------------- FPCR stuff ---------------- */
1679
1680 /* Generate IR to get hold of the rounding mode bits in FPCR, and
1681    convert them to IR format.  Bind the final result to the
1682    returned temp. */
1683 static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
1684 {
1685    /* The ARMvfp encoding for rounding mode bits is:
1686          00  to nearest
1687          01  to +infinity
1688          10  to -infinity
1689          11  to zero
1690       We need to convert that to the IR encoding:
1691          00  to nearest (the default)
1692          10  to +infinity
1693          01  to -infinity
1694          11  to zero
1695       Which can be done by swapping bits 0 and 1.
1696       The rmode bits are at 23:22 in FPSCR.
1697    */
1698    IRTemp armEncd = newTemp(Ity_I32);
1699    IRTemp swapped = newTemp(Ity_I32);
1700    /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
1701       we don't zero out bits 24 and above, since the assignment to
1702       'swapped' will mask them out anyway. */
1703    assign(armEncd,
1704           binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
1705    /* Now swap them. */
1706    assign(swapped,
1707           binop(Iop_Or32,
1708                 binop(Iop_And32,
1709                       binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
1710                       mkU32(2)),
1711                 binop(Iop_And32,
1712                       binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
1713                       mkU32(1))
1714          ));
1715    return swapped;
1716 }
1717
1718
1719 /*------------------------------------------------------------*/
1720 /*--- Helpers for flag handling and conditional insns      ---*/
1721 /*------------------------------------------------------------*/
1722
1723 static const HChar* nameARM64Condcode ( ARM64Condcode cond )
1724 {
1725    switch (cond) {
1726       case ARM64CondEQ:  return "eq";
1727       case ARM64CondNE:  return "ne";
1728       case ARM64CondCS:  return "cs";  // or 'hs'
1729       case ARM64CondCC:  return "cc";  // or 'lo'
1730       case ARM64CondMI:  return "mi";
1731       case ARM64CondPL:  return "pl";
1732       case ARM64CondVS:  return "vs";
1733       case ARM64CondVC:  return "vc";
1734       case ARM64CondHI:  return "hi";
1735       case ARM64CondLS:  return "ls";
1736       case ARM64CondGE:  return "ge";
1737       case ARM64CondLT:  return "lt";
1738       case ARM64CondGT:  return "gt";
1739       case ARM64CondLE:  return "le";
1740       case ARM64CondAL:  return "al";
1741       case ARM64CondNV:  return "nv";
1742       default: vpanic("name_ARM64Condcode");
1743    }
1744 }
1745
1746 /* and a handy shorthand for it */
1747 static const HChar* nameCC ( ARM64Condcode cond ) {
1748    return nameARM64Condcode(cond);
1749 }
1750
1751
1752 /* Build IR to calculate some particular condition from stored
1753    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1754    Ity_I64, suitable for narrowing.  Although the return type is
1755    Ity_I64, the returned value is either 0 or 1.  'cond' must be
1756    :: Ity_I64 and must denote the condition to compute in
1757    bits 7:4, and be zero everywhere else.
1758 */
1759 static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
1760 {
1761    vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
1762    /* And 'cond' had better produce a value in which only bits 7:4 are
1763       nonzero.  However, obviously we can't assert for that. */
1764
1765    /* So what we're constructing for the first argument is
1766       "(cond << 4) | stored-operation".
1767       However, as per comments above, 'cond' must be supplied
1768       pre-shifted to this function.
1769
1770       This pairing scheme requires that the ARM64_CC_OP_ values all fit
1771       in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
1772       8 bits of the first argument. */
1773    IRExpr** args
1774       = mkIRExprVec_4(
1775            binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
1776            IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1777            IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1778            IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
1779         );
1780    IRExpr* call
1781       = mkIRExprCCall(
1782            Ity_I64,
1783            0/*regparm*/,
1784            "arm64g_calculate_condition", &arm64g_calculate_condition,
1785            args
1786         );
1787
1788    /* Exclude the requested condition, OP and NDEP from definedness
1789       checking.  We're only interested in DEP1 and DEP2. */
1790    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1791    return call;
1792 }
1793
1794
1795 /* Build IR to calculate some particular condition from stored
1796    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1797    Ity_I64, suitable for narrowing.  Although the return type is
1798    Ity_I64, the returned value is either 0 or 1.
1799 */
1800 static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
1801 {
1802   /* First arg is "(cond << 4) | condition".  This requires that the
1803      ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
1804      (COND, OP) pair in the lowest 8 bits of the first argument. */
1805    vassert(cond >= 0 && cond <= 15);
1806    return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
1807 }
1808
1809
1810 /* Build IR to calculate just the carry flag from stored
1811    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1812    Ity_I64. */
1813 static IRExpr* mk_arm64g_calculate_flag_c ( void )
1814 {
1815    IRExpr** args
1816       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1817                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1818                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1819                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1820    IRExpr* call
1821       = mkIRExprCCall(
1822            Ity_I64,
1823            0/*regparm*/,
1824            "arm64g_calculate_flag_c", &arm64g_calculate_flag_c,
1825            args
1826         );
1827    /* Exclude OP and NDEP from definedness checking.  We're only
1828       interested in DEP1 and DEP2. */
1829    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1830    return call;
1831 }
1832
1833
1834 //ZZ /* Build IR to calculate just the overflow flag from stored
1835 //ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1836 //ZZ    Ity_I32. */
1837 //ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
1838 //ZZ {
1839 //ZZ    IRExpr** args
1840 //ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
1841 //ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
1842 //ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
1843 //ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
1844 //ZZ    IRExpr* call
1845 //ZZ       = mkIRExprCCall(
1846 //ZZ            Ity_I32,
1847 //ZZ            0/*regparm*/,
1848 //ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
1849 //ZZ            args
1850 //ZZ         );
1851 //ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
1852 //ZZ       interested in DEP1 and DEP2. */
1853 //ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1854 //ZZ    return call;
1855 //ZZ }
1856
1857
1858 /* Build IR to calculate N Z C V in bits 31:28 of the
1859    returned word. */
1860 static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
1861 {
1862    IRExpr** args
1863       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1864                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1865                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1866                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1867    IRExpr* call
1868       = mkIRExprCCall(
1869            Ity_I64,
1870            0/*regparm*/,
1871            "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
1872            args
1873         );
1874    /* Exclude OP and NDEP from definedness checking.  We're only
1875       interested in DEP1 and DEP2. */
1876    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1877    return call;
1878 }
1879
1880
1881 /* Build IR to set the flags thunk, in the most general case. */
1882 static
1883 void setFlags_D1_D2_ND ( UInt cc_op,
1884                          IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
1885 {
1886    vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
1887    vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
1888    vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
1889    vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
1890    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
1891    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
1892    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
1893    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
1894 }
1895
1896 /* Build IR to set the flags thunk after ADD or SUB. */
1897 static
1898 void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
1899 {
1900    IRTemp argL64 = IRTemp_INVALID;
1901    IRTemp argR64 = IRTemp_INVALID;
1902    IRTemp z64    = newTemp(Ity_I64);
1903    if (is64) {
1904       argL64 = argL;
1905       argR64 = argR;
1906    } else {
1907       argL64 = newTemp(Ity_I64);
1908       argR64 = newTemp(Ity_I64);
1909       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1910       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1911    }
1912    assign(z64, mkU64(0));
1913    UInt cc_op = ARM64G_CC_OP_NUMBER;
1914    /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
1915    else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
1916    else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
1917    else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
1918    else                      { vassert(0); }
1919    setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
1920 }
1921
1922 /* Build IR to set the flags thunk after ADC or SBC. */
1923 static
1924 void setFlags_ADC_SBC ( Bool is64, Bool isSBC,
1925                         IRTemp argL, IRTemp argR, IRTemp oldC )
1926 {
1927    IRTemp argL64 = IRTemp_INVALID;
1928    IRTemp argR64 = IRTemp_INVALID;
1929    IRTemp oldC64 = IRTemp_INVALID;
1930    if (is64) {
1931       argL64 = argL;
1932       argR64 = argR;
1933       oldC64 = oldC;
1934    } else {
1935       argL64 = newTemp(Ity_I64);
1936       argR64 = newTemp(Ity_I64);
1937       oldC64 = newTemp(Ity_I64);
1938       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1939       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1940       assign(oldC64, unop(Iop_32Uto64, mkexpr(oldC)));
1941    }
1942    UInt cc_op = ARM64G_CC_OP_NUMBER;
1943    /**/ if ( isSBC &&  is64) { cc_op = ARM64G_CC_OP_SBC64; }
1944    else if ( isSBC && !is64) { cc_op = ARM64G_CC_OP_SBC32; }
1945    else if (!isSBC &&  is64) { cc_op = ARM64G_CC_OP_ADC64; }
1946    else if (!isSBC && !is64) { cc_op = ARM64G_CC_OP_ADC32; }
1947    else                      { vassert(0); }
1948    setFlags_D1_D2_ND(cc_op, argL64, argR64, oldC64);
1949 }
1950
1951 /* Build IR to set the flags thunk after ADD or SUB, if the given
1952    condition evaluates to True at run time.  If not, the flags are set
1953    to the specified NZCV value. */
1954 static
1955 void setFlags_ADD_SUB_conditionally (
1956         Bool is64, Bool isSUB,
1957         IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
1958      )
1959 {
1960    /* Generate IR as follows:
1961         CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
1962         CC_DEP1 = ITE(cond, argL64, nzcv << 28)
1963         CC_DEP2 = ITE(cond, argR64, 0)
1964         CC_NDEP = 0
1965    */
1966
1967    IRTemp z64 = newTemp(Ity_I64);
1968    assign(z64, mkU64(0));
1969
1970    /* Establish the operation and operands for the True case. */
1971    IRTemp t_dep1 = IRTemp_INVALID;
1972    IRTemp t_dep2 = IRTemp_INVALID;
1973    UInt   t_op   = ARM64G_CC_OP_NUMBER;
1974    /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
1975    else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
1976    else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
1977    else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
1978    else                      { vassert(0); }
1979    /* */
1980    if (is64) {
1981       t_dep1 = argL;
1982       t_dep2 = argR;
1983    } else {
1984       t_dep1 = newTemp(Ity_I64);
1985       t_dep2 = newTemp(Ity_I64);
1986       assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
1987       assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
1988    }
1989
1990    /* Establish the operation and operands for the False case. */
1991    IRTemp f_dep1 = newTemp(Ity_I64);
1992    IRTemp f_dep2 = z64;
1993    UInt   f_op   = ARM64G_CC_OP_COPY;
1994    assign(f_dep1, mkU64(nzcv << 28));
1995
1996    /* Final thunk values */
1997    IRTemp dep1 = newTemp(Ity_I64);
1998    IRTemp dep2 = newTemp(Ity_I64);
1999    IRTemp op   = newTemp(Ity_I64);
2000
2001    assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
2002    assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
2003    assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));
2004
2005    /* finally .. */
2006    stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
2007    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
2008    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
2009    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
2010 }
2011
2012 /* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
2013 static
2014 void setFlags_LOGIC ( Bool is64, IRTemp res )
2015 {
2016    IRTemp res64 = IRTemp_INVALID;
2017    IRTemp z64   = newTemp(Ity_I64);
2018    UInt   cc_op = ARM64G_CC_OP_NUMBER;
2019    if (is64) {
2020       res64 = res;
2021       cc_op = ARM64G_CC_OP_LOGIC64;
2022    } else {
2023       res64 = newTemp(Ity_I64);
2024       assign(res64, unop(Iop_32Uto64, mkexpr(res)));
2025       cc_op = ARM64G_CC_OP_LOGIC32;
2026    }
2027    assign(z64, mkU64(0));
2028    setFlags_D1_D2_ND(cc_op, res64, z64, z64);
2029 }
2030
2031 /* Build IR to set the flags thunk to a given NZCV value.  NZCV is
2032    located in bits 31:28 of the supplied value. */
2033 static
2034 void setFlags_COPY ( IRTemp nzcv_28x0 )
2035 {
2036    IRTemp z64 = newTemp(Ity_I64);
2037    assign(z64, mkU64(0));
2038    setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
2039 }
2040
2041
2042 //ZZ /* Minor variant of the above that sets NDEP to zero (if it
2043 //ZZ    sets it at all) */
2044 //ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
2045 //ZZ                              IRTemp t_dep2,
2046 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2047 //ZZ {
2048 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2049 //ZZ    assign( z32, mkU32(0) );
2050 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
2051 //ZZ }
2052 //ZZ
2053 //ZZ
2054 //ZZ /* Minor variant of the above that sets DEP2 to zero (if it
2055 //ZZ    sets it at all) */
2056 //ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
2057 //ZZ                              IRTemp t_ndep,
2058 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2059 //ZZ {
2060 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2061 //ZZ    assign( z32, mkU32(0) );
2062 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
2063 //ZZ }
2064 //ZZ
2065 //ZZ
2066 //ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
2067 //ZZ    sets them at all) */
2068 //ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
2069 //ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2070 //ZZ {
2071 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2072 //ZZ    assign( z32, mkU32(0) );
2073 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
2074 //ZZ }
2075
2076
2077 /*------------------------------------------------------------*/
2078 /*--- Misc math helpers                                    ---*/
2079 /*------------------------------------------------------------*/
2080
2081 /* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
2082 static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
2083 {
2084    IRTemp maskT = newTemp(Ity_I64);
2085    IRTemp res   = newTemp(Ity_I64);
2086    vassert(sh >= 1 && sh <= 63);
2087    assign(maskT, mkU64(mask));
2088    assign( res,
2089            binop(Iop_Or64,
2090                  binop(Iop_Shr64,
2091                        binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
2092                        mkU8(sh)),
2093                  binop(Iop_And64,
2094                        binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
2095                        mkexpr(maskT))
2096                  )
2097            );
2098    return res;
2099 }
2100
2101 /* Generates byte swaps within 32-bit lanes. */
2102 static IRTemp math_UINTSWAP64 ( IRTemp src )
2103 {
2104    IRTemp res;
2105    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2106    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2107    return res;
2108 }
2109
2110 /* Generates byte swaps within 16-bit lanes. */
2111 static IRTemp math_USHORTSWAP64 ( IRTemp src )
2112 {
2113    IRTemp res;
2114    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2115    return res;
2116 }
2117
2118 /* Generates a 64-bit byte swap. */
2119 static IRTemp math_BYTESWAP64 ( IRTemp src )
2120 {
2121    IRTemp res;
2122    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2123    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2124    res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
2125    return res;
2126 }
2127
2128 /* Generates a 64-bit bit swap. */
2129 static IRTemp math_BITSWAP64 ( IRTemp src )
2130 {
2131    IRTemp res;
2132    res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
2133    res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
2134    res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
2135    return math_BYTESWAP64(res);
2136 }
2137
2138 /* Duplicates the bits at the bottom of the given word to fill the
2139    whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
2140    except for the bottom bits. */
2141 static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
2142 {
2143    if (srcTy == Ity_I8) {
2144       IRTemp t16 = newTemp(Ity_I64);
2145       assign(t16, binop(Iop_Or64, mkexpr(src),
2146                                   binop(Iop_Shl64, mkexpr(src), mkU8(8))));
2147       IRTemp t32 = newTemp(Ity_I64);
2148       assign(t32, binop(Iop_Or64, mkexpr(t16),
2149                                   binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
2150       IRTemp t64 = newTemp(Ity_I64);
2151       assign(t64, binop(Iop_Or64, mkexpr(t32),
2152                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2153       return t64;
2154    }
2155    if (srcTy == Ity_I16) {
2156       IRTemp t32 = newTemp(Ity_I64);
2157       assign(t32, binop(Iop_Or64, mkexpr(src),
2158                                   binop(Iop_Shl64, mkexpr(src), mkU8(16))));
2159       IRTemp t64 = newTemp(Ity_I64);
2160       assign(t64, binop(Iop_Or64, mkexpr(t32),
2161                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2162       return t64;
2163    }
2164    if (srcTy == Ity_I32) {
2165       IRTemp t64 = newTemp(Ity_I64);
2166       assign(t64, binop(Iop_Or64, mkexpr(src),
2167                                   binop(Iop_Shl64, mkexpr(src), mkU8(32))));
2168       return t64;
2169    }
2170    if (srcTy == Ity_I64) {
2171       return src;
2172    }
2173    vassert(0);
2174 }
2175
2176
2177 /* Duplicates the src element exactly so as to fill a V128 value. */
2178 static IRTemp math_DUP_TO_V128 ( IRTemp src, IRType srcTy )
2179 {
2180    IRTemp res = newTempV128();
2181    if (srcTy == Ity_F64) {
2182       IRTemp i64 = newTemp(Ity_I64);
2183       assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(src)));
2184       assign(res, binop(Iop_64HLtoV128, mkexpr(i64), mkexpr(i64)));
2185       return res;
2186    }
2187    if (srcTy == Ity_F32) {
2188       IRTemp i64a = newTemp(Ity_I64);
2189       assign(i64a, unop(Iop_32Uto64, unop(Iop_ReinterpF32asI32, mkexpr(src))));
2190       IRTemp i64b = newTemp(Ity_I64);
2191       assign(i64b, binop(Iop_Or64, binop(Iop_Shl64, mkexpr(i64a), mkU8(32)),
2192                                    mkexpr(i64a)));
2193       assign(res, binop(Iop_64HLtoV128, mkexpr(i64b), mkexpr(i64b)));
2194       return res;
2195    }
2196    if (srcTy == Ity_I64) {
2197       assign(res, binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)));
2198       return res;
2199    }
2200    if (srcTy == Ity_I32 || srcTy == Ity_I16 || srcTy == Ity_I8) {
2201       IRTemp t1 = newTemp(Ity_I64);
2202       assign(t1, widenUto64(srcTy, mkexpr(src)));
2203       IRTemp t2 = math_DUP_TO_64(t1, srcTy);
2204       assign(res, binop(Iop_64HLtoV128, mkexpr(t2), mkexpr(t2)));
2205       return res;
2206    }
2207    vassert(0);
2208 }
2209
2210
2211 /* |fullWidth| is a full V128 width result.  Depending on bitQ,
2212    zero out the upper half. */
2213 static IRExpr* math_MAYBE_ZERO_HI64 ( UInt bitQ, IRTemp fullWidth )
2214 {
2215    if (bitQ == 1) return mkexpr(fullWidth);
2216    if (bitQ == 0) return unop(Iop_ZeroHI64ofV128, mkexpr(fullWidth));
2217    vassert(0);
2218 }
2219
2220 /* The same, but from an expression instead. */
2221 static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
2222 {
2223    IRTemp fullWidthT = newTempV128();
2224    assign(fullWidthT, fullWidth);
2225    return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
2226 }
2227
2228
2229 /*------------------------------------------------------------*/
2230 /*--- FP comparison helpers                                ---*/
2231 /*------------------------------------------------------------*/
2232
2233 /* irRes :: Ity_I32 holds a floating point comparison result encoded
2234    as an IRCmpF64Result.  Generate code to convert it to an
2235    ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
2236    Assign a new temp to hold that value, and return the temp. */
2237 static
2238 IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
2239 {
2240    IRTemp ix       = newTemp(Ity_I64);
2241    IRTemp termL    = newTemp(Ity_I64);
2242    IRTemp termR    = newTemp(Ity_I64);
2243    IRTemp nzcv     = newTemp(Ity_I64);
2244    IRTemp irRes    = newTemp(Ity_I64);
2245
2246    /* This is where the fun starts.  We have to convert 'irRes' from
2247       an IR-convention return result (IRCmpF64Result) to an
2248       ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
2249       4 bits of 'nzcv'. */
2250    /* Map compare result from IR to ARM(nzcv) */
2251    /*
2252       FP cmp result | IR   | ARM(nzcv)
2253       --------------------------------
2254       UN              0x45   0011
2255       LT              0x01   1000
2256       GT              0x00   0010
2257       EQ              0x40   0110
2258    */
2259    /* Now since you're probably wondering WTF ..
2260
2261       ix fishes the useful bits out of the IR value, bits 6 and 0, and
2262       places them side by side, giving a number which is 0, 1, 2 or 3.
2263
2264       termL is a sequence cooked up by GNU superopt.  It converts ix
2265          into an almost correct value NZCV value (incredibly), except
2266          for the case of UN, where it produces 0100 instead of the
2267          required 0011.
2268
2269       termR is therefore a correction term, also computed from ix.  It
2270          is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
2271          the final correct value, we subtract termR from termL.
2272
2273       Don't take my word for it.  There's a test program at the bottom
2274       of guest_arm_toIR.c, to try this out with.
2275    */
2276    assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));
2277
2278    assign(
2279       ix,
2280       binop(Iop_Or64,
2281             binop(Iop_And64,
2282                   binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
2283                   mkU64(3)),
2284             binop(Iop_And64, mkexpr(irRes), mkU64(1))));
2285
2286    assign(
2287       termL,
2288       binop(Iop_Add64,
2289             binop(Iop_Shr64,
2290                   binop(Iop_Sub64,
2291                         binop(Iop_Shl64,
2292                               binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
2293                               mkU8(62)),
2294                         mkU64(1)),
2295                   mkU8(61)),
2296             mkU64(1)));
2297
2298    assign(
2299       termR,
2300       binop(Iop_And64,
2301             binop(Iop_And64,
2302                   mkexpr(ix),
2303                   binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
2304             mkU64(1)));
2305
2306    assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
2307    return nzcv;
2308 }
2309
2310
2311 /*------------------------------------------------------------*/
2312 /*--- Data processing (immediate)                          ---*/
2313 /*------------------------------------------------------------*/
2314
2315 /* Helper functions for supporting "DecodeBitMasks" */
2316
2317 static ULong dbm_ROR ( Int width, ULong x, Int rot )
2318 {
2319    vassert(width > 0 && width <= 64);
2320    vassert(rot >= 0 && rot < width);
2321    if (rot == 0) return x;
2322    ULong res = x >> rot;
2323    res |= (x << (width - rot));
2324    if (width < 64)
2325      res &= ((1ULL << width) - 1);
2326    return res;
2327 }
2328
2329 static ULong dbm_RepTo64( Int esize, ULong x )
2330 {
2331    switch (esize) {
2332       case 64:
2333          return x;
2334       case 32:
2335          x &= 0xFFFFFFFF; x |= (x << 32);
2336          return x;
2337       case 16:
2338          x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
2339          return x;
2340       case 8:
2341          x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
2342          return x;
2343       case 4:
2344          x &= 0xF; x |= (x << 4); x |= (x << 8);
2345          x |= (x << 16); x |= (x << 32);
2346          return x;
2347       case 2:
2348          x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
2349          x |= (x << 16); x |= (x << 32);
2350          return x;
2351       default:
2352          break;
2353    }
2354    vpanic("dbm_RepTo64");
2355    /*NOTREACHED*/
2356    return 0;
2357 }
2358
2359 static Int dbm_highestSetBit ( ULong x )
2360 {
2361    Int i;
2362    for (i = 63; i >= 0; i--) {
2363       if (x & (1ULL << i))
2364          return i;
2365    }
2366    vassert(x == 0);
2367    return -1;
2368 }
2369
2370 static
2371 Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask,
2372                           ULong immN, ULong imms, ULong immr, Bool immediate,
2373                           UInt M /*32 or 64*/)
2374 {
2375    vassert(immN < (1ULL << 1));
2376    vassert(imms < (1ULL << 6));
2377    vassert(immr < (1ULL << 6));
2378    vassert(immediate == False || immediate == True);
2379    vassert(M == 32 || M == 64);
2380
2381    Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
2382    if (len < 1) { /* printf("fail1\n"); */ return False; }
2383    vassert(len <= 6);
2384    vassert(M >= (1 << len));
2385
2386    vassert(len >= 1 && len <= 6);
2387    ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
2388                   (1 << len) - 1;
2389    vassert(levels >= 1 && levels <= 63);
2390
2391    if (immediate && ((imms & levels) == levels)) {
2392       /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
2393       return False;
2394    }
2395
2396    ULong S = imms & levels;
2397    ULong R = immr & levels;
2398    Int   diff = S - R;
2399    diff &= 63;
2400    Int esize = 1 << len;
2401    vassert(2 <= esize && esize <= 64);
2402
2403    /* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
2404       same below with d.  S can be 63 in which case we have an out of
2405       range and hence undefined shift. */
2406    vassert(S >= 0 && S <= 63);
2407    vassert(esize >= (S+1));
2408    ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
2409                   //(1ULL << (S+1)) - 1;
2410                   ((1ULL << S) - 1) + (1ULL << S);
2411
2412    Int d = // diff<len-1:0>
2413            diff & ((1 << len)-1);
2414    vassert(esize >= (d+1));
2415    vassert(d >= 0 && d <= 63);
2416
2417    ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
2418                   //(1ULL << (d+1)) - 1;
2419                   ((1ULL << d) - 1) + (1ULL << d);
2420
2421    if (esize != 64) vassert(elem_s < (1ULL << esize));
2422    if (esize != 64) vassert(elem_d < (1ULL << esize));
2423
2424    if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
2425    if (tmask) *tmask = dbm_RepTo64(esize, elem_d);
2426
2427    return True;
2428 }
2429
2430
2431 static
2432 Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
2433                                          UInt insn, Bool sigill_diag)
2434 {
2435 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2436
2437    /* insn[28:23]
2438       10000x PC-rel addressing
2439       10001x Add/subtract (immediate)
2440       100100 Logical (immediate)
2441       100101 Move Wide (immediate)
2442       100110 Bitfield
2443       100111 Extract
2444    */
2445
2446    /* ------------------ ADD/SUB{,S} imm12 ------------------ */
2447    if (INSN(28,24) == BITS5(1,0,0,0,1)) {
2448       Bool is64   = INSN(31,31) == 1;
2449       Bool isSub  = INSN(30,30) == 1;
2450       Bool setCC  = INSN(29,29) == 1;
2451       UInt sh     = INSN(23,22);
2452       UInt uimm12 = INSN(21,10);
2453       UInt nn     = INSN(9,5);
2454       UInt dd     = INSN(4,0);
2455       const HChar* nm = isSub ? "sub" : "add";
2456       if (sh >= 2) {
2457          /* Invalid; fall through */
2458       } else {
2459          vassert(sh <= 1);
2460          uimm12 <<= (12 * sh);
2461          if (is64) {
2462             IRTemp argL  = newTemp(Ity_I64);
2463             IRTemp argR  = newTemp(Ity_I64);
2464             IRTemp res   = newTemp(Ity_I64);
2465             assign(argL, getIReg64orSP(nn));
2466             assign(argR, mkU64(uimm12));
2467             assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
2468                                mkexpr(argL), mkexpr(argR)));
2469             if (setCC) {
2470                putIReg64orZR(dd, mkexpr(res));
2471                setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
2472                DIP("%ss %s, %s, 0x%x\n",
2473                    nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
2474             } else {
2475                putIReg64orSP(dd, mkexpr(res));
2476                DIP("%s %s, %s, 0x%x\n",
2477                    nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
2478             }
2479          } else {
2480             IRTemp argL  = newTemp(Ity_I32);
2481             IRTemp argR  = newTemp(Ity_I32);
2482             IRTemp res   = newTemp(Ity_I32);
2483             assign(argL, getIReg32orSP(nn));
2484             assign(argR, mkU32(uimm12));
2485             assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
2486                                mkexpr(argL), mkexpr(argR)));
2487             if (setCC) {
2488                putIReg32orZR(dd, mkexpr(res));
2489                setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
2490                DIP("%ss %s, %s, 0x%x\n",
2491                    nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
2492             } else {
2493                putIReg32orSP(dd, mkexpr(res));
2494                DIP("%s %s, %s, 0x%x\n",
2495                    nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
2496             }
2497          }
2498          return True;
2499       }
2500    }
2501
2502    /* -------------------- ADR/ADRP -------------------- */
2503    if (INSN(28,24) == BITS5(1,0,0,0,0)) {
2504       UInt  bP    = INSN(31,31);
2505       UInt  immLo = INSN(30,29);
2506       UInt  immHi = INSN(23,5);
2507       UInt  rD    = INSN(4,0);
2508       ULong uimm  = (immHi << 2) | immLo;
2509       ULong simm  = sx_to_64(uimm, 21);
2510       ULong val;
2511       if (bP) {
2512          val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
2513       } else {
2514          val = guest_PC_curr_instr + simm;
2515       }
2516       putIReg64orZR(rD, mkU64(val));
2517       DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
2518       return True;
2519    }
2520
2521    /* -------------------- LOGIC(imm) -------------------- */
2522    if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
2523       /* 31 30 28     22 21   15   9  4
2524          sf op 100100 N  immr imms Rn Rd
2525            op=00: AND  Rd|SP, Rn, #imm
2526            op=01: ORR  Rd|SP, Rn, #imm
2527            op=10: EOR  Rd|SP, Rn, #imm
2528            op=11: ANDS Rd|ZR, Rn, #imm
2529       */
2530       Bool  is64 = INSN(31,31) == 1;
2531       UInt  op   = INSN(30,29);
2532       UInt  N    = INSN(22,22);
2533       UInt  immR = INSN(21,16);
2534       UInt  immS = INSN(15,10);
2535       UInt  nn   = INSN(9,5);
2536       UInt  dd   = INSN(4,0);
2537       ULong imm  = 0;
2538       Bool  ok;
2539       if (N == 1 && !is64)
2540          goto after_logic_imm; /* not allowed; fall through */
2541       ok = dbm_DecodeBitMasks(&imm, NULL,
2542                               N, immS, immR, True, is64 ? 64 : 32);
2543       if (!ok)
2544          goto after_logic_imm;
2545
2546       const HChar* names[4] = { "and", "orr", "eor", "ands" };
2547       const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
2548       const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };
2549
2550       vassert(op < 4);
2551       if (is64) {
2552          IRExpr* argL = getIReg64orZR(nn);
2553          IRExpr* argR = mkU64(imm);
2554          IRTemp  res  = newTemp(Ity_I64);
2555          assign(res, binop(ops64[op], argL, argR));
2556          if (op < 3) {
2557             putIReg64orSP(dd, mkexpr(res));
2558             DIP("%s %s, %s, 0x%llx\n", names[op],
2559                 nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
2560          } else {
2561             putIReg64orZR(dd, mkexpr(res));
2562             setFlags_LOGIC(True/*is64*/, res);
2563             DIP("%s %s, %s, 0x%llx\n", names[op],
2564                 nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
2565          }
2566       } else {
2567          IRExpr* argL = getIReg32orZR(nn);
2568          IRExpr* argR = mkU32((UInt)imm);
2569          IRTemp  res  = newTemp(Ity_I32);
2570          assign(res, binop(ops32[op], argL, argR));
2571          if (op < 3) {
2572             putIReg32orSP(dd, mkexpr(res));
2573             DIP("%s %s, %s, 0x%x\n", names[op],
2574                 nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
2575          } else {
2576             putIReg32orZR(dd, mkexpr(res));
2577             setFlags_LOGIC(False/*!is64*/, res);
2578             DIP("%s %s, %s, 0x%x\n", names[op],
2579                 nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
2580          }
2581       }
2582       return True;
2583    }
2584    after_logic_imm:
2585
2586    /* -------------------- MOV{Z,N,K} -------------------- */
2587    if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
2588       /* 31 30 28      22 20    4
2589          |  |  |       |  |     |
2590          sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
2591          sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
2592          sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
2593       */
2594       Bool is64   = INSN(31,31) == 1;
2595       UInt subopc = INSN(30,29);
2596       UInt hw     = INSN(22,21);
2597       UInt imm16  = INSN(20,5);
2598       UInt dd     = INSN(4,0);
2599       if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
2600          /* invalid; fall through */
2601       } else {
2602          ULong imm64 = ((ULong)imm16) << (16 * hw);
2603          if (!is64)
2604             vassert(imm64 < 0x100000000ULL);
2605          switch (subopc) {
2606             case BITS2(1,0): // MOVZ
2607                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2608                DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2609                break;
2610             case BITS2(0,0): // MOVN
2611                imm64 = ~imm64;
2612                if (!is64)
2613                   imm64 &= 0xFFFFFFFFULL;
2614                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2615                DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2616                break;
2617             case BITS2(1,1): // MOVK
2618                /* This is more complex.  We are inserting a slice into
2619                   the destination register, so we need to have the old
2620                   value of it. */
2621                if (is64) {
2622                   IRTemp old = newTemp(Ity_I64);
2623                   assign(old, getIReg64orZR(dd));
2624                   ULong mask = 0xFFFFULL << (16 * hw);
2625                   IRExpr* res
2626                      = binop(Iop_Or64,
2627                              binop(Iop_And64, mkexpr(old), mkU64(~mask)),
2628                              mkU64(imm64));
2629                   putIReg64orZR(dd, res);
2630                   DIP("movk %s, 0x%x, lsl %u\n",
2631                       nameIReg64orZR(dd), imm16, 16*hw);
2632                } else {
2633                   IRTemp old = newTemp(Ity_I32);
2634                   assign(old, getIReg32orZR(dd));
2635                   vassert(hw <= 1);
2636                   UInt mask = ((UInt)0xFFFF) << (16 * hw);
2637                   IRExpr* res
2638                      = binop(Iop_Or32,
2639                              binop(Iop_And32, mkexpr(old), mkU32(~mask)),
2640                              mkU32((UInt)imm64));
2641                   putIReg32orZR(dd, res);
2642                   DIP("movk %s, 0x%x, lsl %u\n",
2643                       nameIReg32orZR(dd), imm16, 16*hw);
2644                }
2645                break;
2646             default:
2647                vassert(0);
2648          }
2649          return True;
2650       }
2651    }
2652
2653    /* -------------------- {U,S,}BFM -------------------- */
2654    /*    30 28     22 21   15   9  4
2655
2656       sf 10 100110 N  immr imms nn dd
2657          UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2658          UBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2659
2660       sf 00 100110 N  immr imms nn dd
2661          SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2662          SBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2663
2664       sf 01 100110 N  immr imms nn dd
2665          BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2666          BFM Xd, Xn, #immr, #imms   when sf=1, N=1
2667    */
2668    if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
2669       UInt sf     = INSN(31,31);
2670       UInt opc    = INSN(30,29);
2671       UInt N      = INSN(22,22);
2672       UInt immR   = INSN(21,16);
2673       UInt immS   = INSN(15,10);
2674       UInt nn     = INSN(9,5);
2675       UInt dd     = INSN(4,0);
2676       Bool inZero = False;
2677       Bool extend = False;
2678       const HChar* nm = "???";
2679       /* skip invalid combinations */
2680       switch (opc) {
2681          case BITS2(0,0):
2682             inZero = True; extend = True; nm = "sbfm"; break;
2683          case BITS2(0,1):
2684             inZero = False; extend = False; nm = "bfm"; break;
2685          case BITS2(1,0):
2686             inZero = True; extend = False; nm = "ubfm"; break;
2687          case BITS2(1,1):
2688             goto after_bfm; /* invalid */
2689          default:
2690             vassert(0);
2691       }
2692       if (sf == 1 && N != 1) goto after_bfm;
2693       if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
2694                              || ((immS >> 5) & 1) != 0)) goto after_bfm;
2695       ULong wmask = 0, tmask = 0;
2696       Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
2697                                    N, immS, immR, False, sf == 1 ? 64 : 32);
2698       if (!ok) goto after_bfm; /* hmmm */
2699
2700       Bool   is64 = sf == 1;
2701       IRType ty   = is64 ? Ity_I64 : Ity_I32;
2702
2703       // Handle plain shifts explicitly.  These are functionally identical to
2704       // the general case below, but iropt isn't clever enough to reduce those
2705       // sequences to plain shifts.  So give it a hand.
2706       if (is64 && immS == 63 && immR >= 1 && immR <= 63) {
2707          if (opc == BITS2(0,0)) {
2708             // 64-bit signed shift right
2709             putIReg64orZR(dd, binop(Iop_Sar64, getIReg64orZR(nn), mkU8(immR)));
2710             DIP("asr %s, %s, #%u\n",
2711                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2712             return True;
2713          }
2714          if (opc == BITS2(1,0)) {
2715             // 64-bit unsigned shift right
2716             putIReg64orZR(dd, binop(Iop_Shr64, getIReg64orZR(nn), mkU8(immR)));
2717             DIP("lsr %s, %s, #%u\n",
2718                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2719             return True;
2720          }
2721       }
2722
2723       if (!is64 && immS == 31 && immR >= 1 && immR <= 31) {
2724          if (opc == BITS2(0,0)) {
2725             // 32-bit signed shift right
2726             putIReg32orZR(dd, binop(Iop_Sar32, getIReg32orZR(nn), mkU8(immR)));
2727             DIP("asr %s, %s, #%u\n",
2728                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2729             return True;
2730          }
2731          if (opc == BITS2(1,0)) {
2732             // 32-bit unsigned shift right
2733             putIReg32orZR(dd, binop(Iop_Shr32, getIReg32orZR(nn), mkU8(immR)));
2734             DIP("lsr %s, %s, #%u\n",
2735                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2736             return True;
2737          }
2738       }
2739
2740       if (is64 && immS >= 0 && immS <= 62
2741           && immR == immS + 1 && opc == BITS2(1,0)) {
2742          // 64-bit shift left
2743          UInt shift = 64 - immR;
2744          vassert(shift >= 1 && shift <= 63);
2745          putIReg64orZR(dd, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(shift)));
2746          DIP("lsl %s, %s, #%u\n",
2747              nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), shift);
2748          return True;
2749       }
2750       if (!is64 && immS >= 0 && immS <= 30
2751           && immR == immS + 1 && opc == BITS2(1,0)) {
2752          // 32-bit shift left
2753          UInt shift = 32 - immR;
2754          vassert(shift >= 1 && shift <= 31);
2755          putIReg32orZR(dd, binop(Iop_Shl32, getIReg32orZR(nn), mkU8(shift)));
2756          DIP("lsl %s, %s, #%u\n",
2757              nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), shift);
2758          return True;
2759       }
2760
2761       // Also special-case sxtw.
2762       if (opc == BITS2(0,0) && immR == 0) {
2763          if (is64) {
2764             // The destination size is 64 bits.
2765             if (immS == 31) {
2766                putIReg64orZR(dd, unop(Iop_32Sto64, getIReg32orZR(nn)));
2767                DIP("sxtw %s, %s\n", nameIReg64orZR(dd), nameIReg32orZR(nn));
2768                return True;
2769             }
2770             if (immS == 15) {
2771                putIReg64orZR(dd, unop(Iop_16Sto64,
2772                                       unop(Iop_64to16, getIReg64orZR(nn))));
2773                DIP("sxth %s, %s\n", nameIReg64orZR(dd), nameIReg32orZR(nn));
2774                return True;
2775             }
2776             if (immS == 7) {
2777                putIReg64orZR(dd, unop(Iop_8Sto64,
2778                                       unop(Iop_64to8, getIReg64orZR(nn))));
2779                DIP("sxtb %s, %s\n", nameIReg64orZR(dd), nameIReg32orZR(nn));
2780                return True;
2781             }
2782          } else {
2783             // The destination size is 32 bits.
2784             if (immS == 15) {
2785                putIReg32orZR(dd, unop(Iop_16Sto32,
2786                                       unop(Iop_64to16, getIReg64orZR(nn))));
2787                DIP("sxth %s, %s\n", nameIReg32orZR(dd), nameIReg32orZR(nn));
2788                return True;
2789             }
2790             if (immS == 7) {
2791                putIReg32orZR(dd, unop(Iop_8Sto32,
2792                                       unop(Iop_64to8, getIReg64orZR(nn))));
2793                DIP("sxtb %s, %s\n", nameIReg32orZR(dd), nameIReg32orZR(nn));
2794                return True;
2795             }
2796          }
2797       }
2798
2799       // None of the special cases apply.  We have to use the (slow) general
2800       // case.
2801       IRTemp dst = newTemp(ty);
2802       IRTemp src = newTemp(ty);
2803       IRTemp bot = newTemp(ty);
2804       IRTemp top = newTemp(ty);
2805       IRTemp res = newTemp(ty);
2806       assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
2807       assign(src, getIRegOrZR(is64, nn));
2808       /* perform bitfield move on low bits */
2809       assign(bot, binop(mkOR(ty),
2810                         binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
2811                         binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
2812                                          mkU(ty, wmask))));
2813       /* determine extension bits (sign, zero or dest register) */
2814       assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
2815       /* combine extension bits and result bits */
2816       assign(res, binop(mkOR(ty),
2817                         binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
2818                         binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
2819       putIRegOrZR(is64, dd, mkexpr(res));
2820       DIP("%s %s, %s, immR=%u, immS=%u\n",
2821           nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
2822       return True;
2823    }
2824    after_bfm:
2825
2826    /* ---------------------- EXTR ---------------------- */
2827    /*   30 28     22 20 15   9 4
2828       1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
2829       0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
2830    */
2831    if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
2832       Bool is64  = INSN(31,31) == 1;
2833       UInt mm    = INSN(20,16);
2834       UInt imm6  = INSN(15,10);
2835       UInt nn    = INSN(9,5);
2836       UInt dd    = INSN(4,0);
2837       Bool valid = True;
2838       if (INSN(31,31) != INSN(22,22))
2839         valid = False;
2840       if (!is64 && imm6 >= 32)
2841         valid = False;
2842       if (!valid) goto after_extr;
2843       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2844       IRTemp srcHi = newTemp(ty);
2845       IRTemp srcLo = newTemp(ty);
2846       IRTemp res   = newTemp(ty);
2847       assign(srcHi, getIRegOrZR(is64, nn));
2848       assign(srcLo, getIRegOrZR(is64, mm));
2849       if (imm6 == 0) {
2850         assign(res, mkexpr(srcLo));
2851       } else {
2852         UInt szBits = 8 * sizeofIRType(ty);
2853         vassert(imm6 > 0 && imm6 < szBits);
2854         assign(res, binop(mkOR(ty),
2855                           binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
2856                           binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
2857       }
2858       putIRegOrZR(is64, dd, mkexpr(res));
2859       DIP("extr %s, %s, %s, #%u\n",
2860           nameIRegOrZR(is64,dd),
2861           nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
2862       return True;
2863    }
2864   after_extr:
2865
2866    if (sigill_diag) {
2867       vex_printf("ARM64 front end: data_processing_immediate\n");
2868    }
2869    return False;
2870 #  undef INSN
2871 }
2872
2873
2874 /*------------------------------------------------------------*/
2875 /*--- Data processing (register) instructions              ---*/
2876 /*------------------------------------------------------------*/
2877
2878 static const HChar* nameSH ( UInt sh ) {
2879    switch (sh) {
2880       case 0: return "lsl";
2881       case 1: return "lsr";
2882       case 2: return "asr";
2883       case 3: return "ror";
2884       default: vassert(0);
2885    }
2886 }
2887
2888 /* Generate IR to get a register value, possibly shifted by an
2889    immediate.  Returns either a 32- or 64-bit temporary holding the
2890    result.  After the shift, the value can optionally be NOT-ed
2891    too.
2892
2893    sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
2894    in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
2895    isn't allowed, but it's the job of the caller to check that.
2896 */
2897 static IRTemp getShiftedIRegOrZR ( Bool is64,
2898                                    UInt sh_how, UInt sh_amt, UInt regNo,
2899                                    Bool invert )
2900 {
2901    vassert(sh_how < 4);
2902    vassert(sh_amt < (is64 ? 64 : 32));
2903    IRType ty = is64 ? Ity_I64 : Ity_I32;
2904    IRTemp t0 = newTemp(ty);
2905    assign(t0, getIRegOrZR(is64, regNo));
2906    IRTemp t1 = newTemp(ty);
2907    switch (sh_how) {
2908       case BITS2(0,0):
2909          assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
2910          break;
2911       case BITS2(0,1):
2912          assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
2913          break;
2914       case BITS2(1,0):
2915          assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
2916          break;
2917       case BITS2(1,1):
2918          assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
2919          break;
2920       default:
2921          vassert(0);
2922    }
2923    if (invert) {
2924       IRTemp t2 = newTemp(ty);
2925       assign(t2, unop(mkNOT(ty), mkexpr(t1)));
2926       return t2;
2927    } else {
2928       return t1;
2929    }
2930 }
2931
2932
2933 static
2934 Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
2935                                         UInt insn, Bool sigill_diag)
2936 {
2937 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2938
2939    /* ------------------- ADD/SUB(reg) ------------------- */
2940    /* x==0 => 32 bit op      x==1 => 64 bit op
2941       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)
2942
2943       31 30 29 28    23 21 20 15   9  4
2944       |  |  |  |     |  |  |  |    |  |
2945       x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
2946       x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
2947       x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
2948       x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
2949    */
2950    if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
2951       UInt   bX    = INSN(31,31);
2952       UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
2953       UInt   bS    = INSN(29, 29); /* set flags? */
2954       UInt   sh    = INSN(23,22);
2955       UInt   rM    = INSN(20,16);
2956       UInt   imm6  = INSN(15,10);
2957       UInt   rN    = INSN(9,5);
2958       UInt   rD    = INSN(4,0);
2959       Bool   isSUB = bOP == 1;
2960       Bool   is64  = bX == 1;
2961       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2962       if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
2963          /* invalid; fall through */
2964       } else {
2965          IRTemp argL = newTemp(ty);
2966          assign(argL, getIRegOrZR(is64, rN));
2967          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
2968          IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2969          IRTemp res  = newTemp(ty);
2970          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2971          if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2972          if (bS) {
2973             setFlags_ADD_SUB(is64, isSUB, argL, argR);
2974          }
2975          DIP("%s%s %s, %s, %s, %s #%u\n",
2976              bOP ? "sub" : "add", bS ? "s" : "",
2977              nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2978              nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2979          return True;
2980       }
2981    }
2982
2983    /* ------------------- ADC/SBC(reg) ------------------- */
2984    /* x==0 => 32 bit op      x==1 => 64 bit op
2985
2986       31 30 29 28    23 21 20 15     9  4
2987       |  |  |  |     |  |  |  |      |  |
2988       x  0  0  11010 00 0  Rm 000000 Rn Rd   ADC  Rd,Rn,Rm
2989       x  0  1  11010 00 0  Rm 000000 Rn Rd   ADCS Rd,Rn,Rm
2990       x  1  0  11010 00 0  Rm 000000 Rn Rd   SBC  Rd,Rn,Rm
2991       x  1  1  11010 00 0  Rm 000000 Rn Rd   SBCS Rd,Rn,Rm
2992    */
2993
2994    if (INSN(28,21) == BITS8(1,1,0,1,0,0,0,0) && INSN(15,10) == 0 ) {
2995       UInt   bX    = INSN(31,31);
2996       UInt   bOP   = INSN(30,30); /* 0: ADC, 1: SBC */
2997       UInt   bS    = INSN(29,29); /* set flags */
2998       UInt   rM    = INSN(20,16);
2999       UInt   rN    = INSN(9,5);
3000       UInt   rD    = INSN(4,0);
3001
3002       Bool   isSUB = bOP == 1;
3003       Bool   is64  = bX == 1;
3004       IRType ty    = is64 ? Ity_I64 : Ity_I32;
3005
3006       IRTemp oldC = newTemp(ty);
3007       assign(oldC,
3008              is64 ? mk_arm64g_calculate_flag_c()
3009                   : unop(Iop_64to32, mk_arm64g_calculate_flag_c()) );
3010
3011       IRTemp argL = newTemp(ty);
3012       assign(argL, getIRegOrZR(is64, rN));
3013       IRTemp argR = newTemp(ty);
3014       assign(argR, getIRegOrZR(is64, rM));
3015
3016       IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
3017       IRTemp res  = newTemp(ty);
3018       if (isSUB) {
3019          IRExpr* one = is64 ? mkU64(1) : mkU32(1);
3020          IROp xorOp = is64 ? Iop_Xor64 : Iop_Xor32;
3021          assign(res,
3022                 binop(op,
3023                       binop(op, mkexpr(argL), mkexpr(argR)),
3024                       binop(xorOp, mkexpr(oldC), one)));
3025       } else {
3026          assign(res,
3027                 binop(op,
3028                       binop(op, mkexpr(argL), mkexpr(argR)),
3029                       mkexpr(oldC)));
3030       }
3031
3032       if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
3033
3034       if (bS) {
3035          setFlags_ADC_SBC(is64, isSUB, argL, argR, oldC);
3036       }
3037
3038       DIP("%s%s %s, %s, %s\n",
3039           bOP ? "sbc" : "adc", bS ? "s" : "",
3040           nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
3041           nameIRegOrZR(is64, rM));
3042       return True;
3043    }
3044
3045    /* -------------------- LOGIC(reg) -------------------- */
3046    /* x==0 => 32 bit op      x==1 => 64 bit op
3047       N==0 => inv? is no-op (no inversion)
3048       N==1 => inv? is NOT
3049       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR
3050
3051       31 30 28    23 21 20 15   9  4
3052       |  |  |     |  |  |  |    |  |
3053       x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
3054       x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
3055       x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
3056       x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
3057       With N=1, the names are: BIC ORN EON BICS
3058    */
3059    if (INSN(28,24) == BITS5(0,1,0,1,0)) {
3060       UInt   bX   = INSN(31,31);
3061       UInt   sh   = INSN(23,22);
3062       UInt   bN   = INSN(21,21);
3063       UInt   rM   = INSN(20,16);
3064       UInt   imm6 = INSN(15,10);
3065       UInt   rN   = INSN(9,5);
3066       UInt   rD   = INSN(4,0);
3067       Bool   is64 = bX == 1;
3068       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3069       if (!is64 && imm6 > 31) {
3070          /* invalid; fall though */
3071       } else {
3072          IRTemp argL = newTemp(ty);
3073          assign(argL, getIRegOrZR(is64, rN));
3074          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
3075          IROp   op   = Iop_INVALID;
3076          switch (INSN(30,29)) {
3077             case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
3078             case BITS2(0,1):                  op = mkOR(ty);  break;
3079             case BITS2(1,0):                  op = mkXOR(ty); break;
3080             default: vassert(0);
3081          }
3082          IRTemp res = newTemp(ty);
3083          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
3084          if (INSN(30,29) == BITS2(1,1)) {
3085             setFlags_LOGIC(is64, res);
3086          }
3087          putIRegOrZR(is64, rD, mkexpr(res));
3088
3089          static const HChar* names_op[8]
3090             = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
3091          vassert(((bN << 2) | INSN(30,29)) < 8);
3092          const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
3093          /* Special-case the printing of "MOV" */
3094          if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
3095             DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
3096                                 nameIRegOrZR(is64, rM));
3097          } else {
3098             DIP("%s %s, %s, %s, %s #%u\n", nm_op,
3099                 nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
3100                 nameIRegOrZR(is64, rM), nameSH(sh), imm6);
3101          }
3102          return True;
3103       }
3104    }
3105
3106    /* -------------------- {U,S}MULH -------------------- */
3107    /* 31       23 22 20 15     9   4
3108       10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
3109       10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
3110    */
3111    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
3112        && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
3113       Bool isU = INSN(23,23) == 1;
3114       UInt mm  = INSN(20,16);
3115       UInt nn  = INSN(9,5);
3116       UInt dd  = INSN(4,0);
3117       putIReg64orZR(dd, unop(Iop_128HIto64,
3118                              binop(isU ? Iop_MullU64 : Iop_MullS64,
3119                                    getIReg64orZR(nn), getIReg64orZR(mm))));
3120       DIP("%cmulh %s, %s, %s\n",
3121           isU ? 'u' : 's',
3122           nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
3123       return True;
3124    }
3125
3126    /* -------------------- M{ADD,SUB} -------------------- */
3127    /* 31 30           20 15 14 9 4
3128       sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
3129       sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
3130    */
3131    if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
3132       Bool is64  = INSN(31,31) == 1;
3133       UInt mm    = INSN(20,16);
3134       Bool isAdd = INSN(15,15) == 0;
3135       UInt aa    = INSN(14,10);
3136       UInt nn    = INSN(9,5);
3137       UInt dd    = INSN(4,0);
3138       if (is64) {
3139          putIReg64orZR(
3140             dd,
3141             binop(isAdd ? Iop_Add64 : Iop_Sub64,
3142                   getIReg64orZR(aa),
3143                   binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
3144       } else {
3145          putIReg32orZR(
3146             dd,
3147             binop(isAdd ? Iop_Add32 : Iop_Sub32,
3148                   getIReg32orZR(aa),
3149                   binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
3150       }
3151       DIP("%s %s, %s, %s, %s\n",
3152           isAdd ? "madd" : "msub",
3153           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3154           nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
3155       return True;
3156    }
3157
3158    /* ---------------- CS{EL,INC,INV,NEG} ---------------- */
3159    /* 31 30 28        20 15   11 9  4
3160       sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
3161       sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
3162       sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
3163       sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
3164       In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
3165    */
3166    if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
3167       Bool    is64 = INSN(31,31) == 1;
3168       UInt    b30  = INSN(30,30);
3169       UInt    mm   = INSN(20,16);
3170       UInt    cond = INSN(15,12);
3171       UInt    b10  = INSN(10,10);
3172       UInt    nn   = INSN(9,5);
3173       UInt    dd   = INSN(4,0);
3174       UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
3175       IRType  ty   = is64 ? Ity_I64 : Ity_I32;
3176       IRExpr* argL = getIRegOrZR(is64, nn);
3177       IRExpr* argR = getIRegOrZR(is64, mm);
3178       switch (op) {
3179          case BITS2(0,0):
3180             break;
3181          case BITS2(0,1):
3182             argR = binop(mkADD(ty), argR, mkU(ty,1));
3183             break;
3184          case BITS2(1,0):
3185             argR = unop(mkNOT(ty), argR);
3186             break;
3187          case BITS2(1,1):
3188             argR = binop(mkSUB(ty), mkU(ty,0), argR);
3189             break;
3190          default:
3191             vassert(0);
3192       }
3193       putIRegOrZR(
3194          is64, dd,
3195          IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
3196                     argL, argR)
3197       );
3198       const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
3199       DIP("%s %s, %s, %s, %s\n", op_nm[op],
3200           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3201           nameIRegOrZR(is64, mm), nameCC(cond));
3202       return True;
3203    }
3204
3205    /* -------------- ADD/SUB(extended reg) -------------- */
3206    /*     28         20 15  12   9 4
3207       000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
3208       100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld
3209
3210       001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
3211       101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld
3212
3213       010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
3214       110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld
3215
3216       011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
3217       111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld
3218
3219       The 'm' operand is extended per opt, thusly:
3220
3221         000   Xm & 0xFF           UXTB
3222         001   Xm & 0xFFFF         UXTH
3223         010   Xm & (2^32)-1       UXTW
3224         011   Xm                  UXTX
3225
3226         100   Xm sx from bit 7    SXTB
3227         101   Xm sx from bit 15   SXTH
3228         110   Xm sx from bit 31   SXTW
3229         111   Xm                  SXTX
3230
3231       In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
3232       operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
3233       are the identity operation on Wm.
3234
3235       After extension, the value is shifted left by imm3 bits, which
3236       may only be in the range 0 .. 4 inclusive.
3237    */
3238    if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
3239       Bool is64  = INSN(31,31) == 1;
3240       Bool isSub = INSN(30,30) == 1;
3241       Bool setCC = INSN(29,29) == 1;
3242       UInt mm    = INSN(20,16);
3243       UInt opt   = INSN(15,13);
3244       UInt imm3  = INSN(12,10);
3245       UInt nn    = INSN(9,5);
3246       UInt dd    = INSN(4,0);
3247       const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
3248                                   "sxtb", "sxth", "sxtw", "sxtx" };
3249       /* Do almost the same thing in the 32- and 64-bit cases. */
3250       IRTemp xN = newTemp(Ity_I64);
3251       IRTemp xM = newTemp(Ity_I64);
3252       assign(xN, getIReg64orSP(nn));
3253       assign(xM, getIReg64orZR(mm));
3254       IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
3255       Int     shSX = 0;
3256       /* widen Xm .. */
3257       switch (opt) {
3258          case BITS3(0,0,0): // UXTB
3259             xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
3260          case BITS3(0,0,1): // UXTH
3261             xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
3262          case BITS3(0,1,0): // UXTW -- noop for the 32bit case
3263             if (is64) {
3264                xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
3265             }
3266             break;
3267          case BITS3(0,1,1): // UXTX -- always a noop
3268             break;
3269          case BITS3(1,0,0): // SXTB
3270             shSX = 56; goto sxTo64;
3271          case BITS3(1,0,1): // SXTH
3272             shSX = 48; goto sxTo64;
3273          case BITS3(1,1,0): // SXTW -- noop for the 32bit case
3274             if (is64) {
3275                shSX = 32; goto sxTo64;
3276             }
3277             break;
3278          case BITS3(1,1,1): // SXTX -- always a noop
3279             break;
3280          sxTo64:
3281             vassert(shSX >= 32);
3282             xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
3283                         mkU8(shSX));
3284             break;
3285          default:
3286             vassert(0);
3287       }
3288       /* and now shift */
3289       IRTemp argL = xN;
3290       IRTemp argR = newTemp(Ity_I64);
3291       assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
3292       IRTemp res = newTemp(Ity_I64);
3293       assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
3294                         mkexpr(argL), mkexpr(argR)));
3295       if (is64) {
3296          if (setCC) {
3297             putIReg64orZR(dd, mkexpr(res));
3298             setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
3299          } else {
3300             putIReg64orSP(dd, mkexpr(res));
3301          }
3302       } else {
3303          if (setCC) {
3304             IRTemp argL32 = newTemp(Ity_I32);
3305             IRTemp argR32 = newTemp(Ity_I32);
3306             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
3307             assign(argL32, unop(Iop_64to32, mkexpr(argL)));
3308             assign(argR32, unop(Iop_64to32, mkexpr(argR)));
3309             setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
3310          } else {
3311             putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
3312          }
3313       }
3314       DIP("%s%s %s, %s, %s %s lsl %u\n",
3315           isSub ? "sub" : "add", setCC ? "s" : "",
3316           setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
3317           nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
3318           nameExt[opt], imm3);
3319       return True;
3320    }
3321
3322    /* ---------------- CCMP/CCMN(imm) ---------------- */
3323    /* Bizarrely, these appear in the "data processing register"
3324       category, even though they are operations against an
3325       immediate. */
3326    /* 31   29        20   15   11 9    3
3327       sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
3328       sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond
3329
3330       Operation is:
3331          (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
3332          (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
3333    */
3334    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3335        && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
3336       Bool is64  = INSN(31,31) == 1;
3337       Bool isSUB = INSN(30,30) == 1;
3338       UInt imm5  = INSN(20,16);
3339       UInt cond  = INSN(15,12);
3340       UInt nn    = INSN(9,5);
3341       UInt nzcv  = INSN(3,0);
3342
3343       IRTemp condT = newTemp(Ity_I1);
3344       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3345
3346       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3347       IRTemp argL = newTemp(ty);
3348       IRTemp argR = newTemp(ty);
3349
3350       if (is64) {
3351          assign(argL, getIReg64orZR(nn));
3352          assign(argR, mkU64(imm5));
3353       } else {
3354          assign(argL, getIReg32orZR(nn));
3355          assign(argR, mkU32(imm5));
3356       }
3357       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3358
3359       DIP("ccm%c %s, #%u, #%u, %s\n",
3360           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3361           imm5, nzcv, nameCC(cond));
3362       return True;
3363    }
3364
3365    /* ---------------- CCMP/CCMN(reg) ---------------- */
3366    /* 31   29        20 15   11 9    3
3367       sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
3368       sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
3369       Operation is:
3370          (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
3371          (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
3372    */
3373    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3374        && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
3375       Bool is64  = INSN(31,31) == 1;
3376       Bool isSUB = INSN(30,30) == 1;
3377       UInt mm    = INSN(20,16);
3378       UInt cond  = INSN(15,12);
3379       UInt nn    = INSN(9,5);
3380       UInt nzcv  = INSN(3,0);
3381
3382       IRTemp condT = newTemp(Ity_I1);
3383       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3384
3385       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3386       IRTemp argL = newTemp(ty);
3387       IRTemp argR = newTemp(ty);
3388
3389       if (is64) {
3390          assign(argL, getIReg64orZR(nn));
3391          assign(argR, getIReg64orZR(mm));
3392       } else {
3393          assign(argL, getIReg32orZR(nn));
3394          assign(argR, getIReg32orZR(mm));
3395       }
3396       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3397
3398       DIP("ccm%c %s, %s, #%u, %s\n",
3399           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3400           nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
3401       return True;
3402    }
3403
3404
3405    /* -------------- REV/REV16/REV32/RBIT -------------- */
3406    /* 31 30 28       20    15   11 9 4
3407
3408       1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
3409       0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn
3410
3411       1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
3412       0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn
3413
3414       1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
3415       0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn
3416
3417       1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
3418    */
3419    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3420        && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
3421       UInt b31 = INSN(31,31);
3422       UInt opc = INSN(11,10);
3423
3424       UInt ix = 0;
3425       /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1;
3426       else if (b31 == 0 && opc == BITS2(1,0)) ix = 2;
3427       else if (b31 == 1 && opc == BITS2(0,0)) ix = 3;
3428       else if (b31 == 0 && opc == BITS2(0,0)) ix = 4;
3429       else if (b31 == 1 && opc == BITS2(0,1)) ix = 5;
3430       else if (b31 == 0 && opc == BITS2(0,1)) ix = 6;
3431       else if (b31 == 1 && opc == BITS2(1,0)) ix = 7;
3432       if (ix >= 1 && ix <= 7) {
3433          Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
3434          UInt   nn    = INSN(9,5);
3435          UInt   dd    = INSN(4,0);
3436          IRTemp src   = newTemp(Ity_I64);
3437          IRTemp dst   = IRTemp_INVALID;
3438          IRTemp (*math)(IRTemp) = NULL;
3439          switch (ix) {
3440             case 1: case 2: math = math_BYTESWAP64;   break;
3441             case 3: case 4: math = math_BITSWAP64;    break;
3442             case 5: case 6: math = math_USHORTSWAP64; break;
3443             case 7:         math = math_UINTSWAP64;   break;
3444             default: vassert(0);
3445          }
3446          const HChar* names[7]
3447            = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
3448          const HChar* nm = names[ix-1];
3449          vassert(math);
3450          if (ix == 6) {
3451             /* This has to be special cased, since the logic below doesn't
3452                handle it correctly. */
3453             assign(src, getIReg64orZR(nn));
3454             dst = math(src);
3455             putIReg64orZR(dd,
3456                           unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
3457          } else if (is64) {
3458             assign(src, getIReg64orZR(nn));
3459             dst = math(src);
3460             putIReg64orZR(dd, mkexpr(dst));
3461          } else {
3462             assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
3463             dst = math(src);
3464             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3465          }
3466          DIP("%s %s, %s\n", nm,
3467              nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
3468          return True;
3469       }
3470       /* else fall through */
3471    }
3472
3473    /* -------------------- CLZ/CLS -------------------- */
3474    /*    30 28   24   20    15      9 4
3475       sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
3476       sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
3477    */
3478    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3479        && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
3480       Bool   is64  = INSN(31,31) == 1;
3481       Bool   isCLS = INSN(10,10) == 1;
3482       UInt   nn    = INSN(9,5);
3483       UInt   dd    = INSN(4,0);
3484       IRTemp src   = newTemp(Ity_I64);
3485       IRTemp srcZ  = newTemp(Ity_I64);
3486       IRTemp dst   = newTemp(Ity_I64);
3487       /* Get the argument, widened out to 64 bit */
3488       if (is64) {
3489          assign(src, getIReg64orZR(nn));
3490       } else {
3491          assign(src, binop(Iop_Shl64,
3492                            unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
3493       }
3494       /* If this is CLS, mash the arg around accordingly */
3495       if (isCLS) {
3496          IRExpr* one = mkU8(1);
3497          assign(srcZ,
3498          binop(Iop_Xor64,
3499                binop(Iop_Shl64, mkexpr(src), one),
3500                binop(Iop_Shl64, binop(Iop_Shr64, mkexpr(src), one), one)));
3501       } else {
3502          assign(srcZ, mkexpr(src));
3503       }
3504       /* And compute CLZ. */
3505       if (is64) {
3506          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3507                                 mkU64(isCLS ? 63 : 64),
3508                                 unop(Iop_Clz64, mkexpr(srcZ))));
3509          putIReg64orZR(dd, mkexpr(dst));
3510       } else {
3511          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3512                                 mkU64(isCLS ? 31 : 32),
3513                                 unop(Iop_Clz64, mkexpr(srcZ))));
3514          putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3515       }
3516       DIP("cl%c %s, %s\n", isCLS ? 's' : 'z',
3517           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
3518       return True;
3519    }
3520
3521    /* ------------------ LSLV/LSRV/ASRV/RORV ------------------ */
3522    /*    30 28        20 15   11 9 4
3523       sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
3524       sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
3525       sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
3526       sf 00 1101 0110 m  0010 11 n d   RORV Rd,Rn,Rm
3527    */
3528    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3529        && INSN(15,12) == BITS4(0,0,1,0)) {
3530       Bool   is64 = INSN(31,31) == 1;
3531       UInt   mm   = INSN(20,16);
3532       UInt   op   = INSN(11,10);
3533       UInt   nn   = INSN(9,5);
3534       UInt   dd   = INSN(4,0);
3535       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3536       IRTemp srcL = newTemp(ty);
3537       IRTemp srcR = newTemp(Ity_I64);
3538       IRTemp res  = newTemp(ty);
3539       IROp   iop  = Iop_INVALID;
3540       assign(srcL, getIRegOrZR(is64, nn));
3541       assign(srcR, binop(Iop_And64, getIReg64orZR(mm),
3542                                     mkU64(is64 ? 63 : 31)));
3543       if (op < 3) {
3544          // LSLV, LSRV, ASRV
3545          switch (op) {
3546             case BITS2(0,0): iop = mkSHL(ty); break;
3547             case BITS2(0,1): iop = mkSHR(ty); break;
3548             case BITS2(1,0): iop = mkSAR(ty); break;
3549             default: vassert(0);
3550          }
3551          assign(res, binop(iop, mkexpr(srcL),
3552                                 unop(Iop_64to8, mkexpr(srcR))));
3553       } else {
3554          // RORV
3555          IROp opSHL = mkSHL(ty);
3556          IROp opSHR = mkSHR(ty);
3557          IROp opOR  = mkOR(ty);
3558          IRExpr* width = mkU64(is64 ? 64: 32);
3559          assign(
3560             res,
3561             IRExpr_ITE(
3562                binop(Iop_CmpEQ64, mkexpr(srcR), mkU64(0)),
3563                mkexpr(srcL),
3564                binop(opOR,
3565                      binop(opSHL,
3566                            mkexpr(srcL),
3567                            unop(Iop_64to8, binop(Iop_Sub64, width,
3568                                                             mkexpr(srcR)))),
3569                      binop(opSHR,
3570                            mkexpr(srcL), unop(Iop_64to8, mkexpr(srcR))))
3571          ));
3572       }
3573       putIRegOrZR(is64, dd, mkexpr(res));
3574       vassert(op < 4);
3575       const HChar* names[4] = { "lslv", "lsrv", "asrv", "rorv" };
3576       DIP("%s %s, %s, %s\n",
3577           names[op], nameIRegOrZR(is64,dd),
3578                      nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
3579       return True;
3580    }
3581
3582    /* -------------------- SDIV/UDIV -------------------- */
3583    /*    30 28        20 15    10 9 4
3584       sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
3585       sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
3586    */
3587    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3588        && INSN(15,11) == BITS5(0,0,0,0,1)) {
3589       Bool is64 = INSN(31,31) == 1;
3590       UInt mm   = INSN(20,16);
3591       Bool isS  = INSN(10,10) == 1;
3592       UInt nn   = INSN(9,5);
3593       UInt dd   = INSN(4,0);
3594       if (isS) {
3595          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
3596                                      getIRegOrZR(is64, nn),
3597                                      getIRegOrZR(is64, mm)));
3598       } else {
3599          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
3600                                      getIRegOrZR(is64, nn),
3601                                      getIRegOrZR(is64, mm)));
3602       }
3603       DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
3604           nameIRegOrZR(is64, dd),
3605           nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
3606       return True;
3607    }
3608
3609    /* ------------------ {S,U}M{ADD,SUB}L ------------------ */
3610    /* 31        23  20 15 14 9 4
3611       1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
3612       1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
3613       1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
3614       1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
3615       with operation
3616          Xd = Xa +/- (Wn *u/s Wm)
3617    */
3618    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
3619       Bool   isU   = INSN(23,23) == 1;
3620       UInt   mm    = INSN(20,16);
3621       Bool   isAdd = INSN(15,15) == 0;
3622       UInt   aa    = INSN(14,10);
3623       UInt   nn    = INSN(9,5);
3624       UInt   dd    = INSN(4,0);
3625       IRTemp wN    = newTemp(Ity_I32);
3626       IRTemp wM    = newTemp(Ity_I32);
3627       IRTemp xA    = newTemp(Ity_I64);
3628       IRTemp muld  = newTemp(Ity_I64);
3629       IRTemp res   = newTemp(Ity_I64);
3630       assign(wN, getIReg32orZR(nn));
3631       assign(wM, getIReg32orZR(mm));
3632       assign(xA, getIReg64orZR(aa));
3633       assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
3634                          mkexpr(wN), mkexpr(wM)));
3635       assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
3636                         mkexpr(xA), mkexpr(muld)));
3637       putIReg64orZR(dd, mkexpr(res));
3638       DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
3639           nameIReg64orZR(dd), nameIReg32orZR(nn),
3640           nameIReg32orZR(mm), nameIReg64orZR(aa));
3641       return True;
3642    }
3643
3644    /* -------------------- CRC32/CRC32C -------------------- */
3645    /* 31 30           20 15   11 9 4
3646       sf 00 1101 0110 m  0100 sz n d   CRC32<sz>  Wd, Wn, Wm|Xm
3647       sf 00 1101 0110 m  0101 sz n d   CRC32C<sz> Wd, Wn, Wm|Xm
3648    */
3649    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3650        && INSN(15,13) == BITS3(0,1,0)) {
3651       UInt bitSF = INSN(31,31);
3652       UInt mm    = INSN(20,16);
3653       UInt bitC  = INSN(12,12);
3654       UInt sz    = INSN(11,10);
3655       UInt nn    = INSN(9,5);
3656       UInt dd    = INSN(4,0);
3657       vassert(sz >= 0 && sz <= 3);
3658       if ((bitSF == 0 && sz <= BITS2(1,0))
3659           || (bitSF == 1 && sz == BITS2(1,1))) {
3660          UInt ix = (bitC == 1 ? 4 : 0) | sz;
3661          void* helpers[8]
3662             = { &arm64g_calc_crc32b,   &arm64g_calc_crc32h,
3663                 &arm64g_calc_crc32w,   &arm64g_calc_crc32x,
3664                 &arm64g_calc_crc32cb,  &arm64g_calc_crc32ch,
3665                 &arm64g_calc_crc32cw,  &arm64g_calc_crc32cx };
3666          const HChar* hNames[8]
3667             = { "arm64g_calc_crc32b",  "arm64g_calc_crc32h",
3668                 "arm64g_calc_crc32w",  "arm64g_calc_crc32x",
3669                 "arm64g_calc_crc32cb", "arm64g_calc_crc32ch",
3670                 "arm64g_calc_crc32cw", "arm64g_calc_crc32cx" };
3671          const HChar* iNames[8]
3672             = { "crc32b",  "crc32h",  "crc32w",  "crc32x",
3673                 "crc32cb", "crc32ch", "crc32cw", "crc32cx" };
3674
3675          IRTemp srcN = newTemp(Ity_I64);
3676          assign(srcN, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
3677
3678          IRTemp  srcM = newTemp(Ity_I64);
3679          IRExpr* at64 = getIReg64orZR(mm);
3680          switch (sz) {
3681             case BITS2(0,0):
3682                assign(srcM, binop(Iop_And64, at64, mkU64(0xFF))); break;
3683             case BITS2(0,1):
3684                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFF))); break;
3685             case BITS2(1,0):
3686                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFFFFFF))); break;
3687             case BITS2(1,1):
3688                assign(srcM, at64); break;
3689             default:
3690                vassert(0);
3691          }
3692
3693          vassert(ix >= 0 && ix <= 7);
3694
3695          putIReg64orZR(
3696             dd,
3697             unop(Iop_32Uto64,
3698                  unop(Iop_64to32,
3699                       mkIRExprCCall(Ity_I64, 0/*regparm*/,
3700                                     hNames[ix], helpers[ix],
3701                                     mkIRExprVec_2(mkexpr(srcN),
3702                                                   mkexpr(srcM))))));
3703
3704          DIP("%s %s, %s, %s\n", iNames[ix],
3705              nameIReg32orZR(dd),
3706              nameIReg32orZR(nn), nameIRegOrZR(bitSF == 1, mm));
3707          return True;
3708       }
3709       /* fall through */
3710    }
3711
3712    if (sigill_diag) {
3713       vex_printf("ARM64 front end: data_processing_register\n");
3714    }
3715    return False;
3716 #  undef INSN
3717 }
3718
3719
3720 /*------------------------------------------------------------*/
3721 /*--- Math helpers for vector interleave/deinterleave      ---*/
3722 /*------------------------------------------------------------*/
3723
3724 #define EX(_tmp) \
3725            mkexpr(_tmp)
3726 #define SL(_hi128,_lo128,_nbytes) \
3727            ( (_nbytes) == 0 \
3728                 ? (_lo128) \
3729                 : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
3730 #define ROR(_v128,_nbytes) \
3731            SL((_v128),(_v128),(_nbytes))
3732 #define ROL(_v128,_nbytes) \
3733            SL((_v128),(_v128),16-(_nbytes))
3734 #define SHR(_v128,_nbytes) \
3735            binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
3736 #define SHL(_v128,_nbytes) \
3737            binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
3738 #define ILO64x2(_argL,_argR) \
3739            binop(Iop_InterleaveLO64x2,(_argL),(_argR))
3740 #define IHI64x2(_argL,_argR) \
3741            binop(Iop_InterleaveHI64x2,(_argL),(_argR))
3742 #define ILO32x4(_argL,_argR) \
3743            binop(Iop_InterleaveLO32x4,(_argL),(_argR))
3744 #define IHI32x4(_argL,_argR) \
3745            binop(Iop_InterleaveHI32x4,(_argL),(_argR))
3746 #define ILO16x8(_argL,_argR) \
3747            binop(Iop_InterleaveLO16x8,(_argL),(_argR))
3748 #define IHI16x8(_argL,_argR) \
3749            binop(Iop_InterleaveHI16x8,(_argL),(_argR))
3750 #define ILO8x16(_argL,_argR) \
3751            binop(Iop_InterleaveLO8x16,(_argL),(_argR))
3752 #define IHI8x16(_argL,_argR) \
3753            binop(Iop_InterleaveHI8x16,(_argL),(_argR))
3754 #define CEV32x4(_argL,_argR) \
3755            binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
3756 #define COD32x4(_argL,_argR) \
3757            binop(Iop_CatOddLanes32x4,(_argL),(_argR))
3758 #define COD16x8(_argL,_argR) \
3759            binop(Iop_CatOddLanes16x8,(_argL),(_argR))
3760 #define COD8x16(_argL,_argR) \
3761            binop(Iop_CatOddLanes8x16,(_argL),(_argR))
3762 #define CEV8x16(_argL,_argR) \
3763            binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
3764 #define AND(_arg1,_arg2) \
3765            binop(Iop_AndV128,(_arg1),(_arg2))
3766 #define OR2(_arg1,_arg2) \
3767            binop(Iop_OrV128,(_arg1),(_arg2))
3768 #define OR3(_arg1,_arg2,_arg3) \
3769            binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
3770 #define OR4(_arg1,_arg2,_arg3,_arg4) \
3771            binop(Iop_OrV128, \
3772                  binop(Iop_OrV128,(_arg1),(_arg2)), \
3773                  binop(Iop_OrV128,(_arg3),(_arg4)))
3774
3775
3776 /* Do interleaving for 1 128 bit vector, for ST1 insns. */
3777 static
3778 void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
3779                            UInt laneSzBlg2, IRTemp u0 )
3780 {
3781    assign(*i0, mkexpr(u0));
3782 }
3783
3784
3785 /* Do interleaving for 2 128 bit vectors, for ST2 insns. */
3786 static
3787 void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
3788                            UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
3789 {
3790    /* This is pretty easy, since we have primitives directly to
3791       hand. */
3792    if (laneSzBlg2 == 3) {
3793       // 64x2
3794       // u1 == B1 B0, u0 == A1 A0
3795       // i1 == B1 A1, i0 == B0 A0
3796       assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
3797       assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
3798       return;
3799    }
3800    if (laneSzBlg2 == 2) {
3801       // 32x4
3802       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3803       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3804       assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
3805       assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
3806       return;
3807    }
3808    if (laneSzBlg2 == 1) {
3809       // 16x8
3810       // u1 == B{7..0}, u0 == A{7..0}
3811       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3812       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3813       assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
3814       assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
3815       return;
3816    }
3817    if (laneSzBlg2 == 0) {
3818       // 8x16
3819       // u1 == B{f..0}, u0 == A{f..0}
3820       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
3821       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
3822       assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
3823       assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
3824       return;
3825    }
3826    /*NOTREACHED*/
3827    vassert(0);
3828 }
3829
3830
3831 /* Do interleaving for 3 128 bit vectors, for ST3 insns. */
3832 static
3833 void math_INTERLEAVE3_128(
3834         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
3835         UInt laneSzBlg2,
3836         IRTemp u0, IRTemp u1, IRTemp u2 )
3837 {
3838    if (laneSzBlg2 == 3) {
3839       // 64x2
3840       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
3841       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
3842       assign(*i2, IHI64x2( EX(u2), EX(u1) ));
3843       assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
3844       assign(*i0, ILO64x2( EX(u1), EX(u0) ));
3845       return;
3846    }
3847
3848    if (laneSzBlg2 == 2) {
3849       // 32x4
3850       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
3851       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
3852       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
3853       IRTemp p0    = newTempV128();
3854       IRTemp p1    = newTempV128();
3855       IRTemp p2    = newTempV128();
3856       IRTemp c1100 = newTempV128();
3857       IRTemp c0011 = newTempV128();
3858       IRTemp c0110 = newTempV128();
3859       assign(c1100, mkV128(0xFF00));
3860       assign(c0011, mkV128(0x00FF));
3861       assign(c0110, mkV128(0x0FF0));
3862       // First interleave them at 64x2 granularity,
3863       // generating partial ("p") values.
3864       math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
3865       // And more shuffling around for the final answer
3866       assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
3867                        AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
3868       assign(*i1, OR3( SHL(EX(p2),12),
3869                        AND(EX(p1),EX(c0110)),
3870                        SHR(EX(p0),12) ));
3871       assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
3872                        AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
3873       return;
3874    }
3875
3876    if (laneSzBlg2 == 1) {
3877       // 16x8
3878       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
3879       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
3880       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
3881       //
3882       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
3883       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
3884       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
3885       //
3886       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
3887       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
3888       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
3889       IRTemp p0    = newTempV128();
3890       IRTemp p1    = newTempV128();
3891       IRTemp p2    = newTempV128();
3892       IRTemp c1000 = newTempV128();
3893       IRTemp c0100 = newTempV128();
3894       IRTemp c0010 = newTempV128();
3895       IRTemp c0001 = newTempV128();
3896       assign(c1000, mkV128(0xF000));
3897       assign(c0100, mkV128(0x0F00));
3898       assign(c0010, mkV128(0x00F0));
3899       assign(c0001, mkV128(0x000F));
3900       // First interleave them at 32x4 granularity,
3901       // generating partial ("p") values.
3902       math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
3903       // And more shuffling around for the final answer
3904       assign(*i2,
3905              OR4( AND( IHI16x8( EX(p2),        ROL(EX(p2),4) ), EX(c1000) ),
3906                   AND( IHI16x8( ROL(EX(p2),6), EX(p2)        ), EX(c0100) ),
3907                   AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
3908                   AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
3909       ));
3910       assign(*i1,
3911              OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
3912                   AND( IHI16x8( EX(p1),        ROL(EX(p1),4) ), EX(c0100) ),
3913                   AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
3914                   AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
3915       ));
3916       assign(*i0,
3917              OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
3918                   AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
3919                   AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
3920                   AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
3921       ));
3922       return;
3923    }
3924
3925    if (laneSzBlg2 == 0) {
3926       // 8x16.  It doesn't seem worth the hassle of first doing a
3927       // 16x8 interleave, so just generate all 24 partial results
3928       // directly :-(
3929       // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
3930       // i2 == Cf Bf Af Ce .. Bb Ab Ca
3931       // i1 == Ba Aa C9 B9 .. A6 C5 B5
3932       // i0 == A5 C4 B4 A4 .. C0 B0 A0
3933
3934       IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
3935       IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
3936       IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
3937       IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
3938       IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
3939       IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
3940       IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
3941       IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
3942       IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
3943
3944       // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
3945       // of the form 14 bytes junk : CC[0xF] : BB[0xA]
3946       //
3947 #     define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
3948          IRTemp t_##_tempName = newTempV128(); \
3949          assign(t_##_tempName, \
3950                 ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
3951                          ROR(EX(_srcVec2),(_srcShift2)) ) )
3952
3953       // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
3954       IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
3955
3956       // The slicing and reassembly are done as interleavedly as possible,
3957       // so as to minimise the demand for registers in the back end, which
3958       // was observed to be a problem in testing.
3959
3960       XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
3961       XXXX(AfCe, AA, 0xf, CC, 0xe);
3962       assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
3963
3964       XXXX(BeAe, BB, 0xe, AA, 0xe);
3965       XXXX(CdBd, CC, 0xd, BB, 0xd);
3966       assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
3967       assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
3968
3969       XXXX(AdCc, AA, 0xd, CC, 0xc);
3970       XXXX(BcAc, BB, 0xc, AA, 0xc);
3971       assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
3972
3973       XXXX(CbBb, CC, 0xb, BB, 0xb);
3974       XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
3975       assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
3976       assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
3977       assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
3978
3979       XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
3980       XXXX(C9B9, CC, 0x9, BB, 0x9);
3981       assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
3982
3983       XXXX(A9C8, AA, 0x9, CC, 0x8);
3984       XXXX(B8A8, BB, 0x8, AA, 0x8);
3985       assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
3986       assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
3987
3988       XXXX(C7B7, CC, 0x7, BB, 0x7);
3989       XXXX(A7C6, AA, 0x7, CC, 0x6);
3990       assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
3991
3992       XXXX(B6A6, BB, 0x6, AA, 0x6);
3993       XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
3994       assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
3995       assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
3996       assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
3997
3998       XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
3999       XXXX(B4A4, BB, 0x4, AA, 0x4);
4000       assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
4001
4002       XXXX(C3B3, CC, 0x3, BB, 0x3);
4003       XXXX(A3C2, AA, 0x3, CC, 0x2);
4004       assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
4005       assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
4006
4007       XXXX(B2A2, BB, 0x2, AA, 0x2);
4008       XXXX(C1B1, CC, 0x1, BB, 0x1);
4009       assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
4010
4011       XXXX(A1C0, AA, 0x1, CC, 0x0);
4012       XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
4013       assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
4014       assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
4015       assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
4016
4017 #     undef XXXX
4018       return;
4019    }
4020
4021    /*NOTREACHED*/
4022    vassert(0);
4023 }
4024
4025
4026 /* Do interleaving for 4 128 bit vectors, for ST4 insns. */
4027 static
4028 void math_INTERLEAVE4_128(
4029         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4030         UInt laneSzBlg2,
4031         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4032 {
4033    if (laneSzBlg2 == 3) {
4034       // 64x2
4035       assign(*i0, ILO64x2(EX(u1), EX(u0)));
4036       assign(*i1, ILO64x2(EX(u3), EX(u2)));
4037       assign(*i2, IHI64x2(EX(u1), EX(u0)));
4038       assign(*i3, IHI64x2(EX(u3), EX(u2)));
4039       return;
4040    }
4041    if (laneSzBlg2 == 2) {
4042       // 32x4
4043       // First, interleave at the 64-bit lane size.
4044       IRTemp p0 = newTempV128();
4045       IRTemp p1 = newTempV128();
4046       IRTemp p2 = newTempV128();
4047       IRTemp p3 = newTempV128();
4048       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
4049       // And interleave (cat) at the 32 bit size.
4050       assign(*i0, CEV32x4(EX(p1), EX(p0)));
4051       assign(*i1, COD32x4(EX(p1), EX(p0)));
4052       assign(*i2, CEV32x4(EX(p3), EX(p2)));
4053       assign(*i3, COD32x4(EX(p3), EX(p2)));
4054       return;
4055    }
4056    if (laneSzBlg2 == 1) {
4057       // 16x8
4058       // First, interleave at the 32-bit lane size.
4059       IRTemp p0 = newTempV128();
4060       IRTemp p1 = newTempV128();
4061       IRTemp p2 = newTempV128();
4062       IRTemp p3 = newTempV128();
4063       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
4064       // And rearrange within each vector, to get the right 16 bit lanes.
4065       assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
4066       assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
4067       assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
4068       assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
4069       return;
4070    }
4071    if (laneSzBlg2 == 0) {
4072       // 8x16
4073       // First, interleave at the 16-bit lane size.
4074       IRTemp p0 = newTempV128();
4075       IRTemp p1 = newTempV128();
4076       IRTemp p2 = newTempV128();
4077       IRTemp p3 = newTempV128();
4078       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
4079       // And rearrange within each vector, to get the right 8 bit lanes.
4080       assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
4081       assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
4082       assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
4083       assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
4084       return;
4085    }
4086    /*NOTREACHED*/
4087    vassert(0);
4088 }
4089
4090
4091 /* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
4092 static
4093 void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
4094                              UInt laneSzBlg2, IRTemp i0 )
4095 {
4096    assign(*u0, mkexpr(i0));
4097 }
4098
4099
4100 /* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
4101 static
4102 void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4103                              UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4104 {
4105    /* This is pretty easy, since we have primitives directly to
4106       hand. */
4107    if (laneSzBlg2 == 3) {
4108       // 64x2
4109       // i1 == B1 A1, i0 == B0 A0
4110       // u1 == B1 B0, u0 == A1 A0
4111       assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
4112       assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
4113       return;
4114    }
4115    if (laneSzBlg2 == 2) {
4116       // 32x4
4117       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
4118       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
4119       assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
4120       assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
4121       return;
4122    }
4123    if (laneSzBlg2 == 1) {
4124       // 16x8
4125       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
4126       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
4127       // u1 == B{7..0}, u0 == A{7..0}
4128       assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
4129       assign(*u1, binop(Iop_CatOddLanes16x8,  mkexpr(i1), mkexpr(i0)));
4130       return;
4131    }
4132    if (laneSzBlg2 == 0) {
4133       // 8x16
4134       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
4135       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
4136       // u1 == B{f..0}, u0 == A{f..0}
4137       assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
4138       assign(*u1, binop(Iop_CatOddLanes8x16,  mkexpr(i1), mkexpr(i0)));
4139       return;
4140    }
4141    /*NOTREACHED*/
4142    vassert(0);
4143 }
4144
4145
4146 /* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
4147 static
4148 void math_DEINTERLEAVE3_128(
4149         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4150         UInt laneSzBlg2,
4151         IRTemp i0, IRTemp i1, IRTemp i2 )
4152 {
4153    if (laneSzBlg2 == 3) {
4154       // 64x2
4155       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
4156       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
4157       assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1)        ));
4158       assign(*u1, ILO64x2( EX(i2),        ROL(EX(i0),8) ));
4159       assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0)        ));
4160       return;
4161    }
4162
4163    if (laneSzBlg2 == 2) {
4164       // 32x4
4165       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
4166       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
4167       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
4168       IRTemp t_a1c0b0a0 = newTempV128();
4169       IRTemp t_a2c1b1a1 = newTempV128();
4170       IRTemp t_a3c2b2a2 = newTempV128();
4171       IRTemp t_a0c3b3a3 = newTempV128();
4172       IRTemp p0 = newTempV128();
4173       IRTemp p1 = newTempV128();
4174       IRTemp p2 = newTempV128();
4175       // Compute some intermediate values.
4176       assign(t_a1c0b0a0, EX(i0));
4177       assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
4178       assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
4179       assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
4180       // First deinterleave into lane-pairs
4181       assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
4182       assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
4183                          IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
4184       assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
4185       // Then deinterleave at 64x2 granularity.
4186       math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
4187       return;
4188    }
4189
4190    if (laneSzBlg2 == 1) {
4191       // 16x8
4192       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
4193       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
4194       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
4195       //
4196       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
4197       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
4198       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
4199       //
4200       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
4201       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
4202       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
4203
4204       IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
4205       s0 = s1 = s2 = s3
4206          = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
4207       newTempsV128_4(&s0, &s1, &s2, &s3);
4208       newTempsV128_4(&t0, &t1, &t2, &t3);
4209       newTempsV128_4(&p0, &p1, &p2, &c00111111);
4210
4211       // s0 == b2a2 c1b1a1 c0b0a0
4212       // s1 == b4a4 c3b3c3 c2b2a2
4213       // s2 == b6a6 c5b5a5 c4b4a4
4214       // s3 == b0a0 c7b7a7 c6b6a6
4215       assign(s0, EX(i0));
4216       assign(s1, SL(EX(i1),EX(i0),6*2));
4217       assign(s2, SL(EX(i2),EX(i1),4*2));
4218       assign(s3, SL(EX(i0),EX(i2),2*2));
4219
4220       // t0 == 0 0 c1c0 b1b0 a1a0
4221       // t1 == 0 0 c3c2 b3b2 a3a2
4222       // t2 == 0 0 c5c4 b5b4 a5a4
4223       // t3 == 0 0 c7c6 b7b6 a7a6
4224       assign(c00111111, mkV128(0x0FFF));
4225       assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
4226       assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
4227       assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
4228       assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
4229
4230       assign(p0, OR2(EX(t0),          SHL(EX(t1),6*2)));
4231       assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
4232       assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
4233
4234       // Then deinterleave at 32x4 granularity.
4235       math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
4236       return;
4237    }
4238
4239    if (laneSzBlg2 == 0) {
4240       // 8x16.  This is the same scheme as for 16x8, with twice the
4241       // number of intermediate values.
4242       //
4243       // u2 == C{f..0}
4244       // u1 == B{f..0}
4245       // u0 == A{f..0}
4246       //
4247       // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
4248       // i1 ==  BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
4249       // i0 ==   A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4250       //
4251       // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
4252       // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
4253       // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
4254       //
4255       IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
4256              t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
4257       s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
4258          = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
4259          = IRTemp_INVALID;
4260       newTempsV128_4(&s0, &s1, &s2, &s3);
4261       newTempsV128_4(&s4, &s5, &s6, &s7);
4262       newTempsV128_4(&t0, &t1, &t2, &t3);
4263       newTempsV128_4(&t4, &t5, &t6, &t7);
4264       newTempsV128_4(&p0, &p1, &p2, &cMASK);
4265
4266       // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4267       // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
4268       // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
4269       // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
4270       // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
4271       // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
4272       // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
4273       // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
4274       assign(s0, SL(EX(i1),EX(i0), 0));
4275       assign(s1, SL(EX(i1),EX(i0), 6));
4276       assign(s2, SL(EX(i1),EX(i0),12));
4277       assign(s3, SL(EX(i2),EX(i1), 2));
4278       assign(s4, SL(EX(i2),EX(i1), 8));
4279       assign(s5, SL(EX(i2),EX(i1),14));
4280       assign(s6, SL(EX(i0),EX(i2), 4));
4281       assign(s7, SL(EX(i0),EX(i2),10));
4282
4283       // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
4284       // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
4285       // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
4286       // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
4287       // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
4288       // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
4289       // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
4290       // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
4291       assign(cMASK, mkV128(0x003F));
4292       assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
4293       assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
4294       assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
4295       assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
4296       assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
4297       assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
4298       assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
4299       assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
4300
4301       assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
4302       assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
4303                  SHL(EX(t3),2), SHR(EX(t2),4) ));
4304       assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
4305
4306       // Then deinterleave at 16x8 granularity.
4307       math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
4308       return;
4309    }
4310
4311    /*NOTREACHED*/
4312    vassert(0);
4313 }
4314
4315
4316 /* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
4317 static
4318 void math_DEINTERLEAVE4_128(
4319         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4320         UInt laneSzBlg2,
4321         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4322 {
4323    if (laneSzBlg2 == 3) {
4324       // 64x2
4325       assign(*u0, ILO64x2(EX(i2), EX(i0)));
4326       assign(*u1, IHI64x2(EX(i2), EX(i0)));
4327       assign(*u2, ILO64x2(EX(i3), EX(i1)));
4328       assign(*u3, IHI64x2(EX(i3), EX(i1)));
4329       return;
4330    }
4331    if (laneSzBlg2 == 2) {
4332       // 32x4
4333       IRTemp p0 = newTempV128();
4334       IRTemp p2 = newTempV128();
4335       IRTemp p1 = newTempV128();
4336       IRTemp p3 = newTempV128();
4337       assign(p0, ILO32x4(EX(i1), EX(i0)));
4338       assign(p1, IHI32x4(EX(i1), EX(i0)));
4339       assign(p2, ILO32x4(EX(i3), EX(i2)));
4340       assign(p3, IHI32x4(EX(i3), EX(i2)));
4341       // And now do what we did for the 64-bit case.
4342       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
4343       return;
4344    }
4345    if (laneSzBlg2 == 1) {
4346       // 16x8
4347       // Deinterleave into 32-bit chunks, then do as the 32-bit case.
4348       IRTemp p0 = newTempV128();
4349       IRTemp p1 = newTempV128();
4350       IRTemp p2 = newTempV128();
4351       IRTemp p3 = newTempV128();
4352       assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
4353       assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
4354       assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
4355       assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
4356       // From here on is like the 32 bit case.
4357       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
4358       return;
4359    }
4360    if (laneSzBlg2 == 0) {
4361       // 8x16
4362       // Deinterleave into 16-bit chunks, then do as the 16-bit case.
4363       IRTemp p0 = newTempV128();
4364       IRTemp p1 = newTempV128();
4365       IRTemp p2 = newTempV128();
4366       IRTemp p3 = newTempV128();
4367       assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
4368                           ILO8x16(EX(i0),ROL(EX(i0),4)) ));
4369       assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
4370                           ILO8x16(EX(i1),ROL(EX(i1),4)) ));
4371       assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
4372                           ILO8x16(EX(i2),ROL(EX(i2),4)) ));
4373       assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
4374                           ILO8x16(EX(i3),ROL(EX(i3),4)) ));
4375       // From here on is like the 16 bit case.
4376       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
4377       return;
4378    }
4379    /*NOTREACHED*/
4380    vassert(0);
4381 }
4382
4383
4384 /* Wrappers that use the full-width (de)interleavers to do half-width
4385    (de)interleaving.  The scheme is to clone each input lane in the
4386    lower half of each incoming value, do a full width (de)interleave
4387    at the next lane size up, and remove every other lane of the the
4388    result.  The returned values may have any old junk in the upper
4389    64 bits -- the caller must ignore that. */
4390
4391 /* Helper function -- get doubling and narrowing operations. */
4392 static
4393 void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
4394                                    /*OUT*/IROp* halver,
4395                                    UInt laneSzBlg2 )
4396 {
4397    switch (laneSzBlg2) {
4398       case 2:
4399          *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
4400          break;
4401       case 1:
4402          *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
4403          break;
4404       case 0:
4405          *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
4406          break;
4407       default:
4408          vassert(0);
4409    }
4410 }
4411
4412 /* Do interleaving for 1 64 bit vector, for ST1 insns. */
4413 static
4414 void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
4415                           UInt laneSzBlg2, IRTemp u0 )
4416 {
4417    assign(*i0, mkexpr(u0));
4418 }
4419
4420
4421 /* Do interleaving for 2 64 bit vectors, for ST2 insns. */
4422 static
4423 void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
4424                           UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
4425 {
4426    if (laneSzBlg2 == 3) {
4427       // 1x64, degenerate case
4428       assign(*i0, EX(u0));
4429       assign(*i1, EX(u1));
4430       return;
4431    }
4432
4433    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4434    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4435    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4436
4437    IRTemp du0 = newTempV128();
4438    IRTemp du1 = newTempV128();
4439    assign(du0, binop(doubler, EX(u0), EX(u0)));
4440    assign(du1, binop(doubler, EX(u1), EX(u1)));
4441    IRTemp di0 = newTempV128();
4442    IRTemp di1 = newTempV128();
4443    math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
4444    assign(*i0, binop(halver, EX(di0), EX(di0)));
4445    assign(*i1, binop(halver, EX(di1), EX(di1)));
4446 }
4447
4448
4449 /* Do interleaving for 3 64 bit vectors, for ST3 insns. */
4450 static
4451 void math_INTERLEAVE3_64(
4452         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
4453         UInt laneSzBlg2,
4454         IRTemp u0, IRTemp u1, IRTemp u2 )
4455 {
4456    if (laneSzBlg2 == 3) {
4457       // 1x64, degenerate case
4458       assign(*i0, EX(u0));
4459       assign(*i1, EX(u1));
4460       assign(*i2, EX(u2));
4461       return;
4462    }
4463
4464    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4465    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4466    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4467
4468    IRTemp du0 = newTempV128();
4469    IRTemp du1 = newTempV128();
4470    IRTemp du2 = newTempV128();
4471    assign(du0, binop(doubler, EX(u0), EX(u0)));
4472    assign(du1, binop(doubler, EX(u1), EX(u1)));
4473    assign(du2, binop(doubler, EX(u2), EX(u2)));
4474    IRTemp di0 = newTempV128();
4475    IRTemp di1 = newTempV128();
4476    IRTemp di2 = newTempV128();
4477    math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
4478    assign(*i0, binop(halver, EX(di0), EX(di0)));
4479    assign(*i1, binop(halver, EX(di1), EX(di1)));
4480    assign(*i2, binop(halver, EX(di2), EX(di2)));
4481 }
4482
4483
4484 /* Do interleaving for 4 64 bit vectors, for ST4 insns. */
4485 static
4486 void math_INTERLEAVE4_64(
4487         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4488         UInt laneSzBlg2,
4489         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4490 {
4491    if (laneSzBlg2 == 3) {
4492       // 1x64, degenerate case
4493       assign(*i0, EX(u0));
4494       assign(*i1, EX(u1));
4495       assign(*i2, EX(u2));
4496       assign(*i3, EX(u3));
4497       return;
4498    }
4499
4500    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4501    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4502    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4503
4504    IRTemp du0 = newTempV128();
4505    IRTemp du1 = newTempV128();
4506    IRTemp du2 = newTempV128();
4507    IRTemp du3 = newTempV128();
4508    assign(du0, binop(doubler, EX(u0), EX(u0)));
4509    assign(du1, binop(doubler, EX(u1), EX(u1)));
4510    assign(du2, binop(doubler, EX(u2), EX(u2)));
4511    assign(du3, binop(doubler, EX(u3), EX(u3)));
4512    IRTemp di0 = newTempV128();
4513    IRTemp di1 = newTempV128();
4514    IRTemp di2 = newTempV128();
4515    IRTemp di3 = newTempV128();
4516    math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
4517                         laneSzBlg2 + 1, du0, du1, du2, du3);
4518    assign(*i0, binop(halver, EX(di0), EX(di0)));
4519    assign(*i1, binop(halver, EX(di1), EX(di1)));
4520    assign(*i2, binop(halver, EX(di2), EX(di2)));
4521    assign(*i3, binop(halver, EX(di3), EX(di3)));
4522 }
4523
4524
4525 /* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
4526 static
4527 void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
4528                             UInt laneSzBlg2, IRTemp i0 )
4529 {
4530    assign(*u0, mkexpr(i0));
4531 }
4532
4533
4534 /* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
4535 static
4536 void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4537                             UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4538 {
4539    if (laneSzBlg2 == 3) {
4540       // 1x64, degenerate case
4541       assign(*u0, EX(i0));
4542       assign(*u1, EX(i1));
4543       return;
4544    }
4545
4546    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4547    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4548    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4549
4550    IRTemp di0 = newTempV128();
4551    IRTemp di1 = newTempV128();
4552    assign(di0, binop(doubler, EX(i0), EX(i0)));
4553    assign(di1, binop(doubler, EX(i1), EX(i1)));
4554
4555    IRTemp du0 = newTempV128();
4556    IRTemp du1 = newTempV128();
4557    math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
4558    assign(*u0, binop(halver, EX(du0), EX(du0)));
4559    assign(*u1, binop(halver, EX(du1), EX(du1)));
4560 }
4561
4562
4563 /* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
4564 static
4565 void math_DEINTERLEAVE3_64(
4566         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4567         UInt laneSzBlg2,
4568         IRTemp i0, IRTemp i1, IRTemp i2 )
4569 {
4570    if (laneSzBlg2 == 3) {
4571       // 1x64, degenerate case
4572       assign(*u0, EX(i0));
4573       assign(*u1, EX(i1));
4574       assign(*u2, EX(i2));
4575       return;
4576    }
4577
4578    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4579    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4580    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4581
4582    IRTemp di0 = newTempV128();
4583    IRTemp di1 = newTempV128();
4584    IRTemp di2 = newTempV128();
4585    assign(di0, binop(doubler, EX(i0), EX(i0)));
4586    assign(di1, binop(doubler, EX(i1), EX(i1)));
4587    assign(di2, binop(doubler, EX(i2), EX(i2)));
4588    IRTemp du0 = newTempV128();
4589    IRTemp du1 = newTempV128();
4590    IRTemp du2 = newTempV128();
4591    math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
4592    assign(*u0, binop(halver, EX(du0), EX(du0)));
4593    assign(*u1, binop(halver, EX(du1), EX(du1)));
4594    assign(*u2, binop(halver, EX(du2), EX(du2)));
4595 }
4596
4597
4598 /* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
4599 static
4600 void math_DEINTERLEAVE4_64(
4601         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4602         UInt laneSzBlg2,
4603         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4604 {
4605    if (laneSzBlg2 == 3) {
4606       // 1x64, degenerate case
4607       assign(*u0, EX(i0));
4608       assign(*u1, EX(i1));
4609       assign(*u2, EX(i2));
4610       assign(*u3, EX(i3));
4611       return;
4612    }
4613
4614    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4615    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4616    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4617
4618    IRTemp di0 = newTempV128();
4619    IRTemp di1 = newTempV128();
4620    IRTemp di2 = newTempV128();
4621    IRTemp di3 = newTempV128();
4622    assign(di0, binop(doubler, EX(i0), EX(i0)));
4623    assign(di1, binop(doubler, EX(i1), EX(i1)));
4624    assign(di2, binop(doubler, EX(i2), EX(i2)));
4625    assign(di3, binop(doubler, EX(i3), EX(i3)));
4626    IRTemp du0 = newTempV128();
4627    IRTemp du1 = newTempV128();
4628    IRTemp du2 = newTempV128();
4629    IRTemp du3 = newTempV128();
4630    math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
4631                           laneSzBlg2 + 1, di0, di1, di2, di3);
4632    assign(*u0, binop(halver, EX(du0), EX(du0)));
4633    assign(*u1, binop(halver, EX(du1), EX(du1)));
4634    assign(*u2, binop(halver, EX(du2), EX(du2)));
4635    assign(*u3, binop(halver, EX(du3), EX(du3)));
4636 }
4637
4638
4639 #undef EX
4640 #undef SL
4641 #undef ROR
4642 #undef ROL
4643 #undef SHR
4644 #undef SHL
4645 #undef ILO64x2
4646 #undef IHI64x2
4647 #undef ILO32x4
4648 #undef IHI32x4
4649 #undef ILO16x8
4650 #undef IHI16x8
4651 #undef ILO16x8
4652 #undef IHI16x8
4653 #undef CEV32x4
4654 #undef COD32x4
4655 #undef COD16x8
4656 #undef COD8x16
4657 #undef CEV8x16
4658 #undef AND
4659 #undef OR2
4660 #undef OR3
4661 #undef OR4
4662
4663
4664 /*------------------------------------------------------------*/
4665 /*--- Load and Store instructions                          ---*/
4666 /*------------------------------------------------------------*/
4667
4668 /* Generate the EA for a "reg + reg" style amode.  This is done from
4669    parts of the insn, but for sanity checking sake it takes the whole
4670    insn.  This appears to depend on insn[15:12], with opt=insn[15:13]
4671    and S=insn[12]:
4672
4673    The possible forms, along with their opt:S values, are:
4674       011:0   Xn|SP + Xm
4675       111:0   Xn|SP + Xm
4676       011:1   Xn|SP + Xm * transfer_szB
4677       111:1   Xn|SP + Xm * transfer_szB
4678       010:0   Xn|SP + 32Uto64(Wm)
4679       010:1   Xn|SP + 32Uto64(Wm) * transfer_szB
4680       110:0   Xn|SP + 32Sto64(Wm)
4681       110:1   Xn|SP + 32Sto64(Wm) * transfer_szB
4682
4683    Rm is insn[20:16].  Rn is insn[9:5].  Rt is insn[4:0].  Log2 of
4684    the transfer size is insn[23,31,30].  For integer loads/stores,
4685    insn[23] is zero, hence szLg2 can be at most 3 in such cases.
4686
4687    If the decoding fails, it returns IRTemp_INVALID.
4688
4689    isInt is True iff this is decoding is for transfers to/from integer
4690    registers.  If False it is for transfers to/from vector registers.
4691 */
4692 static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt )
4693 {
4694    UInt    optS  = SLICE_UInt(insn, 15, 12);
4695    UInt    mm    = SLICE_UInt(insn, 20, 16);
4696    UInt    nn    = SLICE_UInt(insn, 9, 5);
4697    UInt    szLg2 = (isInt ? 0 : (SLICE_UInt(insn, 23, 23) << 2))
4698                    | SLICE_UInt(insn, 31, 30); // Log2 of the size
4699
4700    buf[0] = 0;
4701
4702    /* Sanity checks, that this really is a load/store insn. */
4703    if (SLICE_UInt(insn, 11, 10) != BITS2(1,0))
4704       goto fail;
4705
4706    if (isInt
4707        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,1,1)/*LDR*/
4708        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,0,1)/*STR*/
4709        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,0,1)/*LDRSbhw Xt*/
4710        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,1,1))/*LDRSbhw Wt*/
4711       goto fail;
4712
4713    if (!isInt
4714        && SLICE_UInt(insn, 29, 24) != BITS6(1,1,1,1,0,0)) /*LDR/STR*/
4715       goto fail;
4716
4717    /* Throw out non-verified but possibly valid cases. */
4718    switch (szLg2) {
4719       case BITS3(0,0,0): break; //  8 bit, valid for both int and vec
4720       case BITS3(0,0,1): break; // 16 bit, valid for both int and vec
4721       case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
4722       case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
4723       case BITS3(1,0,0): // can only ever be valid for the vector case
4724                          if (isInt) goto fail; else break;
4725       case BITS3(1,0,1): // these sizes are never valid
4726       case BITS3(1,1,0):
4727       case BITS3(1,1,1): goto fail;
4728
4729       default: vassert(0);
4730    }
4731
4732    IRExpr* rhs  = NULL;
4733    switch (optS) {
4734       case BITS4(1,1,1,0): goto fail; //ATC
4735       case BITS4(0,1,1,0):
4736          rhs = getIReg64orZR(mm);
4737          vex_sprintf(buf, "[%s, %s]",
4738                      nameIReg64orZR(nn), nameIReg64orZR(mm));
4739          break;
4740       case BITS4(1,1,1,1): goto fail; //ATC
4741       case BITS4(0,1,1,1):
4742          rhs = binop(Iop_Shl64, getIReg64orZR(mm), mkU8(szLg2));
4743          vex_sprintf(buf, "[%s, %s lsl %u]",
4744                      nameIReg64orZR(nn), nameIReg64orZR(mm), szLg2);
4745          break;
4746       case BITS4(0,1,0,0):
4747          rhs = unop(Iop_32Uto64, getIReg32orZR(mm));
4748          vex_sprintf(buf, "[%s, %s uxtx]",
4749                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4750          break;
4751       case BITS4(0,1,0,1):
4752          rhs = binop(Iop_Shl64,
4753                      unop(Iop_32Uto64, getIReg32orZR(mm)), mkU8(szLg2));
4754          vex_sprintf(buf, "[%s, %s uxtx, lsl %u]",
4755                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4756          break;
4757       case BITS4(1,1,0,0):
4758          rhs = unop(Iop_32Sto64, getIReg32orZR(mm));
4759          vex_sprintf(buf, "[%s, %s sxtx]",
4760                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4761          break;
4762       case BITS4(1,1,0,1):
4763          rhs = binop(Iop_Shl64,
4764                      unop(Iop_32Sto64, getIReg32orZR(mm)), mkU8(szLg2));
4765          vex_sprintf(buf, "[%s, %s sxtx, lsl %u]",
4766                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4767          break;
4768       default:
4769          /* The rest appear to be genuinely invalid */
4770          goto fail;
4771    }
4772
4773    vassert(rhs);
4774    IRTemp res = newTemp(Ity_I64);
4775    assign(res, binop(Iop_Add64, getIReg64orSP(nn), rhs));
4776    return res;
4777
4778   fail:
4779    if (0 /*really, sigill_diag, but that causes too much plumbing*/) {
4780       vex_printf("gen_indexed_EA: unhandled case optS == 0x%x\n", optS);
4781    }
4782    return IRTemp_INVALID;
4783 }
4784
4785
4786 /* Generate an 8/16/32/64 bit integer store to ADDR for the lowest
4787    bits of DATAE :: Ity_I64. */
4788 static void gen_narrowing_store ( UInt szB, IRTemp addr, IRExpr* dataE )
4789 {
4790    IRExpr* addrE = mkexpr(addr);
4791    switch (szB) {
4792       case 8:
4793          storeLE(addrE, dataE);
4794          break;
4795       case 4:
4796          storeLE(addrE, unop(Iop_64to32, dataE));
4797          break;
4798       case 2:
4799          storeLE(addrE, unop(Iop_64to16, dataE));
4800          break;
4801       case 1:
4802          storeLE(addrE, unop(Iop_64to8, dataE));
4803          break;
4804       default:
4805          vassert(0);
4806    }
4807 }
4808
4809
4810 /* Generate an 8/16/32/64 bit unsigned widening load from ADDR,
4811    placing the result in an Ity_I64 temporary. */
4812 static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
4813 {
4814    IRTemp  res   = newTemp(Ity_I64);
4815    IRExpr* addrE = mkexpr(addr);
4816    switch (szB) {
4817       case 8:
4818          assign(res, loadLE(Ity_I64,addrE));
4819          break;
4820       case 4:
4821          assign(res, unop(Iop_32Uto64, loadLE(Ity_I32,addrE)));
4822          break;
4823       case 2:
4824          assign(res, unop(Iop_16Uto64, loadLE(Ity_I16,addrE)));
4825          break;
4826       case 1:
4827          assign(res, unop(Iop_8Uto64, loadLE(Ity_I8,addrE)));
4828          break;
4829       default:
4830          vassert(0);
4831    }
4832    return res;
4833 }
4834
4835
4836 /* Generate a SIGBUS followed by a restart of the current instruction if
4837    `effective_addr` is `align`-aligned.  This is required behaviour for atomic
4838    instructions.  This assumes that guest_RIP_curr_instr is set correctly!
4839
4840    This is hardwired to generate SIGBUS because so far the only supported arm64
4841    (arm64-linux) does that.  Should we need to later extend it to generate some
4842    other signal, use the same scheme as with gen_SIGNAL_if_not_XX_aligned in
4843    guest_amd64_toIR.c. */
4844 static
4845 void gen_SIGBUS_if_not_XX_aligned ( IRTemp effective_addr, ULong align )
4846 {
4847    if (align == 1) {
4848       return;
4849    }
4850    vassert(align == 16 || align == 8 || align == 4 || align == 2);
4851    stmt(
4852       IRStmt_Exit(
4853          binop(Iop_CmpNE64,
4854                binop(Iop_And64,mkexpr(effective_addr),mkU64(align-1)),
4855                mkU64(0)),
4856          Ijk_SigBUS,
4857          IRConst_U64(guest_PC_curr_instr),
4858          OFFB_PC
4859       )
4860    );
4861 }
4862
4863
4864 /* Generate a "standard 7" name, from bitQ and size.  But also
4865    allow ".1d" since that's occasionally useful. */
4866 static
4867 const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
4868 {
4869    vassert(bitQ <= 1 && size <= 3);
4870    const HChar* nms[8]
4871       = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
4872    UInt ix = (bitQ << 2) | size;
4873    vassert(ix < 8);
4874    return nms[ix];
4875 }
4876
4877
4878 static
4879 Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4880                           const VexAbiInfo* abiinfo, Bool sigill_diag)
4881 {
4882 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
4883
4884    /* ------------ LDR,STR (immediate, uimm12) ----------- */
4885    /* uimm12 is scaled by the transfer size
4886
4887       31 29  26    21    9  4
4888       |  |   |     |     |  |
4889       11 111 00100 imm12 nn tt    STR  Xt, [Xn|SP, #imm12 * 8]
4890       11 111 00101 imm12 nn tt    LDR  Xt, [Xn|SP, #imm12 * 8]
4891
4892       10 111 00100 imm12 nn tt    STR  Wt, [Xn|SP, #imm12 * 4]
4893       10 111 00101 imm12 nn tt    LDR  Wt, [Xn|SP, #imm12 * 4]
4894
4895       01 111 00100 imm12 nn tt    STRH Wt, [Xn|SP, #imm12 * 2]
4896       01 111 00101 imm12 nn tt    LDRH Wt, [Xn|SP, #imm12 * 2]
4897
4898       00 111 00100 imm12 nn tt    STRB Wt, [Xn|SP, #imm12 * 1]
4899       00 111 00101 imm12 nn tt    LDRB Wt, [Xn|SP, #imm12 * 1]
4900    */
4901    if (INSN(29,23) == BITS7(1,1,1,0,0,1,0)) {
4902       UInt   szLg2 = INSN(31,30);
4903       UInt   szB   = 1 << szLg2;
4904       Bool   isLD  = INSN(22,22) == 1;
4905       UInt   offs  = INSN(21,10) * szB;
4906       UInt   nn    = INSN(9,5);
4907       UInt   tt    = INSN(4,0);
4908       IRTemp ta    = newTemp(Ity_I64);
4909       assign(ta, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offs)));
4910       if (nn == 31) { /* FIXME generate stack alignment check */ }
4911       vassert(szLg2 < 4);
4912       if (isLD) {
4913          putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, ta)));
4914       } else {
4915          gen_narrowing_store(szB, ta, getIReg64orZR(tt));
4916       }
4917       const HChar* ld_name[4] = { "ldrb", "ldrh", "ldr", "ldr" };
4918       const HChar* st_name[4] = { "strb", "strh", "str", "str" };
4919       DIP("%s %s, [%s, #%u]\n",
4920           (isLD ? ld_name : st_name)[szLg2], nameIRegOrZR(szB == 8, tt),
4921           nameIReg64orSP(nn), offs);
4922       return True;
4923    }
4924
4925    /* ------------ LDUR,STUR (immediate, simm9) ----------- */
4926    /*
4927       31 29  26      20   11 9  4
4928       |  |   |       |    |  |  |
4929       (at-Rn-then-Rn=EA)  |  |  |
4930       sz 111 00000 0 imm9 01 Rn Rt   STR Rt, [Xn|SP], #simm9
4931       sz 111 00001 0 imm9 01 Rn Rt   LDR Rt, [Xn|SP], #simm9
4932
4933       (at-EA-then-Rn=EA)
4934       sz 111 00000 0 imm9 11 Rn Rt   STR Rt, [Xn|SP, #simm9]!
4935       sz 111 00001 0 imm9 11 Rn Rt   LDR Rt, [Xn|SP, #simm9]!
4936
4937       (at-EA)
4938       sz 111 00000 0 imm9 00 Rn Rt   STR Rt, [Xn|SP, #simm9]
4939       sz 111 00001 0 imm9 00 Rn Rt   LDR Rt, [Xn|SP, #simm9]
4940
4941       simm9 is unscaled.
4942
4943       The case 'wback && Rn == Rt && Rt != 31' is disallowed.  In the
4944       load case this is because would create two competing values for
4945       Rt.  In the store case the reason is unclear, but the spec
4946       disallows it anyway.
4947
4948       Stores are narrowing, loads are unsigned widening.  sz encodes
4949       the transfer size in the normal way: 00=1, 01=2, 10=4, 11=8.
4950    */
4951    if ((INSN(29,21) & BITS9(1,1,1, 1,1,1,1,0, 1))
4952        == BITS9(1,1,1, 0,0,0,0,0, 0)) {
4953       UInt szLg2  = INSN(31,30);
4954       UInt szB    = 1 << szLg2;
4955       Bool isLoad = INSN(22,22) == 1;
4956       UInt imm9   = INSN(20,12);
4957       UInt nn     = INSN(9,5);
4958       UInt tt     = INSN(4,0);
4959       Bool wBack  = INSN(10,10) == 1;
4960       UInt how    = INSN(11,10);
4961       if (how == BITS2(1,0) || (wBack && nn == tt && tt != 31)) {
4962          /* undecodable; fall through */
4963       } else {
4964          if (nn == 31) { /* FIXME generate stack alignment check */ }
4965
4966          // Compute the transfer address TA and the writeback address WA.
4967          IRTemp tRN = newTemp(Ity_I64);
4968          assign(tRN, getIReg64orSP(nn));
4969          IRTemp tEA = newTemp(Ity_I64);
4970          Long simm9 = (Long)sx_to_64(imm9, 9);
4971          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
4972
4973          IRTemp tTA = newTemp(Ity_I64);
4974          IRTemp tWA = newTemp(Ity_I64);
4975          switch (how) {
4976             case BITS2(0,1):
4977                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4978             case BITS2(1,1):
4979                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4980             case BITS2(0,0):
4981                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4982             default:
4983                vassert(0); /* NOTREACHED */
4984          }
4985
4986          /* Normally rN would be updated after the transfer.  However, in
4987             the special cases typifed by
4988                str x30, [sp,#-16]!
4989                str w1, [sp,#-32]!
4990             it is necessary to update SP before the transfer, (1)
4991             because Memcheck will otherwise complain about a write
4992             below the stack pointer, and (2) because the segfault
4993             stack extension mechanism will otherwise extend the stack
4994             only down to SP before the instruction, which might not be
4995             far enough, if the -16/-32 bit takes the actual access
4996             address to the next page.
4997          */
4998          Bool earlyWBack
4999            = wBack && simm9 < 0
5000              && (szB == 8 || szB == 4 || szB == 2 || szB == 1)
5001              && how == BITS2(1,1) && nn == 31 && !isLoad;
5002
5003          if (wBack && earlyWBack)
5004             putIReg64orSP(nn, mkexpr(tEA));
5005
5006          if (isLoad) {
5007             putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, tTA)));
5008          } else {
5009             gen_narrowing_store(szB, tTA, getIReg64orZR(tt));
5010          }
5011
5012          if (wBack && !earlyWBack)
5013             putIReg64orSP(nn, mkexpr(tEA));
5014
5015          const HChar* ld_name[4] = { "ldurb", "ldurh", "ldur", "ldur" };
5016          const HChar* st_name[4] = { "sturb", "sturh", "stur", "stur" };
5017          const HChar* fmt_str = NULL;
5018          switch (how) {
5019             case BITS2(0,1):
5020                fmt_str = "%s %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5021                break;
5022             case BITS2(1,1):
5023                fmt_str = "%s %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5024                break;
5025             case BITS2(0,0):
5026                fmt_str = "%s %s, [%s, #%lld] (at-Rn)\n";
5027                break;
5028             default:
5029                vassert(0);
5030          }
5031          DIP(fmt_str, (isLoad ? ld_name : st_name)[szLg2],
5032                       nameIRegOrZR(szB == 8, tt),
5033                       nameIReg64orSP(nn), simm9);
5034          return True;
5035       }
5036    }
5037
5038    /* -------- LDP,STP (immediate, simm7) (INT REGS) -------- */
5039    /* L==1 => mm==LD
5040       L==0 => mm==ST
5041       x==0 => 32 bit transfers, and zero extended loads
5042       x==1 => 64 bit transfers
5043       simm7 is scaled by the (single-register) transfer size
5044
5045       (at-Rn-then-Rn=EA)
5046       x0 101 0001 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP], #imm
5047
5048       (at-EA-then-Rn=EA)
5049       x0 101 0011 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]!
5050
5051       (at-EA)
5052       x0 101 0010 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]
5053    */
5054    UInt insn_30_23 = INSN(30,23);
5055    if (insn_30_23 == BITS8(0,1,0,1,0,0,0,1)
5056        || insn_30_23 == BITS8(0,1,0,1,0,0,1,1)
5057        || insn_30_23 == BITS8(0,1,0,1,0,0,1,0)) {
5058       UInt bL     = INSN(22,22);
5059       UInt bX     = INSN(31,31);
5060       UInt bWBack = INSN(23,23);
5061       UInt rT1    = INSN(4,0);
5062       UInt rN     = INSN(9,5);
5063       UInt rT2    = INSN(14,10);
5064       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5065       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5066           || (bL && rT1 == rT2)) {
5067          /* undecodable; fall through */
5068       } else {
5069          if (rN == 31) { /* FIXME generate stack alignment check */ }
5070
5071          // Compute the transfer address TA and the writeback address WA.
5072          IRTemp tRN = newTemp(Ity_I64);
5073          assign(tRN, getIReg64orSP(rN));
5074          IRTemp tEA = newTemp(Ity_I64);
5075          simm7 = (bX ? 8 : 4) * simm7;
5076          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5077
5078          IRTemp tTA = newTemp(Ity_I64);
5079          IRTemp tWA = newTemp(Ity_I64);
5080          switch (INSN(24,23)) {
5081             case BITS2(0,1):
5082                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5083             case BITS2(1,1):
5084                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5085             case BITS2(1,0):
5086                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5087             default:
5088                vassert(0); /* NOTREACHED */
5089          }
5090
5091          /* Normally rN would be updated after the transfer.  However, in
5092             the special case typifed by
5093                stp x29, x30, [sp,#-112]!
5094             it is necessary to update SP before the transfer, (1)
5095             because Memcheck will otherwise complain about a write
5096             below the stack pointer, and (2) because the segfault
5097             stack extension mechanism will otherwise extend the stack
5098             only down to SP before the instruction, which might not be
5099             far enough, if the -112 bit takes the actual access
5100             address to the next page.
5101          */
5102          Bool earlyWBack
5103            = bWBack && simm7 < 0
5104              && INSN(24,23) == BITS2(1,1) && rN == 31 && bL == 0;
5105
5106          if (bWBack && earlyWBack)
5107             putIReg64orSP(rN, mkexpr(tEA));
5108
5109          /**/ if (bL == 1 && bX == 1) {
5110             // 64 bit load
5111             putIReg64orZR(rT1, loadLE(Ity_I64,
5112                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
5113             putIReg64orZR(rT2, loadLE(Ity_I64,
5114                                       binop(Iop_Add64,mkexpr(tTA),mkU64(8))));
5115          } else if (bL == 1 && bX == 0) {
5116             // 32 bit load
5117             putIReg32orZR(rT1, loadLE(Ity_I32,
5118                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
5119             putIReg32orZR(rT2, loadLE(Ity_I32,
5120                                       binop(Iop_Add64,mkexpr(tTA),mkU64(4))));
5121          } else if (bL == 0 && bX == 1) {
5122             // 64 bit store
5123             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
5124                     getIReg64orZR(rT1));
5125             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(8)),
5126                     getIReg64orZR(rT2));
5127          } else {
5128             vassert(bL == 0 && bX == 0);
5129             // 32 bit store
5130             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
5131                     getIReg32orZR(rT1));
5132             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(4)),
5133                     getIReg32orZR(rT2));
5134          }
5135
5136          if (bWBack && !earlyWBack)
5137             putIReg64orSP(rN, mkexpr(tEA));
5138
5139          const HChar* fmt_str = NULL;
5140          switch (INSN(24,23)) {
5141             case BITS2(0,1):
5142                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5143                break;
5144             case BITS2(1,1):
5145                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5146                break;
5147             case BITS2(1,0):
5148                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5149                break;
5150             default:
5151                vassert(0);
5152          }
5153          DIP(fmt_str, bL == 0 ? "st" : "ld",
5154                       nameIRegOrZR(bX == 1, rT1),
5155                       nameIRegOrZR(bX == 1, rT2),
5156                       nameIReg64orSP(rN), simm7);
5157          return True;
5158       }
5159    }
5160
5161    /* -------- LDPSW (immediate, simm7) (INT REGS) -------- */
5162    /* Does 32 bit transfers which are sign extended to 64 bits.
5163       simm7 is scaled by the (single-register) transfer size
5164
5165       (at-Rn-then-Rn=EA)
5166       01 101 0001 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP], #imm
5167
5168       (at-EA-then-Rn=EA)
5169       01 101 0011 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]!
5170
5171       (at-EA)
5172       01 101 0010 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]
5173    */
5174    UInt insn_31_22 = INSN(31,22);
5175    if (insn_31_22 == BITS10(0,1,1,0,1,0,0,0,1,1)
5176        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,1,1)
5177        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,0,1)) {
5178       UInt bWBack = INSN(23,23);
5179       UInt rT1    = INSN(4,0);
5180       UInt rN     = INSN(9,5);
5181       UInt rT2    = INSN(14,10);
5182       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5183       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5184           || (rT1 == rT2)) {
5185          /* undecodable; fall through */
5186       } else {
5187          if (rN == 31) { /* FIXME generate stack alignment check */ }
5188
5189          // Compute the transfer address TA and the writeback address WA.
5190          IRTemp tRN = newTemp(Ity_I64);
5191          assign(tRN, getIReg64orSP(rN));
5192          IRTemp tEA = newTemp(Ity_I64);
5193          simm7 = 4 * simm7;
5194          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5195
5196          IRTemp tTA = newTemp(Ity_I64);
5197          IRTemp tWA = newTemp(Ity_I64);
5198          switch (INSN(24,23)) {
5199             case BITS2(0,1):
5200                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5201             case BITS2(1,1):
5202                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5203             case BITS2(1,0):
5204                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5205             default:
5206                vassert(0); /* NOTREACHED */
5207          }
5208
5209          // 32 bit load, sign extended to 64 bits
5210          putIReg64orZR(rT1, unop(Iop_32Sto64,
5211                                  loadLE(Ity_I32, binop(Iop_Add64,
5212                                                        mkexpr(tTA),
5213                                                        mkU64(0)))));
5214          putIReg64orZR(rT2, unop(Iop_32Sto64,
5215                                  loadLE(Ity_I32, binop(Iop_Add64,
5216                                                        mkexpr(tTA),
5217                                                        mkU64(4)))));
5218          if (bWBack)
5219             putIReg64orSP(rN, mkexpr(tEA));
5220
5221          const HChar* fmt_str = NULL;
5222          switch (INSN(24,23)) {
5223             case BITS2(0,1):
5224                fmt_str = "ldpsw %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5225                break;
5226             case BITS2(1,1):
5227                fmt_str = "ldpsw %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5228                break;
5229             case BITS2(1,0):
5230                fmt_str = "ldpsw %s, %s, [%s, #%lld] (at-Rn)\n";
5231                break;
5232             default:
5233                vassert(0);
5234          }
5235          DIP(fmt_str, nameIReg64orZR(rT1),
5236                       nameIReg64orZR(rT2),
5237                       nameIReg64orSP(rN), simm7);
5238          return True;
5239       }
5240    }
5241
5242    /* ---------------- LDR (literal, int reg) ---------------- */
5243    /* 31 29      23    4
5244       00 011 000 imm19 Rt   LDR   Wt, [PC + sxTo64(imm19 << 2)]
5245       01 011 000 imm19 Rt   LDR   Xt, [PC + sxTo64(imm19 << 2)]
5246       10 011 000 imm19 Rt   LDRSW Xt, [PC + sxTo64(imm19 << 2)]
5247       11 011 000 imm19 Rt   prefetch  [PC + sxTo64(imm19 << 2)]
5248       Just handles the first two cases for now.
5249    */
5250    if (INSN(29,24) == BITS6(0,1,1,0,0,0) && INSN(31,31) == 0) {
5251       UInt  imm19 = INSN(23,5);
5252       UInt  rT    = INSN(4,0);
5253       UInt  bX    = INSN(30,30);
5254       ULong ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5255       if (bX) {
5256          putIReg64orZR(rT, loadLE(Ity_I64, mkU64(ea)));
5257       } else {
5258          putIReg32orZR(rT, loadLE(Ity_I32, mkU64(ea)));
5259       }
5260       DIP("ldr %s, 0x%llx (literal)\n", nameIRegOrZR(bX == 1, rT), ea);
5261       return True;
5262    }
5263
5264    /* -------------- {LD,ST}R (integer register) --------------- */
5265    /* 31 29        20 15     12 11 9  4
5266       |  |         |  |      |  |  |  |
5267       11 111000011 Rm option S  10 Rn Rt  LDR  Xt, [Xn|SP, R<m>{ext/sh}]
5268       10 111000011 Rm option S  10 Rn Rt  LDR  Wt, [Xn|SP, R<m>{ext/sh}]
5269       01 111000011 Rm option S  10 Rn Rt  LDRH Wt, [Xn|SP, R<m>{ext/sh}]
5270       00 111000011 Rm option S  10 Rn Rt  LDRB Wt, [Xn|SP, R<m>{ext/sh}]
5271
5272       11 111000001 Rm option S  10 Rn Rt  STR  Xt, [Xn|SP, R<m>{ext/sh}]
5273       10 111000001 Rm option S  10 Rn Rt  STR  Wt, [Xn|SP, R<m>{ext/sh}]
5274       01 111000001 Rm option S  10 Rn Rt  STRH Wt, [Xn|SP, R<m>{ext/sh}]
5275       00 111000001 Rm option S  10 Rn Rt  STRB Wt, [Xn|SP, R<m>{ext/sh}]
5276    */
5277    if (INSN(29,23) == BITS7(1,1,1,0,0,0,0)
5278        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5279       HChar  dis_buf[64];
5280       UInt   szLg2 = INSN(31,30);
5281       Bool   isLD  = INSN(22,22) == 1;
5282       UInt   tt    = INSN(4,0);
5283       IRTemp ea    = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5284       if (ea != IRTemp_INVALID) {
5285          switch (szLg2) {
5286             case 3: /* 64 bit */
5287                if (isLD) {
5288                   putIReg64orZR(tt, loadLE(Ity_I64, mkexpr(ea)));
5289                   DIP("ldr %s, %s\n", nameIReg64orZR(tt), dis_buf);
5290                } else {
5291                   storeLE(mkexpr(ea), getIReg64orZR(tt));
5292                   DIP("str %s, %s\n", nameIReg64orZR(tt), dis_buf);
5293                }
5294                break;
5295             case 2: /* 32 bit */
5296                if (isLD) {
5297                   putIReg32orZR(tt, loadLE(Ity_I32, mkexpr(ea)));
5298                   DIP("ldr %s, %s\n", nameIReg32orZR(tt), dis_buf);
5299                } else {
5300                   storeLE(mkexpr(ea), getIReg32orZR(tt));
5301                   DIP("str %s, %s\n", nameIReg32orZR(tt), dis_buf);
5302                }
5303                break;
5304             case 1: /* 16 bit */
5305                if (isLD) {
5306                   putIReg64orZR(tt, unop(Iop_16Uto64,
5307                                          loadLE(Ity_I16, mkexpr(ea))));
5308                   DIP("ldruh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5309                } else {
5310                   storeLE(mkexpr(ea), unop(Iop_64to16, getIReg64orZR(tt)));
5311                   DIP("strh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5312                }
5313                break;
5314             case 0: /* 8 bit */
5315                if (isLD) {
5316                   putIReg64orZR(tt, unop(Iop_8Uto64,
5317                                          loadLE(Ity_I8, mkexpr(ea))));
5318                   DIP("ldrub %s, %s\n", nameIReg32orZR(tt), dis_buf);
5319                } else {
5320                   storeLE(mkexpr(ea), unop(Iop_64to8, getIReg64orZR(tt)));
5321                   DIP("strb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5322                }
5323                break;
5324             default:
5325                vassert(0);
5326          }
5327          return True;
5328       }
5329    }
5330
5331    /* -------------- LDRS{B,H,W} (uimm12) -------------- */
5332    /* 31 29  26  23 21    9 4
5333       10 111 001 10 imm12 n t   LDRSW Xt, [Xn|SP, #pimm12 * 4]
5334       01 111 001 1x imm12 n t   LDRSH Rt, [Xn|SP, #pimm12 * 2]
5335       00 111 001 1x imm12 n t   LDRSB Rt, [Xn|SP, #pimm12 * 1]
5336       where
5337          Rt is Wt when x==1, Xt when x==0
5338    */
5339    if (INSN(29,23) == BITS7(1,1,1,0,0,1,1)) {
5340       /* Further checks on bits 31:30 and 22 */
5341       Bool valid = False;
5342       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5343          case BITS3(1,0,0):
5344          case BITS3(0,1,0): case BITS3(0,1,1):
5345          case BITS3(0,0,0): case BITS3(0,0,1):
5346             valid = True;
5347             break;
5348       }
5349       if (valid) {
5350          UInt    szLg2 = INSN(31,30);
5351          UInt    bitX  = INSN(22,22);
5352          UInt    imm12 = INSN(21,10);
5353          UInt    nn    = INSN(9,5);
5354          UInt    tt    = INSN(4,0);
5355          UInt    szB   = 1 << szLg2;
5356          IRExpr* ea    = binop(Iop_Add64,
5357                                getIReg64orSP(nn), mkU64(imm12 * szB));
5358          switch (szB) {
5359             case 4:
5360                vassert(bitX == 0);
5361                putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, ea)));
5362                DIP("ldrsw %s, [%s, #%u]\n", nameIReg64orZR(tt),
5363                    nameIReg64orSP(nn), imm12 * szB);
5364                break;
5365             case 2:
5366                if (bitX == 1) {
5367                   putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, ea)));
5368                } else {
5369                   putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, ea)));
5370                }
5371                DIP("ldrsh %s, [%s, #%u]\n",
5372                    nameIRegOrZR(bitX == 0, tt),
5373                    nameIReg64orSP(nn), imm12 * szB);
5374                break;
5375             case 1:
5376                if (bitX == 1) {
5377                   putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, ea)));
5378                } else {
5379                   putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, ea)));
5380                }
5381                DIP("ldrsb %s, [%s, #%u]\n",
5382                    nameIRegOrZR(bitX == 0, tt),
5383                    nameIReg64orSP(nn), imm12 * szB);
5384                break;
5385             default:
5386                vassert(0);
5387          }
5388          return True;
5389       }
5390       /* else fall through */
5391    }
5392
5393    /* -------------- LDRS{B,H,W} (simm9, upd) -------------- */
5394    /* (at-Rn-then-Rn=EA)
5395       31 29      23 21 20   11 9 4
5396       00 111 000 1x 0  imm9 01 n t  LDRSB Rt, [Xn|SP], #simm9
5397       01 111 000 1x 0  imm9 01 n t  LDRSH Rt, [Xn|SP], #simm9
5398       10 111 000 10 0  imm9 01 n t  LDRSW Xt, [Xn|SP], #simm9
5399
5400       (at-EA-then-Rn=EA)
5401       00 111 000 1x 0  imm9 11 n t  LDRSB Rt, [Xn|SP, #simm9]!
5402       01 111 000 1x 0  imm9 11 n t  LDRSH Rt, [Xn|SP, #simm9]!
5403       10 111 000 10 0  imm9 11 n t  LDRSW Xt, [Xn|SP, #simm9]!
5404       where
5405          Rt is Wt when x==1, Xt when x==0
5406          transfer-at-Rn when [11]==0, at EA when [11]==1
5407    */
5408    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5409        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5410       /* Further checks on bits 31:30 and 22 */
5411       Bool valid = False;
5412       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5413          case BITS3(1,0,0):                    // LDRSW Xt
5414          case BITS3(0,1,0): case BITS3(0,1,1): // LDRSH Xt, Wt
5415          case BITS3(0,0,0): case BITS3(0,0,1): // LDRSB Xt, Wt
5416             valid = True;
5417             break;
5418       }
5419       if (valid) {
5420          UInt   szLg2 = INSN(31,30);
5421          UInt   imm9  = INSN(20,12);
5422          Bool   atRN  = INSN(11,11) == 0;
5423          UInt   nn    = INSN(9,5);
5424          UInt   tt    = INSN(4,0);
5425          IRTemp tRN   = newTemp(Ity_I64);
5426          IRTemp tEA   = newTemp(Ity_I64);
5427          IRTemp tTA   = IRTemp_INVALID;
5428          ULong  simm9 = sx_to_64(imm9, 9);
5429          Bool   is64  = INSN(22,22) == 0;
5430          assign(tRN, getIReg64orSP(nn));
5431          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5432          tTA = atRN ? tRN : tEA;
5433          HChar ch = '?';
5434          /* There are 5 cases:
5435                byte     load,           SX to 64
5436                byte     load, SX to 32, ZX to 64
5437                halfword load,           SX to 64
5438                halfword load, SX to 32, ZX to 64
5439                word     load,           SX to 64
5440             The ifs below handle them in the listed order.
5441          */
5442          if (szLg2 == 0) {
5443             ch = 'b';
5444             if (is64) {
5445                putIReg64orZR(tt, unop(Iop_8Sto64,
5446                                       loadLE(Ity_I8, mkexpr(tTA))));
5447             } else {
5448                putIReg32orZR(tt, unop(Iop_8Sto32,
5449                                       loadLE(Ity_I8, mkexpr(tTA))));
5450             }
5451          }
5452          else if (szLg2 == 1) {
5453             ch = 'h';
5454             if (is64) {
5455                putIReg64orZR(tt, unop(Iop_16Sto64,
5456                                       loadLE(Ity_I16, mkexpr(tTA))));
5457             } else {
5458                putIReg32orZR(tt, unop(Iop_16Sto32,
5459                                       loadLE(Ity_I16, mkexpr(tTA))));
5460             }
5461          }
5462          else if (szLg2 == 2 && is64) {
5463             ch = 'w';
5464             putIReg64orZR(tt, unop(Iop_32Sto64,
5465                                    loadLE(Ity_I32, mkexpr(tTA))));
5466          }
5467          else {
5468             vassert(0);
5469          }
5470          putIReg64orSP(nn, mkexpr(tEA));
5471          DIP(atRN ? "ldrs%c %s, [%s], #%llu\n" : "ldrs%c %s, [%s, #%llu]!",
5472              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
5473          return True;
5474       }
5475       /* else fall through */
5476    }
5477
5478    /* -------------- LDRS{B,H,W} (simm9, noUpd) -------------- */
5479    /* 31 29      23 21 20   11 9 4
5480       00 111 000 1x 0  imm9 00 n t  LDURSB Rt, [Xn|SP, #simm9]
5481       01 111 000 1x 0  imm9 00 n t  LDURSH Rt, [Xn|SP, #simm9]
5482       10 111 000 10 0  imm9 00 n t  LDURSW Xt, [Xn|SP, #simm9]
5483       where
5484          Rt is Wt when x==1, Xt when x==0
5485    */
5486    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5487        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5488       /* Further checks on bits 31:30 and 22 */
5489       Bool valid = False;
5490       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5491          case BITS3(1,0,0):                    // LDURSW Xt
5492          case BITS3(0,1,0): case BITS3(0,1,1): // LDURSH Xt, Wt
5493          case BITS3(0,0,0): case BITS3(0,0,1): // LDURSB Xt, Wt
5494             valid = True;
5495             break;
5496       }
5497       if (valid) {
5498          UInt   szLg2 = INSN(31,30);
5499          UInt   imm9  = INSN(20,12);
5500          UInt   nn    = INSN(9,5);
5501          UInt   tt    = INSN(4,0);
5502          IRTemp tRN   = newTemp(Ity_I64);
5503          IRTemp tEA   = newTemp(Ity_I64);
5504          ULong  simm9 = sx_to_64(imm9, 9);
5505          Bool   is64  = INSN(22,22) == 0;
5506          assign(tRN, getIReg64orSP(nn));
5507          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5508          HChar ch = '?';
5509          /* There are 5 cases:
5510                byte     load,           SX to 64
5511                byte     load, SX to 32, ZX to 64
5512                halfword load,           SX to 64
5513                halfword load, SX to 32, ZX to 64
5514                word     load,           SX to 64
5515             The ifs below handle them in the listed order.
5516          */
5517          if (szLg2 == 0) {
5518             ch = 'b';
5519             if (is64) {
5520                putIReg64orZR(tt, unop(Iop_8Sto64,
5521                                       loadLE(Ity_I8, mkexpr(tEA))));
5522             } else {
5523                putIReg32orZR(tt, unop(Iop_8Sto32,
5524                                       loadLE(Ity_I8, mkexpr(tEA))));
5525             }
5526          }
5527          else if (szLg2 == 1) {
5528             ch = 'h';
5529             if (is64) {
5530                putIReg64orZR(tt, unop(Iop_16Sto64,
5531                                       loadLE(Ity_I16, mkexpr(tEA))));
5532             } else {
5533                putIReg32orZR(tt, unop(Iop_16Sto32,
5534                                       loadLE(Ity_I16, mkexpr(tEA))));
5535             }
5536          }
5537          else if (szLg2 == 2 && is64) {
5538             ch = 'w';
5539             putIReg64orZR(tt, unop(Iop_32Sto64,
5540                                    loadLE(Ity_I32, mkexpr(tEA))));
5541          }
5542          else {
5543             vassert(0);
5544          }
5545          DIP("ldurs%c %s, [%s, #%lld]\n",
5546              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), (Long)simm9);
5547          return True;
5548       }
5549       /* else fall through */
5550    }
5551
5552    /* -------- LDP,STP (immediate, simm7) (FP&VEC) -------- */
5553    /* L==1    => mm==LD
5554       L==0    => mm==ST
5555       sz==00  => 32 bit (S) transfers
5556       sz==01  => 64 bit (D) transfers
5557       sz==10  => 128 bit (Q) transfers
5558       sz==11  isn't allowed
5559       simm7 is scaled by the (single-register) transfer size
5560
5561       31 29  26   22 21   14 9 4
5562
5563       sz 101 1000 L  imm7 t2 n t1   mmNP SDQt1, SDQt2, [Xn|SP, #imm]
5564                                     (at-EA, with nontemporal hint)
5565
5566       sz 101 1001 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP], #imm
5567                                     (at-Rn-then-Rn=EA)
5568
5569       sz 101 1010 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]
5570                                     (at-EA)
5571
5572       sz 101 1011 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]!
5573                                     (at-EA-then-Rn=EA)
5574    */
5575    if (INSN(29,25) == BITS5(1,0,1,1,0)) {
5576       UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
5577       Bool isLD   = INSN(22,22) == 1;
5578       Bool wBack  = INSN(23,23) == 1;
5579       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5580       UInt tt2    = INSN(14,10);
5581       UInt nn     = INSN(9,5);
5582       UInt tt1    = INSN(4,0);
5583       if (szSlg2 == BITS2(1,1) || (isLD && tt1 == tt2)) {
5584          /* undecodable; fall through */
5585       } else {
5586          if (nn == 31) { /* FIXME generate stack alignment check */ }
5587
5588          // Compute the transfer address TA and the writeback address WA.
5589          UInt   szB = 4 << szSlg2; /* szB is the per-register size */
5590          IRTemp tRN = newTemp(Ity_I64);
5591          assign(tRN, getIReg64orSP(nn));
5592          IRTemp tEA = newTemp(Ity_I64);
5593          simm7 = szB * simm7;
5594          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5595
5596          IRTemp tTA = newTemp(Ity_I64);
5597          IRTemp tWA = newTemp(Ity_I64);
5598          switch (INSN(24,23)) {
5599             case BITS2(0,1):
5600                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5601             case BITS2(1,1):
5602                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5603             case BITS2(1,0):
5604             case BITS2(0,0):
5605                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5606             default:
5607                vassert(0); /* NOTREACHED */
5608          }
5609
5610          IRType ty = Ity_INVALID;
5611          switch (szB) {
5612             case 4:  ty = Ity_F32;  break;
5613             case 8:  ty = Ity_F64;  break;
5614             case 16: ty = Ity_V128; break;
5615             default: vassert(0);
5616          }
5617
5618          /* Normally rN would be updated after the transfer.  However, in
5619             the special cases typifed by
5620                stp q0, q1, [sp,#-512]!
5621                stp d0, d1, [sp,#-512]!
5622                stp s0, s1, [sp,#-512]!
5623             it is necessary to update SP before the transfer, (1)
5624             because Memcheck will otherwise complain about a write
5625             below the stack pointer, and (2) because the segfault
5626             stack extension mechanism will otherwise extend the stack
5627             only down to SP before the instruction, which might not be
5628             far enough, if the -512 bit takes the actual access
5629             address to the next page.
5630          */
5631          Bool earlyWBack
5632            = wBack && simm7 < 0
5633              && INSN(24,23) == BITS2(1,1) && nn == 31 && !isLD;
5634
5635          if (wBack && earlyWBack)
5636             putIReg64orSP(nn, mkexpr(tEA));
5637
5638          if (isLD) {
5639             if (szB < 16) {
5640                putQReg128(tt1, mkV128(0x0000));
5641             }
5642             putQRegLO(tt1,
5643                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
5644             if (szB < 16) {
5645                putQReg128(tt2, mkV128(0x0000));
5646             }
5647             putQRegLO(tt2,
5648                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
5649          } else {
5650             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
5651                     getQRegLO(tt1, ty));
5652             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
5653                     getQRegLO(tt2, ty));
5654          }
5655
5656          if (wBack && !earlyWBack)
5657             putIReg64orSP(nn, mkexpr(tEA));
5658
5659          const HChar* fmt_str = NULL;
5660          switch (INSN(24,23)) {
5661             case BITS2(0,1):
5662                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5663                break;
5664             case BITS2(1,1):
5665                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5666                break;
5667             case BITS2(1,0):
5668                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5669                break;
5670             case BITS2(0,0):
5671                fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
5672                break;
5673             default:
5674                vassert(0);
5675          }
5676          DIP(fmt_str, isLD ? "ld" : "st",
5677                       nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
5678                       nameIReg64orSP(nn), simm7);
5679          return True;
5680       }
5681    }
5682
5683    /* -------------- {LD,ST}R (vector register) --------------- */
5684    /* 31 29     23  20 15     12 11 9  4
5685       |  |      |   |  |      |  |  |  |
5686       00 111100 011 Rm option S  10 Rn Rt  LDR Bt, [Xn|SP, R<m>{ext/sh}]
5687       01 111100 011 Rm option S  10 Rn Rt  LDR Ht, [Xn|SP, R<m>{ext/sh}]
5688       10 111100 011 Rm option S  10 Rn Rt  LDR St, [Xn|SP, R<m>{ext/sh}]
5689       11 111100 011 Rm option S  10 Rn Rt  LDR Dt, [Xn|SP, R<m>{ext/sh}]
5690       00 111100 111 Rm option S  10 Rn Rt  LDR Qt, [Xn|SP, R<m>{ext/sh}]
5691
5692       00 111100 001 Rm option S  10 Rn Rt  STR Bt, [Xn|SP, R<m>{ext/sh}]
5693       01 111100 001 Rm option S  10 Rn Rt  STR Ht, [Xn|SP, R<m>{ext/sh}]
5694       10 111100 001 Rm option S  10 Rn Rt  STR St, [Xn|SP, R<m>{ext/sh}]
5695       11 111100 001 Rm option S  10 Rn Rt  STR Dt, [Xn|SP, R<m>{ext/sh}]
5696       00 111100 101 Rm option S  10 Rn Rt  STR Qt, [Xn|SP, R<m>{ext/sh}]
5697    */
5698    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5699        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5700       HChar  dis_buf[64];
5701       UInt   szLg2 = (INSN(23,23) << 2) | INSN(31,30);
5702       Bool   isLD  = INSN(22,22) == 1;
5703       UInt   tt    = INSN(4,0);
5704       if (szLg2 > 4) goto after_LDR_STR_vector_register;
5705       IRTemp ea    = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
5706       if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
5707       switch (szLg2) {
5708          case 0: /* 8 bit */
5709             if (isLD) {
5710                putQReg128(tt, mkV128(0x0000));
5711                putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
5712                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5713             } else {
5714                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
5715                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5716             }
5717             break;
5718          case 1:
5719             if (isLD) {
5720                putQReg128(tt, mkV128(0x0000));
5721                putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
5722                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5723             } else {
5724                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
5725                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5726             }
5727             break;
5728          case 2: /* 32 bit */
5729             if (isLD) {
5730                putQReg128(tt, mkV128(0x0000));
5731                putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
5732                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5733             } else {
5734                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
5735                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5736             }
5737             break;
5738          case 3: /* 64 bit */
5739             if (isLD) {
5740                putQReg128(tt, mkV128(0x0000));
5741                putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
5742                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5743             } else {
5744                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
5745                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5746             }
5747             break;
5748          case 4:
5749             if (isLD) {
5750                putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
5751                DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
5752             } else {
5753                storeLE(mkexpr(ea), getQReg128(tt));
5754                DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
5755             }
5756             break;
5757          default:
5758             vassert(0);
5759       }
5760       return True;
5761    }
5762   after_LDR_STR_vector_register:
5763
5764    /* ---------- LDRS{B,H,W} (integer register, SX) ---------- */
5765    /* 31 29      22 20 15  12 11 9  4
5766       |  |       |  |  |   |  |  |  |
5767       10 1110001 01 Rm opt S 10 Rn Rt    LDRSW Xt, [Xn|SP, R<m>{ext/sh}]
5768
5769       01 1110001 01 Rm opt S 10 Rn Rt    LDRSH Xt, [Xn|SP, R<m>{ext/sh}]
5770       01 1110001 11 Rm opt S 10 Rn Rt    LDRSH Wt, [Xn|SP, R<m>{ext/sh}]
5771
5772       00 1110001 01 Rm opt S 10 Rn Rt    LDRSB Xt, [Xn|SP, R<m>{ext/sh}]
5773       00 1110001 11 Rm opt S 10 Rn Rt    LDRSB Wt, [Xn|SP, R<m>{ext/sh}]
5774    */
5775    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5776        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5777       HChar  dis_buf[64];
5778       UInt   szLg2  = INSN(31,30);
5779       Bool   sxTo64 = INSN(22,22) == 0; // else sx to 32 and zx to 64
5780       UInt   tt     = INSN(4,0);
5781       if (szLg2 == 3) goto after_LDRS_integer_register;
5782       IRTemp ea     = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5783       if (ea == IRTemp_INVALID) goto after_LDRS_integer_register;
5784       /* Enumerate the 5 variants explicitly. */
5785       if (szLg2 == 2/*32 bit*/ && sxTo64) {
5786          putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, mkexpr(ea))));
5787          DIP("ldrsw %s, %s\n", nameIReg64orZR(tt), dis_buf);
5788          return True;
5789       }
5790       else
5791       if (szLg2 == 1/*16 bit*/) {
5792          if (sxTo64) {
5793             putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, mkexpr(ea))));
5794             DIP("ldrsh %s, %s\n", nameIReg64orZR(tt), dis_buf);
5795          } else {
5796             putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(ea))));
5797             DIP("ldrsh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5798          }
5799          return True;
5800       }
5801       else
5802       if (szLg2 == 0/*8 bit*/) {
5803          if (sxTo64) {
5804             putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, mkexpr(ea))));
5805             DIP("ldrsb %s, %s\n", nameIReg64orZR(tt), dis_buf);
5806          } else {
5807             putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(ea))));
5808             DIP("ldrsb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5809          }
5810          return True;
5811       }
5812       /* else it's an invalid combination */
5813    }
5814   after_LDRS_integer_register:
5815
5816    /* -------- LDR/STR (immediate, SIMD&FP, unsigned offset) -------- */
5817    /* This is the Unsigned offset variant only.  The Post-Index and
5818       Pre-Index variants are below.
5819
5820       31 29      23 21    9 4
5821       00 111 101 01 imm12 n t   LDR Bt, [Xn|SP + imm12 * 1]
5822       01 111 101 01 imm12 n t   LDR Ht, [Xn|SP + imm12 * 2]
5823       10 111 101 01 imm12 n t   LDR St, [Xn|SP + imm12 * 4]
5824       11 111 101 01 imm12 n t   LDR Dt, [Xn|SP + imm12 * 8]
5825       00 111 101 11 imm12 n t   LDR Qt, [Xn|SP + imm12 * 16]
5826
5827       00 111 101 00 imm12 n t   STR Bt, [Xn|SP + imm12 * 1]
5828       01 111 101 00 imm12 n t   STR Ht, [Xn|SP + imm12 * 2]
5829       10 111 101 00 imm12 n t   STR St, [Xn|SP + imm12 * 4]
5830       11 111 101 00 imm12 n t   STR Dt, [Xn|SP + imm12 * 8]
5831       00 111 101 10 imm12 n t   STR Qt, [Xn|SP + imm12 * 16]
5832    */
5833    if (INSN(29,24) == BITS6(1,1,1,1,0,1)
5834        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4) {
5835       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5836       Bool   isLD   = INSN(22,22) == 1;
5837       UInt   pimm12 = INSN(21,10) << szLg2;
5838       UInt   nn     = INSN(9,5);
5839       UInt   tt     = INSN(4,0);
5840       IRTemp tEA    = newTemp(Ity_I64);
5841       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5842       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(pimm12)));
5843       if (isLD) {
5844          if (szLg2 < 4) {
5845             putQReg128(tt, mkV128(0x0000));
5846          }
5847          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5848       } else {
5849          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5850       }
5851       DIP("%s %s, [%s, #%u]\n",
5852           isLD ? "ldr" : "str",
5853           nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
5854       return True;
5855    }
5856
5857    /* -------- LDR/STR (immediate, SIMD&FP, pre/post index) -------- */
5858    /* These are the Post-Index and Pre-Index variants.
5859
5860       31 29      23   20   11 9 4
5861       (at-Rn-then-Rn=EA)
5862       00 111 100 01 0 imm9 01 n t   LDR Bt, [Xn|SP], #simm
5863       01 111 100 01 0 imm9 01 n t   LDR Ht, [Xn|SP], #simm
5864       10 111 100 01 0 imm9 01 n t   LDR St, [Xn|SP], #simm
5865       11 111 100 01 0 imm9 01 n t   LDR Dt, [Xn|SP], #simm
5866       00 111 100 11 0 imm9 01 n t   LDR Qt, [Xn|SP], #simm
5867
5868       (at-EA-then-Rn=EA)
5869       00 111 100 01 0 imm9 11 n t   LDR Bt, [Xn|SP, #simm]!
5870       01 111 100 01 0 imm9 11 n t   LDR Ht, [Xn|SP, #simm]!
5871       10 111 100 01 0 imm9 11 n t   LDR St, [Xn|SP, #simm]!
5872       11 111 100 01 0 imm9 11 n t   LDR Dt, [Xn|SP, #simm]!
5873       00 111 100 11 0 imm9 11 n t   LDR Qt, [Xn|SP, #simm]!
5874
5875       Stores are the same except with bit 22 set to 0.
5876    */
5877    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5878        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5879        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5880       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5881       Bool   isLD   = INSN(22,22) == 1;
5882       UInt   imm9   = INSN(20,12);
5883       Bool   atRN   = INSN(11,11) == 0;
5884       UInt   nn     = INSN(9,5);
5885       UInt   tt     = INSN(4,0);
5886       IRTemp tRN    = newTemp(Ity_I64);
5887       IRTemp tEA    = newTemp(Ity_I64);
5888       IRTemp tTA    = IRTemp_INVALID;
5889       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5890       ULong  simm9  = sx_to_64(imm9, 9);
5891       assign(tRN, getIReg64orSP(nn));
5892       assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5893       tTA = atRN ? tRN : tEA;
5894
5895       /* Do early writeback for the cases typified by
5896             str d8, [sp, #-32]!
5897             str d10, [sp, #-128]!
5898             str q1, [sp, #-32]!
5899          for the same reasons as described in a similar comment in the
5900          "LDP,STP (immediate, simm7) (FP&VEC)" case just above.
5901       */
5902       Bool earlyWBack
5903          = !atRN && !isLD && (ty == Ity_F64 || ty == Ity_V128)
5904            && nn == 31 && ((Long)simm9) < 0;
5905
5906       if (earlyWBack)
5907          putIReg64orSP(nn, mkexpr(tEA));
5908
5909       if (isLD) {
5910          if (szLg2 < 4) {
5911             putQReg128(tt, mkV128(0x0000));
5912          }
5913          putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
5914       } else {
5915          storeLE(mkexpr(tTA), getQRegLO(tt, ty));
5916       }
5917
5918       if (!earlyWBack)
5919          putIReg64orSP(nn, mkexpr(tEA));
5920
5921       DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
5922           isLD ? "ldr" : "str",
5923           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5924       return True;
5925    }
5926
5927    /* -------- LDUR/STUR (unscaled offset, SIMD&FP) -------- */
5928    /* 31 29      23   20   11 9 4
5929       00 111 100 01 0 imm9 00 n t   LDR Bt, [Xn|SP, #simm]
5930       01 111 100 01 0 imm9 00 n t   LDR Ht, [Xn|SP, #simm]
5931       10 111 100 01 0 imm9 00 n t   LDR St, [Xn|SP, #simm]
5932       11 111 100 01 0 imm9 00 n t   LDR Dt, [Xn|SP, #simm]
5933       00 111 100 11 0 imm9 00 n t   LDR Qt, [Xn|SP, #simm]
5934
5935       00 111 100 00 0 imm9 00 n t   STR Bt, [Xn|SP, #simm]
5936       01 111 100 00 0 imm9 00 n t   STR Ht, [Xn|SP, #simm]
5937       10 111 100 00 0 imm9 00 n t   STR St, [Xn|SP, #simm]
5938       11 111 100 00 0 imm9 00 n t   STR Dt, [Xn|SP, #simm]
5939       00 111 100 10 0 imm9 00 n t   STR Qt, [Xn|SP, #simm]
5940    */
5941    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5942        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5943        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5944       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5945       Bool   isLD   = INSN(22,22) == 1;
5946       UInt   imm9   = INSN(20,12);
5947       UInt   nn     = INSN(9,5);
5948       UInt   tt     = INSN(4,0);
5949       ULong  simm9  = sx_to_64(imm9, 9);
5950       IRTemp tEA    = newTemp(Ity_I64);
5951       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5952       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
5953       if (isLD) {
5954          if (szLg2 < 4) {
5955             putQReg128(tt, mkV128(0x0000));
5956          }
5957          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5958       } else {
5959          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5960       }
5961       DIP("%s %s, [%s, #%lld]\n",
5962           isLD ? "ldur" : "stur",
5963           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5964       return True;
5965    }
5966
5967    /* ---------------- LDR (literal, SIMD&FP) ---------------- */
5968    /* 31 29      23    4
5969       00 011 100 imm19 t    LDR St, [PC + sxTo64(imm19 << 2)]
5970       01 011 100 imm19 t    LDR Dt, [PC + sxTo64(imm19 << 2)]
5971       10 011 100 imm19 t    LDR Qt, [PC + sxTo64(imm19 << 2)]
5972    */
5973    if (INSN(29,24) == BITS6(0,1,1,1,0,0) && INSN(31,30) < BITS2(1,1)) {
5974       UInt   szB   = 4 << INSN(31,30);
5975       UInt   imm19 = INSN(23,5);
5976       UInt   tt    = INSN(4,0);
5977       ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5978       IRType ty    = preferredVectorSubTypeFromSize(szB);
5979       putQReg128(tt, mkV128(0x0000));
5980       putQRegLO(tt, loadLE(ty, mkU64(ea)));
5981       DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
5982       return True;
5983    }
5984
5985    /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg  ------ */
5986    /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
5987    /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
5988    /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
5989    /* 31 29  26   22 21 20    15   11 9 4
5990
5991       0q 001 1000 L  0  00000 0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP]
5992       0q 001 1001 L  0  m     0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP], step
5993
5994       0q 001 1000 L  0  00000 0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP]
5995       0q 001 1001 L  0  m     0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP], step
5996
5997       0q 001 1000 L  0  00000 1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP]
5998       0q 001 1001 L  0  m     1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP], step
5999
6000       0q 001 1000 L  0  00000 0111 sz n t  xx1 {Vt.T},      [Xn|SP]
6001       0q 001 1001 L  0  m     0111 sz n t  xx1 {Vt.T},      [Xn|SP], step
6002
6003       T    = defined by Q and sz in the normal way
6004       step = if m == 11111 then transfer-size else Xm
6005       xx   = case L of 1 -> LD ; 0 -> ST
6006    */
6007    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
6008        && INSN(21,21) == 0) {
6009       Bool bitQ  = INSN(30,30);
6010       Bool isPX  = INSN(23,23) == 1;
6011       Bool isLD  = INSN(22,22) == 1;
6012       UInt mm    = INSN(20,16);
6013       UInt opc   = INSN(15,12);
6014       UInt sz    = INSN(11,10);
6015       UInt nn    = INSN(9,5);
6016       UInt tt    = INSN(4,0);
6017       Bool isQ   = bitQ == 1;
6018       Bool is1d  = sz == BITS2(1,1) && !isQ;
6019       UInt nRegs = 0;
6020       switch (opc) {
6021          case BITS4(0,0,0,0): nRegs = 4; break;
6022          case BITS4(0,1,0,0): nRegs = 3; break;
6023          case BITS4(1,0,0,0): nRegs = 2; break;
6024          case BITS4(0,1,1,1): nRegs = 1; break;
6025          default: break;
6026       }
6027
6028       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
6029          If we see it, set nRegs to 0 so as to cause the next conditional
6030          to fail. */
6031       if (!isPX && mm != 0)
6032          nRegs = 0;
6033
6034       if (nRegs == 1                             /* .1d is allowed */
6035           || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
6036
6037          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6038
6039          /* Generate the transfer address (TA) and if necessary the
6040             writeback address (WB) */
6041          IRTemp tTA = newTemp(Ity_I64);
6042          assign(tTA, getIReg64orSP(nn));
6043          if (nn == 31) { /* FIXME generate stack alignment check */ }
6044          IRTemp tWB = IRTemp_INVALID;
6045          if (isPX) {
6046             tWB = newTemp(Ity_I64);
6047             assign(tWB, binop(Iop_Add64,
6048                               mkexpr(tTA),
6049                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6050                                                      : getIReg64orZR(mm)));
6051          }
6052
6053          /* -- BEGIN generate the transfers -- */
6054
6055          IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
6056          u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
6057          switch (nRegs) {
6058             case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
6059             case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
6060             case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
6061             case 1: u0 = newTempV128(); i0 = newTempV128(); break;
6062             default: vassert(0);
6063          }
6064
6065          /* -- Multiple 128 or 64 bit stores -- */
6066          if (!isLD) {
6067             switch (nRegs) {
6068                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6069                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6070                case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
6071                case 1: assign(u0, getQReg128((tt+0) % 32)); break;
6072                default: vassert(0);
6073             }
6074             switch (nRegs) {
6075                case 4:  (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
6076                            (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
6077                         break;
6078                case 3:  (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
6079                            (&i0, &i1, &i2, sz, u0, u1, u2);
6080                         break;
6081                case 2:  (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
6082                            (&i0, &i1, sz, u0, u1);
6083                         break;
6084                case 1:  (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
6085                            (&i0, sz, u0);
6086                         break;
6087                default: vassert(0);
6088             }
6089 #           define MAYBE_NARROW_TO_64(_expr) \
6090                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6091             UInt step = isQ ? 16 : 8;
6092             switch (nRegs) {
6093                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6094                                  MAYBE_NARROW_TO_64(mkexpr(i3)) );
6095                         /* fallthru */
6096                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6097                                  MAYBE_NARROW_TO_64(mkexpr(i2)) );
6098                         /* fallthru */
6099                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6100                                  MAYBE_NARROW_TO_64(mkexpr(i1)) );
6101                         /* fallthru */
6102                case 1:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6103                                  MAYBE_NARROW_TO_64(mkexpr(i0)) );
6104                         break;
6105                default: vassert(0);
6106             }
6107 #           undef MAYBE_NARROW_TO_64
6108          }
6109
6110          /* -- Multiple 128 or 64 bit loads -- */
6111          else /* isLD */ {
6112             UInt   step   = isQ ? 16 : 8;
6113             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6114 #           define MAYBE_WIDEN_FROM_64(_expr) \
6115                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6116             switch (nRegs) {
6117                case 4:
6118                   assign(i3, MAYBE_WIDEN_FROM_64(
6119                                 loadLE(loadTy,
6120                                        binop(Iop_Add64, mkexpr(tTA),
6121                                                         mkU64(3 * step)))));
6122                   /* fallthru */
6123                case 3:
6124                   assign(i2, MAYBE_WIDEN_FROM_64(
6125                                 loadLE(loadTy,
6126                                        binop(Iop_Add64, mkexpr(tTA),
6127                                                         mkU64(2 * step)))));
6128                   /* fallthru */
6129                case 2:
6130                   assign(i1, MAYBE_WIDEN_FROM_64(
6131                                 loadLE(loadTy,
6132                                        binop(Iop_Add64, mkexpr(tTA),
6133                                                         mkU64(1 * step)))));
6134                   /* fallthru */
6135                case 1:
6136                   assign(i0, MAYBE_WIDEN_FROM_64(
6137                                 loadLE(loadTy,
6138                                        binop(Iop_Add64, mkexpr(tTA),
6139                                                         mkU64(0 * step)))));
6140                   break;
6141                default:
6142                   vassert(0);
6143             }
6144 #           undef MAYBE_WIDEN_FROM_64
6145             switch (nRegs) {
6146                case 4:  (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
6147                            (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
6148                         break;
6149                case 3:  (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
6150                            (&u0, &u1, &u2, sz, i0, i1, i2);
6151                         break;
6152                case 2:  (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
6153                            (&u0, &u1, sz, i0, i1);
6154                         break;
6155                case 1:  (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
6156                            (&u0, sz, i0);
6157                         break;
6158                default: vassert(0);
6159             }
6160             switch (nRegs) {
6161                case 4:  putQReg128( (tt+3) % 32,
6162                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6163                         /* fallthru */
6164                case 3:  putQReg128( (tt+2) % 32,
6165                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6166                         /* fallthru */
6167                case 2:  putQReg128( (tt+1) % 32,
6168                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6169                         /* fallthru */
6170                case 1:  putQReg128( (tt+0) % 32,
6171                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6172                         break;
6173                default: vassert(0);
6174             }
6175          }
6176
6177          /* -- END generate the transfers -- */
6178
6179          /* Do the writeback, if necessary */
6180          if (isPX) {
6181             putIReg64orSP(nn, mkexpr(tWB));
6182          }
6183
6184          HChar pxStr[20];
6185          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6186          if (isPX) {
6187             if (mm == BITS5(1,1,1,1,1))
6188                vex_sprintf(pxStr, ", #%u", xferSzB);
6189             else
6190                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6191          }
6192          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6193          DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
6194              isLD ? "ld" : "st", nRegs,
6195              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6196              pxStr);
6197
6198          if (nRegs >= 3) {
6199             dres->hint = Dis_HintVerbose;
6200          }
6201          return True;
6202       }
6203       /* else fall through */
6204    }
6205
6206    /* ------ LD1/ST1 (multiple 1-elem structs to/from 2 regs  ------ */
6207    /* ------ LD1/ST1 (multiple 1-elem structs to/from 3 regs  ------ */
6208    /* ------ LD1/ST1 (multiple 1-elem structs to/from 4 regs  ------ */
6209    /* 31 29  26   22 21 20    15   11 9 4
6210
6211       0q 001 1000 L  0  00000 0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP]
6212       0q 001 1001 L  0  m     0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP], step
6213
6214       0q 001 1000 L  0  00000 0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP]
6215       0q 001 1001 L  0  m     0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP], step
6216
6217       0q 001 1000 L  0  00000 1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP]
6218       0q 001 1001 L  0  m     1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP], step
6219
6220       T    = defined by Q and sz in the normal way
6221       step = if m == 11111 then transfer-size else Xm
6222       xx   = case L of 1 -> LD ; 0 -> ST
6223    */
6224    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
6225        && INSN(21,21) == 0) {
6226       Bool bitQ  = INSN(30,30);
6227       Bool isPX  = INSN(23,23) == 1;
6228       Bool isLD  = INSN(22,22) == 1;
6229       UInt mm    = INSN(20,16);
6230       UInt opc   = INSN(15,12);
6231       UInt sz    = INSN(11,10);
6232       UInt nn    = INSN(9,5);
6233       UInt tt    = INSN(4,0);
6234       Bool isQ   = bitQ == 1;
6235       UInt nRegs = 0;
6236       switch (opc) {
6237          case BITS4(0,0,1,0): nRegs = 4; break;
6238          case BITS4(0,1,1,0): nRegs = 3; break;
6239          case BITS4(1,0,1,0): nRegs = 2; break;
6240          default: break;
6241       }
6242
6243       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
6244          If we see it, set nRegs to 0 so as to cause the next conditional
6245          to fail. */
6246       if (!isPX && mm != 0)
6247          nRegs = 0;
6248
6249       if (nRegs >= 2 && nRegs <= 4) {
6250
6251          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6252
6253          /* Generate the transfer address (TA) and if necessary the
6254             writeback address (WB) */
6255          IRTemp tTA = newTemp(Ity_I64);
6256          assign(tTA, getIReg64orSP(nn));
6257          if (nn == 31) { /* FIXME generate stack alignment check */ }
6258          IRTemp tWB = IRTemp_INVALID;
6259          if (isPX) {
6260             tWB = newTemp(Ity_I64);
6261             assign(tWB, binop(Iop_Add64,
6262                               mkexpr(tTA),
6263                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6264                                                      : getIReg64orZR(mm)));
6265          }
6266
6267          /* -- BEGIN generate the transfers -- */
6268
6269          IRTemp u0, u1, u2, u3;
6270          u0 = u1 = u2 = u3 = IRTemp_INVALID;
6271          switch (nRegs) {
6272             case 4: u3 = newTempV128(); /* fallthru */
6273             case 3: u2 = newTempV128(); /* fallthru */
6274             case 2: u1 = newTempV128();
6275                     u0 = newTempV128(); break;
6276             default: vassert(0);
6277          }
6278
6279          /* -- Multiple 128 or 64 bit stores -- */
6280          if (!isLD) {
6281             switch (nRegs) {
6282                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6283                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6284                case 2: assign(u1, getQReg128((tt+1) % 32));
6285                        assign(u0, getQReg128((tt+0) % 32)); break;
6286                default: vassert(0);
6287             }
6288 #           define MAYBE_NARROW_TO_64(_expr) \
6289                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6290             UInt step = isQ ? 16 : 8;
6291             switch (nRegs) {
6292                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6293                                  MAYBE_NARROW_TO_64(mkexpr(u3)) );
6294                         /* fallthru */
6295                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6296                                  MAYBE_NARROW_TO_64(mkexpr(u2)) );
6297                         /* fallthru */
6298                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6299                                  MAYBE_NARROW_TO_64(mkexpr(u1)) );
6300                         storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6301                                  MAYBE_NARROW_TO_64(mkexpr(u0)) );
6302                         break;
6303                default: vassert(0);
6304             }
6305 #           undef MAYBE_NARROW_TO_64
6306          }
6307
6308          /* -- Multiple 128 or 64 bit loads -- */
6309          else /* isLD */ {
6310             UInt   step   = isQ ? 16 : 8;
6311             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6312 #           define MAYBE_WIDEN_FROM_64(_expr) \
6313                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6314             switch (nRegs) {
6315                case 4:
6316                   assign(u3, MAYBE_WIDEN_FROM_64(
6317                                 loadLE(loadTy,
6318                                        binop(Iop_Add64, mkexpr(tTA),
6319                                                         mkU64(3 * step)))));
6320                   /* fallthru */
6321                case 3:
6322                   assign(u2, MAYBE_WIDEN_FROM_64(
6323                                 loadLE(loadTy,
6324                                        binop(Iop_Add64, mkexpr(tTA),
6325                                                         mkU64(2 * step)))));
6326                   /* fallthru */
6327                case 2:
6328                   assign(u1, MAYBE_WIDEN_FROM_64(
6329                                 loadLE(loadTy,
6330                                        binop(Iop_Add64, mkexpr(tTA),
6331                                                         mkU64(1 * step)))));
6332                   assign(u0, MAYBE_WIDEN_FROM_64(
6333                                 loadLE(loadTy,
6334                                        binop(Iop_Add64, mkexpr(tTA),
6335                                                         mkU64(0 * step)))));
6336                   break;
6337                default:
6338                   vassert(0);
6339             }
6340 #           undef MAYBE_WIDEN_FROM_64
6341             switch (nRegs) {
6342                case 4:  putQReg128( (tt+3) % 32,
6343                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6344                         /* fallthru */
6345                case 3:  putQReg128( (tt+2) % 32,
6346                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6347                         /* fallthru */
6348                case 2:  putQReg128( (tt+1) % 32,
6349                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6350                         putQReg128( (tt+0) % 32,
6351                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6352                         break;
6353                default: vassert(0);
6354             }
6355          }
6356
6357          /* -- END generate the transfers -- */
6358
6359          /* Do the writeback, if necessary */
6360          if (isPX) {
6361             putIReg64orSP(nn, mkexpr(tWB));
6362          }
6363
6364          HChar pxStr[20];
6365          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6366          if (isPX) {
6367             if (mm == BITS5(1,1,1,1,1))
6368                vex_sprintf(pxStr, ", #%u", xferSzB);
6369             else
6370                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6371          }
6372          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6373          DIP("%s1 {v%u.%s .. v%u.%s}, [%s]%s\n",
6374              isLD ? "ld" : "st",
6375              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6376              pxStr);
6377
6378          return True;
6379       }
6380       /* else fall through */
6381    }
6382
6383    /* ---------- LD1R (single structure, replicate) ---------- */
6384    /* ---------- LD2R (single structure, replicate) ---------- */
6385    /* ---------- LD3R (single structure, replicate) ---------- */
6386    /* ---------- LD4R (single structure, replicate) ---------- */
6387    /* 31 29       22 20    15    11 9 4
6388       0q 001 1010 10 00000 110 0 sz n t  LD1R {Vt.T}, [Xn|SP]
6389       0q 001 1011 10 m     110 0 sz n t  LD1R {Vt.T}, [Xn|SP], step
6390
6391       0q 001 1010 11 00000 110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP]
6392       0q 001 1011 11 m     110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP], step
6393
6394       0q 001 1010 10 00000 111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP]
6395       0q 001 1011 10 m     111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP], step
6396
6397       0q 001 1010 11 00000 111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP]
6398       0q 001 1011 11 m     111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP], step
6399
6400       step = if m == 11111 then transfer-size else Xm
6401    */
6402    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
6403        && INSN(22,22) == 1 && INSN(15,14) == BITS2(1,1)
6404        && INSN(12,12) == 0) {
6405       UInt   bitQ  = INSN(30,30);
6406       Bool   isPX  = INSN(23,23) == 1;
6407       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6408       UInt   mm    = INSN(20,16);
6409       UInt   sz    = INSN(11,10);
6410       UInt   nn    = INSN(9,5);
6411       UInt   tt    = INSN(4,0);
6412
6413       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6414       if (isPX || mm == 0) {
6415
6416          IRType ty    = integerIRTypeOfSize(1 << sz);
6417
6418          UInt laneSzB = 1 << sz;
6419          UInt xferSzB = laneSzB * nRegs;
6420
6421          /* Generate the transfer address (TA) and if necessary the
6422             writeback address (WB) */
6423          IRTemp tTA = newTemp(Ity_I64);
6424          assign(tTA, getIReg64orSP(nn));
6425          if (nn == 31) { /* FIXME generate stack alignment check */ }
6426          IRTemp tWB = IRTemp_INVALID;
6427          if (isPX) {
6428             tWB = newTemp(Ity_I64);
6429             assign(tWB, binop(Iop_Add64,
6430                               mkexpr(tTA),
6431                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6432                                                      : getIReg64orZR(mm)));
6433          }
6434
6435          /* Do the writeback, if necessary */
6436          if (isPX) {
6437             putIReg64orSP(nn, mkexpr(tWB));
6438          }
6439
6440          IRTemp e0, e1, e2, e3, v0, v1, v2, v3;
6441          e0 = e1 = e2 = e3 = v0 = v1 = v2 = v3 = IRTemp_INVALID;
6442          switch (nRegs) {
6443             case 4:
6444                e3 = newTemp(ty);
6445                assign(e3, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6446                                                       mkU64(3 * laneSzB))));
6447                v3 = math_DUP_TO_V128(e3, ty);
6448                putQReg128((tt+3) % 32, math_MAYBE_ZERO_HI64(bitQ, v3));
6449                /* fallthrough */
6450             case 3:
6451                e2 = newTemp(ty);
6452                assign(e2, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6453                                                       mkU64(2 * laneSzB))));
6454                v2 = math_DUP_TO_V128(e2, ty);
6455                putQReg128((tt+2) % 32, math_MAYBE_ZERO_HI64(bitQ, v2));
6456                /* fallthrough */
6457             case 2:
6458                e1 = newTemp(ty);
6459                assign(e1, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6460                                                       mkU64(1 * laneSzB))));
6461                v1 = math_DUP_TO_V128(e1, ty);
6462                putQReg128((tt+1) % 32, math_MAYBE_ZERO_HI64(bitQ, v1));
6463                /* fallthrough */
6464             case 1:
6465                e0 = newTemp(ty);
6466                assign(e0, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6467                                                       mkU64(0 * laneSzB))));
6468                v0 = math_DUP_TO_V128(e0, ty);
6469                putQReg128((tt+0) % 32, math_MAYBE_ZERO_HI64(bitQ, v0));
6470                break;
6471             default:
6472                vassert(0);
6473          }
6474
6475          HChar pxStr[20];
6476          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6477          if (isPX) {
6478             if (mm == BITS5(1,1,1,1,1))
6479                vex_sprintf(pxStr, ", #%u", xferSzB);
6480             else
6481                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6482          }
6483          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6484          DIP("ld%ur {v%u.%s .. v%u.%s}, [%s]%s\n",
6485              nRegs,
6486              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6487              pxStr);
6488
6489          return True;
6490       }
6491       /* else fall through */
6492    }
6493
6494    /* ------ LD1/ST1 (single structure, to/from one lane) ------ */
6495    /* ------ LD2/ST2 (single structure, to/from one lane) ------ */
6496    /* ------ LD3/ST3 (single structure, to/from one lane) ------ */
6497    /* ------ LD4/ST4 (single structure, to/from one lane) ------ */
6498    /* 31 29       22 21 20    15    11 9 4
6499       0q 001 1010 L  0  00000 xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP]
6500       0q 001 1011 L  0  m     xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP], step
6501
6502       0q 001 1010 L  1  00000 xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP]
6503       0q 001 1011 L  1  m     xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP], step
6504
6505       0q 001 1010 L  0  00000 xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP]
6506       0q 001 1011 L  0  m     xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP], step
6507
6508       0q 001 1010 L  1  00000 xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP]
6509       0q 001 1011 L  1  m     xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP], step
6510
6511       step = if m == 11111 then transfer-size else Xm
6512       op   = case L of 1 -> LD ; 0 -> ST
6513
6514       laneszB,ix = case xx:q:S:sz of 00:b:b:bb -> 1, bbbb
6515                                      01:b:b:b0 -> 2, bbb
6516                                      10:b:b:00 -> 4, bb
6517                                      10:b:0:01 -> 8, b
6518    */
6519    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)) {
6520       UInt   bitQ  = INSN(30,30);
6521       Bool   isPX  = INSN(23,23) == 1;
6522       Bool   isLD  = INSN(22,22) == 1;
6523       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6524       UInt   mm    = INSN(20,16);
6525       UInt   xx    = INSN(15,14);
6526       UInt   bitS  = INSN(12,12);
6527       UInt   sz    = INSN(11,10);
6528       UInt   nn    = INSN(9,5);
6529       UInt   tt    = INSN(4,0);
6530
6531       Bool valid = True;
6532
6533       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6534       if (!isPX && mm != 0)
6535          valid = False;
6536
6537       UInt laneSzB = 0;  /* invalid */
6538       UInt ix      = 16; /* invalid */
6539
6540       UInt xx_q_S_sz = (xx << 4) | (bitQ << 3) | (bitS << 2) | sz;
6541       switch (xx_q_S_sz) {
6542          case 0x00: case 0x01: case 0x02: case 0x03:
6543          case 0x04: case 0x05: case 0x06: case 0x07:
6544          case 0x08: case 0x09: case 0x0A: case 0x0B:
6545          case 0x0C: case 0x0D: case 0x0E: case 0x0F:
6546             laneSzB = 1; ix = xx_q_S_sz & 0xF;
6547             break;
6548          case 0x10: case 0x12: case 0x14: case 0x16:
6549          case 0x18: case 0x1A: case 0x1C: case 0x1E:
6550             laneSzB = 2; ix = (xx_q_S_sz >> 1) & 7;
6551             break;
6552          case 0x20: case 0x24: case 0x28: case 0x2C:
6553             laneSzB = 4; ix = (xx_q_S_sz >> 2) & 3;
6554             break;
6555          case 0x21: case 0x29:
6556             laneSzB = 8; ix = (xx_q_S_sz >> 3) & 1;
6557             break;
6558          default:
6559             break;
6560       }
6561
6562       if (valid && laneSzB != 0) {
6563
6564          IRType ty      = integerIRTypeOfSize(laneSzB);
6565          UInt   xferSzB = laneSzB * nRegs;
6566
6567          /* Generate the transfer address (TA) and if necessary the
6568             writeback address (WB) */
6569          IRTemp tTA = newTemp(Ity_I64);
6570          assign(tTA, getIReg64orSP(nn));
6571          if (nn == 31) { /* FIXME generate stack alignment check */ }
6572          IRTemp tWB = IRTemp_INVALID;
6573          if (isPX) {
6574             tWB = newTemp(Ity_I64);
6575             assign(tWB, binop(Iop_Add64,
6576                               mkexpr(tTA),
6577                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6578                                                      : getIReg64orZR(mm)));
6579          }
6580
6581          /* Do the writeback, if necessary */
6582          if (isPX) {
6583             putIReg64orSP(nn, mkexpr(tWB));
6584          }
6585
6586          switch (nRegs) {
6587             case 4: {
6588                IRExpr* addr
6589                   = binop(Iop_Add64, mkexpr(tTA), mkU64(3 * laneSzB));
6590                if (isLD) {
6591                   putQRegLane((tt+3) % 32, ix, loadLE(ty, addr));
6592                } else {
6593                   storeLE(addr, getQRegLane((tt+3) % 32, ix, ty));
6594                }
6595             }
6596             /* fallthrough */
6597             case 3: {
6598                IRExpr* addr
6599                   = binop(Iop_Add64, mkexpr(tTA), mkU64(2 * laneSzB));
6600                if (isLD) {
6601                   putQRegLane((tt+2) % 32, ix, loadLE(ty, addr));
6602                } else {
6603                   storeLE(addr, getQRegLane((tt+2) % 32, ix, ty));
6604                }
6605             }
6606             /* fallthrough */
6607             case 2: {
6608                IRExpr* addr
6609                   = binop(Iop_Add64, mkexpr(tTA), mkU64(1 * laneSzB));
6610                if (isLD) {
6611                   putQRegLane((tt+1) % 32, ix, loadLE(ty, addr));
6612                } else {
6613                   storeLE(addr, getQRegLane((tt+1) % 32, ix, ty));
6614                }
6615             }
6616             /* fallthrough */
6617             case 1: {
6618                IRExpr* addr
6619                   = binop(Iop_Add64, mkexpr(tTA), mkU64(0 * laneSzB));
6620                if (isLD) {
6621                   putQRegLane((tt+0) % 32, ix, loadLE(ty, addr));
6622                } else {
6623                   storeLE(addr, getQRegLane((tt+0) % 32, ix, ty));
6624                }
6625                break;
6626             }
6627             default:
6628                vassert(0);
6629          }
6630
6631          HChar pxStr[20];
6632          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6633          if (isPX) {
6634             if (mm == BITS5(1,1,1,1,1))
6635                vex_sprintf(pxStr, ", #%u", xferSzB);
6636             else
6637                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6638          }
6639          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6640          DIP("%s%u {v%u.%s .. v%u.%s}[%u], [%s]%s\n",
6641              isLD ? "ld" : "st", nRegs,
6642              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr,
6643              ix, nameIReg64orSP(nn), pxStr);
6644
6645          return True;
6646       }
6647       /* else fall through */
6648    }
6649
6650    /* ------------------ LD{,A}X{R,RH,RB} ------------------ */
6651    /* ------------------ ST{,L}X{R,RH,RB} ------------------ */
6652    /* 31 29     23  20      14    9 4
6653       sz 001000 010 11111 0 11111 n t   LDX{R,RH,RB}  Rt, [Xn|SP]
6654       sz 001000 010 11111 1 11111 n t   LDAX{R,RH,RB} Rt, [Xn|SP]
6655       sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
6656       sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
6657    */
6658    /* For the "standard" implementation we pass through the LL and SC to
6659       the host.  For the "fallback" implementation, for details see
6660         https://bugs.kde.org/show_bug.cgi?id=344524 and
6661         https://bugs.kde.org/show_bug.cgi?id=369459,
6662       but in short:
6663
6664       LoadLinked(addr)
6665         gs.LLsize = load_size // 1, 2, 4 or 8
6666         gs.LLaddr = addr
6667         gs.LLdata = zeroExtend(*addr)
6668
6669       StoreCond(addr, data)
6670         tmp_LLsize = gs.LLsize
6671         gs.LLsize = 0 // "no transaction"
6672         if tmp_LLsize != store_size        -> fail
6673         if addr != gs.LLaddr               -> fail
6674         if zeroExtend(*addr) != gs.LLdata  -> fail
6675         cas_ok = CAS(store_size, addr, gs.LLdata -> data)
6676         if !cas_ok                         -> fail
6677         succeed
6678
6679       When thread scheduled
6680         gs.LLsize = 0 // "no transaction"
6681         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
6682          has to do this bit)
6683    */
6684    if (INSN(29,24) == BITS6(0,0,1,0,0,0)
6685        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
6686        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6687       UInt szBlg2     = INSN(31,30);
6688       Bool isLD       = INSN(22,22) == 1;
6689       Bool isAcqOrRel = INSN(15,15) == 1;
6690       UInt ss         = INSN(20,16);
6691       UInt nn         = INSN(9,5);
6692       UInt tt         = INSN(4,0);
6693
6694       vassert(szBlg2 < 4);
6695       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6696       IRType ty  = integerIRTypeOfSize(szB);
6697       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6698
6699       IRTemp ea = newTemp(Ity_I64);
6700       assign(ea, getIReg64orSP(nn));
6701       gen_SIGBUS_if_not_XX_aligned(ea, szB);
6702
6703       if (isLD && ss == BITS5(1,1,1,1,1)) {
6704          IRTemp res = newTemp(ty);
6705          if (abiinfo->guest__use_fallback_LLSC) {
6706             // Do the load first so we don't update any guest state
6707             // if it faults.
6708             IRTemp loaded_data64 = newTemp(Ity_I64);
6709             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
6710             stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
6711             stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
6712             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6713             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
6714             putIReg64orZR(tt, mkexpr(loaded_data64));
6715          } else {
6716             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6717             putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6718          }
6719          if (isAcqOrRel) {
6720             stmt(IRStmt_MBE(Imbe_Fence));
6721          }
6722          DIP("ld%sx%s %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6723              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6724              abiinfo->guest__use_fallback_LLSC
6725                 ? "(fallback implementation)" : "");
6726          return True;
6727       }
6728       if (!isLD) {
6729          if (isAcqOrRel) {
6730             stmt(IRStmt_MBE(Imbe_Fence));
6731          }
6732          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6733          if (abiinfo->guest__use_fallback_LLSC) {
6734             // This is really ugly, since we don't have any way to do
6735             // proper if-then-else.  First, set up as if the SC failed,
6736             // and jump forwards if it really has failed.
6737
6738             // Continuation address
6739             IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6740
6741             // "the SC failed".  Any non-zero value means failure.
6742             putIReg64orZR(ss, mkU64(1));
6743
6744             IRTemp tmp_LLsize = newTemp(Ity_I64);
6745             assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6746             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6747             ));
6748             // Fail if no or wrong-size transaction
6749             vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
6750             stmt( IRStmt_Exit(
6751                      binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(szB)),
6752                      Ijk_Boring, nia, OFFB_PC
6753             ));
6754             // Fail if the address doesn't match the LL address
6755             stmt( IRStmt_Exit(
6756                       binop(Iop_CmpNE64, mkexpr(ea),
6757                                          IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6758                       Ijk_Boring, nia, OFFB_PC
6759             ));
6760             // Fail if the data doesn't match the LL data
6761             IRTemp llsc_data64 = newTemp(Ity_I64);
6762             assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
6763             stmt( IRStmt_Exit(
6764                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
6765                                          mkexpr(llsc_data64)),
6766                       Ijk_Boring, nia, OFFB_PC
6767             ));
6768             // Try to CAS the new value in.
6769             IRTemp old = newTemp(ty);
6770             IRTemp expd = newTemp(ty);
6771             assign(expd, narrowFrom64(ty, mkexpr(llsc_data64)));
6772             stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6773                                      Iend_LE, mkexpr(ea),
6774                                      /*expdHi*/NULL, mkexpr(expd),
6775                                      /*dataHi*/NULL, data
6776             )));
6777             // Fail if the CAS failed (viz, old != expd)
6778             stmt( IRStmt_Exit(
6779                       binop(Iop_CmpNE64,
6780                             widenUto64(ty, mkexpr(old)),
6781                             widenUto64(ty, mkexpr(expd))),
6782                       Ijk_Boring, nia, OFFB_PC
6783             ));
6784             // Otherwise we succeeded (!)
6785             putIReg64orZR(ss, mkU64(0));
6786          } else {
6787             IRTemp res = newTemp(Ity_I1);
6788             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
6789             /* IR semantics: res is 1 if store succeeds, 0 if it fails.
6790                Need to set rS to 1 on failure, 0 on success. */
6791             putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
6792                                                mkU64(1)));
6793          }
6794          DIP("st%sx%s %s, %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6795              nameIRegOrZR(False, ss),
6796              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6797              abiinfo->guest__use_fallback_LLSC
6798                 ? "(fallback implementation)" : "");
6799          return True;
6800       }
6801       /* else fall through */
6802    }
6803
6804    /* -------------------- LD{,A}XP -------------------- */
6805    /* -------------------- ST{,L}XP -------------------- */
6806    /* 31 30 29     23  20    15 14  9  4
6807        1 sz 001000 011 11111 0  t2  n  t1   LDXP  Rt1, Rt2, [Xn|SP]
6808        1 sz 001000 011 11111 1  t2  n  t1   LDAXP Rt1, Rt2, [Xn|SP]
6809        1 sz 001000 001 s     0  t2  n  t1   STXP  Ws, Rt1, Rt2, [Xn|SP]
6810        1 sz 001000 001 s     1  t2  n  t1   STLXP Ws, Rt1, Rt2, [Xn|SP]
6811    */
6812    /* See just above, "LD{,A}X{R,RH,RB} / ST{,L}X{R,RH,RB}", for detailed
6813       comments about this implementation.  Note the 'sz' field here is only 1
6814       bit; above, it is 2 bits, and has a different encoding.
6815    */
6816    if (INSN(31,31) == 1
6817        && INSN(29,24) == BITS6(0,0,1,0,0,0)
6818        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,1)) {
6819       Bool elemIs64   = INSN(30,30) == 1;
6820       Bool isLD       = INSN(22,22) == 1;
6821       Bool isAcqOrRel = INSN(15,15) == 1;
6822       UInt ss         = INSN(20,16);
6823       UInt tt2        = INSN(14,10);
6824       UInt nn         = INSN(9,5);
6825       UInt tt1        = INSN(4,0);
6826
6827       UInt   elemSzB = elemIs64 ? 8 : 4;
6828       UInt   fullSzB = 2 * elemSzB;
6829       IRType elemTy  = integerIRTypeOfSize(elemSzB);
6830       IRType fullTy  = integerIRTypeOfSize(fullSzB);
6831
6832       IRTemp ea = newTemp(Ity_I64);
6833       assign(ea, getIReg64orSP(nn));
6834       gen_SIGBUS_if_not_XX_aligned(ea, fullSzB);
6835
6836       if (isLD && ss == BITS5(1,1,1,1,1)) {
6837          if (abiinfo->guest__use_fallback_LLSC) {
6838             // Fallback implementation of LL.
6839             // Do the load first so we don't update any guest state if it
6840             // faults.  Assumes little-endian guest.
6841             if (fullTy == Ity_I64) {
6842                vassert(elemSzB == 4);
6843                IRTemp loaded_data64 = newTemp(Ity_I64);
6844                assign(loaded_data64, loadLE(fullTy, mkexpr(ea)));
6845                stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
6846                stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
6847                stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6848                stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(8) ));
6849                putIReg64orZR(tt1, unop(Iop_32Uto64,
6850                                        unop(Iop_64to32,
6851                                             mkexpr(loaded_data64))));
6852                putIReg64orZR(tt2, unop(Iop_32Uto64,
6853                                        unop(Iop_64HIto32,
6854                                             mkexpr(loaded_data64))));
6855             } else {
6856                vassert(elemSzB == 8 && fullTy == Ity_I128);
6857                IRTemp loaded_data128 = newTemp(Ity_I128);
6858                // Hack: do the load as V128 rather than I128 so as to avoid
6859                // having to implement I128 loads in the arm64 back end.
6860                assign(loaded_data128, unop(Iop_ReinterpV128asI128,
6861                                            loadLE(Ity_V128, mkexpr(ea))));
6862                IRTemp loaded_data_lo64 = newTemp(Ity_I64);
6863                IRTemp loaded_data_hi64 = newTemp(Ity_I64);
6864                assign(loaded_data_lo64, unop(Iop_128to64,
6865                                              mkexpr(loaded_data128)));
6866                assign(loaded_data_hi64, unop(Iop_128HIto64,
6867                                              mkexpr(loaded_data128)));
6868                stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64,
6869                                  mkexpr(loaded_data_lo64) ));
6870                stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64,
6871                                  mkexpr(loaded_data_hi64) ));
6872                stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6873                stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(16) ));
6874                putIReg64orZR(tt1, mkexpr(loaded_data_lo64));
6875                putIReg64orZR(tt2, mkexpr(loaded_data_hi64));
6876             }
6877          } else {
6878             // Non-fallback implementation of LL.
6879             IRTemp res = newTemp(fullTy); // I64 or I128
6880             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6881             // Assuming a little-endian guest here.  Rt1 goes at the lower
6882             // address, so it must live in the least significant half of `res`.
6883             IROp opGetLO = fullTy == Ity_I128 ? Iop_128to64   : Iop_64to32;
6884             IROp opGetHI = fullTy == Ity_I128 ? Iop_128HIto64 : Iop_64HIto32;
6885             putIReg64orZR(tt1, widenUto64(elemTy, unop(opGetLO, mkexpr(res))));
6886             putIReg64orZR(tt2, widenUto64(elemTy, unop(opGetHI, mkexpr(res))));
6887          }
6888          if (isAcqOrRel) {
6889             stmt(IRStmt_MBE(Imbe_Fence));
6890          }
6891          DIP("ld%sxp %s, %s, [%s] %s\n",
6892              isAcqOrRel ? (isLD ? "a" : "l") : "",
6893              nameIRegOrZR(elemSzB == 8, tt1),
6894              nameIRegOrZR(elemSzB == 8, tt2),
6895              nameIReg64orSP(nn),
6896              abiinfo->guest__use_fallback_LLSC
6897                 ? "(fallback implementation)" : "");
6898          return True;
6899       }
6900       if (!isLD) {
6901          if (isAcqOrRel) {
6902             stmt(IRStmt_MBE(Imbe_Fence));
6903          }
6904          if (abiinfo->guest__use_fallback_LLSC) {
6905             // Fallback implementation of SC.
6906             // This is really ugly, since we don't have any way to do
6907             // proper if-then-else.  First, set up as if the SC failed,
6908             // and jump forwards if it really has failed.
6909
6910             // Continuation address
6911             IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6912
6913             // "the SC failed".  Any non-zero value means failure.
6914             putIReg64orZR(ss, mkU64(1));
6915
6916             IRTemp tmp_LLsize = newTemp(Ity_I64);
6917             assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6918             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6919             ));
6920             // Fail if no or wrong-size transaction
6921             vassert((fullSzB == 8 && fullTy == Ity_I64)
6922                     || (fullSzB == 16 && fullTy == Ity_I128));
6923             stmt( IRStmt_Exit(
6924                      binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(fullSzB)),
6925                      Ijk_Boring, nia, OFFB_PC
6926             ));
6927             // Fail if the address doesn't match the LL address
6928             stmt( IRStmt_Exit(
6929                       binop(Iop_CmpNE64, mkexpr(ea),
6930                                          IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6931                       Ijk_Boring, nia, OFFB_PC
6932             ));
6933             // The data to be stored.
6934             IRTemp store_data = newTemp(fullTy);
6935             if (fullTy == Ity_I64) {
6936                assign(store_data,
6937                       binop(Iop_32HLto64,
6938                             narrowFrom64(Ity_I32, getIReg64orZR(tt2)),
6939                             narrowFrom64(Ity_I32, getIReg64orZR(tt1))));
6940             } else {
6941                assign(store_data,
6942                       binop(Iop_64HLto128,
6943                             getIReg64orZR(tt2), getIReg64orZR(tt1)));
6944             }
6945
6946             if (fullTy == Ity_I64) {
6947                // 64 bit (2x32 bit) path
6948                // Fail if the data in memory doesn't match the data stashed by
6949                // the LL.
6950                IRTemp llsc_data_lo64 = newTemp(Ity_I64);
6951                assign(llsc_data_lo64,
6952                       IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
6953                stmt( IRStmt_Exit(
6954                          binop(Iop_CmpNE64, loadLE(Ity_I64, mkexpr(ea)),
6955                                             mkexpr(llsc_data_lo64)),
6956                       Ijk_Boring, nia, OFFB_PC
6957                ));
6958                // Try to CAS the new value in.
6959                IRTemp old = newTemp(Ity_I64);
6960                IRTemp expd = newTemp(Ity_I64);
6961                assign(expd, mkexpr(llsc_data_lo64));
6962                stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6963                                         Iend_LE, mkexpr(ea),
6964                                         /*expdHi*/NULL, mkexpr(expd),
6965                                         /*dataHi*/NULL, mkexpr(store_data)
6966                )));
6967                // Fail if the CAS failed (viz, old != expd)
6968                stmt( IRStmt_Exit(
6969                          binop(Iop_CmpNE64, mkexpr(old), mkexpr(expd)),
6970                          Ijk_Boring, nia, OFFB_PC
6971                ));
6972             } else {
6973                // 128 bit (2x64 bit) path
6974                // Fail if the data in memory doesn't match the data stashed by
6975                // the LL.
6976                IRTemp llsc_data_lo64 = newTemp(Ity_I64);
6977                assign(llsc_data_lo64,
6978                       IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
6979                IRTemp llsc_data_hi64 = newTemp(Ity_I64);
6980                assign(llsc_data_hi64,
6981                       IRExpr_Get(OFFB_LLSC_DATA_HI64, Ity_I64));
6982                IRTemp data_at_ea = newTemp(Ity_I128);
6983                assign(data_at_ea,
6984                       unop(Iop_ReinterpV128asI128,
6985                            loadLE(Ity_V128, mkexpr(ea))));
6986                stmt( IRStmt_Exit(
6987                         binop(Iop_CmpNE64,
6988                               unop(Iop_128to64, mkexpr(data_at_ea)),
6989                               mkexpr(llsc_data_lo64)),
6990                         Ijk_Boring, nia, OFFB_PC
6991                ));
6992                stmt( IRStmt_Exit(
6993                         binop(Iop_CmpNE64,
6994                               unop(Iop_128HIto64, mkexpr(data_at_ea)),
6995                               mkexpr(llsc_data_hi64)),
6996                         Ijk_Boring, nia, OFFB_PC
6997                ));
6998                // Try to CAS the new value in.
6999                IRTemp old_lo64 = newTemp(Ity_I64);
7000                IRTemp old_hi64 = newTemp(Ity_I64);
7001                IRTemp expd_lo64 = newTemp(Ity_I64);
7002                IRTemp expd_hi64 = newTemp(Ity_I64);
7003                IRTemp store_data_lo64 = newTemp(Ity_I64);
7004                IRTemp store_data_hi64 = newTemp(Ity_I64);
7005                assign(expd_lo64, mkexpr(llsc_data_lo64));
7006                assign(expd_hi64, mkexpr(llsc_data_hi64));
7007                assign(store_data_lo64, unop(Iop_128to64, mkexpr(store_data)));
7008                assign(store_data_hi64, unop(Iop_128HIto64, mkexpr(store_data)));
7009                stmt( IRStmt_CAS(mkIRCAS(old_hi64, old_lo64,
7010                                         Iend_LE, mkexpr(ea),
7011                                         mkexpr(expd_hi64), mkexpr(expd_lo64),
7012                                         mkexpr(store_data_hi64),
7013                                         mkexpr(store_data_lo64)
7014                )));
7015                // Fail if the CAS failed (viz, old != expd)
7016                stmt( IRStmt_Exit(
7017                         binop(Iop_CmpNE64, mkexpr(old_lo64), mkexpr(expd_lo64)),
7018                         Ijk_Boring, nia, OFFB_PC
7019                ));
7020                stmt( IRStmt_Exit(
7021                         binop(Iop_CmpNE64, mkexpr(old_hi64), mkexpr(expd_hi64)),
7022                         Ijk_Boring, nia, OFFB_PC
7023                ));
7024             }
7025             // Otherwise we succeeded (!)
7026             putIReg64orZR(ss, mkU64(0));
7027          } else {
7028             // Non-fallback implementation of SC.
7029             IRTemp  res     = newTemp(Ity_I1);
7030             IRExpr* dataLO  = narrowFrom64(elemTy, getIReg64orZR(tt1));
7031             IRExpr* dataHI  = narrowFrom64(elemTy, getIReg64orZR(tt2));
7032             IROp    opMerge = fullTy == Ity_I128 ? Iop_64HLto128 : Iop_32HLto64;
7033             IRExpr* data    = binop(opMerge, dataHI, dataLO);
7034             // Assuming a little-endian guest here.  Rt1 goes at the lower
7035             // address, so it must live in the least significant half of `data`.
7036             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
7037             /* IR semantics: res is 1 if store succeeds, 0 if it fails.
7038                Need to set rS to 1 on failure, 0 on success. */
7039             putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
7040                                                mkU64(1)));
7041          }
7042          DIP("st%sxp %s, %s, %s, [%s] %s\n",
7043              isAcqOrRel ? (isLD ? "a" : "l") : "",
7044              nameIRegOrZR(False, ss),
7045              nameIRegOrZR(elemSzB == 8, tt1),
7046              nameIRegOrZR(elemSzB == 8, tt2),
7047              nameIReg64orSP(nn),
7048              abiinfo->guest__use_fallback_LLSC
7049                 ? "(fallback implementation)" : "");
7050          return True;
7051       }
7052       /* else fall through */
7053    }
7054
7055    /* ------------------ LDA{R,RH,RB} ------------------ */
7056    /* ------------------ STL{R,RH,RB} ------------------ */
7057    /* 31 29     23  20      14    9 4
7058       sz 001000 110 11111 1 11111 n t   LDAR<sz> Rt, [Xn|SP]
7059       sz 001000 100 11111 1 11111 n t   STLR<sz> Rt, [Xn|SP]
7060    */
7061    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
7062        && INSN(21,10) == BITS12(0,1,1,1,1,1,1,1,1,1,1,1)) {
7063       UInt szBlg2 = INSN(31,30);
7064       Bool isLD   = INSN(22,22) == 1;
7065       UInt nn     = INSN(9,5);
7066       UInt tt     = INSN(4,0);
7067
7068       vassert(szBlg2 < 4);
7069       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
7070       IRType ty  = integerIRTypeOfSize(szB);
7071       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
7072
7073       IRTemp ea = newTemp(Ity_I64);
7074       assign(ea, getIReg64orSP(nn));
7075       gen_SIGBUS_if_not_XX_aligned(ea, szB);
7076
7077       if (isLD) {
7078          IRTemp res = newTemp(ty);
7079          assign(res, loadLE(ty, mkexpr(ea)));
7080          putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
7081          stmt(IRStmt_MBE(Imbe_Fence));
7082          DIP("lda%s %s, [%s]\n", suffix[szBlg2],
7083              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
7084       } else {
7085          stmt(IRStmt_MBE(Imbe_Fence));
7086          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
7087          storeLE(mkexpr(ea), data);
7088          DIP("stl%s %s, [%s]\n", suffix[szBlg2],
7089              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
7090       }
7091       return True;
7092    }
7093
7094    /* The PRFM cases that follow are possibly allow Rt values (the
7095       prefetch operation) which are not allowed by the documentation.
7096       This should be looked into. */
7097    /* ------------------ PRFM (immediate) ------------------ */
7098    /* 31           21    9 4
7099       11 111 00110 imm12 n t   PRFM pfrop=Rt, [Xn|SP, #pimm]
7100    */
7101    if (INSN(31,22) == BITS10(1,1,1,1,1,0,0,1,1,0)) {
7102       UInt imm12 = INSN(21,10);
7103       UInt nn    = INSN(9,5);
7104       UInt tt    = INSN(4,0);
7105       /* Generating any IR here is pointless, except for documentation
7106          purposes, as it will get optimised away later. */
7107       IRTemp ea = newTemp(Ity_I64);
7108       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(imm12 * 8)));
7109       DIP("prfm prfop=%u, [%s, #%u]\n", tt, nameIReg64orSP(nn), imm12 * 8);
7110       return True;
7111    }
7112
7113    /* ------------------ PRFM (register) ------------------ */
7114    /* 31 29      22 20 15  12 11 9  4
7115       11 1110001 01 Rm opt S  10 Rn Rt    PRFM pfrop=Rt, [Xn|SP, R<m>{ext/sh}]
7116    */
7117    if (INSN(31,21) == BITS11(1,1,1,1,1,0,0,0,1,0,1)
7118        && INSN(11,10) == BITS2(1,0)) {
7119       HChar  dis_buf[64];
7120       UInt   tt = INSN(4,0);
7121       IRTemp ea = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
7122       if (ea != IRTemp_INVALID) {
7123          /* No actual code to generate. */
7124          DIP("prfm prfop=%u, %s\n", tt, dis_buf);
7125          return True;
7126       }
7127    }
7128
7129    /* ------------------ PRFM (unscaled offset) ------------------ */
7130    /* 31 29      22 20   11 9  4
7131       11 1110001 00 imm9 00 Rn Rt    PRFM pfrop=Rt, [Xn|SP, #simm]
7132    */
7133    if (INSN(31,21) == BITS11(1,1, 1,1,1,0,0,0,1, 0,0)
7134        && INSN(11,10) == BITS2(0,0)) {
7135       ULong  imm9   = INSN(20,12);
7136       UInt   nn     = INSN(9,5);
7137       UInt   tt     = INSN(4,0);
7138       ULong  offset = sx_to_64(imm9, 9);
7139       IRTemp ea     = newTemp(Ity_I64);
7140       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offset)));
7141       /* No actual code to generate. */
7142       DIP("prfum prfop=%u, [%s, #0x%llx]\n", tt, nameIReg64orSP(nn), offset);
7143       return True;
7144    }
7145
7146    /* ---------------- ARMv8.1-LSE: Atomic Memory Operations ---------------- */
7147    /* 31 29     23 22 21 20 15   11 9 4
7148       sz 111000 A  R  1  s  0000 00 n t LDADD{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
7149       sz 111000 A  R  1  s  0001 00 n t LDCLR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
7150       sz 111000 A  R  1  s  0010 00 n t LDEOR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
7151       sz 111000 A  R  1  s  0011 00 n t LDSET{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
7152       sz 111000 A  R  1  s  0100 00 n t LDSMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7153       sz 111000 A  R  1  s  0101 00 n t LDSMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7154       sz 111000 A  R  1  s  0110 00 n t LDUMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7155       sz 111000 A  R  1  s  0111 00 n t LDUMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7156       sz 111000 A  R  1  s  1000 00 n t SWP{,A}{,L}<sz>    <Rs>, <Rt>, [<Xn|SP>]
7157    */
7158    if (INSN(29,24) == BITS6(1,1,1,0,0,0)
7159        && INSN(21,21) == 1
7160        && (INSN(15,12) <= BITS4(1,0,0,0))
7161        && INSN(11,10) == BITS2(0,0)) {
7162       UInt szBlg2 = INSN(31,30);
7163       Bool isAcq = INSN(23,23) == 1;
7164       Bool isRel = INSN(22,22) == 1;
7165       UInt ss  = INSN(20,16);
7166       UInt opc = INSN(15,12);
7167       UInt nn  = INSN(9,5);
7168       UInt tt  = INSN(4,0);
7169
7170       const HChar* nm = NULL;
7171       const HChar* suffix[4] = { "b", "h", "", "" };
7172
7173       vassert(szBlg2 < 4);
7174       UInt  szB = 1 << szBlg2; /* 1, 2, 4 or 8 bytes*/
7175       IRType ty = integerIRTypeOfSize(szB);
7176       Bool is64 = szB == 8;
7177       Bool isSigned = (opc == 4) || (opc == 5) /*smax || smin*/;
7178
7179       // IR used to emulate these atomic memory ops:
7180       // 1) barrier
7181       // 2) load
7182       // 3) widen operands and do arithmetic/logic op
7183       // 4) cas to see if target memory updated
7184       // 5) barrier
7185       // 6) repeat from 1) if cas says target memory not updated
7186       // 7) update register
7187
7188       IRTemp ea = newTemp(Ity_I64);
7189       assign(ea, getIReg64orSP(nn));
7190       gen_SIGBUS_if_not_XX_aligned(ea, szB);
7191
7192       // Insert barrier before loading for acquire and acquire-release variants:
7193       // A and AL.
7194       if (isAcq && (tt != 31))
7195          stmt(IRStmt_MBE(Imbe_Fence));
7196
7197       // Load LHS from memory, RHS from register.
7198       IRTemp orig = newTemp(ty);
7199       assign(orig, loadLE(ty, mkexpr(ea)));
7200       IRExpr *lhs = mkexpr(orig);
7201       IRExpr *rhs = narrowFrom64(ty, getIReg64orZR(ss));
7202       IRExpr *res = NULL;
7203
7204       lhs = isSigned ? widenSto64(ty, lhs) : widenUto64(ty, lhs);
7205       rhs = isSigned ? widenSto64(ty, rhs) : widenUto64(ty, rhs);
7206
7207       // Perform the operation.
7208       switch (opc) {
7209          case 0:
7210             nm = "ldadd";
7211             res = binop(Iop_Add64, lhs, rhs);
7212             break;
7213          case 1:
7214             nm = "ldclr";
7215             res = binop(Iop_And64, lhs, unop(mkNOT(Ity_I64), rhs));
7216             break;
7217          case 2:
7218             nm = "ldeor";
7219             res = binop(Iop_Xor64, lhs, rhs);
7220             break;
7221          case 3:
7222             nm = "ldset";
7223             res = binop(Iop_Or64, lhs, rhs);
7224             break;
7225          case 4:
7226             nm = "ldsmax";
7227             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), rhs, lhs);
7228             break;
7229          case 5:
7230             nm = "ldsmin";
7231             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), lhs, rhs);
7232             break;
7233          case 6:
7234             nm = "ldumax";
7235             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), rhs, lhs);
7236             break;
7237          case 7:
7238             nm = "ldumin";
7239             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), lhs, rhs);
7240             break;
7241          case 8:
7242             nm = "swp";
7243             res = rhs;
7244             break;
7245          default:
7246             vassert(0);
7247             break;
7248       }
7249
7250       // Store the result back if LHS remains unchanged in memory.
7251       IRTemp old = newTemp(ty);
7252       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
7253                                Iend_LE, mkexpr(ea),
7254                                /*expdHi*/NULL, mkexpr(orig),
7255                                /*dataHi*/NULL, narrowFrom64(ty, res))) );
7256
7257       // Insert barrier after storing for release and acquire-release variants:
7258       // L and AL.
7259       if (isRel)
7260          stmt(IRStmt_MBE(Imbe_Fence));
7261
7262       // Retry if the CAS failed (i.e. when old != orig).
7263       IRConst* nia = IRConst_U64(guest_PC_curr_instr);
7264       stmt( IRStmt_Exit(
7265                 binop(Iop_CasCmpNE64,
7266                       widenUto64(ty, mkexpr(old)),
7267                       widenUto64(ty, mkexpr(orig))),
7268                 Ijk_Boring, nia, OFFB_PC ));
7269       // Otherwise we succeeded.
7270       putIReg64orZR(tt, widenUto64(ty, mkexpr(old)));
7271
7272       DIP("%s%s%s%s %s, %s, [%s]\n", nm, isAcq ? "a" : "", isRel ? "l" : "",
7273           suffix[szBlg2], nameIRegOrZR(is64, ss), nameIRegOrZR(is64, tt),
7274           nameIReg64orSP(nn));
7275       return True;
7276    }
7277
7278    /* ------------------ ARMv8.1-LSE: Compare-and-Swap ------------------ */
7279    /* 31 29      22 21 20 15 14    9 4
7280       sz 0010001 A  1  s  R  11111 n t CAS{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7281    */
7282    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
7283        && INSN(21,21) == 1
7284        && INSN(14,10) == BITS5(1,1,1,1,1)) {
7285       UInt szBlg2 = INSN(31,30);
7286       Bool isAcq = INSN(22,22) == 1;
7287       Bool isRel = INSN(15,15) == 1;
7288       UInt ss  = INSN(20,16);
7289       UInt nn  = INSN(9,5);
7290       UInt tt  = INSN(4,0);
7291
7292       const HChar* suffix[4] = { "b", "h", "", "" };
7293
7294       UInt  szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
7295       IRType ty = integerIRTypeOfSize(szB);
7296       Bool is64 = szB == 8;
7297
7298       IRTemp ea = newTemp(Ity_I64);
7299       assign(ea, getIReg64orSP(nn));
7300       gen_SIGBUS_if_not_XX_aligned(ea, szB);
7301
7302       IRExpr *exp = narrowFrom64(ty, getIReg64orZR(ss));
7303       IRExpr *new = narrowFrom64(ty, getIReg64orZR(tt));
7304
7305       if (isAcq)
7306          stmt(IRStmt_MBE(Imbe_Fence));
7307
7308       // Store the result back if LHS remains unchanged in memory.
7309       IRTemp old = newTemp(ty);
7310       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
7311                                Iend_LE, mkexpr(ea),
7312                                /*expdHi*/NULL, exp,
7313                                /*dataHi*/NULL, new)) );
7314
7315       if (isRel)
7316          stmt(IRStmt_MBE(Imbe_Fence));
7317
7318       putIReg64orZR(ss, widenUto64(ty, mkexpr(old)));
7319       DIP("cas%s%s%s %s, %s, [%s]\n",
7320           isAcq ? "a" : "", isRel ? "l" : "", suffix[szBlg2],
7321           nameIRegOrZR(is64, ss), nameIRegOrZR(is64, tt), nameIReg64orSP(nn));
7322       return True;
7323    }
7324
7325    /* ---------------- ARMv8.1-LSE: Compare-and-Swap Pair --------------- */
7326    /* 31 30 29      22 21 20 15 14    9 4
7327       0  sz 0010000 A  1  s  R  11111 n t CASP{,A}{,L} <Rs>, <Rt>, [<Xn|SP>]
7328    */
7329    if (INSN(31,31) == 0
7330        && INSN(29,23) == BITS7(0,0,1,0,0,0,0)
7331        && INSN(21,21) == 1
7332        && INSN(14,10) == BITS5(1,1,1,1,1)) {
7333       UInt is64 = INSN(30,30);
7334       Bool isAcq = INSN(22,22) == 1;
7335       Bool isRel = INSN(15,15) == 1;
7336       UInt ss  = INSN(20,16);
7337       UInt nn  = INSN(9,5);
7338       UInt tt  = INSN(4,0);
7339
7340       if ((ss & 0x1) || (tt & 0x1)) {
7341          /* undefined; fall through */
7342       } else {
7343          IRTemp ea = newTemp(Ity_I64);
7344          assign(ea, getIReg64orSP(nn));
7345          gen_SIGBUS_if_not_XX_aligned(ea, is64 ? 16 : 8);
7346
7347          IRExpr *expLo = getIRegOrZR(is64, ss);
7348          IRExpr *expHi = getIRegOrZR(is64, ss + 1);
7349          IRExpr *newLo = getIRegOrZR(is64, tt);
7350          IRExpr *newHi = getIRegOrZR(is64, tt + 1);
7351          IRTemp oldLo = newTemp(is64 ? Ity_I64 : Ity_I32);
7352          IRTemp oldHi = newTemp(is64 ? Ity_I64 : Ity_I32);
7353
7354          if (isAcq)
7355             stmt(IRStmt_MBE(Imbe_Fence));
7356
7357          stmt( IRStmt_CAS(mkIRCAS(oldHi, oldLo,
7358                                   Iend_LE, mkexpr(ea),
7359                                   expHi, expLo,
7360                                   newHi, newLo)) );
7361
7362          if (isRel)
7363             stmt(IRStmt_MBE(Imbe_Fence));
7364
7365          putIRegOrZR(is64, ss, mkexpr(oldLo));
7366          putIRegOrZR(is64, ss+1, mkexpr(oldHi));
7367          DIP("casp%s%s %s, %s, %s, %s, [%s]\n",
7368              isAcq ? "a" : "", isRel ? "l" : "",
7369              nameIRegOrZR(is64, ss), nameIRegOrZR(is64, ss+1),
7370              nameIRegOrZR(is64, tt), nameIRegOrZR(is64, tt+1),
7371              nameIReg64orSP(nn));
7372          return True;
7373       }
7374    }
7375
7376    if (sigill_diag) {
7377       vex_printf("ARM64 front end: load_store\n");
7378    }
7379
7380    return False;
7381 #  undef INSN
7382 }
7383
7384
7385 /*------------------------------------------------------------*/
7386 /*--- Control flow and misc instructions                   ---*/
7387 /*------------------------------------------------------------*/
7388
7389 static
7390 Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
7391                           const VexArchInfo* archinfo,
7392                           const VexAbiInfo* abiinfo, Bool sigill_diag)
7393 {
7394 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
7395
7396    /* ---------------------- B cond ----------------------- */
7397    /* 31        24    4 3
7398       0101010 0 imm19 0 cond */
7399    if (INSN(31,24) == BITS8(0,1,0,1,0,1,0,0) && INSN(4,4) == 0) {
7400       UInt  cond   = INSN(3,0);
7401       ULong uimm64 = INSN(23,5) << 2;
7402       Long  simm64 = (Long)sx_to_64(uimm64, 21);
7403       vassert(dres->whatNext    == Dis_Continue);
7404       vassert(dres->len         == 4);
7405       vassert(dres->jk_StopHere == Ijk_INVALID);
7406       stmt( IRStmt_Exit(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
7407                         Ijk_Boring,
7408                         IRConst_U64(guest_PC_curr_instr + simm64),
7409                         OFFB_PC) );
7410       putPC(mkU64(guest_PC_curr_instr + 4));
7411       dres->whatNext    = Dis_StopHere;
7412       dres->jk_StopHere = Ijk_Boring;
7413       DIP("b.%s 0x%llx\n", nameCC(cond), guest_PC_curr_instr + simm64);
7414       return True;
7415    }
7416
7417    /* -------------------- B{L} uncond -------------------- */
7418    if (INSN(30,26) == BITS5(0,0,1,0,1)) {
7419       /* 000101 imm26  B  (PC + sxTo64(imm26 << 2))
7420          100101 imm26  B  (PC + sxTo64(imm26 << 2))
7421       */
7422       UInt  bLink  = INSN(31,31);
7423       ULong uimm64 = INSN(25,0) << 2;
7424       Long  simm64 = (Long)sx_to_64(uimm64, 28);
7425       if (bLink) {
7426          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
7427       }
7428       putPC(mkU64(guest_PC_curr_instr + simm64));
7429       dres->whatNext = Dis_StopHere;
7430       dres->jk_StopHere = Ijk_Call;
7431       DIP("b%s 0x%llx\n", bLink == 1 ? "l" : "",
7432                           guest_PC_curr_instr + simm64);
7433       return True;
7434    }
7435
7436    /* --------------------- B{L} reg --------------------- */
7437    /* 31      24 22 20    15     9  4
7438       1101011 00 10 11111 000000 nn 00000  RET  Rn
7439       1101011 00 01 11111 000000 nn 00000  CALL Rn
7440       1101011 00 00 11111 000000 nn 00000  JMP  Rn
7441    */
7442    if (INSN(31,23) == BITS9(1,1,0,1,0,1,1,0,0)
7443        && INSN(20,16) == BITS5(1,1,1,1,1)
7444        && INSN(15,10) == BITS6(0,0,0,0,0,0)
7445        && INSN(4,0) == BITS5(0,0,0,0,0)) {
7446       UInt branch_type = INSN(22,21);
7447       UInt nn          = INSN(9,5);
7448       if (branch_type == BITS2(1,0) /* RET */) {
7449          putPC(getIReg64orZR(nn));
7450          dres->whatNext = Dis_StopHere;
7451          dres->jk_StopHere = Ijk_Ret;
7452          DIP("ret %s\n", nameIReg64orZR(nn));
7453          return True;
7454       }
7455       if (branch_type == BITS2(0,1) /* CALL */) {
7456          IRTemp dst = newTemp(Ity_I64);
7457          assign(dst, getIReg64orZR(nn));
7458          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
7459          putPC(mkexpr(dst));
7460          dres->whatNext = Dis_StopHere;
7461          dres->jk_StopHere = Ijk_Call;
7462          DIP("blr %s\n", nameIReg64orZR(nn));
7463          return True;
7464       }
7465       if (branch_type == BITS2(0,0) /* JMP */) {
7466          putPC(getIReg64orZR(nn));
7467          dres->whatNext = Dis_StopHere;
7468          dres->jk_StopHere = Ijk_Boring;
7469          DIP("jmp %s\n", nameIReg64orZR(nn));
7470          return True;
7471       }
7472    }
7473
7474    /* -------------------- CB{N}Z -------------------- */
7475    /* sf 011 010 1 imm19 Rt   CBNZ Xt|Wt, (PC + sxTo64(imm19 << 2))
7476       sf 011 010 0 imm19 Rt   CBZ  Xt|Wt, (PC + sxTo64(imm19 << 2))
7477    */
7478    if (INSN(30,25) == BITS6(0,1,1,0,1,0)) {
7479       Bool    is64   = INSN(31,31) == 1;
7480       Bool    bIfZ   = INSN(24,24) == 0;
7481       ULong   uimm64 = INSN(23,5) << 2;
7482       UInt    rT     = INSN(4,0);
7483       Long    simm64 = (Long)sx_to_64(uimm64, 21);
7484       IRExpr* cond   = NULL;
7485       if (is64) {
7486          cond = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
7487                       getIReg64orZR(rT), mkU64(0));
7488       } else {
7489          cond = binop(bIfZ ? Iop_CmpEQ32 : Iop_CmpNE32,
7490                       getIReg32orZR(rT), mkU32(0));
7491       }
7492       stmt( IRStmt_Exit(cond,
7493                         Ijk_Boring,
7494                         IRConst_U64(guest_PC_curr_instr + simm64),
7495                         OFFB_PC) );
7496       putPC(mkU64(guest_PC_curr_instr + 4));
7497       dres->whatNext    = Dis_StopHere;
7498       dres->jk_StopHere = Ijk_Boring;
7499       DIP("cb%sz %s, 0x%llx\n",
7500           bIfZ ? "" : "n", nameIRegOrZR(is64, rT),
7501           guest_PC_curr_instr + simm64);
7502       return True;
7503    }
7504
7505    /* -------------------- TB{N}Z -------------------- */
7506    /* 31 30      24 23  18  5 4
7507       b5 011 011 1  b40 imm14 t  TBNZ Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
7508       b5 011 011 0  b40 imm14 t  TBZ  Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
7509    */
7510    if (INSN(30,25) == BITS6(0,1,1,0,1,1)) {
7511       UInt    b5     = INSN(31,31);
7512       Bool    bIfZ   = INSN(24,24) == 0;
7513       UInt    b40    = INSN(23,19);
7514       UInt    imm14  = INSN(18,5);
7515       UInt    tt     = INSN(4,0);
7516       UInt    bitNo  = (b5 << 5) | b40;
7517       ULong   uimm64 = imm14 << 2;
7518       Long    simm64 = sx_to_64(uimm64, 16);
7519       IRExpr* cond
7520          = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
7521                  binop(Iop_And64,
7522                        binop(Iop_Shr64, getIReg64orZR(tt), mkU8(bitNo)),
7523                        mkU64(1)),
7524                  mkU64(0));
7525       stmt( IRStmt_Exit(cond,
7526                         Ijk_Boring,
7527                         IRConst_U64(guest_PC_curr_instr + simm64),
7528                         OFFB_PC) );
7529       putPC(mkU64(guest_PC_curr_instr + 4));
7530       dres->whatNext    = Dis_StopHere;
7531       dres->jk_StopHere = Ijk_Boring;
7532       DIP("tb%sz %s, #%u, 0x%llx\n",
7533           bIfZ ? "" : "n", nameIReg64orZR(tt), bitNo,
7534           guest_PC_curr_instr + simm64);
7535       return True;
7536    }
7537
7538    /* -------------------- SVC -------------------- */
7539    /* 11010100 000 imm16 000 01
7540       Don't bother with anything except the imm16==0 case.
7541    */
7542    if (INSN(31,0) == 0xD4000001) {
7543       putPC(mkU64(guest_PC_curr_instr + 4));
7544       dres->whatNext    = Dis_StopHere;
7545       dres->jk_StopHere = Ijk_Sys_syscall;
7546       DIP("svc #0\n");
7547       return True;
7548    }
7549
7550    /* ------------------ M{SR,RS} ------------------ */
7551    /* ---- Cases for TPIDR_EL0 ----
7552       0xD51BD0 010 Rt   MSR tpidr_el0, rT
7553       0xD53BD0 010 Rt   MRS rT, tpidr_el0
7554    */
7555    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51BD040 /*MSR*/
7556        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53BD040 /*MRS*/) {
7557       Bool toSys = INSN(21,21) == 0;
7558       UInt tt    = INSN(4,0);
7559       if (toSys) {
7560          stmt( IRStmt_Put( OFFB_TPIDR_EL0, getIReg64orZR(tt)) );
7561          DIP("msr tpidr_el0, %s\n", nameIReg64orZR(tt));
7562       } else {
7563          putIReg64orZR(tt, IRExpr_Get( OFFB_TPIDR_EL0, Ity_I64 ));
7564          DIP("mrs %s, tpidr_el0\n", nameIReg64orZR(tt));
7565       }
7566       return True;
7567    }
7568    /* ---- Cases for FPCR ----
7569       0xD51B44 000 Rt  MSR fpcr, rT
7570       0xD53B44 000 Rt  MSR rT, fpcr
7571    */
7572    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4400 /*MSR*/
7573        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4400 /*MRS*/) {
7574       Bool toSys = INSN(21,21) == 0;
7575       UInt tt    = INSN(4,0);
7576       if (toSys) {
7577          stmt( IRStmt_Put( OFFB_FPCR, getIReg32orZR(tt)) );
7578          DIP("msr fpcr, %s\n", nameIReg64orZR(tt));
7579       } else {
7580          putIReg32orZR(tt, IRExpr_Get(OFFB_FPCR, Ity_I32));
7581          DIP("mrs %s, fpcr\n", nameIReg64orZR(tt));
7582       }
7583       return True;
7584    }
7585    /* ---- Cases for FPSR ----
7586       0xD51B44 001 Rt  MSR fpsr, rT
7587       0xD53B44 001 Rt  MSR rT, fpsr
7588       The only part of this we model is FPSR.QC.  All other bits
7589       are ignored when writing to it and RAZ when reading from it.
7590    */
7591    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4420 /*MSR*/
7592        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4420 /*MRS*/) {
7593       Bool toSys = INSN(21,21) == 0;
7594       UInt tt    = INSN(4,0);
7595       if (toSys) {
7596          /* Just deal with FPSR.QC.  Make up a V128 value which is
7597             zero if Xt[27] is zero and any other value if Xt[27] is
7598             nonzero. */
7599          IRTemp qc64 = newTemp(Ity_I64);
7600          assign(qc64, binop(Iop_And64,
7601                             binop(Iop_Shr64, getIReg64orZR(tt), mkU8(27)),
7602                             mkU64(1)));
7603          IRExpr* qcV128 = binop(Iop_64HLtoV128, mkexpr(qc64), mkexpr(qc64));
7604          stmt( IRStmt_Put( OFFB_QCFLAG, qcV128 ) );
7605          DIP("msr fpsr, %s\n", nameIReg64orZR(tt));
7606       } else {
7607          /* Generate a value which is all zeroes except for bit 27,
7608             which must be zero if QCFLAG is all zeroes and one otherwise. */
7609          IRTemp qcV128 = newTempV128();
7610          assign(qcV128, IRExpr_Get( OFFB_QCFLAG, Ity_V128 ));
7611          IRTemp qc64 = newTemp(Ity_I64);
7612          assign(qc64, binop(Iop_Or64, unop(Iop_V128HIto64, mkexpr(qcV128)),
7613                                       unop(Iop_V128to64,   mkexpr(qcV128))));
7614          IRExpr* res = binop(Iop_Shl64,
7615                              unop(Iop_1Uto64,
7616                                   binop(Iop_CmpNE64, mkexpr(qc64), mkU64(0))),
7617                              mkU8(27));
7618          putIReg64orZR(tt, res);
7619          DIP("mrs %s, fpsr\n", nameIReg64orZR(tt));
7620       }
7621       return True;
7622    }
7623    /* ---- Cases for NZCV ----
7624       D51B42 000 Rt  MSR nzcv, rT
7625       D53B42 000 Rt  MRS rT, nzcv
7626       The only parts of NZCV that actually exist are bits 31:28, which
7627       are the N Z C and V bits themselves.  Hence the flags thunk provides
7628       all the state we need.
7629    */
7630    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4200 /*MSR*/
7631        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4200 /*MRS*/) {
7632       Bool  toSys = INSN(21,21) == 0;
7633       UInt  tt    = INSN(4,0);
7634       if (toSys) {
7635          IRTemp t = newTemp(Ity_I64);
7636          assign(t, binop(Iop_And64, getIReg64orZR(tt), mkU64(0xF0000000ULL)));
7637          setFlags_COPY(t);
7638          DIP("msr %s, nzcv\n", nameIReg32orZR(tt));
7639       } else {
7640          IRTemp res = newTemp(Ity_I64);
7641          assign(res, mk_arm64g_calculate_flags_nzcv());
7642          putIReg32orZR(tt, unop(Iop_64to32, mkexpr(res)));
7643          DIP("mrs %s, nzcv\n", nameIReg64orZR(tt));
7644       }
7645       return True;
7646    }
7647    /* ---- Cases for DCZID_EL0 ----
7648       Don't support arbitrary reads and writes to this register.  Just
7649       return the value 16, which indicates that the DC ZVA instruction
7650       is not permitted, so we don't have to emulate it.
7651       D5 3B 00 111 Rt  MRS rT, dczid_el0
7652    */
7653    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B00E0) {
7654       UInt tt = INSN(4,0);
7655       putIReg64orZR(tt, mkU64(1<<4));
7656       DIP("mrs %s, dczid_el0 (FAKED)\n", nameIReg64orZR(tt));
7657       return True;
7658    }
7659    /* ---- Cases for CTR_EL0 ----
7660       We just handle reads, and make up a value from the D and I line
7661       sizes in the VexArchInfo we are given, and patch in the following
7662       fields that the Foundation model gives ("natively"):
7663       CWG = 0b0100, ERG = 0b0100, L1Ip = 0b11
7664       D5 3B 00 001 Rt  MRS rT, dczid_el0
7665    */
7666    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B0020) {
7667       UInt tt = INSN(4,0);
7668       /* Need to generate a value from dMinLine_lg2_szB and
7669          dMinLine_lg2_szB.  The value in the register is in 32-bit
7670          units, so need to subtract 2 from the values in the
7671          VexArchInfo.  We can assume that the values here are valid --
7672          disInstr_ARM64 checks them -- so there's no need to deal with
7673          out-of-range cases. */
7674       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7675               && archinfo->arm64_dMinLine_lg2_szB <= 17
7676               && archinfo->arm64_iMinLine_lg2_szB >= 2
7677               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7678       UInt val
7679          = 0x8440c000 | ((0xF & (archinfo->arm64_dMinLine_lg2_szB - 2)) << 16)
7680                       | ((0xF & (archinfo->arm64_iMinLine_lg2_szB - 2)) << 0);
7681       putIReg64orZR(tt, mkU64(val));
7682       DIP("mrs %s, ctr_el0\n", nameIReg64orZR(tt));
7683       return True;
7684    }
7685    /* ---- Cases for CNTVCT_EL0 ----
7686       This is a timestamp counter of some sort.  Support reads of it only
7687       by passing through to the host.
7688       D5 3B E0 010 Rt  MRS Xt, cntvct_el0
7689    */
7690    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE040) {
7691       UInt     tt   = INSN(4,0);
7692       IRTemp   val  = newTemp(Ity_I64);
7693       IRExpr** args = mkIRExprVec_0();
7694       IRDirty* d    = unsafeIRDirty_1_N (
7695                          val,
7696                          0/*regparms*/,
7697                          "arm64g_dirtyhelper_MRS_CNTVCT_EL0",
7698                          &arm64g_dirtyhelper_MRS_CNTVCT_EL0,
7699                          args
7700                       );
7701       /* execute the dirty call, dumping the result in val. */
7702       stmt( IRStmt_Dirty(d) );
7703       putIReg64orZR(tt, mkexpr(val));
7704       DIP("mrs %s, cntvct_el0\n", nameIReg64orZR(tt));
7705       return True;
7706    }
7707    /* ---- Cases for CNTFRQ_EL0 ----
7708       This is always RO at EL0, so it's safe to pass through to the host.
7709       D5 3B E0 000 Rt  MRS Xt, cntfrq_el0
7710    */
7711    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE000) {
7712       UInt     tt   = INSN(4,0);
7713       IRTemp   val  = newTemp(Ity_I64);
7714       IRExpr** args = mkIRExprVec_0();
7715       IRDirty* d    = unsafeIRDirty_1_N (
7716                          val,
7717                          0/*regparms*/,
7718                          "arm64g_dirtyhelper_MRS_CNTFRQ_EL0",
7719                          &arm64g_dirtyhelper_MRS_CNTFRQ_EL0,
7720                          args
7721                       );
7722       /* execute the dirty call, dumping the result in val. */
7723       stmt( IRStmt_Dirty(d) );
7724       putIReg64orZR(tt, mkexpr(val));
7725       DIP("mrs %s, cntfrq_el0\n", nameIReg64orZR(tt));
7726       return True;
7727    }
7728
7729    /* ------------------ IC_IVAU ------------------ */
7730    /* D5 0B 75 001 Rt  ic ivau, rT
7731    */
7732    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7520) {
7733       /* We will always be provided with a valid iMinLine value. */
7734       vassert(archinfo->arm64_iMinLine_lg2_szB >= 2
7735               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7736       /* Round the requested address, in rT, down to the start of the
7737          containing block. */
7738       UInt   tt      = INSN(4,0);
7739       ULong  lineszB = 1ULL << archinfo->arm64_iMinLine_lg2_szB;
7740       IRTemp addr    = newTemp(Ity_I64);
7741       assign( addr, binop( Iop_And64,
7742                            getIReg64orZR(tt),
7743                            mkU64(~(lineszB - 1))) );
7744       /* Set the invalidation range, request exit-and-invalidate, with
7745          continuation at the next instruction. */
7746       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7747       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7748       /* be paranoid ... */
7749       stmt( IRStmt_MBE(Imbe_Fence) );
7750       putPC(mkU64( guest_PC_curr_instr + 4 ));
7751       dres->whatNext    = Dis_StopHere;
7752       dres->jk_StopHere = Ijk_InvalICache;
7753       DIP("ic ivau, %s\n", nameIReg64orZR(tt));
7754       return True;
7755    }
7756
7757    /* ------------------ DC_CVAU ------------------ */
7758    /* D5 0B 7B 001 Rt  dc cvau, rT
7759       D5 0B 7E 001 Rt  dc civac, rT
7760    */
7761    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7B20
7762        || (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7E20) {
7763       /* Exactly the same scheme as for IC IVAU, except we observe the
7764          dMinLine size, and request an Ijk_FlushDCache instead of
7765          Ijk_InvalICache. */
7766       /* We will always be provided with a valid dMinLine value. */
7767       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7768               && archinfo->arm64_dMinLine_lg2_szB <= 17);
7769       /* Round the requested address, in rT, down to the start of the
7770          containing block. */
7771       UInt   tt      = INSN(4,0);
7772       ULong  lineszB = 1ULL << archinfo->arm64_dMinLine_lg2_szB;
7773       IRTemp addr    = newTemp(Ity_I64);
7774       assign( addr, binop( Iop_And64,
7775                            getIReg64orZR(tt),
7776                            mkU64(~(lineszB - 1))) );
7777       /* Set the flush range, request exit-and-flush, with
7778          continuation at the next instruction. */
7779       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7780       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7781       /* be paranoid ... */
7782       stmt( IRStmt_MBE(Imbe_Fence) );
7783       putPC(mkU64( guest_PC_curr_instr + 4 ));
7784       dres->whatNext    = Dis_StopHere;
7785       dres->jk_StopHere = Ijk_FlushDCache;
7786       DIP("dc cvau, %s\n", nameIReg64orZR(tt));
7787       return True;
7788    }
7789
7790    /* ------------------ ISB, DMB, DSB ------------------ */
7791    /* 31          21            11  7 6  4
7792       11010 10100 0 00 011 0011 CRm 1 01 11111  DMB opt
7793       11010 10100 0 00 011 0011 CRm 1 00 11111  DSB opt
7794       11010 10100 0 00 011 0011 CRm 1 10 11111  ISB opt
7795    */
7796    if (INSN(31,22) == BITS10(1,1,0,1,0,1,0,1,0,0)
7797        && INSN(21,12) == BITS10(0,0,0,0,1,1,0,0,1,1)
7798        && INSN(7,7) == 1
7799        && INSN(6,5) <= BITS2(1,0) && INSN(4,0) == BITS5(1,1,1,1,1)) {
7800       UInt opc = INSN(6,5);
7801       UInt CRm = INSN(11,8);
7802       vassert(opc <= 2 && CRm <= 15);
7803       stmt(IRStmt_MBE(Imbe_Fence));
7804       const HChar* opNames[3]
7805          = { "dsb", "dmb", "isb" };
7806       const HChar* howNames[16]
7807          = { "#0", "oshld", "oshst", "osh", "#4", "nshld", "nshst", "nsh",
7808              "#8", "ishld", "ishst", "ish", "#12", "ld", "st", "sy" };
7809       DIP("%s %s\n", opNames[opc], howNames[CRm]);
7810       return True;
7811    }
7812
7813    /* -------------------- NOP -------------------- */
7814    if (INSN(31,0) == 0xD503201F) {
7815       DIP("nop\n");
7816       return True;
7817    }
7818
7819    /* -------------------- BRK -------------------- */
7820    /* 31        23  20    4
7821       1101 0100 001 imm16 00000  BRK #imm16
7822    */
7823    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,0)
7824        && INSN(23,21) == BITS3(0,0,1) && INSN(4,0) == BITS5(0,0,0,0,0)) {
7825       UInt imm16 = INSN(20,5);
7826       /* Request SIGTRAP and then restart of this insn. */
7827       putPC(mkU64(guest_PC_curr_instr + 0));
7828       dres->whatNext    = Dis_StopHere;
7829       dres->jk_StopHere = Ijk_SigTRAP;
7830       DIP("brk #%u\n", imm16);
7831       return True;
7832    }
7833
7834    /* ------------------- YIELD ------------------- */
7835    /* 31        23        15        7
7836       1101 0101 0000 0011 0010 0000 0011 1111
7837    */
7838    if (INSN(31,0) == 0xD503203F) {
7839       /* Request yield followed by continuation at the next insn. */
7840       putPC(mkU64(guest_PC_curr_instr + 4));
7841       dres->whatNext    = Dis_StopHere;
7842       dres->jk_StopHere = Ijk_Yield;
7843       DIP("yield\n");
7844       return True;
7845    }
7846
7847    /* -------------------- HINT ------------------- */
7848    /* 31        23        15   11   4 3
7849       1101 0101 0000 0011 0010 imm7 1 1111
7850       Catch otherwise unhandled HINT instructions - any
7851       like YIELD which are explicitly handled should go
7852       above this case.
7853    */
7854    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,1)
7855        && INSN(23,16) == BITS8(0,0,0,0,0,0,1,1)
7856        && INSN(15,12) == BITS4(0,0,1,0)
7857        && INSN(4,0) == BITS5(1,1,1,1,1)) {
7858       UInt imm7 = INSN(11,5);
7859       DIP("hint #%u\n", imm7);
7860       return True;
7861    }
7862
7863    /* ------------------- CLREX ------------------ */
7864    /* 31        23        15   11 7
7865       1101 0101 0000 0011 0011 m  0101 1111  CLREX CRm
7866       CRm is apparently ignored.
7867    */
7868    if ((INSN(31,0) & 0xFFFFF0FF) == 0xD503305F) {
7869       UInt mm = INSN(11,8);
7870       /* AFAICS, this simply cancels a (all?) reservations made by a
7871          (any?) preceding LDREX(es).  Arrange to hand it through to
7872          the back end. */
7873       if (abiinfo->guest__use_fallback_LLSC) {
7874          stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) )); // "no transaction"
7875       } else {
7876          stmt( IRStmt_MBE(Imbe_CancelReservation) );
7877       }
7878       DIP("clrex #%u\n", mm);
7879       return True;
7880    }
7881
7882    if (sigill_diag) {
7883       vex_printf("ARM64 front end: branch_etc\n");
7884    }
7885    return False;
7886 #  undef INSN
7887 }
7888
7889
7890 /*------------------------------------------------------------*/
7891 /*--- SIMD and FP instructions: helper functions           ---*/
7892 /*------------------------------------------------------------*/
7893
7894 /* Some constructors for interleave/deinterleave expressions. */
7895
7896 static IRExpr* mk_CatEvenLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7897    // returns a0 b0
7898    return binop(Iop_InterleaveLO64x2, mkexpr(a10), mkexpr(b10));
7899 }
7900
7901 static IRExpr* mk_CatOddLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7902    // returns a1 b1
7903    return binop(Iop_InterleaveHI64x2, mkexpr(a10), mkexpr(b10));
7904 }
7905
7906 static IRExpr* mk_CatEvenLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7907    // returns a2 a0 b2 b0
7908    return binop(Iop_CatEvenLanes32x4, mkexpr(a3210), mkexpr(b3210));
7909 }
7910
7911 static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7912    // returns a3 a1 b3 b1
7913    return binop(Iop_CatOddLanes32x4, mkexpr(a3210), mkexpr(b3210));
7914 }
7915
7916 static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 ) {
7917    // returns a1 b1 a0 b0
7918    return binop(Iop_InterleaveLO32x4, mkexpr(a3210), mkexpr(b3210));
7919 }
7920
7921 static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 ) {
7922    // returns a3 b3 a2 b2
7923    return binop(Iop_InterleaveHI32x4, mkexpr(a3210), mkexpr(b3210));
7924 }
7925
7926 static IRExpr* mk_CatEvenLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7927    // returns a6 a4 a2 a0 b6 b4 b2 b0
7928    return binop(Iop_CatEvenLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7929 }
7930
7931 static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7932    // returns a7 a5 a3 a1 b7 b5 b3 b1
7933    return binop(Iop_CatOddLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7934 }
7935
7936 static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7937    // returns a3 b3 a2 b2 a1 b1 a0 b0
7938    return binop(Iop_InterleaveLO16x8, mkexpr(a76543210), mkexpr(b76543210));
7939 }
7940
7941 static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7942    // returns a7 b7 a6 b6 a5 b5 a4 b4
7943    return binop(Iop_InterleaveHI16x8, mkexpr(a76543210), mkexpr(b76543210));
7944 }
7945
7946 static IRExpr* mk_CatEvenLanes8x16 ( IRTemp aFEDCBA9876543210,
7947                                      IRTemp bFEDCBA9876543210 ) {
7948    // returns aE aC aA a8 a6 a4 a2 a0 bE bC bA b8 b6 b4 b2 b0
7949    return binop(Iop_CatEvenLanes8x16, mkexpr(aFEDCBA9876543210),
7950                                       mkexpr(bFEDCBA9876543210));
7951 }
7952
7953 static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
7954                                     IRTemp bFEDCBA9876543210 ) {
7955    // returns aF aD aB a9 a7 a5 a3 a1 bF bD bB b9 b7 b5 b3 b1
7956    return binop(Iop_CatOddLanes8x16, mkexpr(aFEDCBA9876543210),
7957                                      mkexpr(bFEDCBA9876543210));
7958 }
7959
7960 static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
7961                                      IRTemp bFEDCBA9876543210 ) {
7962    // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
7963    return binop(Iop_InterleaveLO8x16, mkexpr(aFEDCBA9876543210),
7964                                       mkexpr(bFEDCBA9876543210));
7965 }
7966
7967 static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
7968                                      IRTemp bFEDCBA9876543210 ) {
7969    // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
7970    return binop(Iop_InterleaveHI8x16, mkexpr(aFEDCBA9876543210),
7971                                       mkexpr(bFEDCBA9876543210));
7972 }
7973
7974 /* Generate N copies of |bit| in the bottom of a ULong. */
7975 static ULong Replicate ( ULong bit, Int N )
7976 {
7977    vassert(bit <= 1 && N >= 1 && N < 64);
7978    if (bit == 0) {
7979       return 0;
7980     } else {
7981       /* Careful.  This won't work for N == 64. */
7982       return (1ULL << N) - 1;
7983    }
7984 }
7985
7986 static ULong Replicate32x2 ( ULong bits32 )
7987 {
7988    vassert(0 == (bits32 & ~0xFFFFFFFFULL));
7989    return (bits32 << 32) | bits32;
7990 }
7991
7992 static ULong Replicate16x4 ( ULong bits16 )
7993 {
7994    vassert(0 == (bits16 & ~0xFFFFULL));
7995    return Replicate32x2((bits16 << 16) | bits16);
7996 }
7997
7998 static ULong Replicate8x8 ( ULong bits8 )
7999 {
8000    vassert(0 == (bits8 & ~0xFFULL));
8001    return Replicate16x4((bits8 << 8) | bits8);
8002 }
8003
8004 /* Expand the VFPExpandImm-style encoding in the bottom 8 bits of
8005    |imm8| to either a 32-bit value if N is 32 or a 64 bit value if N
8006    is 64.  In the former case, the upper 32 bits of the returned value
8007    are guaranteed to be zero. */
8008 static ULong VFPExpandImm ( ULong imm8, Int N )
8009 {
8010    vassert(imm8 <= 0xFF);
8011    vassert(N == 32 || N == 64);
8012    Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
8013    Int F = N - E - 1;
8014    ULong imm8_6 = (imm8 >> 6) & 1;
8015    /* sign: 1 bit */
8016    /* exp:  E bits */
8017    /* frac: F bits */
8018    ULong sign = (imm8 >> 7) & 1;
8019    ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
8020    ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
8021    vassert(sign < (1ULL << 1));
8022    vassert(exp  < (1ULL << E));
8023    vassert(frac < (1ULL << F));
8024    vassert(1 + E + F == N);
8025    ULong res = (sign << (E+F)) | (exp << F) | frac;
8026    return res;
8027 }
8028
8029 /* Expand an AdvSIMDExpandImm-style encoding into a 64-bit value.
8030    This might fail, as indicated by the returned Bool.  Page 2530 of
8031    the manual. */
8032 static Bool AdvSIMDExpandImm ( /*OUT*/ULong* res,
8033                                UInt op, UInt cmode, UInt imm8 )
8034 {
8035    vassert(op <= 1);
8036    vassert(cmode <= 15);
8037    vassert(imm8 <= 255);
8038
8039    *res = 0; /* will overwrite iff returning True */
8040
8041    ULong imm64    = 0;
8042    Bool  testimm8 = False;
8043
8044    switch (cmode >> 1) {
8045       case 0:
8046          testimm8 = False; imm64 = Replicate32x2(imm8); break;
8047       case 1:
8048          testimm8 = True; imm64 = Replicate32x2(imm8 << 8); break;
8049       case 2:
8050          testimm8 = True; imm64 = Replicate32x2(imm8 << 16); break;
8051       case 3:
8052          testimm8 = True; imm64 = Replicate32x2(imm8 << 24); break;
8053       case 4:
8054           testimm8 = False; imm64 = Replicate16x4(imm8); break;
8055       case 5:
8056           testimm8 = True; imm64 = Replicate16x4(imm8 << 8); break;
8057       case 6:
8058           testimm8 = True;
8059           if ((cmode & 1) == 0)
8060               imm64 = Replicate32x2((imm8 << 8) | 0xFF);
8061           else
8062               imm64 = Replicate32x2((imm8 << 16) | 0xFFFF);
8063           break;
8064       case 7:
8065          testimm8 = False;
8066          if ((cmode & 1) == 0 && op == 0)
8067              imm64 = Replicate8x8(imm8);
8068          if ((cmode & 1) == 0 && op == 1) {
8069              imm64 = 0;   imm64 |= (imm8 & 0x80) ? 0xFF : 0x00;
8070              imm64 <<= 8; imm64 |= (imm8 & 0x40) ? 0xFF : 0x00;
8071              imm64 <<= 8; imm64 |= (imm8 & 0x20) ? 0xFF : 0x00;
8072              imm64 <<= 8; imm64 |= (imm8 & 0x10) ? 0xFF : 0x00;
8073              imm64 <<= 8; imm64 |= (imm8 & 0x08) ? 0xFF : 0x00;
8074              imm64 <<= 8; imm64 |= (imm8 & 0x04) ? 0xFF : 0x00;
8075              imm64 <<= 8; imm64 |= (imm8 & 0x02) ? 0xFF : 0x00;
8076              imm64 <<= 8; imm64 |= (imm8 & 0x01) ? 0xFF : 0x00;
8077          }
8078          if ((cmode & 1) == 1 && op == 0) {
8079             ULong imm8_7  = (imm8 >> 7) & 1;
8080             ULong imm8_6  = (imm8 >> 6) & 1;
8081             ULong imm8_50 = imm8 & 63;
8082             ULong imm32 = (imm8_7                 << (1 + 5 + 6 + 19))
8083                           | ((imm8_6 ^ 1)         << (5 + 6 + 19))
8084                           | (Replicate(imm8_6, 5) << (6 + 19))
8085                           | (imm8_50              << 19);
8086             imm64 = Replicate32x2(imm32);
8087          }
8088          if ((cmode & 1) == 1 && op == 1) {
8089             // imm64 = imm8<7>:NOT(imm8<6>)
8090             //                :Replicate(imm8<6>,8):imm8<5:0>:Zeros(48);
8091             ULong imm8_7  = (imm8 >> 7) & 1;
8092             ULong imm8_6  = (imm8 >> 6) & 1;
8093             ULong imm8_50 = imm8 & 63;
8094             imm64 = (imm8_7 << 63) | ((imm8_6 ^ 1) << 62)
8095                     | (Replicate(imm8_6, 8) << 54)
8096                     | (imm8_50 << 48);
8097          }
8098          break;
8099       default:
8100         vassert(0);
8101    }
8102
8103    if (testimm8 && imm8 == 0)
8104       return False;
8105
8106    *res = imm64;
8107    return True;
8108 }
8109
8110 /* Help a bit for decoding laneage for vector operations that can be
8111    of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
8112    and SZ bits, typically for vector floating point. */
8113 static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
8114                                /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
8115                                /*OUT*/const HChar** arrSpec,
8116                                Bool bitQ, Bool bitSZ )
8117 {
8118    vassert(bitQ == True || bitQ == False);
8119    vassert(bitSZ == True || bitSZ == False);
8120    if (bitQ && bitSZ) { // 2x64
8121       if (tyI)       *tyI       = Ity_I64;
8122       if (tyF)       *tyF       = Ity_F64;
8123       if (nLanes)    *nLanes    = 2;
8124       if (zeroUpper) *zeroUpper = False;
8125       if (arrSpec)   *arrSpec   = "2d";
8126       return True;
8127    }
8128    if (bitQ && !bitSZ) { // 4x32
8129       if (tyI)       *tyI       = Ity_I32;
8130       if (tyF)       *tyF       = Ity_F32;
8131       if (nLanes)    *nLanes    = 4;
8132       if (zeroUpper) *zeroUpper = False;
8133       if (arrSpec)   *arrSpec   = "4s";
8134       return True;
8135    }
8136    if (!bitQ && !bitSZ) { // 2x32
8137       if (tyI)       *tyI       = Ity_I32;
8138       if (tyF)       *tyF       = Ity_F32;
8139       if (nLanes)    *nLanes    = 2;
8140       if (zeroUpper) *zeroUpper = True;
8141       if (arrSpec)   *arrSpec   = "2s";
8142       return True;
8143    }
8144    // Else impliedly 1x64, which isn't allowed.
8145    return False;
8146 }
8147
8148 /* Helper for decoding laneage for shift-style vector operations
8149    that involve an immediate shift amount. */
8150 static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
8151                                     UInt immh, UInt immb )
8152 {
8153    vassert(immh < (1<<4));
8154    vassert(immb < (1<<3));
8155    UInt immhb = (immh << 3) | immb;
8156    if (immh & 8) {
8157       if (shift)  *shift  = 128 - immhb;
8158       if (szBlg2) *szBlg2 = 3;
8159       return True;
8160    }
8161    if (immh & 4) {
8162       if (shift)  *shift  = 64 - immhb;
8163       if (szBlg2) *szBlg2 = 2;
8164       return True;
8165    }
8166    if (immh & 2) {
8167       if (shift)  *shift  = 32 - immhb;
8168       if (szBlg2) *szBlg2 = 1;
8169       return True;
8170    }
8171    if (immh & 1) {
8172       if (shift)  *shift  = 16 - immhb;
8173       if (szBlg2) *szBlg2 = 0;
8174       return True;
8175    }
8176    return False;
8177 }
8178
8179 /* Generate IR to fold all lanes of the V128 value in 'src' as
8180    characterised by the operator 'op', and return the result in the
8181    bottom bits of a V128, with all other bits set to zero. */
8182 static IRTemp math_FOLDV ( IRTemp src, IROp op )
8183 {
8184    /* The basic idea is to use repeated applications of Iop_CatEven*
8185       and Iop_CatOdd* operators to 'src' so as to clone each lane into
8186       a complete vector.  Then fold all those vectors with 'op' and
8187       zero out all but the least significant lane. */
8188    switch (op) {
8189       case Iop_Min8Sx16: case Iop_Min8Ux16:
8190       case Iop_Max8Sx16: case Iop_Max8Ux16: case Iop_Add8x16: {
8191          /* NB: temp naming here is misleading -- the naming is for 8
8192             lanes of 16 bit, whereas what is being operated on is 16
8193             lanes of 8 bits. */
8194          IRTemp x76543210 = src;
8195          IRTemp x76547654 = newTempV128();
8196          IRTemp x32103210 = newTempV128();
8197          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
8198          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
8199          IRTemp x76767676 = newTempV128();
8200          IRTemp x54545454 = newTempV128();
8201          IRTemp x32323232 = newTempV128();
8202          IRTemp x10101010 = newTempV128();
8203          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
8204          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
8205          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
8206          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
8207          IRTemp x77777777 = newTempV128();
8208          IRTemp x66666666 = newTempV128();
8209          IRTemp x55555555 = newTempV128();
8210          IRTemp x44444444 = newTempV128();
8211          IRTemp x33333333 = newTempV128();
8212          IRTemp x22222222 = newTempV128();
8213          IRTemp x11111111 = newTempV128();
8214          IRTemp x00000000 = newTempV128();
8215          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
8216          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
8217          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
8218          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
8219          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
8220          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
8221          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
8222          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
8223          /* Naming not misleading after here. */
8224          IRTemp xAllF = newTempV128();
8225          IRTemp xAllE = newTempV128();
8226          IRTemp xAllD = newTempV128();
8227          IRTemp xAllC = newTempV128();
8228          IRTemp xAllB = newTempV128();
8229          IRTemp xAllA = newTempV128();
8230          IRTemp xAll9 = newTempV128();
8231          IRTemp xAll8 = newTempV128();
8232          IRTemp xAll7 = newTempV128();
8233          IRTemp xAll6 = newTempV128();
8234          IRTemp xAll5 = newTempV128();
8235          IRTemp xAll4 = newTempV128();
8236          IRTemp xAll3 = newTempV128();
8237          IRTemp xAll2 = newTempV128();
8238          IRTemp xAll1 = newTempV128();
8239          IRTemp xAll0 = newTempV128();
8240          assign(xAllF, mk_CatOddLanes8x16 (x77777777, x77777777));
8241          assign(xAllE, mk_CatEvenLanes8x16(x77777777, x77777777));
8242          assign(xAllD, mk_CatOddLanes8x16 (x66666666, x66666666));
8243          assign(xAllC, mk_CatEvenLanes8x16(x66666666, x66666666));
8244          assign(xAllB, mk_CatOddLanes8x16 (x55555555, x55555555));
8245          assign(xAllA, mk_CatEvenLanes8x16(x55555555, x55555555));
8246          assign(xAll9, mk_CatOddLanes8x16 (x44444444, x44444444));
8247          assign(xAll8, mk_CatEvenLanes8x16(x44444444, x44444444));
8248          assign(xAll7, mk_CatOddLanes8x16 (x33333333, x33333333));
8249          assign(xAll6, mk_CatEvenLanes8x16(x33333333, x33333333));
8250          assign(xAll5, mk_CatOddLanes8x16 (x22222222, x22222222));
8251          assign(xAll4, mk_CatEvenLanes8x16(x22222222, x22222222));
8252          assign(xAll3, mk_CatOddLanes8x16 (x11111111, x11111111));
8253          assign(xAll2, mk_CatEvenLanes8x16(x11111111, x11111111));
8254          assign(xAll1, mk_CatOddLanes8x16 (x00000000, x00000000));
8255          assign(xAll0, mk_CatEvenLanes8x16(x00000000, x00000000));
8256          IRTemp maxFE = newTempV128();
8257          IRTemp maxDC = newTempV128();
8258          IRTemp maxBA = newTempV128();
8259          IRTemp max98 = newTempV128();
8260          IRTemp max76 = newTempV128();
8261          IRTemp max54 = newTempV128();
8262          IRTemp max32 = newTempV128();
8263          IRTemp max10 = newTempV128();
8264          assign(maxFE, binop(op, mkexpr(xAllF), mkexpr(xAllE)));
8265          assign(maxDC, binop(op, mkexpr(xAllD), mkexpr(xAllC)));
8266          assign(maxBA, binop(op, mkexpr(xAllB), mkexpr(xAllA)));
8267          assign(max98, binop(op, mkexpr(xAll9), mkexpr(xAll8)));
8268          assign(max76, binop(op, mkexpr(xAll7), mkexpr(xAll6)));
8269          assign(max54, binop(op, mkexpr(xAll5), mkexpr(xAll4)));
8270          assign(max32, binop(op, mkexpr(xAll3), mkexpr(xAll2)));
8271          assign(max10, binop(op, mkexpr(xAll1), mkexpr(xAll0)));
8272          IRTemp maxFEDC = newTempV128();
8273          IRTemp maxBA98 = newTempV128();
8274          IRTemp max7654 = newTempV128();
8275          IRTemp max3210 = newTempV128();
8276          assign(maxFEDC, binop(op, mkexpr(maxFE), mkexpr(maxDC)));
8277          assign(maxBA98, binop(op, mkexpr(maxBA), mkexpr(max98)));
8278          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
8279          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
8280          IRTemp maxFEDCBA98 = newTempV128();
8281          IRTemp max76543210 = newTempV128();
8282          assign(maxFEDCBA98, binop(op, mkexpr(maxFEDC), mkexpr(maxBA98)));
8283          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
8284          IRTemp maxAllLanes = newTempV128();
8285          assign(maxAllLanes, binop(op, mkexpr(maxFEDCBA98),
8286                                        mkexpr(max76543210)));
8287          IRTemp res = newTempV128();
8288          assign(res, unop(Iop_ZeroHI120ofV128, mkexpr(maxAllLanes)));
8289          return res;
8290       }
8291       case Iop_Min16Sx8: case Iop_Min16Ux8:
8292       case Iop_Max16Sx8: case Iop_Max16Ux8: case Iop_Add16x8: {
8293          IRTemp x76543210 = src;
8294          IRTemp x76547654 = newTempV128();
8295          IRTemp x32103210 = newTempV128();
8296          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
8297          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
8298          IRTemp x76767676 = newTempV128();
8299          IRTemp x54545454 = newTempV128();
8300          IRTemp x32323232 = newTempV128();
8301          IRTemp x10101010 = newTempV128();
8302          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
8303          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
8304          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
8305          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
8306          IRTemp x77777777 = newTempV128();
8307          IRTemp x66666666 = newTempV128();
8308          IRTemp x55555555 = newTempV128();
8309          IRTemp x44444444 = newTempV128();
8310          IRTemp x33333333 = newTempV128();
8311          IRTemp x22222222 = newTempV128();
8312          IRTemp x11111111 = newTempV128();
8313          IRTemp x00000000 = newTempV128();
8314          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
8315          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
8316          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
8317          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
8318          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
8319          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
8320          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
8321          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
8322          IRTemp max76 = newTempV128();
8323          IRTemp max54 = newTempV128();
8324          IRTemp max32 = newTempV128();
8325          IRTemp max10 = newTempV128();
8326          assign(max76, binop(op, mkexpr(x77777777), mkexpr(x66666666)));
8327          assign(max54, binop(op, mkexpr(x55555555), mkexpr(x44444444)));
8328          assign(max32, binop(op, mkexpr(x33333333), mkexpr(x22222222)));
8329          assign(max10, binop(op, mkexpr(x11111111), mkexpr(x00000000)));
8330          IRTemp max7654 = newTempV128();
8331          IRTemp max3210 = newTempV128();
8332          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
8333          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
8334          IRTemp max76543210 = newTempV128();
8335          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
8336          IRTemp res = newTempV128();
8337          assign(res, unop(Iop_ZeroHI112ofV128, mkexpr(max76543210)));
8338          return res;
8339       }
8340       case Iop_Max32Fx4: case Iop_Min32Fx4:
8341       case Iop_Min32Sx4: case Iop_Min32Ux4:
8342       case Iop_Max32Sx4: case Iop_Max32Ux4: case Iop_Add32x4: {
8343          IRTemp x3210 = src;
8344          IRTemp x3232 = newTempV128();
8345          IRTemp x1010 = newTempV128();
8346          assign(x3232, mk_CatOddLanes64x2 (x3210, x3210));
8347          assign(x1010, mk_CatEvenLanes64x2(x3210, x3210));
8348          IRTemp x3333 = newTempV128();
8349          IRTemp x2222 = newTempV128();
8350          IRTemp x1111 = newTempV128();
8351          IRTemp x0000 = newTempV128();
8352          assign(x3333, mk_CatOddLanes32x4 (x3232, x3232));
8353          assign(x2222, mk_CatEvenLanes32x4(x3232, x3232));
8354          assign(x1111, mk_CatOddLanes32x4 (x1010, x1010));
8355          assign(x0000, mk_CatEvenLanes32x4(x1010, x1010));
8356          IRTemp max32 = newTempV128();
8357          IRTemp max10 = newTempV128();
8358          assign(max32, binop(op, mkexpr(x3333), mkexpr(x2222)));
8359          assign(max10, binop(op, mkexpr(x1111), mkexpr(x0000)));
8360          IRTemp max3210 = newTempV128();
8361          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
8362          IRTemp res = newTempV128();
8363          assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
8364          return res;
8365       }
8366       case Iop_Add64x2: {
8367          IRTemp x10 = src;
8368          IRTemp x00 = newTempV128();
8369          IRTemp x11 = newTempV128();
8370          assign(x11, binop(Iop_InterleaveHI64x2, mkexpr(x10), mkexpr(x10)));
8371          assign(x00, binop(Iop_InterleaveLO64x2, mkexpr(x10), mkexpr(x10)));
8372          IRTemp max10 = newTempV128();
8373          assign(max10, binop(op, mkexpr(x11), mkexpr(x00)));
8374          IRTemp res = newTempV128();
8375          assign(res, unop(Iop_ZeroHI64ofV128, mkexpr(max10)));
8376          return res;
8377       }
8378       default:
8379          vassert(0);
8380    }
8381 }
8382
8383
8384 /* Generate IR for TBL and TBX.  This deals with the 128 bit case
8385    only. */
8386 static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src,
8387                              IRTemp oor_values )
8388 {
8389    vassert(len >= 0 && len <= 3);
8390
8391    /* Generate some useful constants as concisely as possible. */
8392    IRTemp half15 = newTemp(Ity_I64);
8393    assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL));
8394    IRTemp half16 = newTemp(Ity_I64);
8395    assign(half16, mkU64(0x1010101010101010ULL));
8396
8397    /* A zero vector */
8398    IRTemp allZero = newTempV128();
8399    assign(allZero, mkV128(0x0000));
8400    /* A vector containing 15 in each 8-bit lane */
8401    IRTemp all15 = newTempV128();
8402    assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15)));
8403    /* A vector containing 16 in each 8-bit lane */
8404    IRTemp all16 = newTempV128();
8405    assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16)));
8406    /* A vector containing 32 in each 8-bit lane */
8407    IRTemp all32 = newTempV128();
8408    assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16)));
8409    /* A vector containing 48 in each 8-bit lane */
8410    IRTemp all48 = newTempV128();
8411    assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32)));
8412    /* A vector containing 64 in each 8-bit lane */
8413    IRTemp all64 = newTempV128();
8414    assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32)));
8415
8416    /* Group the 16/32/48/64 vectors so as to be indexable. */
8417    IRTemp allXX[4] = { all16, all32, all48, all64 };
8418
8419    /* Compute the result for each table vector, with zeroes in places
8420       where the index values are out of range, and OR them into the
8421       running vector. */
8422    IRTemp running_result = newTempV128();
8423    assign(running_result, mkV128(0));
8424
8425    UInt tabent;
8426    for (tabent = 0; tabent <= len; tabent++) {
8427       vassert(tabent >= 0 && tabent < 4);
8428       IRTemp bias = newTempV128();
8429       assign(bias,
8430              mkexpr(tabent == 0 ? allZero : allXX[tabent-1]));
8431       IRTemp biased_indices = newTempV128();
8432       assign(biased_indices,
8433              binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias)));
8434       IRTemp valid_mask = newTempV128();
8435       assign(valid_mask,
8436              binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices)));
8437       IRTemp safe_biased_indices = newTempV128();
8438       assign(safe_biased_indices,
8439              binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15)));
8440       IRTemp results_or_junk = newTempV128();
8441       assign(results_or_junk,
8442              binop(Iop_Perm8x16, mkexpr(tab[tabent]),
8443                                  mkexpr(safe_biased_indices)));
8444       IRTemp results_or_zero = newTempV128();
8445       assign(results_or_zero,
8446              binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask)));
8447       /* And OR that into the running result. */
8448       IRTemp tmp = newTempV128();
8449       assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero),
8450                         mkexpr(running_result)));
8451       running_result = tmp;
8452    }
8453
8454    /* So now running_result holds the overall result where the indices
8455       are in range, and zero in out-of-range lanes.  Now we need to
8456       compute an overall validity mask and use this to copy in the
8457       lanes in the oor_values for out of range indices.  This is
8458       unnecessary for TBL but will get folded out by iropt, so we lean
8459       on that and generate the same code for TBL and TBX here. */
8460    IRTemp overall_valid_mask = newTempV128();
8461    assign(overall_valid_mask,
8462           binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src)));
8463    IRTemp result = newTempV128();
8464    assign(result,
8465           binop(Iop_OrV128,
8466                 mkexpr(running_result),
8467                 binop(Iop_AndV128,
8468                       mkexpr(oor_values),
8469                       unop(Iop_NotV128, mkexpr(overall_valid_mask)))));
8470    return result;
8471 }
8472
8473
8474 /* Let |argL| and |argR| be V128 values, and let |opI64x2toV128| be
8475    an op which takes two I64s and produces a V128.  That is, a widening
8476    operator.  Generate IR which applies |opI64x2toV128| to either the
8477    lower (if |is2| is False) or upper (if |is2| is True) halves of
8478    |argL| and |argR|, and return the value in a new IRTemp.
8479 */
8480 static
8481 IRTemp math_BINARY_WIDENING_V128 ( Bool is2, IROp opI64x2toV128,
8482                                    IRExpr* argL, IRExpr* argR )
8483 {
8484    IRTemp res   = newTempV128();
8485    IROp   slice = is2 ? Iop_V128HIto64 : Iop_V128to64;
8486    assign(res, binop(opI64x2toV128, unop(slice, argL),
8487                                     unop(slice, argR)));
8488    return res;
8489 }
8490
8491
8492 /* Generate signed/unsigned absolute difference vector IR. */
8493 static
8494 IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE )
8495 {
8496    vassert(size <= 3);
8497    IRTemp argL = newTempV128();
8498    IRTemp argR = newTempV128();
8499    IRTemp msk  = newTempV128();
8500    IRTemp res  = newTempV128();
8501    assign(argL, argLE);
8502    assign(argR, argRE);
8503    assign(msk, binop(isU ? mkVecCMPGTU(size) : mkVecCMPGTS(size),
8504                      mkexpr(argL), mkexpr(argR)));
8505    assign(res,
8506           binop(Iop_OrV128,
8507                 binop(Iop_AndV128,
8508                       binop(mkVecSUB(size), mkexpr(argL), mkexpr(argR)),
8509                       mkexpr(msk)),
8510                 binop(Iop_AndV128,
8511                       binop(mkVecSUB(size), mkexpr(argR), mkexpr(argL)),
8512                       unop(Iop_NotV128, mkexpr(msk)))));
8513    return res;
8514 }
8515
8516
8517 /* Generate IR that takes a V128 and sign- or zero-widens
8518    either the lower or upper set of lanes to twice-as-wide,
8519    resulting in a new V128 value. */
8520 static
8521 IRTemp math_WIDEN_LO_OR_HI_LANES ( Bool zWiden, Bool fromUpperHalf,
8522                                    UInt sizeNarrow, IRExpr* srcE )
8523 {
8524    IRTemp src = newTempV128();
8525    IRTemp res = newTempV128();
8526    assign(src, srcE);
8527    switch (sizeNarrow) {
8528       case X10:
8529          assign(res,
8530                 binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2,
8531                       binop(fromUpperHalf ? Iop_InterleaveHI32x4
8532                                           : Iop_InterleaveLO32x4,
8533                             mkexpr(src),
8534                             mkexpr(src)),
8535                       mkU8(32)));
8536          break;
8537       case X01:
8538          assign(res,
8539                 binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4,
8540                       binop(fromUpperHalf ? Iop_InterleaveHI16x8
8541                                           : Iop_InterleaveLO16x8,
8542                             mkexpr(src),
8543                             mkexpr(src)),
8544                       mkU8(16)));
8545          break;
8546       case X00:
8547          assign(res,
8548                 binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8,
8549                       binop(fromUpperHalf ? Iop_InterleaveHI8x16
8550                                           : Iop_InterleaveLO8x16,
8551                             mkexpr(src),
8552                             mkexpr(src)),
8553                       mkU8(8)));
8554          break;
8555       default:
8556          vassert(0);
8557    }
8558    return res;
8559 }
8560
8561
8562 /* Generate IR that takes a V128 and sign- or zero-widens
8563    either the even or odd lanes to twice-as-wide,
8564    resulting in a new V128 value. */
8565 static
8566 IRTemp math_WIDEN_EVEN_OR_ODD_LANES ( Bool zWiden, Bool fromOdd,
8567                                       UInt sizeNarrow, IRExpr* srcE )
8568 {
8569    IRTemp src   = newTempV128();
8570    IRTemp res   = newTempV128();
8571    IROp   opSAR = mkVecSARN(sizeNarrow+1);
8572    IROp   opSHR = mkVecSHRN(sizeNarrow+1);
8573    IROp   opSHL = mkVecSHLN(sizeNarrow+1);
8574    IROp   opSxR = zWiden ? opSHR : opSAR;
8575    UInt   amt   = 0;
8576    switch (sizeNarrow) {
8577       case X10: amt = 32; break;
8578       case X01: amt = 16; break;
8579       case X00: amt = 8;  break;
8580       default: vassert(0);
8581    }
8582    assign(src, srcE);
8583    if (fromOdd) {
8584       assign(res, binop(opSxR, mkexpr(src), mkU8(amt)));
8585    } else {
8586       assign(res, binop(opSxR, binop(opSHL, mkexpr(src), mkU8(amt)),
8587                                mkU8(amt)));
8588    }
8589    return res;
8590 }
8591
8592
8593 /* Generate IR that takes two V128s and narrows (takes lower half)
8594    of each lane, producing a single V128 value. */
8595 static
8596 IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow )
8597 {
8598    IRTemp res = newTempV128();
8599    assign(res, binop(mkVecCATEVENLANES(sizeNarrow),
8600                      mkexpr(argHi), mkexpr(argLo)));
8601    return res;
8602 }
8603
8604
8605 /* Return a temp which holds the vector dup of the lane of width
8606    (1 << size) obtained from src[laneNo]. */
8607 static
8608 IRTemp math_DUP_VEC_ELEM ( IRExpr* src, UInt size, UInt laneNo )
8609 {
8610    vassert(size <= 3);
8611    /* Normalise |laneNo| so it is of the form
8612       x000 for D, xx00 for S, xxx0 for H, and xxxx for B.
8613       This puts the bits we want to inspect at constant offsets
8614       regardless of the value of |size|.
8615    */
8616    UInt ix = laneNo << size;
8617    vassert(ix <= 15);
8618    IROp ops[4] = { Iop_INVALID, Iop_INVALID, Iop_INVALID, Iop_INVALID };
8619    switch (size) {
8620       case 0: /* B */
8621          ops[0] = (ix & 1) ? Iop_CatOddLanes8x16 : Iop_CatEvenLanes8x16;
8622          /* fallthrough */
8623       case 1: /* H */
8624          ops[1] = (ix & 2) ? Iop_CatOddLanes16x8 : Iop_CatEvenLanes16x8;
8625          /* fallthrough */
8626       case 2: /* S */
8627          ops[2] = (ix & 4) ? Iop_CatOddLanes32x4 : Iop_CatEvenLanes32x4;
8628          /* fallthrough */
8629       case 3: /* D */
8630          ops[3] = (ix & 8) ? Iop_InterleaveHI64x2 : Iop_InterleaveLO64x2;
8631          break;
8632       default:
8633          vassert(0);
8634    }
8635    IRTemp res = newTempV128();
8636    assign(res, src);
8637    Int i;
8638    for (i = 3; i >= 0; i--) {
8639       if (ops[i] == Iop_INVALID)
8640          break;
8641       IRTemp tmp = newTempV128();
8642       assign(tmp, binop(ops[i], mkexpr(res), mkexpr(res)));
8643       res = tmp;
8644    }
8645    return res;
8646 }
8647
8648
8649 /* Let |srcV| be a V128 value, and let |imm5| be a lane-and-size
8650    selector encoded as shown below.  Return a new V128 holding the
8651    selected lane from |srcV| dup'd out to V128, and also return the
8652    lane number, log2 of the lane size in bytes, and width-character via
8653    *laneNo, *laneSzLg2 and *laneCh respectively.  It may be that imm5
8654    is an invalid selector, in which case return
8655    IRTemp_INVALID, 0, 0 and '?' respectively.
8656
8657    imm5 = xxxx1   signifies .b[xxxx]
8658         = xxx10   .h[xxx]
8659         = xx100   .s[xx]
8660         = x1000   .d[x]
8661         otherwise invalid
8662 */
8663 static
8664 IRTemp handle_DUP_VEC_ELEM ( /*OUT*/UInt* laneNo,
8665                              /*OUT*/UInt* laneSzLg2, /*OUT*/HChar* laneCh,
8666                              IRExpr* srcV, UInt imm5 )
8667 {
8668    *laneNo    = 0;
8669    *laneSzLg2 = 0;
8670    *laneCh    = '?';
8671
8672    if (imm5 & 1) {
8673       *laneNo    = (imm5 >> 1) & 15;
8674       *laneSzLg2 = 0;
8675       *laneCh    = 'b';
8676    }
8677    else if (imm5 & 2) {
8678       *laneNo    = (imm5 >> 2) & 7;
8679       *laneSzLg2 = 1;
8680       *laneCh    = 'h';
8681    }
8682    else if (imm5 & 4) {
8683       *laneNo    = (imm5 >> 3) & 3;
8684       *laneSzLg2 = 2;
8685       *laneCh    = 's';
8686    }
8687    else if (imm5 & 8) {
8688       *laneNo    = (imm5 >> 4) & 1;
8689       *laneSzLg2 = 3;
8690       *laneCh    = 'd';
8691    }
8692    else {
8693       /* invalid */
8694       return IRTemp_INVALID;
8695    }
8696
8697    return math_DUP_VEC_ELEM(srcV, *laneSzLg2, *laneNo);
8698 }
8699
8700
8701 /* Clone |imm| to every lane of a V128, with lane size log2 of |size|. */
8702 static
8703 IRTemp math_VEC_DUP_IMM ( UInt size, ULong imm )
8704 {
8705    IRType ty  = Ity_INVALID;
8706    IRTemp rcS = IRTemp_INVALID;
8707    switch (size) {
8708       case X01:
8709          vassert(imm <= 0xFFFFULL);
8710          ty  = Ity_I16;
8711          rcS = newTemp(ty); assign(rcS, mkU16( (UShort)imm ));
8712          break;
8713       case X10:
8714          vassert(imm <= 0xFFFFFFFFULL);
8715          ty  = Ity_I32;
8716          rcS = newTemp(ty); assign(rcS, mkU32( (UInt)imm ));
8717          break;
8718       case X11:
8719          ty  = Ity_I64;
8720          rcS = newTemp(ty); assign(rcS, mkU64(imm)); break;
8721       default:
8722          vassert(0);
8723    }
8724    IRTemp rcV = math_DUP_TO_V128(rcS, ty);
8725    return rcV;
8726 }
8727
8728
8729 /* Let |new64| be a V128 in which only the lower 64 bits are interesting,
8730    and the upper can contain any value -- it is ignored.  If |is2| is False,
8731    generate IR to put |new64| in the lower half of vector reg |dd| and zero
8732    the upper half.  If |is2| is True, generate IR to put |new64| in the upper
8733    half of vector reg |dd| and leave the lower half unchanged.  This
8734    simulates the behaviour of the "foo/foo2" instructions in which the
8735    destination is half the width of sources, for example addhn/addhn2.
8736 */
8737 static
8738 void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
8739 {
8740    if (is2) {
8741       /* Get the old contents of Vdd, zero the upper half, and replace
8742          it with 'x'. */
8743       IRTemp t_zero_oldLO = newTempV128();
8744       assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
8745       IRTemp t_newHI_zero = newTempV128();
8746       assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
8747                                                        mkV128(0x0000)));
8748       IRTemp res = newTempV128();
8749       assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
8750                                     mkexpr(t_newHI_zero)));
8751       putQReg128(dd, mkexpr(res));
8752    } else {
8753       /* This is simple. */
8754       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
8755    }
8756 }
8757
8758
8759 /* Compute vector SQABS at lane size |size| for |srcE|, returning
8760    the q result in |*qabs| and the normal result in |*nabs|. */
8761 static
8762 void math_SQABS ( /*OUT*/IRTemp* qabs, /*OUT*/IRTemp* nabs,
8763                   IRExpr* srcE, UInt size )
8764 {
8765       IRTemp src, mask, maskn, nsub, qsub;
8766       src = mask = maskn = nsub = qsub = IRTemp_INVALID;
8767       newTempsV128_7(&src, &mask, &maskn, &nsub, &qsub, nabs, qabs);
8768       assign(src,   srcE);
8769       assign(mask,  binop(mkVecCMPGTS(size),  mkV128(0x0000), mkexpr(src)));
8770       assign(maskn, unop(Iop_NotV128, mkexpr(mask)));
8771       assign(nsub,  binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8772       assign(qsub,  binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8773       assign(*nabs, binop(Iop_OrV128,
8774                           binop(Iop_AndV128, mkexpr(nsub), mkexpr(mask)),
8775                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8776       assign(*qabs, binop(Iop_OrV128,
8777                           binop(Iop_AndV128, mkexpr(qsub), mkexpr(mask)),
8778                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8779 }
8780
8781
8782 /* Compute vector SQNEG at lane size |size| for |srcE|, returning
8783    the q result in |*qneg| and the normal result in |*nneg|. */
8784 static
8785 void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
8786                   IRExpr* srcE, UInt size )
8787 {
8788       IRTemp src = IRTemp_INVALID;
8789       newTempsV128_3(&src, nneg, qneg);
8790       assign(src,   srcE);
8791       assign(*nneg, binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8792       assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8793 }
8794
8795
8796 /* Zero all except the least significant lane of |srcE|, where |size|
8797    indicates the lane size in the usual way. */
8798 static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE )
8799 {
8800    vassert(size < 4);
8801    IRTemp t = newTempV128();
8802    assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
8803    return t;
8804 }
8805
8806
8807 /* Generate IR to compute vector widening MULL from either the lower
8808    (is2==False) or upper (is2==True) halves of vecN and vecM.  The
8809    widening multiplies are unsigned when isU==True and signed when
8810    isU==False.  |size| is the narrow lane size indication.  Optionally,
8811    the product may be added to or subtracted from vecD, at the wide lane
8812    size.  This happens when |mas| is 'a' (add) or 's' (sub).  When |mas|
8813    is 'm' (only multiply) then the accumulate part does not happen, and
8814    |vecD| is expected to == IRTemp_INVALID.
8815
8816    Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
8817    are allowed.  The result is returned in a new IRTemp, which is
8818    returned in *res. */
8819 static
8820 void math_MULL_ACC ( /*OUT*/IRTemp* res,
8821                      Bool is2, Bool isU, UInt size, HChar mas,
8822                      IRTemp vecN, IRTemp vecM, IRTemp vecD )
8823 {
8824    vassert(res && *res == IRTemp_INVALID);
8825    vassert(size <= 2);
8826    vassert(mas == 'm' || mas == 'a' || mas == 's');
8827    if (mas == 'm') vassert(vecD == IRTemp_INVALID);
8828    IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
8829    IROp   accOp = (mas == 'a') ? mkVecADD(size+1)
8830                   : (mas == 's' ? mkVecSUB(size+1)
8831                   : Iop_INVALID);
8832    IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp,
8833                                             mkexpr(vecN), mkexpr(vecM));
8834    *res = newTempV128();
8835    assign(*res, mas == 'm' ? mkexpr(mul)
8836                            : binop(accOp, mkexpr(vecD), mkexpr(mul)));
8837 }
8838
8839
8840 /* Same as math_MULL_ACC, except the multiply is signed widening,
8841    the multiplied value is then doubled, before being added to or
8842    subtracted from the accumulated value.  And everything is
8843    saturated.  In all cases, saturation residuals are returned
8844    via (sat1q, sat1n), and in the accumulate cases,
8845    via (sat2q, sat2n) too.  All results are returned in new temporaries.
8846    In the no-accumulate case, *sat2q and *sat2n are never instantiated,
8847    so the caller can tell this has happened. */
8848 static
8849 void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
8850                         /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8851                         /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
8852                         Bool is2, UInt size, HChar mas,
8853                         IRTemp vecN, IRTemp vecM, IRTemp vecD )
8854 {
8855    vassert(size <= 2);
8856    vassert(mas == 'm' || mas == 'a' || mas == 's');
8857    /* Compute
8858          sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
8859          sat1n = vecN.D[is2] *s  vecM.d[is2] *  2
8860       IOW take either the low or high halves of vecN and vecM, signed widen,
8861       multiply, double that, and signedly saturate.  Also compute the same
8862       but without saturation.
8863    */
8864    vassert(sat2q && *sat2q == IRTemp_INVALID);
8865    vassert(sat2n && *sat2n == IRTemp_INVALID);
8866    newTempsV128_3(sat1q, sat1n, res);
8867    IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
8868                                          mkexpr(vecN), mkexpr(vecM));
8869    IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
8870                                          mkexpr(vecN), mkexpr(vecM));
8871    assign(*sat1q, mkexpr(tq));
8872    assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
8873
8874    /* If there is no accumulation, the final result is sat1q,
8875       and there's no assignment to sat2q or sat2n. */
8876    if (mas == 'm') {
8877       assign(*res, mkexpr(*sat1q));
8878       return;
8879    }
8880
8881    /* Compute
8882          sat2q  = vecD +sq/-sq sat1q
8883          sat2n  = vecD +/-     sat1n
8884          result = sat2q
8885    */
8886    newTempsV128_2(sat2q, sat2n);
8887    assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
8888                         mkexpr(vecD), mkexpr(*sat1q)));
8889    assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
8890                         mkexpr(vecD), mkexpr(*sat1n)));
8891    assign(*res, mkexpr(*sat2q));
8892 }
8893
8894
8895 /* Generate IR for widening signed vector multiplies.  The operands
8896    have their lane width signedly widened, and they are then multiplied
8897    at the wider width, returning results in two new IRTemps. */
8898 static
8899 void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
8900                   UInt sizeNarrow, IRTemp argL, IRTemp argR )
8901 {
8902    vassert(sizeNarrow <= 2);
8903    newTempsV128_2(resHI, resLO);
8904    IRTemp argLhi = newTemp(Ity_I64);
8905    IRTemp argLlo = newTemp(Ity_I64);
8906    IRTemp argRhi = newTemp(Ity_I64);
8907    IRTemp argRlo = newTemp(Ity_I64);
8908    assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
8909    assign(argLlo, unop(Iop_V128to64,   mkexpr(argL)));
8910    assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
8911    assign(argRlo, unop(Iop_V128to64,   mkexpr(argR)));
8912    IROp opMulls = mkVecMULLS(sizeNarrow);
8913    assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
8914    assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
8915 }
8916
8917
8918 /* Generate IR for SQDMULH and SQRDMULH: signedly wideningly multiply,
8919    double that, possibly add a rounding constant (R variants), and take
8920    the high half. */
8921 static
8922 void math_SQDMULH ( /*OUT*/IRTemp* res,
8923                     /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8924                     Bool isR, UInt size, IRTemp vN, IRTemp vM )
8925 {
8926    vassert(size == X01 || size == X10); /* s or h only */
8927
8928    newTempsV128_3(res, sat1q, sat1n);
8929
8930    IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
8931    math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
8932
8933    IRTemp addWide = mkVecADD(size+1);
8934
8935    if (isR) {
8936       assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8937
8938       Int    rcShift    = size == X01 ? 15 : 31;
8939       IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
8940       assign(*sat1n,
8941              binop(mkVecCATODDLANES(size),
8942                    binop(addWide,
8943                          binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8944                          mkexpr(roundConst)),
8945                    binop(addWide,
8946                          binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
8947                          mkexpr(roundConst))));
8948    } else {
8949       assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8950
8951       assign(*sat1n,
8952              binop(mkVecCATODDLANES(size),
8953                    binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8954                    binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
8955    }
8956
8957    assign(*res, mkexpr(*sat1q));
8958 }
8959
8960 /* Generate IR for SQRDMLAH and SQRDMLSH: signedly wideningly multiply,
8961    double, add a rounding constant, take the high half and accumulate. */
8962 static
8963 void math_SQRDMLAH ( /*OUT*/IRTemp* res, /*OUT*/IRTemp* res_nosat, Bool isAdd,
8964                      UInt size, IRTemp vD, IRTemp vN, IRTemp vM )
8965 {
8966    vassert(size == X01 || size == X10); /* s or h only */
8967
8968    /* SQRDMLAH = SQADD(A, SQRDMULH(B, C)) */
8969
8970    IRTemp mul, mul_nosat, dummy;
8971    mul = mul_nosat = dummy = IRTemp_INVALID;
8972    math_SQDMULH(&mul, &dummy, &mul_nosat, True/*R*/, size, vN, vM);
8973
8974    IROp  op = isAdd ? mkVecADD(size)   : mkVecSUB(size);
8975    IROp qop = isAdd ? mkVecQADDS(size) : mkVecQSUBS(size);
8976    newTempsV128_2(res, res_nosat);
8977    assign(*res, binop(qop, mkexpr(vD), mkexpr(mul)));
8978    assign(*res_nosat, binop(op, mkexpr(vD), mkexpr(mul_nosat)));
8979 }
8980
8981
8982 /* Generate IR for SQSHL, UQSHL, SQSHLU by imm.  Put the result in
8983    a new temp in *res, and the Q difference pair in new temps in
8984    *qDiff1 and *qDiff2 respectively.  |nm| denotes which of the
8985    three operations it is. */
8986 static
8987 void math_QSHL_IMM ( /*OUT*/IRTemp* res,
8988                      /*OUT*/IRTemp* qDiff1, /*OUT*/IRTemp* qDiff2,
8989                      IRTemp src, UInt size, UInt shift, const HChar* nm )
8990 {
8991    vassert(size <= 3);
8992    UInt laneBits = 8 << size;
8993    vassert(shift < laneBits);
8994    newTempsV128_3(res, qDiff1, qDiff2);
8995    IRTemp z128 = newTempV128();
8996    assign(z128, mkV128(0x0000));
8997
8998    /* UQSHL */
8999    if (vex_streq(nm, "uqshl")) {
9000       IROp qop = mkVecQSHLNSATUU(size);
9001       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
9002       if (shift == 0) {
9003          /* No shift means no saturation. */
9004          assign(*qDiff1, mkexpr(z128));
9005          assign(*qDiff2, mkexpr(z128));
9006       } else {
9007          /* Saturation has occurred if any of the shifted-out bits are
9008             nonzero.  We get the shifted-out bits by right-shifting the
9009             original value. */
9010          UInt rshift = laneBits - shift;
9011          vassert(rshift >= 1 && rshift < laneBits);
9012          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
9013          assign(*qDiff2, mkexpr(z128));
9014       }
9015       return;
9016    }
9017
9018    /* SQSHL */
9019    if (vex_streq(nm, "sqshl")) {
9020       IROp qop = mkVecQSHLNSATSS(size);
9021       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
9022       if (shift == 0) {
9023          /* No shift means no saturation. */
9024          assign(*qDiff1, mkexpr(z128));
9025          assign(*qDiff2, mkexpr(z128));
9026       } else {
9027          /* Saturation has occurred if any of the shifted-out bits are
9028             different from the top bit of the original value. */
9029          UInt rshift = laneBits - 1 - shift;
9030          vassert(rshift >= 0 && rshift < laneBits-1);
9031          /* qDiff1 is the shifted out bits, and the top bit of the original
9032             value, preceded by zeroes. */
9033          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
9034          /* qDiff2 is the top bit of the original value, cloned the
9035             correct number of times. */
9036          assign(*qDiff2, binop(mkVecSHRN(size),
9037                                binop(mkVecSARN(size), mkexpr(src),
9038                                                       mkU8(laneBits-1)),
9039                                mkU8(rshift)));
9040          /* This also succeeds in comparing the top bit of the original
9041             value to itself, which is a bit stupid, but not wrong. */
9042       }
9043       return;
9044    }
9045
9046    /* SQSHLU */
9047    if (vex_streq(nm, "sqshlu")) {
9048       IROp qop = mkVecQSHLNSATSU(size);
9049       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
9050       if (shift == 0) {
9051          /* If there's no shift, saturation depends on the top bit
9052             of the source. */
9053          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(laneBits-1)));
9054          assign(*qDiff2, mkexpr(z128));
9055       } else {
9056          /* Saturation has occurred if any of the shifted-out bits are
9057             nonzero.  We get the shifted-out bits by right-shifting the
9058             original value. */
9059          UInt rshift = laneBits - shift;
9060          vassert(rshift >= 1 && rshift < laneBits);
9061          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
9062          assign(*qDiff2, mkexpr(z128));
9063       }
9064       return;
9065    }
9066
9067    vassert(0);
9068 }
9069
9070
9071 /* Generate IR to do SRHADD and URHADD. */
9072 static
9073 IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb )
9074 {
9075    /* Generate this:
9076       (A >> 1) + (B >> 1) + (((A & 1) + (B & 1) + 1) >> 1)
9077    */
9078    vassert(size <= 3);
9079    IROp opSHR = isU ? mkVecSHRN(size) : mkVecSARN(size);
9080    IROp opADD = mkVecADD(size);
9081    /* The only tricky bit is to generate the correct vector 1 constant. */
9082    const ULong ones64[4]
9083       = { 0x0101010101010101ULL, 0x0001000100010001ULL,
9084           0x0000000100000001ULL, 0x0000000000000001ULL };
9085    IRTemp imm64 = newTemp(Ity_I64);
9086    assign(imm64, mkU64(ones64[size]));
9087    IRTemp vecOne = newTempV128();
9088    assign(vecOne, binop(Iop_64HLtoV128, mkexpr(imm64), mkexpr(imm64)));
9089    IRTemp scaOne = newTemp(Ity_I8);
9090    assign(scaOne, mkU8(1));
9091    IRTemp res = newTempV128();
9092    assign(res,
9093           binop(opADD,
9094                 binop(opSHR, mkexpr(aa), mkexpr(scaOne)),
9095                 binop(opADD,
9096                       binop(opSHR, mkexpr(bb), mkexpr(scaOne)),
9097                       binop(opSHR,
9098                             binop(opADD,
9099                                   binop(opADD,
9100                                         binop(Iop_AndV128, mkexpr(aa),
9101                                                            mkexpr(vecOne)),
9102                                         binop(Iop_AndV128, mkexpr(bb),
9103                                                            mkexpr(vecOne))
9104                                   ),
9105                                   mkexpr(vecOne)
9106                             ),
9107                             mkexpr(scaOne)
9108                       )
9109                 )
9110           )
9111    );
9112    return res;
9113 }
9114
9115
9116 /* QCFLAG tracks the SIMD sticky saturation status.  Update the status
9117    thusly: if, after application of |opZHI| to both |qres| and |nres|,
9118    they have the same value, leave QCFLAG unchanged.  Otherwise, set it
9119    (implicitly) to 1.  |opZHI| may only be one of the Iop_ZeroHIxxofV128
9120    operators, or Iop_INVALID, in which case |qres| and |nres| are used
9121    unmodified.  The presence |opZHI| means this function can be used to
9122    generate QCFLAG update code for both scalar and vector SIMD operations.
9123 */
9124 static
9125 void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
9126 {
9127    IRTemp diff      = newTempV128();
9128    IRTemp oldQCFLAG = newTempV128();
9129    IRTemp newQCFLAG = newTempV128();
9130    if (opZHI == Iop_INVALID) {
9131       assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
9132    } else {
9133       vassert(opZHI == Iop_ZeroHI64ofV128
9134               || opZHI == Iop_ZeroHI96ofV128 || opZHI == Iop_ZeroHI112ofV128);
9135       assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
9136    }
9137    assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
9138    assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
9139    stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
9140 }
9141
9142
9143 /* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
9144    are used unmodified, hence suitable for QCFLAG updates for whole-vector
9145    operations. */
9146 static
9147 void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
9148 {
9149    updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
9150 }
9151
9152
9153 /* Generate IR to rearrange two vector values in a way which is useful
9154    for doing S/D/H add-pair etc operations.  There are 5 cases:
9155
9156    2d:  [m1 m0] [n1 n0]  -->  [m1 n1] [m0 n0]
9157
9158    4s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [m3 m1 n3 n1] [m2 m0 n2 n0]
9159
9160    8h:  [m7 m6 m5 m4 m3 m2 m1 m0] [n7 n6 n5 n4 n3 n2 n1 n0] -->
9161         [m7 m5 n7 n5 m3 m1 n3 n1] [m6 m4 n6 n4 m2 m0 n2 n0]
9162
9163    2s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [0 0 m1 n1] [0 0 m0 n0]
9164
9165    4h:  [m7 m6 m5 m4 m3 m2 m1 m0] [n7 n6 n5 n4 n3 n2 n1 n0] -->
9166         [ 0  0  0  0 m3 m1 n3 n1] [ 0  0  0  0 m2 m0 n2 n0]
9167 */
9168 static
9169 void math_REARRANGE_FOR_FLOATING_PAIRWISE (
9170         /*OUT*/IRTemp* rearrL, /*OUT*/IRTemp* rearrR,
9171         IRTemp vecM, IRTemp vecN, ARM64VecESize sz, UInt bitQ
9172      )
9173 {
9174    vassert(rearrL && *rearrL == IRTemp_INVALID);
9175    vassert(rearrR && *rearrR == IRTemp_INVALID);
9176    *rearrL = newTempV128();
9177    *rearrR = newTempV128();
9178
9179    switch (sz) {
9180       case ARM64VSizeD:
9181          // 2d case
9182          vassert(bitQ == 1);
9183          assign(*rearrL, binop(Iop_InterleaveHI64x2, mkexpr(vecM), mkexpr(vecN)));
9184          assign(*rearrR, binop(Iop_InterleaveLO64x2, mkexpr(vecM), mkexpr(vecN)));
9185          break;
9186
9187       case ARM64VSizeS:
9188          if (bitQ == 1) {
9189             // 4s case
9190             assign(*rearrL, binop(Iop_CatOddLanes32x4,  mkexpr(vecM), mkexpr(vecN)));
9191             assign(*rearrR, binop(Iop_CatEvenLanes32x4, mkexpr(vecM), mkexpr(vecN)));
9192          } else {
9193             // 2s case
9194             IRTemp m1n1m0n0 = newTempV128();
9195             IRTemp m0n0m1n1 = newTempV128();
9196             assign(m1n1m0n0, binop(Iop_InterleaveLO32x4,
9197                                    mkexpr(vecM), mkexpr(vecN)));
9198             assign(m0n0m1n1, triop(Iop_SliceV128,
9199                                    mkexpr(m1n1m0n0), mkexpr(m1n1m0n0), mkU8(8)));
9200             assign(*rearrL, unop(Iop_ZeroHI64ofV128, mkexpr(m1n1m0n0)));
9201             assign(*rearrR, unop(Iop_ZeroHI64ofV128, mkexpr(m0n0m1n1)));
9202          }
9203          break;
9204
9205       case ARM64VSizeH:
9206          if (bitQ == 1) {
9207             // 8h case
9208             assign(*rearrL, binop(Iop_CatOddLanes16x8,  mkexpr(vecM), mkexpr(vecN)));
9209             assign(*rearrR, binop(Iop_CatEvenLanes16x8, mkexpr(vecM), mkexpr(vecN)));
9210          } else {
9211             // 4h case
9212             IRTemp m3m1n3n1 = newTempV128();
9213             IRTemp m2m0n2n0 = newTempV128();
9214             assign(m3m1n3n1, binop(Iop_CatOddLanes16x8, mkexpr(vecM), mkexpr(vecN)));
9215             assign(m2m0n2n0, binop(Iop_CatEvenLanes16x8, mkexpr(vecM), mkexpr(vecN)));
9216             assign(*rearrL, unop(Iop_ZeroHI64ofV128,
9217                                  binop(Iop_CatEvenLanes32x4, mkexpr(m3m1n3n1),
9218                                                              mkexpr(m3m1n3n1))));
9219             assign(*rearrR, unop(Iop_ZeroHI64ofV128,
9220                                  binop(Iop_CatEvenLanes32x4, mkexpr(m2m0n2n0),
9221                                                              mkexpr(m2m0n2n0))));
9222          }
9223          break;
9224
9225       default: vpanic("math_REARRANGE_FOR_FLOATING_PAIRWISE");
9226    }
9227 }
9228
9229
9230 /* Returns 2.0 ^ (-n) for n in 1 .. 64 */
9231 static Double two_to_the_minus ( Int n )
9232 {
9233    if (n == 1) return 0.5;
9234    vassert(n >= 2 && n <= 64);
9235    Int half = n / 2;
9236    return two_to_the_minus(half) * two_to_the_minus(n - half);
9237 }
9238
9239
9240 /* Returns 2.0 ^ n for n in 1 .. 64 */
9241 static Double two_to_the_plus ( Int n )
9242 {
9243    if (n == 1) return 2.0;
9244    vassert(n >= 2 && n <= 64);
9245    Int half = n / 2;
9246    return two_to_the_plus(half) * two_to_the_plus(n - half);
9247 }
9248
9249
9250 /*------------------------------------------------------------*/
9251 /*--- SIMD and FP instructions                             ---*/
9252 /*------------------------------------------------------------*/
9253
9254 static
9255 Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
9256 {
9257    /* 31  29     23  21 20 15 14   10 9 4
9258       0 q 101110 op2 0  m  0  imm4 0  n d
9259       Decode fields: op2
9260    */
9261 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9262    if (INSN(31,31) != 0
9263        || INSN(29,24) != BITS6(1,0,1,1,1,0)
9264        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(10,10) != 0) {
9265       return False;
9266    }
9267    UInt bitQ = INSN(30,30);
9268    UInt op2  = INSN(23,22);
9269    UInt mm   = INSN(20,16);
9270    UInt imm4 = INSN(14,11);
9271    UInt nn   = INSN(9,5);
9272    UInt dd   = INSN(4,0);
9273
9274    if (op2 == BITS2(0,0)) {
9275       /* -------- 00: EXT 16b_16b_16b, 8b_8b_8b -------- */
9276       IRTemp sHi = newTempV128();
9277       IRTemp sLo = newTempV128();
9278       IRTemp res = newTempV128();
9279       assign(sHi, getQReg128(mm));
9280       assign(sLo, getQReg128(nn));
9281       if (bitQ == 1) {
9282          if (imm4 == 0) {
9283             assign(res, mkexpr(sLo));
9284          } else {
9285             vassert(imm4 >= 1 && imm4 <= 15);
9286             assign(res, triop(Iop_SliceV128,
9287                               mkexpr(sHi), mkexpr(sLo), mkU8(imm4)));
9288          }
9289          putQReg128(dd, mkexpr(res));
9290          DIP("ext v%u.16b, v%u.16b, v%u.16b, #%u\n", dd, nn, mm, imm4);
9291       } else {
9292          if (imm4 >= 8) return False;
9293          if (imm4 == 0) {
9294             assign(res, mkexpr(sLo));
9295          } else {
9296             vassert(imm4 >= 1 && imm4 <= 7);
9297             IRTemp hi64lo64 = newTempV128();
9298             assign(hi64lo64, binop(Iop_InterleaveLO64x2,
9299                                    mkexpr(sHi), mkexpr(sLo)));
9300             assign(res, triop(Iop_SliceV128,
9301                               mkexpr(hi64lo64), mkexpr(hi64lo64), mkU8(imm4)));
9302          }
9303          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9304          DIP("ext v%u.8b, v%u.8b, v%u.8b, #%u\n", dd, nn, mm, imm4);
9305       }
9306       return True;
9307    }
9308
9309    return False;
9310 #  undef INSN
9311 }
9312
9313
9314 static
9315 Bool dis_AdvSIMD_TBL_TBX(/*MB_OUT*/DisResult* dres, UInt insn)
9316 {
9317    /* 31  29     23  21 20 15 14  12 11 9 4
9318       0 q 001110 op2 0  m  0  len op 00 n d
9319       Decode fields: op2,len,op
9320    */
9321 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9322    if (INSN(31,31) != 0
9323        || INSN(29,24) != BITS6(0,0,1,1,1,0)
9324        || INSN(21,21) != 0
9325        || INSN(15,15) != 0
9326        || INSN(11,10) != BITS2(0,0)) {
9327       return False;
9328    }
9329    UInt bitQ  = INSN(30,30);
9330    UInt op2   = INSN(23,22);
9331    UInt mm    = INSN(20,16);
9332    UInt len   = INSN(14,13);
9333    UInt bitOP = INSN(12,12);
9334    UInt nn    = INSN(9,5);
9335    UInt dd    = INSN(4,0);
9336
9337    if (op2 == X00) {
9338       /* -------- 00,xx,0 TBL, xx register table -------- */
9339       /* -------- 00,xx,1 TBX, xx register table -------- */
9340       /* 31  28        20 15 14  12  9 4
9341          0q0 01110 000 m  0  len 000 n d  TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
9342          0q0 01110 000 m  0  len 100 n d  TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
9343          where Ta = 16b(q=1) or 8b(q=0)
9344       */
9345       Bool isTBX = bitOP == 1;
9346       /* The out-of-range values to use. */
9347       IRTemp oor_values = newTempV128();
9348       assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0));
9349       /* src value */
9350       IRTemp src = newTempV128();
9351       assign(src, getQReg128(mm));
9352       /* The table values */
9353       IRTemp tab[4];
9354       UInt   i;
9355       for (i = 0; i <= len; i++) {
9356          vassert(i < 4);
9357          tab[i] = newTempV128();
9358          assign(tab[i], getQReg128((nn + i) % 32));
9359       }
9360       IRTemp res = math_TBL_TBX(tab, len, src, oor_values);
9361       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9362       const HChar* Ta = bitQ ==1 ? "16b" : "8b";
9363       const HChar* nm = isTBX ? "tbx" : "tbl";
9364       DIP("%s %s.%s, {v%u.16b .. v%u.16b}, %s.%s\n",
9365           nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta);
9366       return True;
9367    }
9368
9369 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9370    return False;
9371 #  undef INSN
9372 }
9373
9374
9375 static
9376 Bool dis_AdvSIMD_ZIP_UZP_TRN(/*MB_OUT*/DisResult* dres, UInt insn)
9377 {
9378    /* 31  29     23   21 20 15 14     11 9 4
9379       0 q 001110 size 0  m  0  opcode 10 n d
9380       Decode fields: opcode
9381    */
9382 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9383    if (INSN(31,31) != 0
9384        || INSN(29,24) != BITS6(0,0,1,1,1,0)
9385        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(11,10) != BITS2(1,0)) {
9386       return False;
9387    }
9388    UInt bitQ   = INSN(30,30);
9389    UInt size   = INSN(23,22);
9390    UInt mm     = INSN(20,16);
9391    UInt opcode = INSN(14,12);
9392    UInt nn     = INSN(9,5);
9393    UInt dd     = INSN(4,0);
9394
9395    if (opcode == BITS3(0,0,1) || opcode == BITS3(1,0,1)) {
9396       /* -------- 001 UZP1 std7_std7_std7 -------- */
9397       /* -------- 101 UZP2 std7_std7_std7 -------- */
9398       if (bitQ == 0 && size == X11) return False; // implied 1d case
9399       Bool   isUZP1 = opcode == BITS3(0,0,1);
9400       IROp   op     = isUZP1 ? mkVecCATEVENLANES(size)
9401                              : mkVecCATODDLANES(size);
9402       IRTemp preL = newTempV128();
9403       IRTemp preR = newTempV128();
9404       IRTemp res  = newTempV128();
9405       if (bitQ == 0) {
9406          assign(preL, binop(Iop_InterleaveLO64x2, getQReg128(mm),
9407                                                   getQReg128(nn)));
9408          assign(preR, mkexpr(preL));
9409       } else {
9410          assign(preL, getQReg128(mm));
9411          assign(preR, getQReg128(nn));
9412       }
9413       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
9414       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9415       const HChar* nm  = isUZP1 ? "uzp1" : "uzp2";
9416       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9417       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9418           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9419       return True;
9420    }
9421
9422    if (opcode == BITS3(0,1,0) || opcode == BITS3(1,1,0)) {
9423       /* -------- 010 TRN1 std7_std7_std7 -------- */
9424       /* -------- 110 TRN2 std7_std7_std7 -------- */
9425       if (bitQ == 0 && size == X11) return False; // implied 1d case
9426       Bool   isTRN1 = opcode == BITS3(0,1,0);
9427       IROp   op1    = isTRN1 ? mkVecCATEVENLANES(size)
9428                              : mkVecCATODDLANES(size);
9429       IROp op2 = mkVecINTERLEAVEHI(size);
9430       IRTemp srcM = newTempV128();
9431       IRTemp srcN = newTempV128();
9432       IRTemp res  = newTempV128();
9433       assign(srcM, getQReg128(mm));
9434       assign(srcN, getQReg128(nn));
9435       assign(res, binop(op2, binop(op1, mkexpr(srcM), mkexpr(srcM)),
9436                              binop(op1, mkexpr(srcN), mkexpr(srcN))));
9437       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9438       const HChar* nm  = isTRN1 ? "trn1" : "trn2";
9439       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9440       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9441           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9442       return True;
9443    }
9444
9445    if (opcode == BITS3(0,1,1) || opcode == BITS3(1,1,1)) {
9446       /* -------- 011 ZIP1 std7_std7_std7 -------- */
9447       /* -------- 111 ZIP2 std7_std7_std7 -------- */
9448       if (bitQ == 0 && size == X11) return False; // implied 1d case
9449       Bool   isZIP1 = opcode == BITS3(0,1,1);
9450       IROp   op     = isZIP1 ? mkVecINTERLEAVELO(size)
9451                              : mkVecINTERLEAVEHI(size);
9452       IRTemp preL = newTempV128();
9453       IRTemp preR = newTempV128();
9454       IRTemp res  = newTempV128();
9455       if (bitQ == 0 && !isZIP1) {
9456          IRTemp z128 = newTempV128();
9457          assign(z128, mkV128(0x0000));
9458          // preL = Vm shifted left 32 bits
9459          // preR = Vn shifted left 32 bits
9460          assign(preL, triop(Iop_SliceV128,
9461                             getQReg128(mm), mkexpr(z128), mkU8(12)));
9462          assign(preR, triop(Iop_SliceV128,
9463                             getQReg128(nn), mkexpr(z128), mkU8(12)));
9464
9465       } else {
9466          assign(preL, getQReg128(mm));
9467          assign(preR, getQReg128(nn));
9468       }
9469       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
9470       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9471       const HChar* nm  = isZIP1 ? "zip1" : "zip2";
9472       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9473       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9474           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9475       return True;
9476    }
9477
9478    return False;
9479 #  undef INSN
9480 }
9481
9482
9483 static
9484 Bool dis_AdvSIMD_across_lanes(/*MB_OUT*/DisResult* dres, UInt insn)
9485 {
9486    /* 31    28    23   21    16     11 9 4
9487       0 q u 01110 size 11000 opcode 10 n d
9488       Decode fields: u,size,opcode
9489    */
9490 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9491    if (INSN(31,31) != 0
9492        || INSN(28,24) != BITS5(0,1,1,1,0)
9493        || INSN(21,17) != BITS5(1,1,0,0,0) || INSN(11,10) != BITS2(1,0)) {
9494       return False;
9495    }
9496    UInt bitQ   = INSN(30,30);
9497    UInt bitU   = INSN(29,29);
9498    UInt size   = INSN(23,22);
9499    UInt opcode = INSN(16,12);
9500    UInt nn     = INSN(9,5);
9501    UInt dd     = INSN(4,0);
9502
9503    if (opcode == BITS5(0,0,0,1,1)) {
9504       /* -------- 0,xx,00011 SADDLV -------- */
9505       /* -------- 1,xx,00011 UADDLV -------- */
9506       /* size is the narrow size */
9507       if (size == X11 || (size == X10 && bitQ == 0)) return False;
9508       Bool   isU = bitU == 1;
9509       IRTemp src = newTempV128();
9510       assign(src, getQReg128(nn));
9511       /* The basic plan is to widen the lower half, and if Q = 1,
9512          the upper half too.  Add them together (if Q = 1), and in
9513          either case fold with add at twice the lane width.
9514       */
9515       IRExpr* widened
9516          = mkexpr(math_WIDEN_LO_OR_HI_LANES(
9517                      isU, False/*!fromUpperHalf*/, size, mkexpr(src)));
9518       if (bitQ == 1) {
9519          widened
9520             = binop(mkVecADD(size+1),
9521                     widened,
9522                     mkexpr(math_WIDEN_LO_OR_HI_LANES(
9523                               isU, True/*fromUpperHalf*/, size, mkexpr(src)))
9524               );
9525       }
9526       /* Now fold. */
9527       IRTemp tWi = newTempV128();
9528       assign(tWi, widened);
9529       IRTemp res = math_FOLDV(tWi, mkVecADD(size+1));
9530       putQReg128(dd, mkexpr(res));
9531       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9532       const HChar  ch  = "bhsd"[size];
9533       DIP("%s %s.%c, %s.%s\n", isU ? "uaddlv" : "saddlv",
9534           nameQReg128(dd), ch, nameQReg128(nn), arr);
9535       return True;
9536    }
9537
9538    UInt ix = 0;
9539    /**/ if (opcode == BITS5(0,1,0,1,0)) { ix = bitU == 0 ? 1 : 2; }
9540    else if (opcode == BITS5(1,1,0,1,0)) { ix = bitU == 0 ? 3 : 4; }
9541    else if (opcode == BITS5(1,1,0,1,1) && bitU == 0) { ix = 5; }
9542    /**/
9543    if (ix != 0) {
9544       /* -------- 0,xx,01010: SMAXV -------- (1) */
9545       /* -------- 1,xx,01010: UMAXV -------- (2) */
9546       /* -------- 0,xx,11010: SMINV -------- (3) */
9547       /* -------- 1,xx,11010: UMINV -------- (4) */
9548       /* -------- 0,xx,11011: ADDV  -------- (5) */
9549       vassert(ix >= 1 && ix <= 5);
9550       if (size == X11) return False; // 1d,2d cases not allowed
9551       if (size == X10 && bitQ == 0) return False; // 2s case not allowed
9552       const IROp opMAXS[3]
9553          = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4 };
9554       const IROp opMAXU[3]
9555          = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4 };
9556       const IROp opMINS[3]
9557          = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4 };
9558       const IROp opMINU[3]
9559          = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4 };
9560       const IROp opADD[3]
9561          = { Iop_Add8x16,  Iop_Add16x8,  Iop_Add32x4 };
9562       vassert(size < 3);
9563       IROp op = Iop_INVALID;
9564       const HChar* nm = NULL;
9565       switch (ix) {
9566          case 1: op = opMAXS[size]; nm = "smaxv"; break;
9567          case 2: op = opMAXU[size]; nm = "umaxv"; break;
9568          case 3: op = opMINS[size]; nm = "sminv"; break;
9569          case 4: op = opMINU[size]; nm = "uminv"; break;
9570          case 5: op = opADD[size];  nm = "addv";  break;
9571          default: vassert(0);
9572       }
9573       vassert(op != Iop_INVALID && nm != NULL);
9574       IRTemp tN1 = newTempV128();
9575       assign(tN1, getQReg128(nn));
9576       /* If Q == 0, we're just folding lanes in the lower half of
9577          the value.  In which case, copy the lower half of the
9578          source into the upper half, so we can then treat it the
9579          same as the full width case.  Except for the addition case,
9580          in which we have to zero out the upper half. */
9581       IRTemp tN2 = newTempV128();
9582       assign(tN2, bitQ == 0
9583                      ? (ix == 5 ? unop(Iop_ZeroHI64ofV128, mkexpr(tN1))
9584                                 : mk_CatEvenLanes64x2(tN1,tN1))
9585                      : mkexpr(tN1));
9586       IRTemp res = math_FOLDV(tN2, op);
9587       if (res == IRTemp_INVALID)
9588          return False; /* means math_FOLDV
9589                           doesn't handle this case yet */
9590       putQReg128(dd, mkexpr(res));
9591       const IRType tys[3] = { Ity_I8, Ity_I16, Ity_I32 };
9592       IRType laneTy = tys[size];
9593       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9594       DIP("%s %s, %s.%s\n", nm,
9595           nameQRegLO(dd, laneTy), nameQReg128(nn), arr);
9596       return True;
9597    }
9598
9599    if ((size == X00 || size == X10)
9600        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9601       /* -------- 0,00,01100: FMAXMNV s_4s -------- */
9602       /* -------- 0,10,01100: FMINMNV s_4s -------- */
9603       /* -------- 1,00,01111: FMAXV   s_4s -------- */
9604       /* -------- 1,10,01111: FMINV   s_4s -------- */
9605       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9606       if (bitQ == 0) return False; // Only 4s is allowed
9607       Bool   isMIN = (size & 2) == 2;
9608       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9609       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(2);
9610       IRTemp src = newTempV128();
9611       assign(src, getQReg128(nn));
9612       IRTemp res = math_FOLDV(src, opMXX);
9613       putQReg128(dd, mkexpr(res));
9614       DIP("%s%sv s%u, %u.4s\n",
9615           isMIN ? "fmin" : "fmax", isNM ? "nm" : "", dd, nn);
9616       return True;
9617    }
9618
9619 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9620    return False;
9621 #  undef INSN
9622 }
9623
9624
9625 static
9626 Bool dis_AdvSIMD_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9627 {
9628    /* 31     28       20   15 14   10 9 4
9629       0 q op 01110000 imm5 0  imm4 1  n d
9630       Decode fields: q,op,imm4
9631    */
9632 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9633    if (INSN(31,31) != 0
9634        || INSN(28,21) != BITS8(0,1,1,1,0,0,0,0)
9635        || INSN(15,15) != 0 || INSN(10,10) != 1) {
9636       return False;
9637    }
9638    UInt bitQ  = INSN(30,30);
9639    UInt bitOP = INSN(29,29);
9640    UInt imm5  = INSN(20,16);
9641    UInt imm4  = INSN(14,11);
9642    UInt nn    = INSN(9,5);
9643    UInt dd    = INSN(4,0);
9644
9645    /* -------- x,0,0000: DUP (element, vector) -------- */
9646    /* 31  28       20   15     9 4
9647       0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
9648    */
9649    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9650       UInt   laneNo    = 0;
9651       UInt   laneSzLg2 = 0;
9652       HChar  laneCh    = '?';
9653       IRTemp res       = handle_DUP_VEC_ELEM(&laneNo, &laneSzLg2, &laneCh,
9654                                              getQReg128(nn), imm5);
9655       if (res == IRTemp_INVALID)
9656          return False;
9657       if (bitQ == 0 && laneSzLg2 == X11)
9658          return False; /* .1d case */
9659       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9660       const HChar* arT = nameArr_Q_SZ(bitQ, laneSzLg2);
9661       DIP("dup %s.%s, %s.%c[%u]\n",
9662            nameQReg128(dd), arT, nameQReg128(nn), laneCh, laneNo);
9663       return True;
9664    }
9665
9666    /* -------- x,0,0001: DUP (general, vector) -------- */
9667    /* 31  28       20   15       9 4
9668       0q0 01110000 imm5 0 0001 1 n d  DUP Vd.T, Rn
9669       Q=0 writes 64, Q=1 writes 128
9670       imm5: xxxx1  8B(q=0)      or 16b(q=1),     R=W
9671             xxx10  4H(q=0)      or 8H(q=1),      R=W
9672             xx100  2S(q=0)      or 4S(q=1),      R=W
9673             x1000  Invalid(q=0) or 2D(q=1),      R=X
9674             x0000  Invalid(q=0) or Invalid(q=1)
9675       Require op=0, imm4=0001
9676    */
9677    if (bitOP == 0 && imm4 == BITS4(0,0,0,1)) {
9678       Bool   isQ = bitQ == 1;
9679       IRTemp w0  = newTemp(Ity_I64);
9680       const HChar* arT = "??";
9681       IRType laneTy = Ity_INVALID;
9682       if (imm5 & 1) {
9683          arT    = isQ ? "16b" : "8b";
9684          laneTy = Ity_I8;
9685          assign(w0, unop(Iop_8Uto64, unop(Iop_64to8, getIReg64orZR(nn))));
9686       }
9687       else if (imm5 & 2) {
9688          arT    = isQ ? "8h" : "4h";
9689          laneTy = Ity_I16;
9690          assign(w0, unop(Iop_16Uto64, unop(Iop_64to16, getIReg64orZR(nn))));
9691       }
9692       else if (imm5 & 4) {
9693          arT    = isQ ? "4s" : "2s";
9694          laneTy = Ity_I32;
9695          assign(w0, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
9696       }
9697       else if ((imm5 & 8) && isQ) {
9698          arT    = "2d";
9699          laneTy = Ity_I64;
9700          assign(w0, getIReg64orZR(nn));
9701       }
9702       else {
9703          /* invalid; leave laneTy unchanged. */
9704       }
9705       /* */
9706       if (laneTy != Ity_INVALID) {
9707          IRTemp w1 = math_DUP_TO_64(w0, laneTy);
9708          putQReg128(dd, binop(Iop_64HLtoV128,
9709                               isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
9710          DIP("dup %s.%s, %s\n",
9711              nameQReg128(dd), arT, nameIRegOrZR(laneTy == Ity_I64, nn));
9712          return True;
9713       }
9714       /* invalid */
9715       return False;
9716    }
9717
9718    /* -------- 1,0,0011: INS (general) -------- */
9719    /* 31  28       20   15     9 4
9720       010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
9721       where Ts,ix = case imm5 of xxxx1 -> B, xxxx
9722                                  xxx10 -> H, xxx
9723                                  xx100 -> S, xx
9724                                  x1000 -> D, x
9725    */
9726    if (bitQ == 1 && bitOP == 0 && imm4 == BITS4(0,0,1,1)) {
9727       HChar   ts     = '?';
9728       UInt    laneNo = 16;
9729       IRExpr* src    = NULL;
9730       if (imm5 & 1) {
9731          src    = unop(Iop_64to8, getIReg64orZR(nn));
9732          laneNo = (imm5 >> 1) & 15;
9733          ts     = 'b';
9734       }
9735       else if (imm5 & 2) {
9736          src    = unop(Iop_64to16, getIReg64orZR(nn));
9737          laneNo = (imm5 >> 2) & 7;
9738          ts     = 'h';
9739       }
9740       else if (imm5 & 4) {
9741          src    = unop(Iop_64to32, getIReg64orZR(nn));
9742          laneNo = (imm5 >> 3) & 3;
9743          ts     = 's';
9744       }
9745       else if (imm5 & 8) {
9746          src    = getIReg64orZR(nn);
9747          laneNo = (imm5 >> 4) & 1;
9748          ts     = 'd';
9749       }
9750       /* */
9751       if (src) {
9752          vassert(laneNo < 16);
9753          putQRegLane(dd, laneNo, src);
9754          DIP("ins %s.%c[%u], %s\n",
9755              nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
9756          return True;
9757       }
9758       /* invalid */
9759       return False;
9760    }
9761
9762    /* -------- x,0,0101: SMOV -------- */
9763    /* -------- x,0,0111: UMOV -------- */
9764    /* 31  28        20   15     9 4
9765       0q0 01110 000 imm5 001111 n d  UMOV Xd/Wd, Vn.Ts[index]
9766       0q0 01110 000 imm5 001011 n d  SMOV Xd/Wd, Vn.Ts[index]
9767       dest is Xd when q==1, Wd when q==0
9768       UMOV:
9769          Ts,index,ops = case q:imm5 of
9770                           0:xxxx1 -> B, xxxx, 8Uto64
9771                           1:xxxx1 -> invalid
9772                           0:xxx10 -> H, xxx,  16Uto64
9773                           1:xxx10 -> invalid
9774                           0:xx100 -> S, xx,   32Uto64
9775                           1:xx100 -> invalid
9776                           1:x1000 -> D, x,    copy64
9777                           other   -> invalid
9778       SMOV:
9779          Ts,index,ops = case q:imm5 of
9780                           0:xxxx1 -> B, xxxx, (32Uto64 . 8Sto32)
9781                           1:xxxx1 -> B, xxxx, 8Sto64
9782                           0:xxx10 -> H, xxx,  (32Uto64 . 16Sto32)
9783                           1:xxx10 -> H, xxx,  16Sto64
9784                           0:xx100 -> invalid
9785                           1:xx100 -> S, xx,   32Sto64
9786                           1:x1000 -> invalid
9787                           other   -> invalid
9788    */
9789    if (bitOP == 0 && (imm4 == BITS4(0,1,0,1) || imm4 == BITS4(0,1,1,1))) {
9790       Bool isU  = (imm4 & 2) == 2;
9791       const HChar* arTs = "??";
9792       UInt    laneNo = 16; /* invalid */
9793       // Setting 'res' to non-NULL determines valid/invalid
9794       IRExpr* res    = NULL;
9795       if (!bitQ && (imm5 & 1)) { // 0:xxxx1
9796          laneNo = (imm5 >> 1) & 15;
9797          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9798          res = isU ? unop(Iop_8Uto64, lane)
9799                    : unop(Iop_32Uto64, unop(Iop_8Sto32, lane));
9800          arTs = "b";
9801       }
9802       else if (bitQ && (imm5 & 1)) { // 1:xxxx1
9803          laneNo = (imm5 >> 1) & 15;
9804          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9805          res = isU ? NULL
9806                    : unop(Iop_8Sto64, lane);
9807          arTs = "b";
9808       }
9809       else if (!bitQ && (imm5 & 2)) { // 0:xxx10
9810          laneNo = (imm5 >> 2) & 7;
9811          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9812          res = isU ? unop(Iop_16Uto64, lane)
9813                    : unop(Iop_32Uto64, unop(Iop_16Sto32, lane));
9814          arTs = "h";
9815       }
9816       else if (bitQ && (imm5 & 2)) { // 1:xxx10
9817          laneNo = (imm5 >> 2) & 7;
9818          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9819          res = isU ? NULL
9820                    : unop(Iop_16Sto64, lane);
9821          arTs = "h";
9822       }
9823       else if (!bitQ && (imm5 & 4)) { // 0:xx100
9824          laneNo = (imm5 >> 3) & 3;
9825          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9826          res = isU ? unop(Iop_32Uto64, lane)
9827                    : NULL;
9828          arTs = "s";
9829       }
9830       else if (bitQ && (imm5 & 4)) { // 1:xxx10
9831          laneNo = (imm5 >> 3) & 3;
9832          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9833          res = isU ? NULL
9834                    : unop(Iop_32Sto64, lane);
9835          arTs = "s";
9836       }
9837       else if (bitQ && (imm5 & 8)) { // 1:x1000
9838          laneNo = (imm5 >> 4) & 1;
9839          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I64);
9840          res = isU ? lane
9841                    : NULL;
9842          arTs = "d";
9843       }
9844       /* */
9845       if (res) {
9846          vassert(laneNo < 16);
9847          putIReg64orZR(dd, res);
9848          DIP("%cmov %s, %s.%s[%u]\n", isU ? 'u' : 's',
9849              nameIRegOrZR(bitQ == 1, dd),
9850              nameQReg128(nn), arTs, laneNo);
9851          return True;
9852       }
9853       /* invalid */
9854       return False;
9855    }
9856
9857    /* -------- 1,1,xxxx: INS (element) -------- */
9858    /* 31  28       20     14   9 4
9859       011 01110000 imm5 0 imm4 n d  INS Vd.Ts[ix1], Vn.Ts[ix2]
9860       where Ts,ix1,ix2
9861                = case imm5 of xxxx1 -> B, xxxx, imm4[3:0]
9862                               xxx10 -> H, xxx,  imm4[3:1]
9863                               xx100 -> S, xx,   imm4[3:2]
9864                               x1000 -> D, x,    imm4[3:3]
9865    */
9866    if (bitQ == 1 && bitOP == 1) {
9867       HChar   ts  = '?';
9868       IRType  ity = Ity_INVALID;
9869       UInt    ix1 = 16;
9870       UInt    ix2 = 16;
9871       if (imm5 & 1) {
9872          ts  = 'b';
9873          ity = Ity_I8;
9874          ix1 = (imm5 >> 1) & 15;
9875          ix2 = (imm4 >> 0) & 15;
9876       }
9877       else if (imm5 & 2) {
9878          ts  = 'h';
9879          ity = Ity_I16;
9880          ix1 = (imm5 >> 2) & 7;
9881          ix2 = (imm4 >> 1) & 7;
9882       }
9883       else if (imm5 & 4) {
9884          ts  = 's';
9885          ity = Ity_I32;
9886          ix1 = (imm5 >> 3) & 3;
9887          ix2 = (imm4 >> 2) & 3;
9888       }
9889       else if (imm5 & 8) {
9890          ts  = 'd';
9891          ity = Ity_I64;
9892          ix1 = (imm5 >> 4) & 1;
9893          ix2 = (imm4 >> 3) & 1;
9894       }
9895       /* */
9896       if (ity != Ity_INVALID) {
9897          vassert(ix1 < 16);
9898          vassert(ix2 < 16);
9899          putQRegLane(dd, ix1, getQRegLane(nn, ix2, ity));
9900          DIP("ins %s.%c[%u], %s.%c[%u]\n",
9901              nameQReg128(dd), ts, ix1, nameQReg128(nn), ts, ix2);
9902          return True;
9903       }
9904       /* invalid */
9905       return False;
9906    }
9907
9908    return False;
9909 #  undef INSN
9910 }
9911
9912
9913 static
9914 Bool dis_AdvSIMD_modified_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
9915 {
9916    /* 31    28          18  15    11 9     4
9917       0q op 01111 00000 abc cmode 01 defgh d
9918       Decode fields: q,op,cmode
9919       Bit 11 is really "o2", but it is always zero.
9920    */
9921 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9922    if (INSN(31,31) != 0
9923        || INSN(28,19) != BITS10(0,1,1,1,1,0,0,0,0,0)
9924        || INSN(11,10) != BITS2(0,1)) {
9925       return False;
9926    }
9927    UInt bitQ     = INSN(30,30);
9928    UInt bitOP    = INSN(29,29);
9929    UInt cmode    = INSN(15,12);
9930    UInt abcdefgh = (INSN(18,16) << 5) | INSN(9,5);
9931    UInt dd       = INSN(4,0);
9932
9933    ULong imm64lo  = 0;
9934    UInt  op_cmode = (bitOP << 4) | cmode;
9935    Bool  ok       = False;
9936    Bool  isORR    = False;
9937    Bool  isBIC    = False;
9938    Bool  isMOV    = False;
9939    Bool  isMVN    = False;
9940    Bool  isFMOV   = False;
9941    switch (op_cmode) {
9942       /* -------- x,0,0000 MOVI 32-bit shifted imm -------- */
9943       /* -------- x,0,0010 MOVI 32-bit shifted imm -------- */
9944       /* -------- x,0,0100 MOVI 32-bit shifted imm -------- */
9945       /* -------- x,0,0110 MOVI 32-bit shifted imm -------- */
9946       case BITS5(0,0,0,0,0): case BITS5(0,0,0,1,0):
9947       case BITS5(0,0,1,0,0): case BITS5(0,0,1,1,0): // 0:0xx0
9948          ok = True; isMOV = True; break;
9949
9950       /* -------- x,0,0001 ORR (vector, immediate) 32-bit -------- */
9951       /* -------- x,0,0011 ORR (vector, immediate) 32-bit -------- */
9952       /* -------- x,0,0101 ORR (vector, immediate) 32-bit -------- */
9953       /* -------- x,0,0111 ORR (vector, immediate) 32-bit -------- */
9954       case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,1):
9955       case BITS5(0,0,1,0,1): case BITS5(0,0,1,1,1): // 0:0xx1
9956          ok = True; isORR = True; break;
9957
9958       /* -------- x,0,1000 MOVI 16-bit shifted imm -------- */
9959       /* -------- x,0,1010 MOVI 16-bit shifted imm -------- */
9960       case BITS5(0,1,0,0,0): case BITS5(0,1,0,1,0): // 0:10x0
9961          ok = True; isMOV = True; break;
9962
9963       /* -------- x,0,1001 ORR (vector, immediate) 16-bit -------- */
9964       /* -------- x,0,1011 ORR (vector, immediate) 16-bit -------- */
9965       case BITS5(0,1,0,0,1): case BITS5(0,1,0,1,1): // 0:10x1
9966          ok = True; isORR = True; break;
9967
9968       /* -------- x,0,1100 MOVI 32-bit shifting ones -------- */
9969       /* -------- x,0,1101 MOVI 32-bit shifting ones -------- */
9970       case BITS5(0,1,1,0,0): case BITS5(0,1,1,0,1): // 0:110x
9971          ok = True; isMOV = True; break;
9972
9973       /* -------- x,0,1110 MOVI 8-bit -------- */
9974       case BITS5(0,1,1,1,0):
9975          ok = True; isMOV = True; break;
9976
9977       /* -------- x,0,1111 FMOV (vector, immediate, F32) -------- */
9978       case BITS5(0,1,1,1,1): // 0:1111
9979          ok = True; isFMOV = True; break;
9980
9981       /* -------- x,1,0000 MVNI 32-bit shifted imm -------- */
9982       /* -------- x,1,0010 MVNI 32-bit shifted imm  -------- */
9983       /* -------- x,1,0100 MVNI 32-bit shifted imm  -------- */
9984       /* -------- x,1,0110 MVNI 32-bit shifted imm  -------- */
9985       case BITS5(1,0,0,0,0): case BITS5(1,0,0,1,0):
9986       case BITS5(1,0,1,0,0): case BITS5(1,0,1,1,0): // 1:0xx0
9987          ok = True; isMVN = True; break;
9988
9989       /* -------- x,1,0001 BIC (vector, immediate) 32-bit -------- */
9990       /* -------- x,1,0011 BIC (vector, immediate) 32-bit -------- */
9991       /* -------- x,1,0101 BIC (vector, immediate) 32-bit -------- */
9992       /* -------- x,1,0111 BIC (vector, immediate) 32-bit -------- */
9993       case BITS5(1,0,0,0,1): case BITS5(1,0,0,1,1):
9994       case BITS5(1,0,1,0,1): case BITS5(1,0,1,1,1): // 1:0xx1
9995          ok = True; isBIC = True; break;
9996
9997       /* -------- x,1,1000 MVNI 16-bit shifted imm -------- */
9998       /* -------- x,1,1010 MVNI 16-bit shifted imm -------- */
9999       case BITS5(1,1,0,0,0): case BITS5(1,1,0,1,0): // 1:10x0
10000          ok = True; isMVN = True; break;
10001
10002       /* -------- x,1,1001 BIC (vector, immediate) 16-bit -------- */
10003       /* -------- x,1,1011 BIC (vector, immediate) 16-bit -------- */
10004       case BITS5(1,1,0,0,1): case BITS5(1,1,0,1,1): // 1:10x1
10005          ok = True; isBIC = True; break;
10006
10007       /* -------- x,1,1100 MVNI 32-bit shifting ones -------- */
10008       /* -------- x,1,1101 MVNI 32-bit shifting ones -------- */
10009       case BITS5(1,1,1,0,0): case BITS5(1,1,1,0,1): // 1:110x
10010          ok = True; isMVN = True; break;
10011
10012       /* -------- 0,1,1110 MOVI 64-bit scalar -------- */
10013       /* -------- 1,1,1110 MOVI 64-bit vector -------- */
10014       case BITS5(1,1,1,1,0):
10015          ok = True; isMOV = True; break;
10016
10017       /* -------- 1,1,1111 FMOV (vector, immediate, F64) -------- */
10018       case BITS5(1,1,1,1,1): // 1:1111
10019          ok = bitQ == 1; isFMOV = True; break;
10020
10021       default:
10022         break;
10023    }
10024    if (ok) {
10025       vassert(1 == (isMOV ? 1 : 0) + (isMVN ? 1 : 0)
10026                    + (isORR ? 1 : 0) + (isBIC ? 1 : 0) + (isFMOV ? 1 : 0));
10027       ok = AdvSIMDExpandImm(&imm64lo, bitOP, cmode, abcdefgh);
10028    }
10029    if (ok) {
10030       if (isORR || isBIC) {
10031          ULong inv
10032             = isORR ? 0ULL : ~0ULL;
10033          IRExpr* immV128
10034             = binop(Iop_64HLtoV128, mkU64(inv ^ imm64lo), mkU64(inv ^ imm64lo));
10035          IRExpr* res
10036             = binop(isORR ? Iop_OrV128 : Iop_AndV128, getQReg128(dd), immV128);
10037          const HChar* nm = isORR ? "orr" : "bic";
10038          if (bitQ == 0) {
10039             putQReg128(dd, unop(Iop_ZeroHI64ofV128, res));
10040             DIP("%s %s.1d, %016llx\n", nm, nameQReg128(dd), imm64lo);
10041          } else {
10042             putQReg128(dd, res);
10043             DIP("%s %s.2d, #0x%016llx'%016llx\n", nm,
10044                 nameQReg128(dd), imm64lo, imm64lo);
10045          }
10046       }
10047       else if (isMOV || isMVN || isFMOV) {
10048          if (isMVN) imm64lo = ~imm64lo;
10049          ULong   imm64hi = bitQ == 0  ? 0  :  imm64lo;
10050          IRExpr* immV128 = binop(Iop_64HLtoV128, mkU64(imm64hi),
10051                                                  mkU64(imm64lo));
10052          putQReg128(dd, immV128);
10053          DIP("mov %s, #0x%016llx'%016llx\n", nameQReg128(dd), imm64hi, imm64lo);
10054       }
10055       return True;
10056    }
10057    /* else fall through */
10058
10059    return False;
10060 #  undef INSN
10061 }
10062
10063
10064 static
10065 Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn)
10066 {
10067    /* 31    28       20   15 14   10 9 4
10068       01 op 11110000 imm5 0  imm4 1  n d
10069       Decode fields: op,imm4
10070    */
10071 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10072    if (INSN(31,30) != BITS2(0,1)
10073        || INSN(28,21) != BITS8(1,1,1,1,0,0,0,0)
10074        || INSN(15,15) != 0 || INSN(10,10) != 1) {
10075       return False;
10076    }
10077    UInt bitOP = INSN(29,29);
10078    UInt imm5  = INSN(20,16);
10079    UInt imm4  = INSN(14,11);
10080    UInt nn    = INSN(9,5);
10081    UInt dd    = INSN(4,0);
10082
10083    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
10084       /* -------- 0,0000 DUP (element, scalar) -------- */
10085       IRTemp w0     = newTemp(Ity_I64);
10086       const HChar* arTs = "??";
10087       IRType laneTy = Ity_INVALID;
10088       UInt   laneNo = 16; /* invalid */
10089       if (imm5 & 1) {
10090          arTs   = "b";
10091          laneNo = (imm5 >> 1) & 15;
10092          laneTy = Ity_I8;
10093          assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
10094       }
10095       else if (imm5 & 2) {
10096          arTs   = "h";
10097          laneNo = (imm5 >> 2) & 7;
10098          laneTy = Ity_I16;
10099          assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
10100       }
10101       else if (imm5 & 4) {
10102          arTs   = "s";
10103          laneNo = (imm5 >> 3) & 3;
10104          laneTy = Ity_I32;
10105          assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
10106       }
10107       else if (imm5 & 8) {
10108          arTs   = "d";
10109          laneNo = (imm5 >> 4) & 1;
10110          laneTy = Ity_I64;
10111          assign(w0, getQRegLane(nn, laneNo, laneTy));
10112       }
10113       else {
10114          /* invalid; leave laneTy unchanged. */
10115       }
10116       /* */
10117       if (laneTy != Ity_INVALID) {
10118          vassert(laneNo < 16);
10119          putQReg128(dd, binop(Iop_64HLtoV128, mkU64(0), mkexpr(w0)));
10120          DIP("dup %s, %s.%s[%u]\n",
10121              nameQRegLO(dd, laneTy), nameQReg128(nn), arTs, laneNo);
10122          return True;
10123       }
10124       /* else fall through */
10125    }
10126
10127    return False;
10128 #  undef INSN
10129 }
10130
10131
10132 static
10133 Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn,
10134                                  const VexArchInfo* archinfo)
10135 {
10136    /* 31   28    23 21    16     11 9 4
10137       01 u 11110 sz 11000 opcode 10 n d
10138       Decode fields: u,sz,opcode
10139    */
10140 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10141    if (INSN(31,30) != BITS2(0,1)
10142        || INSN(28,24) != BITS5(1,1,1,1,0)
10143        || INSN(21,17) != BITS5(1,1,0,0,0)
10144        || INSN(11,10) != BITS2(1,0)) {
10145       return False;
10146    }
10147    UInt bitU   = INSN(29,29);
10148    UInt sz     = INSN(23,22);
10149    UInt opcode = INSN(16,12);
10150    UInt nn     = INSN(9,5);
10151    UInt dd     = INSN(4,0);
10152
10153    if (bitU == 0 && sz == X11 && opcode == BITS5(1,1,0,1,1)) {
10154       /* -------- 0,11,11011 ADDP d_2d -------- */
10155       IRTemp xy = newTempV128();
10156       IRTemp xx = newTempV128();
10157       assign(xy, getQReg128(nn));
10158       assign(xx, binop(Iop_InterleaveHI64x2, mkexpr(xy), mkexpr(xy)));
10159       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10160                           binop(Iop_Add64x2, mkexpr(xy), mkexpr(xx))));
10161       DIP("addp d%u, %s.2d\n", dd, nameQReg128(nn));
10162       return True;
10163    }
10164
10165    if (bitU == 1 && sz <= X01 && opcode == BITS5(0,1,1,0,1)) {
10166       /* -------- 1,00,01101 ADDP s_2s -------- */
10167       /* -------- 1,01,01101 ADDP d_2d -------- */
10168       Bool   isD   = sz == X01;
10169       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
10170       IROp   opADD = mkVecADDF(isD ? 3 : 2);
10171       IRTemp src   = newTempV128();
10172       IRTemp argL  = newTempV128();
10173       IRTemp argR  = newTempV128();
10174       assign(src, getQReg128(nn));
10175       assign(argL, unop(opZHI, mkexpr(src)));
10176       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
10177                                                     mkU8(isD ? 8 : 4))));
10178       putQReg128(dd, unop(opZHI,
10179                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
10180                                               mkexpr(argL), mkexpr(argR))));
10181       DIP(isD ? "faddp d%u, v%u.2d\n" : "faddp s%u, v%u.2s\n", dd, nn);
10182       return True;
10183    }
10184
10185    /* Half-precision floating point ADDP (v8.2). */
10186    if (bitU == 0 && sz <= X00 && opcode == BITS5(0,1,1,0,1)) {
10187       /* -------- 0,00,01101 ADDP h_2h -------- */
10188       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
10189          return False;
10190       IROp   opZHI = mkVecZEROHIxxOFV128(1);
10191       IROp   opADD = mkVecADDF(1);
10192       IRTemp src   = newTempV128();
10193       IRTemp argL  = newTempV128();
10194       IRTemp argR  = newTempV128();
10195       assign(src, getQReg128(nn));
10196       assign(argL, unop(opZHI, mkexpr(src)));
10197       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
10198                                                     mkU8(2))));
10199       putQReg128(dd, unop(opZHI,
10200                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
10201                                               mkexpr(argL), mkexpr(argR))));
10202       DIP("faddp h%u, v%u.2h\n", dd, nn);
10203       return True;
10204    }
10205
10206    if (bitU == 1
10207        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
10208       /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */
10209       /* -------- 1,1x,01100 FMINNMP d_2d, s_2s -------- */
10210       /* -------- 1,0x,01111 FMAXP   d_2d, s_2s -------- */
10211       /* -------- 1,1x,01111 FMINP   d_2d, s_2s -------- */
10212       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
10213       Bool   isD   = (sz & 1) == 1;
10214       Bool   isMIN = (sz & 2) == 2;
10215       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
10216       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
10217       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
10218       IRTemp src   = newTempV128();
10219       IRTemp argL  = newTempV128();
10220       IRTemp argR  = newTempV128();
10221       assign(src, getQReg128(nn));
10222       assign(argL, unop(opZHI, mkexpr(src)));
10223       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
10224                                                     mkU8(isD ? 8 : 4))));
10225       putQReg128(dd, unop(opZHI,
10226                           binop(opMXX, mkexpr(argL), mkexpr(argR))));
10227       HChar c = isD ? 'd' : 's';
10228       DIP("%s%sp %c%u, v%u.2%c\n",
10229            isMIN ? "fmin" : "fmax", isNM ? "nm" : "", c, dd, nn, c);
10230       return True;
10231    }
10232
10233    return False;
10234 #  undef INSN
10235 }
10236
10237
10238 static
10239 Bool dis_AdvSIMD_scalar_shift_by_imm(/*MB_OUT*/DisResult* dres, UInt insn)
10240 {
10241    /* 31   28     22   18   15     10 9 4
10242       01 u 111110 immh immb opcode 1  n d
10243       Decode fields: u,immh,opcode
10244    */
10245 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10246    if (INSN(31,30) != BITS2(0,1)
10247        || INSN(28,23) != BITS6(1,1,1,1,1,0) || INSN(10,10) != 1) {
10248       return False;
10249    }
10250    UInt bitU   = INSN(29,29);
10251    UInt immh   = INSN(22,19);
10252    UInt immb   = INSN(18,16);
10253    UInt opcode = INSN(15,11);
10254    UInt nn     = INSN(9,5);
10255    UInt dd     = INSN(4,0);
10256    UInt immhb  = (immh << 3) | immb;
10257
10258    if ((immh & 8) == 8
10259        && (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0))) {
10260       /* -------- 0,1xxx,00000 SSHR d_d_#imm -------- */
10261       /* -------- 1,1xxx,00000 USHR d_d_#imm -------- */
10262       /* -------- 0,1xxx,00010 SSRA d_d_#imm -------- */
10263       /* -------- 1,1xxx,00010 USRA d_d_#imm -------- */
10264       Bool isU   = bitU == 1;
10265       Bool isAcc = opcode == BITS5(0,0,0,1,0);
10266       UInt sh    = 128 - immhb;
10267       vassert(sh >= 1 && sh <= 64);
10268       IROp    op  = isU ? Iop_ShrN64x2 : Iop_SarN64x2;
10269       IRExpr* src = getQReg128(nn);
10270       IRTemp  shf = newTempV128();
10271       IRTemp  res = newTempV128();
10272       if (sh == 64 && isU) {
10273          assign(shf, mkV128(0x0000));
10274       } else {
10275          UInt nudge = 0;
10276          if (sh == 64) {
10277             vassert(!isU);
10278             nudge = 1;
10279          }
10280          assign(shf, binop(op, src, mkU8(sh - nudge)));
10281       }
10282       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
10283                         : mkexpr(shf));
10284       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10285       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
10286                               : (isU ? "ushr" : "sshr");
10287       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
10288       return True;
10289    }
10290
10291    if ((immh & 8) == 8
10292        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0))) {
10293       /* -------- 0,1xxx,00100 SRSHR d_d_#imm -------- */
10294       /* -------- 1,1xxx,00100 URSHR d_d_#imm -------- */
10295       /* -------- 0,1xxx,00110 SRSRA d_d_#imm -------- */
10296       /* -------- 1,1xxx,00110 URSRA d_d_#imm -------- */
10297       Bool isU   = bitU == 1;
10298       Bool isAcc = opcode == BITS5(0,0,1,1,0);
10299       UInt sh    = 128 - immhb;
10300       vassert(sh >= 1 && sh <= 64);
10301       IROp    op  = isU ? Iop_Rsh64Ux2 : Iop_Rsh64Sx2;
10302       vassert(sh >= 1 && sh <= 64);
10303       IRExpr* src  = getQReg128(nn);
10304       IRTemp  imm8 = newTemp(Ity_I8);
10305       assign(imm8, mkU8((UChar)(-sh)));
10306       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
10307       IRTemp  shf  = newTempV128();
10308       IRTemp  res  = newTempV128();
10309       assign(shf, binop(op, src, amt));
10310       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
10311                         : mkexpr(shf));
10312       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10313       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
10314                               : (isU ? "urshr" : "srshr");
10315       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
10316       return True;
10317    }
10318
10319    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,0,0)) {
10320       /* -------- 1,1xxx,01000 SRI d_d_#imm -------- */
10321       UInt sh = 128 - immhb;
10322       vassert(sh >= 1 && sh <= 64);
10323       if (sh == 64) {
10324          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
10325       } else {
10326          /* sh is in range 1 .. 63 */
10327          ULong   nmask  = (ULong)(((Long)0x8000000000000000ULL) >> (sh-1));
10328          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
10329          IRTemp  res    = newTempV128();
10330          assign(res, binop(Iop_OrV128,
10331                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
10332                            binop(Iop_ShrN64x2, getQReg128(nn), mkU8(sh))));
10333          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10334       }
10335       DIP("sri d%u, d%u, #%u\n", dd, nn, sh);
10336       return True;
10337    }
10338
10339    if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
10340       /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
10341       UInt sh = immhb - 64;
10342       vassert(sh >= 0 && sh < 64);
10343       putQReg128(dd,
10344                  unop(Iop_ZeroHI64ofV128,
10345                       sh == 0 ? getQReg128(nn)
10346                               : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
10347       DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
10348       return True;
10349    }
10350
10351    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
10352       /* -------- 1,1xxx,01010 SLI d_d_#imm -------- */
10353       UInt sh = immhb - 64;
10354       vassert(sh >= 0 && sh < 64);
10355       if (sh == 0) {
10356          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(nn)));
10357       } else {
10358          /* sh is in range 1 .. 63 */
10359          ULong   nmask  = (1ULL << sh) - 1;
10360          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
10361          IRTemp  res    = newTempV128();
10362          assign(res, binop(Iop_OrV128,
10363                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
10364                            binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
10365          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10366       }
10367       DIP("sli d%u, d%u, #%u\n", dd, nn, sh);
10368       return True;
10369    }
10370
10371    if (opcode == BITS5(0,1,1,1,0)
10372        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
10373       /* -------- 0,01110  SQSHL  #imm -------- */
10374       /* -------- 1,01110  UQSHL  #imm -------- */
10375       /* -------- 1,01100  SQSHLU #imm -------- */
10376       UInt size  = 0;
10377       UInt shift = 0;
10378       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10379       if (!ok) return False;
10380       vassert(size >= 0 && size <= 3);
10381       /* The shift encoding has opposite sign for the leftwards case.
10382          Adjust shift to compensate. */
10383       UInt lanebits = 8 << size;
10384       shift = lanebits - shift;
10385       vassert(shift >= 0 && shift < lanebits);
10386       const HChar* nm = NULL;
10387       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
10388       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
10389       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
10390       else vassert(0);
10391       IRTemp qDiff1 = IRTemp_INVALID;
10392       IRTemp qDiff2 = IRTemp_INVALID;
10393       IRTemp res = IRTemp_INVALID;
10394       IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn));
10395       /* This relies on the fact that the zeroed out lanes generate zeroed
10396          result lanes and don't saturate, so there's no point in trimming
10397          the resulting res, qDiff1 or qDiff2 values. */
10398       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
10399       putQReg128(dd, mkexpr(res));
10400       updateQCFLAGwithDifference(qDiff1, qDiff2);
10401       const HChar arr = "bhsd"[size];
10402       DIP("%s %c%u, %c%u, #%u\n", nm, arr, dd, arr, nn, shift);
10403       return True;
10404    }
10405
10406    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
10407        || (bitU == 1
10408            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
10409       /* -------- 0,10010   SQSHRN #imm -------- */
10410       /* -------- 1,10010   UQSHRN #imm -------- */
10411       /* -------- 0,10011  SQRSHRN #imm -------- */
10412       /* -------- 1,10011  UQRSHRN #imm -------- */
10413       /* -------- 1,10000  SQSHRUN #imm -------- */
10414       /* -------- 1,10001 SQRSHRUN #imm -------- */
10415       UInt size  = 0;
10416       UInt shift = 0;
10417       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10418       if (!ok || size == X11) return False;
10419       vassert(size >= X00 && size <= X10);
10420       vassert(shift >= 1 && shift <= (8 << size));
10421       const HChar* nm = "??";
10422       IROp op = Iop_INVALID;
10423       /* Decide on the name and the operation. */
10424       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
10425          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
10426       }
10427       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10428          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
10429       }
10430       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
10431          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
10432       }
10433       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
10434          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
10435       }
10436       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
10437          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
10438       }
10439       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
10440          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
10441       }
10442       else vassert(0);
10443       /* Compute the result (Q, shifted value) pair. */
10444       IRTemp src128 = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size+1, getQReg128(nn));
10445       IRTemp pair   = newTempV128();
10446       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
10447       /* Update the result reg */
10448       IRTemp res64in128 = newTempV128();
10449       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
10450       putQReg128(dd, mkexpr(res64in128));
10451       /* Update the Q flag. */
10452       IRTemp q64q64 = newTempV128();
10453       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
10454       IRTemp z128 = newTempV128();
10455       assign(z128, mkV128(0x0000));
10456       updateQCFLAGwithDifference(q64q64, z128);
10457       /* */
10458       const HChar arrNarrow = "bhsd"[size];
10459       const HChar arrWide   = "bhsd"[size+1];
10460       DIP("%s %c%u, %c%u, #%u\n", nm, arrNarrow, dd, arrWide, nn, shift);
10461       return True;
10462    }
10463
10464    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,0,0)) {
10465       /* -------- 0,!=00xx,11100 SCVTF d_d_imm, s_s_imm -------- */
10466       /* -------- 1,!=00xx,11100 UCVTF d_d_imm, s_s_imm -------- */
10467       UInt size  = 0;
10468       UInt fbits = 0;
10469       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
10470       /* The following holds because immh is never zero. */
10471       vassert(ok);
10472       /* The following holds because immh >= 0100. */
10473       vassert(size == X10 || size == X11);
10474       Bool isD = size == X11;
10475       Bool isU = bitU == 1;
10476       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
10477       Double  scale  = two_to_the_minus(fbits);
10478       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
10479                              : IRExpr_Const(IRConst_F32( (Float)scale ));
10480       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
10481       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
10482                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
10483       IRType tyF = isD ? Ity_F64 : Ity_F32;
10484       IRType tyI = isD ? Ity_I64 : Ity_I32;
10485       IRTemp src = newTemp(tyI);
10486       IRTemp res = newTemp(tyF);
10487       IRTemp rm  = mk_get_IR_rounding_mode();
10488       assign(src, getQRegLane(nn, 0, tyI));
10489       assign(res, triop(opMUL, mkexpr(rm),
10490                                binop(opCVT, mkexpr(rm), mkexpr(src)), scaleE));
10491       putQRegLane(dd, 0, mkexpr(res));
10492       if (!isD) {
10493          putQRegLane(dd, 1, mkU32(0));
10494       }
10495       putQRegLane(dd, 1, mkU64(0));
10496       const HChar ch = isD ? 'd' : 's';
10497       DIP("%s %c%u, %c%u, #%u\n", isU ? "ucvtf" : "scvtf",
10498           ch, dd, ch, nn, fbits);
10499       return True;
10500    }
10501
10502    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,1,1)) {
10503       /* -------- 0,!=00xx,11111 FCVTZS d_d_imm, s_s_imm -------- */
10504       /* -------- 1,!=00xx,11111 FCVTZU d_d_imm, s_s_imm -------- */
10505       UInt size  = 0;
10506       UInt fbits = 0;
10507       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
10508       /* The following holds because immh is never zero. */
10509       vassert(ok);
10510       /* The following holds because immh >= 0100. */
10511       vassert(size == X10 || size == X11);
10512       Bool isD = size == X11;
10513       Bool isU = bitU == 1;
10514       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
10515       Double  scale  = two_to_the_plus(fbits);
10516       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
10517                            : IRExpr_Const(IRConst_F32( (Float)scale ));
10518       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
10519       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
10520                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
10521       IRType tyF = isD ? Ity_F64 : Ity_F32;
10522       IRType tyI = isD ? Ity_I64 : Ity_I32;
10523       IRTemp src = newTemp(tyF);
10524       IRTemp res = newTemp(tyI);
10525       IRTemp rm  = newTemp(Ity_I32);
10526       assign(src, getQRegLane(nn, 0, tyF));
10527       assign(rm,  mkU32(Irrm_ZERO));
10528       assign(res, binop(opCVT, mkexpr(rm),
10529                                triop(opMUL, mkexpr(rm), mkexpr(src), scaleE)));
10530       putQRegLane(dd, 0, mkexpr(res));
10531       if (!isD) {
10532          putQRegLane(dd, 1, mkU32(0));
10533       }
10534       putQRegLane(dd, 1, mkU64(0));
10535       const HChar ch = isD ? 'd' : 's';
10536       DIP("%s %c%u, %c%u, #%u\n", isU ? "fcvtzu" : "fcvtzs",
10537           ch, dd, ch, nn, fbits);
10538       return True;
10539    }
10540
10541 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10542    return False;
10543 #  undef INSN
10544 }
10545
10546
10547 static
10548 Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
10549 {
10550    /* 31 29 28    23   21 20 15     11 9 4
10551       01 U  11110 size 1  m  opcode 00 n d
10552       Decode fields: u,opcode
10553    */
10554 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10555    if (INSN(31,30) != BITS2(0,1)
10556        || INSN(28,24) != BITS5(1,1,1,1,0)
10557        || INSN(21,21) != 1
10558        || INSN(11,10) != BITS2(0,0)) {
10559       return False;
10560    }
10561    UInt bitU   = INSN(29,29);
10562    UInt size   = INSN(23,22);
10563    UInt mm     = INSN(20,16);
10564    UInt opcode = INSN(15,12);
10565    UInt nn     = INSN(9,5);
10566    UInt dd     = INSN(4,0);
10567    vassert(size < 4);
10568
10569    if (bitU == 0
10570        && (opcode == BITS4(1,1,0,1)
10571            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
10572       /* -------- 0,1101  SQDMULL -------- */ // 0 (ks)
10573       /* -------- 0,1001  SQDMLAL -------- */ // 1
10574       /* -------- 0,1011  SQDMLSL -------- */ // 2
10575       /* Widens, and size refers to the narrowed lanes. */
10576       UInt ks = 3;
10577       switch (opcode) {
10578          case BITS4(1,1,0,1): ks = 0; break;
10579          case BITS4(1,0,0,1): ks = 1; break;
10580          case BITS4(1,0,1,1): ks = 2; break;
10581          default: vassert(0);
10582       }
10583       vassert(ks >= 0 && ks <= 2);
10584       if (size == X00 || size == X11) return False;
10585       vassert(size <= 2);
10586       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
10587       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
10588       newTempsV128_3(&vecN, &vecM, &vecD);
10589       assign(vecN, getQReg128(nn));
10590       assign(vecM, getQReg128(mm));
10591       assign(vecD, getQReg128(dd));
10592       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
10593                        False/*!is2*/, size, "mas"[ks],
10594                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
10595       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
10596       putQReg128(dd, unop(opZHI, mkexpr(res)));
10597       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
10598       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10599       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
10600          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
10601       }
10602       const HChar* nm        = ks == 0 ? "sqdmull"
10603                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
10604       const HChar  arrNarrow = "bhsd"[size];
10605       const HChar  arrWide   = "bhsd"[size+1];
10606       DIP("%s %c%u, %c%u, %c%u\n",
10607           nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
10608       return True;
10609    }
10610
10611    return False;
10612 #  undef INSN
10613 }
10614
10615
10616 static
10617 Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
10618 {
10619    /* 31 29 28    23   21 20 15     10 9 4
10620       01 U  11110 size 1  m  opcode 1  n d
10621       Decode fields: u,size,opcode
10622    */
10623 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10624    if (INSN(31,30) != BITS2(0,1)
10625        || INSN(28,24) != BITS5(1,1,1,1,0)
10626        || INSN(21,21) != 1
10627        || INSN(10,10) != 1) {
10628       return False;
10629    }
10630    UInt bitU   = INSN(29,29);
10631    UInt size   = INSN(23,22);
10632    UInt mm     = INSN(20,16);
10633    UInt opcode = INSN(15,11);
10634    UInt nn     = INSN(9,5);
10635    UInt dd     = INSN(4,0);
10636    vassert(size < 4);
10637
10638    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
10639       /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
10640       /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
10641       /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
10642       /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
10643       Bool isADD = opcode == BITS5(0,0,0,0,1);
10644       Bool isU   = bitU == 1;
10645       IROp qop   = Iop_INVALID;
10646       IROp nop   = Iop_INVALID;
10647       if (isADD) {
10648          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
10649          nop = mkVecADD(size);
10650       } else {
10651          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
10652          nop = mkVecSUB(size);
10653       }
10654       IRTemp argL = newTempV128();
10655       IRTemp argR = newTempV128();
10656       IRTemp qres = newTempV128();
10657       IRTemp nres = newTempV128();
10658       assign(argL, getQReg128(nn));
10659       assign(argR, getQReg128(mm));
10660       assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10661                              size, binop(qop, mkexpr(argL), mkexpr(argR)))));
10662       assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10663                              size, binop(nop, mkexpr(argL), mkexpr(argR)))));
10664       putQReg128(dd, mkexpr(qres));
10665       updateQCFLAGwithDifference(qres, nres);
10666       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
10667                                : (isU ? "uqsub" : "sqsub");
10668       const HChar  arr = "bhsd"[size];
10669       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10670       return True;
10671    }
10672
10673    if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
10674       /* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
10675       /* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
10676       Bool    isGT = bitU == 0;
10677       IRExpr* argL = getQReg128(nn);
10678       IRExpr* argR = getQReg128(mm);
10679       IRTemp  res  = newTempV128();
10680       assign(res,
10681              isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10682                   : binop(Iop_CmpGT64Ux2, argL, argR));
10683       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10684       DIP("%s %s, %s, %s\n",isGT ? "cmgt" : "cmhi",
10685           nameQRegLO(dd, Ity_I64),
10686           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10687       return True;
10688    }
10689
10690    if (size == X11 && opcode == BITS5(0,0,1,1,1)) {
10691       /* -------- 0,11,00111 CMGE d_d_d -------- */ // >=s
10692       /* -------- 1,11,00111 CMHS d_d_d -------- */ // >=u
10693       Bool    isGE = bitU == 0;
10694       IRExpr* argL = getQReg128(nn);
10695       IRExpr* argR = getQReg128(mm);
10696       IRTemp  res  = newTempV128();
10697       assign(res,
10698              isGE ? unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL))
10699                   : unop(Iop_NotV128, binop(Iop_CmpGT64Ux2, argR, argL)));
10700       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10701       DIP("%s %s, %s, %s\n", isGE ? "cmge" : "cmhs",
10702           nameQRegLO(dd, Ity_I64),
10703           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10704       return True;
10705    }
10706
10707    if (size == X11 && (opcode == BITS5(0,1,0,0,0)
10708                        || opcode == BITS5(0,1,0,1,0))) {
10709       /* -------- 0,xx,01000 SSHL  d_d_d -------- */
10710       /* -------- 0,xx,01010 SRSHL d_d_d -------- */
10711       /* -------- 1,xx,01000 USHL  d_d_d -------- */
10712       /* -------- 1,xx,01010 URSHL d_d_d -------- */
10713       Bool isU = bitU == 1;
10714       Bool isR = opcode == BITS5(0,1,0,1,0);
10715       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
10716                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
10717       IRTemp res = newTempV128();
10718       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10719       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10720       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
10721                              : (isU ? "ushl"  : "sshl");
10722       DIP("%s %s, %s, %s\n", nm,
10723           nameQRegLO(dd, Ity_I64),
10724           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10725       return True;
10726    }
10727
10728    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
10729       /* -------- 0,xx,01001 SQSHL  std4_std4_std4 -------- */
10730       /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
10731       /* -------- 1,xx,01001 UQSHL  std4_std4_std4 -------- */
10732       /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */
10733       Bool isU = bitU == 1;
10734       Bool isR = opcode == BITS5(0,1,0,1,1);
10735       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
10736                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
10737       /* This is a bit tricky.  Since we're only interested in the lowest
10738          lane of the result, we zero out all the rest in the operands, so
10739          as to ensure that other lanes don't pollute the returned Q value.
10740          This works because it means, for the lanes we don't care about, we
10741          are shifting zero by zero, which can never saturate. */
10742       IRTemp res256 = newTemp(Ity_V256);
10743       IRTemp resSH  = newTempV128();
10744       IRTemp resQ   = newTempV128();
10745       IRTemp zero   = newTempV128();
10746       assign(
10747          res256,
10748          binop(op,
10749                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))),
10750                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm)))));
10751       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
10752       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
10753       assign(zero,  mkV128(0x0000));
10754       putQReg128(dd, mkexpr(resSH));
10755       updateQCFLAGwithDifference(resQ, zero);
10756       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
10757                              : (isU ? "uqshl"  : "sqshl");
10758       const HChar  arr = "bhsd"[size];
10759       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10760       return True;
10761    }
10762
10763    if (size == X11 && opcode == BITS5(1,0,0,0,0)) {
10764       /* -------- 0,11,10000 ADD d_d_d -------- */
10765       /* -------- 1,11,10000 SUB d_d_d -------- */
10766       Bool   isSUB = bitU == 1;
10767       IRTemp res   = newTemp(Ity_I64);
10768       assign(res, binop(isSUB ? Iop_Sub64 : Iop_Add64,
10769                         getQRegLane(nn, 0, Ity_I64),
10770                         getQRegLane(mm, 0, Ity_I64)));
10771       putQRegLane(dd, 0, mkexpr(res));
10772       putQRegLane(dd, 1, mkU64(0));
10773       DIP("%s %s, %s, %s\n", isSUB ? "sub" : "add",
10774           nameQRegLO(dd, Ity_I64),
10775           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10776       return True;
10777    }
10778
10779    if (size == X11 && opcode == BITS5(1,0,0,0,1)) {
10780       /* -------- 0,11,10001 CMTST d_d_d -------- */ // &, != 0
10781       /* -------- 1,11,10001 CMEQ  d_d_d -------- */ // ==
10782       Bool    isEQ = bitU == 1;
10783       IRExpr* argL = getQReg128(nn);
10784       IRExpr* argR = getQReg128(mm);
10785       IRTemp  res  = newTempV128();
10786       assign(res,
10787              isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10788                   : unop(Iop_NotV128, binop(Iop_CmpEQ64x2,
10789                                             binop(Iop_AndV128, argL, argR),
10790                                             mkV128(0x0000))));
10791       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10792       DIP("%s %s, %s, %s\n", isEQ ? "cmeq" : "cmtst",
10793           nameQRegLO(dd, Ity_I64),
10794           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10795       return True;
10796    }
10797
10798    if (opcode == BITS5(1,0,1,1,0)) {
10799       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
10800       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
10801       if (size == X00 || size == X11) return False;
10802       Bool isR = bitU == 1;
10803       IRTemp res, sat1q, sat1n, vN, vM;
10804       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10805       newTempsV128_2(&vN, &vM);
10806       assign(vN, getQReg128(nn));
10807       assign(vM, getQReg128(mm));
10808       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10809       putQReg128(dd,
10810                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10811       updateQCFLAGwithDifference(
10812          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1q)),
10813          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1n)));
10814       const HChar  arr = "bhsd"[size];
10815       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10816       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10817       return True;
10818    }
10819
10820    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
10821       /* -------- 1,1x,11010 FABD d_d_d, s_s_s -------- */
10822       IRType ity = size == X11 ? Ity_F64 : Ity_F32;
10823       IRTemp res = newTemp(ity);
10824       assign(res, unop(mkABSF(ity),
10825                        triop(mkSUBF(ity),
10826                              mkexpr(mk_get_IR_rounding_mode()),
10827                              getQRegLO(nn,ity), getQRegLO(mm,ity))));
10828       putQReg128(dd, mkV128(0x0000));
10829       putQRegLO(dd, mkexpr(res));
10830       DIP("fabd %s, %s, %s\n",
10831           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10832       return True;
10833    }
10834
10835    if (bitU == 0 && size <= X01 && opcode == BITS5(1,1,0,1,1)) {
10836       /* -------- 0,0x,11011 FMULX d_d_d, s_s_s -------- */
10837       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10838       IRType ity = size == X01 ? Ity_F64 : Ity_F32;
10839       IRTemp res = newTemp(ity);
10840       assign(res, triop(mkMULF(ity),
10841                         mkexpr(mk_get_IR_rounding_mode()),
10842                         getQRegLO(nn,ity), getQRegLO(mm,ity)));
10843       putQReg128(dd, mkV128(0x0000));
10844       putQRegLO(dd, mkexpr(res));
10845       DIP("fmulx %s, %s, %s\n",
10846           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10847       return True;
10848    }
10849
10850    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
10851       /* -------- 0,0x,11100 FCMEQ d_d_d, s_s_s -------- */
10852       /* -------- 1,0x,11100 FCMGE d_d_d, s_s_s -------- */
10853       Bool   isD   = size == X01;
10854       IRType ity   = isD ? Ity_F64 : Ity_F32;
10855       Bool   isGE  = bitU == 1;
10856       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
10857                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
10858       IRTemp res   = newTempV128();
10859       assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
10860                        : binop(opCMP, getQReg128(nn), getQReg128(mm)));
10861       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10862                                                              mkexpr(res))));
10863       DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
10864           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10865       return True;
10866    }
10867
10868    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
10869       /* -------- 1,1x,11100 FCMGT d_d_d, s_s_s -------- */
10870       Bool   isD   = size == X11;
10871       IRType ity   = isD ? Ity_F64 : Ity_F32;
10872       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10873       IRTemp res   = newTempV128();
10874       assign(res, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
10875       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10876                                                              mkexpr(res))));
10877       DIP("%s %s, %s, %s\n", "fcmgt",
10878           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10879       return True;
10880    }
10881
10882    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
10883       /* -------- 1,0x,11101 FACGE d_d_d, s_s_s -------- */
10884       /* -------- 1,1x,11101 FACGT d_d_d, s_s_s -------- */
10885       Bool   isD   = (size & 1) == 1;
10886       IRType ity   = isD ? Ity_F64 : Ity_F32;
10887       Bool   isGT  = (size & 2) == 2;
10888       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
10889                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
10890       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
10891       IRTemp res   = newTempV128();
10892       assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
10893                                unop(opABS, getQReg128(nn)))); // swapd
10894       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10895                                                              mkexpr(res))));
10896       DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
10897           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10898       return True;
10899    }
10900
10901    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
10902       /* -------- 0,0x,11111: FRECPS  d_d_d, s_s_s -------- */
10903       /* -------- 0,1x,11111: FRSQRTS d_d_d, s_s_s -------- */
10904       Bool isSQRT = (size & 2) == 2;
10905       Bool isD    = (size & 1) == 1;
10906       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
10907                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
10908       IRTemp res = newTempV128();
10909       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10910       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10911                                                              mkexpr(res))));
10912       HChar c = isD ? 'd' : 's';
10913       DIP("%s %c%u, %c%u, %c%u\n", isSQRT ? "frsqrts" : "frecps",
10914           c, dd, c, nn, c, mm);
10915       return True;
10916    }
10917
10918    return False;
10919 #  undef INSN
10920 }
10921
10922 static
10923 Bool dis_AdvSIMD_scalar_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn,
10924                                          const VexArchInfo* archinfo)
10925 {
10926    /* 31 29 28    23   21 20 15     10 9 4
10927       01 U  11110 size 0  m  opcode 1  n d
10928       Decode fields: u,size,opcode
10929    */
10930 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10931    if (INSN(31,30) != BITS2(0,1)
10932        || INSN(28,24) != BITS5(1,1,1,1,0)
10933        || INSN(21,21) != 0
10934        || INSN(10,10) != 1) {
10935       return False;
10936    }
10937    UInt bitU   = INSN(29,29);
10938    UInt size   = INSN(23,22);
10939    UInt mm     = INSN(20,16);
10940    UInt opcode = INSN(15,11);
10941    UInt nn     = INSN(9,5);
10942    UInt dd     = INSN(4,0);
10943    vassert(size < 4);
10944    vassert(mm < 32 && nn < 32 && dd < 32);
10945
10946    if (bitU == 1 && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
10947       /* -------- xx,10000 SQRDMLAH s and h variants only -------- */
10948       /* -------- xx,10001 SQRDMLSH s and h variants only -------- */
10949       if (size == X00 || size == X11) return False;
10950       Bool isAdd = opcode == BITS5(1,0,0,0,0);
10951
10952       IRTemp res, res_nosat, vD, vN, vM;
10953       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
10954       newTempsV128_3(&vD, &vN, &vM);
10955       assign(vD, getQReg128(dd));
10956       assign(vN, getQReg128(nn));
10957       assign(vM, getQReg128(mm));
10958
10959       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
10960       putQReg128(dd,
10961                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10962       updateQCFLAGwithDifference(
10963          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res)),
10964          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res_nosat)));
10965
10966       const HChar  arr = "hs"[size];
10967       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
10968       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10969       return True;
10970    }
10971
10972    if (bitU == 1 && size == X11 && opcode == BITS5(0,0,0,1,0)) {
10973       /* -------- 1,11,00010 FABD h_h_h -------- */
10974       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
10975          return False;
10976       IRTemp res = newTemp(Ity_F16);
10977       assign(res, unop(mkABSF(Ity_F16),
10978                        triop(mkSUBF(Ity_F16),
10979                              mkexpr(mk_get_IR_rounding_mode()),
10980                              getQRegLO(nn,Ity_F16), getQRegLO(mm,Ity_F16))));
10981       putQReg128(dd, mkV128(0x0000));
10982       putQRegLO(dd, mkexpr(res));
10983       DIP("fabd %s, %s, %s\n",
10984           nameQRegLO(dd, Ity_F16), nameQRegLO(nn, Ity_F16), nameQRegLO(mm, Ity_F16));
10985       return True;
10986    }
10987
10988    if (size == X01 && opcode == BITS5(0,0,1,0,0)) {
10989       /* -------- 0,01,00100 FCMEQ h_h_h -------- */
10990       /* -------- 1,01,00100 FCMGE h_h_h -------- */
10991       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
10992          return False;
10993       Bool   isGE  = bitU == 1;
10994       IROp   opCMP = isGE ? Iop_CmpLE16Fx8 : Iop_CmpEQ16Fx8;
10995       IRTemp res   = newTempV128();
10996       /* Swap source and destination in order to use existing LE IR op for GE. */
10997       assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn))
10998                        : binop(opCMP, getQReg128(nn), getQReg128(mm)));
10999       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(X01, mkexpr(res))));
11000       DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
11001           nameQRegLO(dd, Ity_F16), nameQRegLO(nn, Ity_F16), nameQRegLO(mm, Ity_F16));
11002       return True;
11003    }
11004
11005    if (bitU == 1 && size == X11 && opcode == BITS5(0,0,1,0,0)) {
11006       /* -------- 1,11,00100 FCMGT h_h_h -------- */
11007       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
11008          return False;
11009       IRTemp res   = newTempV128();
11010       /* Swap source and destination in order to use existing LT IR op for GT. */
11011       assign(res, binop(Iop_CmpLT16Fx8, getQReg128(mm), getQReg128(nn)));
11012       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(X01, mkexpr(res))));
11013       DIP("%s %s, %s, %s\n", "fcmgt",
11014           nameQRegLO(dd, Ity_F16), nameQRegLO(nn, Ity_F16), nameQRegLO(mm, Ity_F16));
11015       return True;
11016    }
11017
11018    if (bitU == 1 && opcode == BITS5(0,0,1,0,1)) {
11019       /* -------- 1,01,00101 FACGE h_h_h -------- */
11020       /* -------- 1,01,00101 FACGT h_h_h -------- */
11021       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
11022          return False;
11023       IRType ity   = Ity_F16;
11024       Bool   isGT  = (size & 2) == 2;
11025       IROp   opCMP = isGT ? Iop_CmpLT16Fx8 : Iop_CmpLE16Fx8;
11026       IROp   opABS = Iop_Abs16Fx8;
11027       IRTemp res   = newTempV128();
11028       assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
11029                                unop(opABS, getQReg128(nn))));
11030       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(X01,
11031                                                              mkexpr(res))));
11032       DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
11033           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
11034       return True;
11035    }
11036
11037    return False;
11038 #  undef INSN
11039 }
11040
11041
11042 static
11043 Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
11044 {
11045    /* 31 29 28    23   21    16     11 9 4
11046       01 U  11110 size 10000 opcode 10 n d
11047       Decode fields: u,size,opcode
11048    */
11049 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11050    if (INSN(31,30) != BITS2(0,1)
11051        || INSN(28,24) != BITS5(1,1,1,1,0)
11052        || INSN(21,17) != BITS5(1,0,0,0,0)
11053        || INSN(11,10) != BITS2(1,0)) {
11054       return False;
11055    }
11056    UInt bitU   = INSN(29,29);
11057    UInt size   = INSN(23,22);
11058    UInt opcode = INSN(16,12);
11059    UInt nn     = INSN(9,5);
11060    UInt dd     = INSN(4,0);
11061    vassert(size < 4);
11062
11063    if (opcode == BITS5(0,0,0,1,1)) {
11064       /* -------- 0,xx,00011: SUQADD std4_std4 -------- */
11065       /* -------- 1,xx,00011: USQADD std4_std4 -------- */
11066       /* These are a bit tricky (to say the least).  See comments on
11067          the vector variants (in dis_AdvSIMD_two_reg_misc) below for
11068          details. */
11069       Bool   isUSQADD = bitU == 1;
11070       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
11071                              : mkVecQADDEXTUSSATSS(size);
11072       IROp   nop  = mkVecADD(size);
11073       IRTemp argL = newTempV128();
11074       IRTemp argR = newTempV128();
11075       assign(argL, getQReg128(nn));
11076       assign(argR, getQReg128(dd));
11077       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
11078                        size, binop(qop, mkexpr(argL), mkexpr(argR)));
11079       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
11080                        size, binop(nop, mkexpr(argL), mkexpr(argR)));
11081       putQReg128(dd, mkexpr(qres));
11082       updateQCFLAGwithDifference(qres, nres);
11083       const HChar arr = "bhsd"[size];
11084       DIP("%s %c%u, %c%u\n", isUSQADD ? "usqadd" : "suqadd", arr, dd, arr, nn);
11085       return True;
11086    }
11087
11088    if (opcode == BITS5(0,0,1,1,1)) {
11089       /* -------- 0,xx,00111 SQABS std4_std4 -------- */
11090       /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
11091       Bool isNEG = bitU == 1;
11092       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
11093       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
11094                                          getQReg128(nn), size );
11095       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(qresFW));
11096       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(nresFW));
11097       putQReg128(dd, mkexpr(qres));
11098       updateQCFLAGwithDifference(qres, nres);
11099       const HChar arr = "bhsd"[size];
11100       DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
11101       return True;
11102    }
11103
11104    if (size == X11 && opcode == BITS5(0,1,0,0,0)) {
11105       /* -------- 0,11,01000: CMGT d_d_#0 -------- */ // >s 0
11106       /* -------- 1,11,01000: CMGE d_d_#0 -------- */ // >=s 0
11107       Bool    isGT = bitU == 0;
11108       IRExpr* argL = getQReg128(nn);
11109       IRExpr* argR = mkV128(0x0000);
11110       IRTemp  res  = newTempV128();
11111       assign(res, isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
11112                        : unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL)));
11113       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
11114       DIP("cm%s d%u, d%u, #0\n", isGT ? "gt" : "ge", dd, nn);
11115       return True;
11116    }
11117
11118    if (size == X11 && opcode == BITS5(0,1,0,0,1)) {
11119       /* -------- 0,11,01001: CMEQ d_d_#0 -------- */ // == 0
11120       /* -------- 1,11,01001: CMLE d_d_#0 -------- */ // <=s 0
11121       Bool    isEQ = bitU == 0;
11122       IRExpr* argL = getQReg128(nn);
11123       IRExpr* argR = mkV128(0x0000);
11124       IRTemp  res  = newTempV128();
11125       assign(res, isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
11126                        : unop(Iop_NotV128,
11127                               binop(Iop_CmpGT64Sx2, argL, argR)));
11128       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
11129       DIP("cm%s d%u, d%u, #0\n", isEQ ? "eq" : "le", dd, nn);
11130       return True;
11131    }
11132
11133    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,0)) {
11134       /* -------- 0,11,01010: CMLT d_d_#0 -------- */ // <s 0
11135       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
11136                           binop(Iop_CmpGT64Sx2, mkV128(0x0000),
11137                                                 getQReg128(nn))));
11138       DIP("cm%s d%u, d%u, #0\n", "lt", dd, nn);
11139       return True;
11140    }
11141
11142    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
11143       /* -------- 0,11,01011 ABS d_d -------- */
11144       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
11145                           unop(Iop_Abs64x2, getQReg128(nn))));
11146       DIP("abs d%u, d%u\n", dd, nn);
11147       return True;
11148    }
11149
11150    if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
11151       /* -------- 1,11,01011 NEG d_d -------- */
11152       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
11153                           binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
11154       DIP("neg d%u, d%u\n", dd, nn);
11155       return True;
11156    }
11157
11158    UInt ix = 0; /*INVALID*/
11159    if (size >= X10) {
11160       switch (opcode) {
11161          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
11162          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
11163          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
11164          default: break;
11165       }
11166    }
11167    if (ix > 0) {
11168       /* -------- 0,1x,01100 FCMGT d_d_#0.0, s_s_#0.0 (ix 1) -------- */
11169       /* -------- 0,1x,01101 FCMEQ d_d_#0.0, s_s_#0.0 (ix 2) -------- */
11170       /* -------- 0,1x,01110 FCMLT d_d_#0.0, s_s_#0.0 (ix 3) -------- */
11171       /* -------- 1,1x,01100 FCMGE d_d_#0.0, s_s_#0.0 (ix 4) -------- */
11172       /* -------- 1,1x,01101 FCMLE d_d_#0.0, s_s_#0.0 (ix 5) -------- */
11173       Bool   isD     = size == X11;
11174       IRType ity     = isD ? Ity_F64 : Ity_F32;
11175       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
11176       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
11177       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
11178       IROp   opCmp   = Iop_INVALID;
11179       Bool   swap    = False;
11180       const HChar* nm = "??";
11181       switch (ix) {
11182          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
11183          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
11184          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
11185          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
11186          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
11187          default: vassert(0);
11188       }
11189       IRExpr* zero = mkV128(0x0000);
11190       IRTemp res = newTempV128();
11191       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
11192                        : binop(opCmp, getQReg128(nn), zero));
11193       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
11194                                                              mkexpr(res))));
11195
11196       DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
11197       return True;
11198    }
11199
11200    if (opcode == BITS5(1,0,1,0,0)
11201        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
11202       /* -------- 0,xx,10100: SQXTN -------- */
11203       /* -------- 1,xx,10100: UQXTN -------- */
11204       /* -------- 1,xx,10010: SQXTUN -------- */
11205       if (size == X11) return False;
11206       vassert(size < 3);
11207       IROp  opN    = Iop_INVALID;
11208       Bool  zWiden = True;
11209       const HChar* nm = "??";
11210       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
11211          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
11212       }
11213       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
11214          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
11215       }
11216       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
11217          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
11218       }
11219       else vassert(0);
11220       IRTemp src  = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
11221                        size+1, getQReg128(nn));
11222       IRTemp resN = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
11223                        size, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
11224       putQReg128(dd, mkexpr(resN));
11225       /* This widens zero lanes to zero, and compares it against zero, so all
11226          of the non-participating lanes make no contribution to the
11227          Q flag state. */
11228       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
11229                                               size, mkexpr(resN));
11230       updateQCFLAGwithDifference(src, resW);
11231       const HChar arrNarrow = "bhsd"[size];
11232       const HChar arrWide   = "bhsd"[size+1];
11233       DIP("%s %c%u, %c%u\n", nm, arrNarrow, dd, arrWide, nn);
11234       return True;
11235    }
11236
11237    if (opcode == BITS5(1,0,1,1,0) && bitU == 1 && size == X01) {
11238       /* -------- 1,01,10110 FCVTXN s_d -------- */
11239       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
11240          odd" but I don't know what that really means. */
11241       putQRegLO(dd,
11242                 binop(Iop_F64toF32, mkU32(Irrm_NEAREST),
11243                                     getQRegLO(nn, Ity_F64)));
11244       putQRegLane(dd, 1, mkU32(0));
11245       putQRegLane(dd, 1, mkU64(0));
11246       DIP("fcvtxn s%u, d%u\n", dd, nn);
11247       return True;
11248    }
11249
11250    ix = 0; /*INVALID*/
11251    switch (opcode) {
11252       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
11253       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
11254       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
11255       default: break;
11256    }
11257    if (ix > 0) {
11258       /* -------- 0,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
11259       /* -------- 0,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
11260       /* -------- 0,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
11261       /* -------- 0,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
11262       /* -------- 0,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
11263       /* -------- 1,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
11264       /* -------- 1,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
11265       /* -------- 1,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
11266       /* -------- 1,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
11267       /* -------- 1,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
11268       Bool           isD  = (size & 1) == 1;
11269       IRType         tyF  = isD ? Ity_F64 : Ity_F32;
11270       IRType         tyI  = isD ? Ity_I64 : Ity_I32;
11271       IRRoundingMode irrm = 8; /*impossible*/
11272       HChar          ch   = '?';
11273       switch (ix) {
11274          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
11275          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
11276          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
11277          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
11278          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
11279          default: vassert(0);
11280       }
11281       IROp cvt = Iop_INVALID;
11282       if (bitU == 1) {
11283          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
11284       } else {
11285          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
11286       }
11287       IRTemp src = newTemp(tyF);
11288       IRTemp res = newTemp(tyI);
11289       assign(src, getQRegLane(nn, 0, tyF));
11290       assign(res, binop(cvt, mkU32(irrm), mkexpr(src)));
11291       putQRegLane(dd, 0, mkexpr(res)); /* bits 31-0 or 63-0 */
11292       if (!isD) {
11293          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
11294       }
11295       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
11296       HChar sOrD = isD ? 'd' : 's';
11297       DIP("fcvt%c%c %c%u, %c%u\n", ch, bitU == 1 ? 'u' : 's',
11298           sOrD, dd, sOrD, nn);
11299       return True;
11300    }
11301
11302    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
11303       /* -------- 0,0x,11101: SCVTF d_d, s_s -------- */
11304       /* -------- 1,0x,11101: UCVTF d_d, s_s -------- */
11305       Bool   isU = bitU == 1;
11306       Bool   isD = (size & 1) == 1;
11307       IRType tyI = isD ? Ity_I64 : Ity_I32;
11308       IROp   iop = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
11309                        : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
11310       IRTemp rm  = mk_get_IR_rounding_mode();
11311       putQRegLO(dd, binop(iop, mkexpr(rm), getQRegLO(nn, tyI)));
11312       if (!isD) {
11313          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
11314       }
11315       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
11316       HChar c = isD ? 'd' : 's';
11317       DIP("%ccvtf %c%u, %c%u\n", isU ? 'u' : 's', c, dd, c, nn);
11318       return True;
11319    }
11320
11321    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
11322       /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
11323       /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
11324       Bool isSQRT = bitU == 1;
11325       Bool isD    = (size & 1) == 1;
11326       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
11327                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
11328       IRTemp resV = newTempV128();
11329       assign(resV, unop(op, getQReg128(nn)));
11330       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
11331                                                              mkexpr(resV))));
11332       HChar c = isD ? 'd' : 's';
11333       DIP("%s %c%u, %c%u\n", isSQRT ? "frsqrte" : "frecpe", c, dd, c, nn);
11334       return True;
11335    }
11336
11337    if (bitU == 0 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
11338       /* -------- 0,1x,11111: FRECPX  d_d, s_s -------- */
11339       Bool   isD = (size & 1) == 1;
11340       IRType ty  = isD ? Ity_F64 : Ity_F32;
11341       IROp   op  = isD ? Iop_RecpExpF64 : Iop_RecpExpF32;
11342       IRTemp res = newTemp(ty);
11343       IRTemp rm  = mk_get_IR_rounding_mode();
11344       assign(res, binop(op, mkexpr(rm), getQRegLane(nn, 0, ty)));
11345       putQReg128(dd, mkV128(0x0000));
11346       putQRegLane(dd, 0, mkexpr(res));
11347       HChar c = isD ? 'd' : 's';
11348       DIP("%s %c%u, %c%u\n", "frecpx", c, dd, c, nn);
11349       return True;
11350    }
11351
11352    return False;
11353 #  undef INSN
11354 }
11355
11356
11357 static
11358 Bool dis_AdvSIMD_scalar_two_reg_misc_fp16(/*MB_OUT*/DisResult* dres, UInt insn,
11359                                           const VexArchInfo* archinfo)
11360 {
11361    /* This decode function only handles instructions with half-precision
11362       floating-point (fp16) operands.
11363    */
11364    if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
11365       return False;
11366
11367    /* 31 29 28    23   21    16     11 9 4
11368       01 U  11110 size 11100 opcode 10 n d
11369       Decode fields: u,size,opcode
11370    */
11371 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11372    if (INSN(31,30) != BITS2(0,1)
11373        || INSN(28,24) != BITS5(1,1,1,1,0)
11374        || INSN(21,17) != BITS5(1,1,1,0,0)
11375        || INSN(11,10) != BITS2(1,0)) {
11376       return False;
11377    }
11378    UInt bitU   = INSN(29,29);
11379    UInt size   = INSN(23,22);
11380    UInt opcode = INSN(16,12);
11381    UInt nn     = INSN(9,5);
11382    UInt dd     = INSN(4,0);
11383    vassert(size == 3);
11384
11385    /* Decoding FCM<condtion> based on opcode and bitU. ix used to select
11386     * <condition>
11387     */
11388    UInt ix = 0; // Invalid <condition>
11389    switch (opcode) {
11390       case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 4 : 1; break; // FCMLE=4,FCMEQ=1
11391       case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 5 : 2; break; // FCMGE=5,FCMGT=2
11392       case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;    // FCMLT=3
11393       default: break;
11394    }
11395    if (ix > 0) {
11396       /* -------- 0,01101 FCMEQ h_h_#0.0 (ix 1) -------- */
11397       /* -------- 0,01100 FCMGT h_h_#0.0 (ix 2) -------- */
11398       /* -------- 0,01110 FCMLT h_h_#0.0 (ix 3) -------- */
11399       /* -------- 1,01101 FCMLE h_h_#0.0 (ix 4) -------- */
11400       /* -------- 1,01100 FCMGE h_h_#0.0 (ix 5) -------- */
11401       IRType ity     = Ity_F16;
11402       IROp   opCmp   = Iop_INVALID;
11403       Bool   swap    = False;
11404       const HChar* nm = "??";
11405       switch (ix) {
11406          case 1: nm = "fcmeq"; opCmp = Iop_CmpEQ16Fx8; break;
11407          case 2: nm = "fcmgt"; opCmp = Iop_CmpLT16Fx8; swap = True; break;
11408          case 3: nm = "fcmlt"; opCmp = Iop_CmpLT16Fx8; break;
11409          case 4: nm = "fcmle"; opCmp = Iop_CmpLE16Fx8; break;
11410          case 5: nm = "fcmge"; opCmp = Iop_CmpLE16Fx8; swap = True; break;
11411          default: vassert(0);
11412       }
11413       IRExpr* zero = mkV128(0x0000);
11414       IRTemp res   = newTempV128();
11415       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
11416                        : binop(opCmp, getQReg128(nn), zero));
11417       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(X01, mkexpr(res))));
11418
11419       DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
11420       return True;
11421    }
11422
11423    return False;
11424 #  undef INSN
11425 }
11426
11427
11428 static
11429 Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
11430 {
11431    /* 31   28    23   21 20 19 15     11   9 4
11432       01 U 11111 size L  M  m  opcode H  0 n d
11433       Decode fields are: u,size,opcode
11434       M is really part of the mm register number.  Individual
11435       cases need to inspect L and H though.
11436    */
11437 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11438    if (INSN(31,30) != BITS2(0,1)
11439        || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) != 0) {
11440       return False;
11441    }
11442    UInt bitU   = INSN(29,29);
11443    UInt size   = INSN(23,22);
11444    UInt bitL   = INSN(21,21);
11445    UInt bitM   = INSN(20,20);
11446    UInt mmLO4  = INSN(19,16);
11447    UInt opcode = INSN(15,12);
11448    UInt bitH   = INSN(11,11);
11449    UInt nn     = INSN(9,5);
11450    UInt dd     = INSN(4,0);
11451    vassert(size < 4);
11452    vassert(bitH < 2 && bitM < 2 && bitL < 2);
11453
11454    if (bitU == 0 && size >= X10
11455        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
11456       /* -------- 0,1x,0001 FMLA d_d_d[], s_s_s[] -------- */
11457       /* -------- 0,1x,0101 FMLS d_d_d[], s_s_s[] -------- */
11458       Bool isD   = (size & 1) == 1;
11459       Bool isSUB = opcode == BITS4(0,1,0,1);
11460       UInt index;
11461       if      (!isD)             index = (bitH << 1) | bitL;
11462       else if (isD && bitL == 0) index = bitH;
11463       else return False; // sz:L == x11 => unallocated encoding
11464       vassert(index < (isD ? 2 : 4));
11465       IRType ity   = isD ? Ity_F64 : Ity_F32;
11466       IRTemp elem  = newTemp(ity);
11467       UInt   mm    = (bitM << 4) | mmLO4;
11468       assign(elem, getQRegLane(mm, index, ity));
11469       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
11470       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
11471       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
11472       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
11473       IRTemp rm    = mk_get_IR_rounding_mode();
11474       IRTemp t1    = newTempV128();
11475       IRTemp t2    = newTempV128();
11476       // FIXME: double rounding; use FMA primops instead
11477       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
11478       assign(t2, triop(isSUB ? opSUB : opADD,
11479                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
11480       putQReg128(dd,
11481                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
11482                                                          mkexpr(t2))));
11483       const HChar c = isD ? 'd' : 's';
11484       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
11485           c, dd, c, nn, nameQReg128(mm), c, index);
11486       return True;
11487    }
11488
11489    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
11490       /* -------- 0,1x,1001 FMUL  d_d_d[], s_s_s[] -------- */
11491       /* -------- 1,1x,1001 FMULX d_d_d[], s_s_s[] -------- */
11492       Bool isD    = (size & 1) == 1;
11493       Bool isMULX = bitU == 1;
11494       UInt index;
11495       if      (!isD)             index = (bitH << 1) | bitL;
11496       else if (isD && bitL == 0) index = bitH;
11497       else return False; // sz:L == x11 => unallocated encoding
11498       vassert(index < (isD ? 2 : 4));
11499       IRType ity   = isD ? Ity_F64 : Ity_F32;
11500       IRTemp elem  = newTemp(ity);
11501       UInt   mm    = (bitM << 4) | mmLO4;
11502       assign(elem, getQRegLane(mm, index, ity));
11503       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
11504       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
11505       IRTemp rm    = mk_get_IR_rounding_mode();
11506       IRTemp t1    = newTempV128();
11507       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
11508       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
11509       putQReg128(dd,
11510                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
11511                                                          mkexpr(t1))));
11512       const HChar c = isD ? 'd' : 's';
11513       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isMULX ? "fmulx" : "fmul",
11514           c, dd, c, nn, nameQReg128(mm), c, index);
11515       return True;
11516    }
11517
11518    if (bitU == 0
11519        && (opcode == BITS4(1,0,1,1)
11520            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
11521       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
11522       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
11523       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
11524       /* Widens, and size refers to the narrowed lanes. */
11525       UInt ks = 3;
11526       switch (opcode) {
11527          case BITS4(1,0,1,1): ks = 0; break;
11528          case BITS4(0,0,1,1): ks = 1; break;
11529          case BITS4(0,1,1,1): ks = 2; break;
11530          default: vassert(0);
11531       }
11532       vassert(ks >= 0 && ks <= 2);
11533       UInt mm  = 32; // invalid
11534       UInt ix  = 16; // invalid
11535       switch (size) {
11536          case X00:
11537             return False; // h_b_b[] case is not allowed
11538          case X01:
11539             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11540          case X10:
11541             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11542          case X11:
11543             return False; // q_d_d[] case is not allowed
11544          default:
11545             vassert(0);
11546       }
11547       vassert(mm < 32 && ix < 16);
11548       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
11549       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
11550       newTempsV128_2(&vecN, &vecD);
11551       assign(vecN, getQReg128(nn));
11552       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11553       assign(vecD, getQReg128(dd));
11554       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
11555                        False/*!is2*/, size, "mas"[ks],
11556                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11557       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
11558       putQReg128(dd, unop(opZHI, mkexpr(res)));
11559       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
11560       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
11561       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
11562          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
11563       }
11564       const HChar* nm        = ks == 0 ? "sqmull"
11565                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
11566       const HChar  arrNarrow = "bhsd"[size];
11567       const HChar  arrWide   = "bhsd"[size+1];
11568       DIP("%s %c%u, %c%u, v%u.%c[%u]\n",
11569           nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
11570       return True;
11571    }
11572
11573    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
11574       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
11575       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
11576       UInt mm  = 32; // invalid
11577       UInt ix  = 16; // invalid
11578       switch (size) {
11579          case X00:
11580             return False; // b case is not allowed
11581          case X01:
11582             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11583          case X10:
11584             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11585          case X11:
11586             return False; // q case is not allowed
11587          default:
11588             vassert(0);
11589       }
11590       vassert(mm < 32 && ix < 16);
11591       Bool isR = opcode == BITS4(1,1,0,1);
11592       IRTemp res, sat1q, sat1n, vN, vM;
11593       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
11594       vN = newTempV128();
11595       assign(vN, getQReg128(nn));
11596       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11597       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
11598       IROp opZHI = mkVecZEROHIxxOFV128(size);
11599       putQReg128(dd, unop(opZHI, mkexpr(res)));
11600       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
11601       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
11602       HChar ch         = size == X01 ? 'h' : 's';
11603       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
11604       return True;
11605    }
11606
11607    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
11608       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
11609       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
11610       UInt mm  = 32; // invalid
11611       UInt ix  = 16; // invalid
11612       switch (size) {
11613          case X00:
11614             return False; // b case is not allowed
11615          case X01:
11616             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11617          case X10:
11618             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11619          case X11:
11620             return False; // d case is not allowed
11621          default:
11622             vassert(0);
11623       }
11624       vassert(size < 4);
11625       vassert(mm < 32 && ix < 16);
11626       Bool isAdd = opcode == BITS4(1,1,0,1);
11627
11628       IRTemp res, res_nosat, vD, vN, vM;
11629       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
11630       newTempsV128_2(&vD, &vN);
11631       assign(vD, getQReg128(dd));
11632       assign(vN, getQReg128(nn));
11633       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11634
11635       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
11636       IROp opZHI = mkVecZEROHIxxOFV128(size);
11637       putQReg128(dd, unop(opZHI, mkexpr(res)));
11638       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
11639
11640       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
11641       HChar ch         = size == X01 ? 'h' : 's';
11642       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
11643       return True;
11644    }
11645
11646    return False;
11647 #  undef INSN
11648 }
11649
11650
11651 static
11652 Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
11653 {
11654    /* 31    28     22   18   15     10 9 4
11655       0 q u 011110 immh immb opcode 1  n d
11656       Decode fields: u,opcode
11657    */
11658 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11659    if (INSN(31,31) != 0
11660        || INSN(28,23) != BITS6(0,1,1,1,1,0) || INSN(10,10) != 1) {
11661       return False;
11662    }
11663    UInt bitQ   = INSN(30,30);
11664    UInt bitU   = INSN(29,29);
11665    UInt immh   = INSN(22,19);
11666    UInt immb   = INSN(18,16);
11667    UInt opcode = INSN(15,11);
11668    UInt nn     = INSN(9,5);
11669    UInt dd     = INSN(4,0);
11670
11671    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0)) {
11672       /* -------- 0,00000 SSHR std7_std7_#imm -------- */
11673       /* -------- 1,00000 USHR std7_std7_#imm -------- */
11674       /* -------- 0,00010 SSRA std7_std7_#imm -------- */
11675       /* -------- 1,00010 USRA std7_std7_#imm -------- */
11676       /* laneTy, shift = case immh:immb of
11677                          0001:xxx -> B, SHR:8-xxx
11678                          001x:xxx -> H, SHR:16-xxxx
11679                          01xx:xxx -> S, SHR:32-xxxxx
11680                          1xxx:xxx -> D, SHR:64-xxxxxx
11681                          other    -> invalid
11682       */
11683       UInt size  = 0;
11684       UInt shift = 0;
11685       Bool isQ   = bitQ == 1;
11686       Bool isU   = bitU == 1;
11687       Bool isAcc = opcode == BITS5(0,0,0,1,0);
11688       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11689       if (!ok || (bitQ == 0 && size == X11)) return False;
11690       vassert(size >= 0 && size <= 3);
11691       UInt lanebits = 8 << size;
11692       vassert(shift >= 1 && shift <= lanebits);
11693       IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
11694       IRExpr* src = getQReg128(nn);
11695       IRTemp  shf = newTempV128();
11696       IRTemp  res = newTempV128();
11697       if (shift == lanebits && isU) {
11698          assign(shf, mkV128(0x0000));
11699       } else {
11700          UInt nudge = 0;
11701          if (shift == lanebits) {
11702             vassert(!isU);
11703             nudge = 1;
11704          }
11705          assign(shf, binop(op, src, mkU8(shift - nudge)));
11706       }
11707       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
11708                         : mkexpr(shf));
11709       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11710       HChar laneCh = "bhsd"[size];
11711       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11712       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
11713                               : (isU ? "ushr" : "sshr");
11714       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11715           nameQReg128(dd), nLanes, laneCh,
11716           nameQReg128(nn), nLanes, laneCh, shift);
11717       return True;
11718    }
11719
11720    if (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0)) {
11721       /* -------- 0,00100 SRSHR std7_std7_#imm -------- */
11722       /* -------- 1,00100 URSHR std7_std7_#imm -------- */
11723       /* -------- 0,00110 SRSRA std7_std7_#imm -------- */
11724       /* -------- 1,00110 URSRA std7_std7_#imm -------- */
11725       /* laneTy, shift = case immh:immb of
11726                          0001:xxx -> B, SHR:8-xxx
11727                          001x:xxx -> H, SHR:16-xxxx
11728                          01xx:xxx -> S, SHR:32-xxxxx
11729                          1xxx:xxx -> D, SHR:64-xxxxxx
11730                          other    -> invalid
11731       */
11732       UInt size  = 0;
11733       UInt shift = 0;
11734       Bool isQ   = bitQ == 1;
11735       Bool isU   = bitU == 1;
11736       Bool isAcc = opcode == BITS5(0,0,1,1,0);
11737       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11738       if (!ok || (bitQ == 0 && size == X11)) return False;
11739       vassert(size >= 0 && size <= 3);
11740       UInt lanebits = 8 << size;
11741       vassert(shift >= 1 && shift <= lanebits);
11742       IROp    op   = isU ? mkVecRSHU(size) : mkVecRSHS(size);
11743       IRExpr* src  = getQReg128(nn);
11744       IRTemp  imm8 = newTemp(Ity_I8);
11745       assign(imm8, mkU8((UChar)(-shift)));
11746       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
11747       IRTemp  shf  = newTempV128();
11748       IRTemp  res  = newTempV128();
11749       assign(shf, binop(op, src, amt));
11750       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
11751                         : mkexpr(shf));
11752       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11753       HChar laneCh = "bhsd"[size];
11754       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11755       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
11756                               : (isU ? "urshr" : "srshr");
11757       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11758           nameQReg128(dd), nLanes, laneCh,
11759           nameQReg128(nn), nLanes, laneCh, shift);
11760       return True;
11761    }
11762
11763    if (bitU == 1 && opcode == BITS5(0,1,0,0,0)) {
11764       /* -------- 1,01000 SRI std7_std7_#imm -------- */
11765       /* laneTy, shift = case immh:immb of
11766                          0001:xxx -> B, SHR:8-xxx
11767                          001x:xxx -> H, SHR:16-xxxx
11768                          01xx:xxx -> S, SHR:32-xxxxx
11769                          1xxx:xxx -> D, SHR:64-xxxxxx
11770                          other    -> invalid
11771       */
11772       UInt size  = 0;
11773       UInt shift = 0;
11774       Bool isQ   = bitQ == 1;
11775       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11776       if (!ok || (bitQ == 0 && size == X11)) return False;
11777       vassert(size >= 0 && size <= 3);
11778       UInt lanebits = 8 << size;
11779       vassert(shift >= 1 && shift <= lanebits);
11780       IRExpr* src = getQReg128(nn);
11781       IRTemp  res = newTempV128();
11782       if (shift == lanebits) {
11783          assign(res, getQReg128(dd));
11784       } else {
11785          assign(res, binop(mkVecSHRN(size), src, mkU8(shift)));
11786          IRExpr* nmask = binop(mkVecSHLN(size),
11787                                mkV128(0xFFFF), mkU8(lanebits - shift));
11788          IRTemp  tmp   = newTempV128();
11789          assign(tmp, binop(Iop_OrV128,
11790                            mkexpr(res),
11791                            binop(Iop_AndV128, getQReg128(dd), nmask)));
11792          res = tmp;
11793       }
11794       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11795       HChar laneCh = "bhsd"[size];
11796       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11797       DIP("%s %s.%u%c, %s.%u%c, #%u\n", "sri",
11798           nameQReg128(dd), nLanes, laneCh,
11799           nameQReg128(nn), nLanes, laneCh, shift);
11800       return True;
11801    }
11802
11803    if (opcode == BITS5(0,1,0,1,0)) {
11804       /* -------- 0,01010 SHL std7_std7_#imm -------- */
11805       /* -------- 1,01010 SLI std7_std7_#imm -------- */
11806       /* laneTy, shift = case immh:immb of
11807                          0001:xxx -> B, xxx
11808                          001x:xxx -> H, xxxx
11809                          01xx:xxx -> S, xxxxx
11810                          1xxx:xxx -> D, xxxxxx
11811                          other    -> invalid
11812       */
11813       UInt size  = 0;
11814       UInt shift = 0;
11815       Bool isSLI = bitU == 1;
11816       Bool isQ   = bitQ == 1;
11817       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11818       if (!ok || (bitQ == 0 && size == X11)) return False;
11819       vassert(size >= 0 && size <= 3);
11820       /* The shift encoding has opposite sign for the leftwards case.
11821          Adjust shift to compensate. */
11822       UInt lanebits = 8 << size;
11823       shift = lanebits - shift;
11824       vassert(shift >= 0 && shift < lanebits);
11825       IROp    op  = mkVecSHLN(size);
11826       IRExpr* src = getQReg128(nn);
11827       IRTemp  res = newTempV128();
11828       if (shift == 0) {
11829          assign(res, src);
11830       } else {
11831          assign(res, binop(op, src, mkU8(shift)));
11832          if (isSLI) {
11833             IRExpr* nmask = binop(mkVecSHRN(size),
11834                                   mkV128(0xFFFF), mkU8(lanebits - shift));
11835             IRTemp  tmp   = newTempV128();
11836             assign(tmp, binop(Iop_OrV128,
11837                               mkexpr(res),
11838                               binop(Iop_AndV128, getQReg128(dd), nmask)));
11839             res = tmp;
11840          }
11841       }
11842       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11843       HChar laneCh = "bhsd"[size];
11844       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11845       const HChar* nm = isSLI ? "sli" : "shl";
11846       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11847           nameQReg128(dd), nLanes, laneCh,
11848           nameQReg128(nn), nLanes, laneCh, shift);
11849       return True;
11850    }
11851
11852    if (opcode == BITS5(0,1,1,1,0)
11853        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
11854       /* -------- 0,01110  SQSHL  std7_std7_#imm -------- */
11855       /* -------- 1,01110  UQSHL  std7_std7_#imm -------- */
11856       /* -------- 1,01100  SQSHLU std7_std7_#imm -------- */
11857       UInt size  = 0;
11858       UInt shift = 0;
11859       Bool isQ   = bitQ == 1;
11860       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11861       if (!ok || (bitQ == 0 && size == X11)) return False;
11862       vassert(size >= 0 && size <= 3);
11863       /* The shift encoding has opposite sign for the leftwards case.
11864          Adjust shift to compensate. */
11865       UInt lanebits = 8 << size;
11866       shift = lanebits - shift;
11867       vassert(shift >= 0 && shift < lanebits);
11868       const HChar* nm = NULL;
11869       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
11870       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
11871       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
11872       else vassert(0);
11873       IRTemp qDiff1 = IRTemp_INVALID;
11874       IRTemp qDiff2 = IRTemp_INVALID;
11875       IRTemp res = IRTemp_INVALID;
11876       IRTemp src = newTempV128();
11877       assign(src, getQReg128(nn));
11878       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
11879       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11880       updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2,
11881                                     isQ ? Iop_INVALID : Iop_ZeroHI64ofV128);
11882       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11883       DIP("%s %s.%s, %s.%s, #%u\n", nm,
11884           nameQReg128(dd), arr, nameQReg128(nn), arr, shift);
11885       return True;
11886    }
11887
11888    if (bitU == 0
11889        && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
11890       /* -------- 0,10000  SHRN{,2} #imm -------- */
11891       /* -------- 0,10001 RSHRN{,2} #imm -------- */
11892       /* Narrows, and size is the narrow size. */
11893       UInt size  = 0;
11894       UInt shift = 0;
11895       Bool is2   = bitQ == 1;
11896       Bool isR   = opcode == BITS5(1,0,0,0,1);
11897       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11898       if (!ok || size == X11) return False;
11899       vassert(shift >= 1);
11900       IRTemp t1 = newTempV128();
11901       IRTemp t2 = newTempV128();
11902       IRTemp t3 = newTempV128();
11903       assign(t1, getQReg128(nn));
11904       assign(t2, isR ? binop(mkVecADD(size+1),
11905                              mkexpr(t1),
11906                              mkexpr(math_VEC_DUP_IMM(size+1, 1ULL<<(shift-1))))
11907                      : mkexpr(t1));
11908       assign(t3, binop(mkVecSHRN(size+1), mkexpr(t2), mkU8(shift)));
11909       IRTemp t4 = math_NARROW_LANES(t3, t3, size);
11910       putLO64andZUorPutHI64(is2, dd, t4);
11911       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11912       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11913       DIP("%s %s.%s, %s.%s, #%u\n", isR ? "rshrn" : "shrn",
11914           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
11915       return True;
11916    }
11917
11918    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
11919        || (bitU == 1
11920            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
11921       /* -------- 0,10010   SQSHRN{,2} #imm -------- */
11922       /* -------- 1,10010   UQSHRN{,2} #imm -------- */
11923       /* -------- 0,10011  SQRSHRN{,2} #imm -------- */
11924       /* -------- 1,10011  UQRSHRN{,2} #imm -------- */
11925       /* -------- 1,10000  SQSHRUN{,2} #imm -------- */
11926       /* -------- 1,10001 SQRSHRUN{,2} #imm -------- */
11927       UInt size  = 0;
11928       UInt shift = 0;
11929       Bool is2   = bitQ == 1;
11930       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11931       if (!ok || size == X11) return False;
11932       vassert(shift >= 1 && shift <= (8 << size));
11933       const HChar* nm = "??";
11934       IROp op = Iop_INVALID;
11935       /* Decide on the name and the operation. */
11936       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
11937          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
11938       }
11939       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
11940          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
11941       }
11942       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
11943          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
11944       }
11945       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
11946          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
11947       }
11948       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
11949          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
11950       }
11951       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
11952          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
11953       }
11954       else vassert(0);
11955       /* Compute the result (Q, shifted value) pair. */
11956       IRTemp src128 = newTempV128();
11957       assign(src128, getQReg128(nn));
11958       IRTemp pair = newTempV128();
11959       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
11960       /* Update the result reg */
11961       IRTemp res64in128 = newTempV128();
11962       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
11963       putLO64andZUorPutHI64(is2, dd, res64in128);
11964       /* Update the Q flag. */
11965       IRTemp q64q64 = newTempV128();
11966       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
11967       IRTemp z128 = newTempV128();
11968       assign(z128, mkV128(0x0000));
11969       updateQCFLAGwithDifference(q64q64, z128);
11970       /* */
11971       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11972       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11973       DIP("%s %s.%s, %s.%s, #%u\n", nm,
11974           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
11975       return True;
11976    }
11977
11978    if (opcode == BITS5(1,0,1,0,0)) {
11979       /* -------- 0,10100 SSHLL{,2} #imm -------- */
11980       /* -------- 1,10100 USHLL{,2} #imm -------- */
11981       /* 31  28     22   18   15     9 4
11982          0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
11983          0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
11984          where Ta,Tb,sh
11985            = case immh of 1xxx -> invalid
11986                           01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
11987                           001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
11988                           0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
11989                           0000 -> AdvSIMD modified immediate (???)
11990       */
11991       Bool    isQ   = bitQ == 1;
11992       Bool    isU   = bitU == 1;
11993       UInt    immhb = (immh << 3) | immb;
11994       IRTemp  src   = newTempV128();
11995       IRTemp  zero  = newTempV128();
11996       IRExpr* res   = NULL;
11997       UInt    sh    = 0;
11998       const HChar* ta = "??";
11999       const HChar* tb = "??";
12000       assign(src, getQReg128(nn));
12001       assign(zero, mkV128(0x0000));
12002       if (immh & 8) {
12003          /* invalid; don't assign to res */
12004       }
12005       else if (immh & 4) {
12006          sh = immhb - 32;
12007          vassert(sh < 32); /* so 32-sh is 1..32 */
12008          ta = "2d";
12009          tb = isQ ? "4s" : "2s";
12010          IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero)
12011                            : mk_InterleaveLO32x4(src, zero);
12012          res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
12013       }
12014       else if (immh & 2) {
12015          sh = immhb - 16;
12016          vassert(sh < 16); /* so 16-sh is 1..16 */
12017          ta = "4s";
12018          tb = isQ ? "8h" : "4h";
12019          IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero)
12020                            : mk_InterleaveLO16x8(src, zero);
12021          res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
12022       }
12023       else if (immh & 1) {
12024          sh = immhb - 8;
12025          vassert(sh < 8); /* so 8-sh is 1..8 */
12026          ta = "8h";
12027          tb = isQ ? "16b" : "8b";
12028          IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero)
12029                            : mk_InterleaveLO8x16(src, zero);
12030          res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
12031       } else {
12032          vassert(immh == 0);
12033          /* invalid; don't assign to res */
12034       }
12035       /* */
12036       if (res) {
12037          putQReg128(dd, res);
12038          DIP("%cshll%s %s.%s, %s.%s, #%u\n",
12039              isU ? 'u' : 's', isQ ? "2" : "",
12040              nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
12041          return True;
12042       }
12043       return False;
12044    }
12045
12046    if (opcode == BITS5(1,1,1,0,0)) {
12047       /* -------- 0,11100 SCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
12048       /* -------- 1,11100 UCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
12049       /* If immh is of the form 00xx, the insn is invalid. */
12050       if (immh < BITS4(0,1,0,0)) return False;
12051       UInt size  = 0;
12052       UInt fbits = 0;
12053       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
12054       /* The following holds because immh is never zero. */
12055       vassert(ok);
12056       /* The following holds because immh >= 0100. */
12057       vassert(size == X10 || size == X11);
12058       Bool isD = size == X11;
12059       Bool isU = bitU == 1;
12060       Bool isQ = bitQ == 1;
12061       if (isD && !isQ) return False; /* reject .1d case */
12062       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
12063       Double  scale  = two_to_the_minus(fbits);
12064       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
12065                            : IRExpr_Const(IRConst_F32( (Float)scale ));
12066       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
12067       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
12068                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
12069       IRType tyF = isD ? Ity_F64 : Ity_F32;
12070       IRType tyI = isD ? Ity_I64 : Ity_I32;
12071       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
12072       vassert(nLanes == 2 || nLanes == 4);
12073       for (UInt i = 0; i < nLanes; i++) {
12074          IRTemp src = newTemp(tyI);
12075          IRTemp res = newTemp(tyF);
12076          IRTemp rm  = mk_get_IR_rounding_mode();
12077          assign(src, getQRegLane(nn, i, tyI));
12078          assign(res, triop(opMUL, mkexpr(rm),
12079                                   binop(opCVT, mkexpr(rm), mkexpr(src)),
12080                                   scaleE));
12081          putQRegLane(dd, i, mkexpr(res));
12082       }
12083       if (!isQ) {
12084          putQRegLane(dd, 1, mkU64(0));
12085       }
12086       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12087       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "ucvtf" : "scvtf",
12088           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
12089       return True;
12090    }
12091
12092    if (opcode == BITS5(1,1,1,1,1)) {
12093       /* -------- 0,11111 FCVTZS {2d_2d,4s_4s,2s_2s}_imm -------- */
12094       /* -------- 1,11111 FCVTZU {2d_2d,4s_4s,2s_2s}_imm -------- */
12095       /* If immh is of the form 00xx, the insn is invalid. */
12096       if (immh < BITS4(0,1,0,0)) return False;
12097       UInt size  = 0;
12098       UInt fbits = 0;
12099       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
12100       /* The following holds because immh is never zero. */
12101       vassert(ok);
12102       /* The following holds because immh >= 0100. */
12103       vassert(size == X10 || size == X11);
12104       Bool isD = size == X11;
12105       Bool isU = bitU == 1;
12106       Bool isQ = bitQ == 1;
12107       if (isD && !isQ) return False; /* reject .1d case */
12108       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
12109       Double  scale  = two_to_the_plus(fbits);
12110       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
12111                            : IRExpr_Const(IRConst_F32( (Float)scale ));
12112       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
12113       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
12114                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
12115       IRType tyF = isD ? Ity_F64 : Ity_F32;
12116       IRType tyI = isD ? Ity_I64 : Ity_I32;
12117       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
12118       vassert(nLanes == 2 || nLanes == 4);
12119       for (UInt i = 0; i < nLanes; i++) {
12120          IRTemp src = newTemp(tyF);
12121          IRTemp res = newTemp(tyI);
12122          IRTemp rm  = newTemp(Ity_I32);
12123          assign(src, getQRegLane(nn, i, tyF));
12124          assign(rm,  mkU32(Irrm_ZERO));
12125          assign(res, binop(opCVT, mkexpr(rm),
12126                                   triop(opMUL, mkexpr(rm),
12127                                                mkexpr(src), scaleE)));
12128          putQRegLane(dd, i, mkexpr(res));
12129       }
12130       if (!isQ) {
12131          putQRegLane(dd, 1, mkU64(0));
12132       }
12133       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12134       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "fcvtzu" : "fcvtzs",
12135           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
12136       return True;
12137    }
12138
12139 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12140    return False;
12141 #  undef INSN
12142 }
12143
12144
12145 static
12146 Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
12147 {
12148    /* 31 30 29 28    23   21 20 15     11 9 4
12149       0  Q  U  01110 size 1  m  opcode 00 n d
12150       Decode fields: u,opcode
12151    */
12152 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12153    if (INSN(31,31) != 0
12154        || INSN(28,24) != BITS5(0,1,1,1,0)
12155        || INSN(21,21) != 1
12156        || INSN(11,10) != BITS2(0,0)) {
12157       return False;
12158    }
12159    UInt bitQ   = INSN(30,30);
12160    UInt bitU   = INSN(29,29);
12161    UInt size   = INSN(23,22);
12162    UInt mm     = INSN(20,16);
12163    UInt opcode = INSN(15,12);
12164    UInt nn     = INSN(9,5);
12165    UInt dd     = INSN(4,0);
12166    vassert(size < 4);
12167    Bool is2    = bitQ == 1;
12168
12169    if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) {
12170       /* -------- 0,0000 SADDL{2} -------- */
12171       /* -------- 1,0000 UADDL{2} -------- */
12172       /* -------- 0,0010 SSUBL{2} -------- */
12173       /* -------- 1,0010 USUBL{2} -------- */
12174       /* Widens, and size refers to the narrow lanes. */
12175       if (size == X11) return False;
12176       vassert(size <= 2);
12177       Bool   isU   = bitU == 1;
12178       Bool   isADD = opcode == BITS4(0,0,0,0);
12179       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
12180       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
12181       IRTemp res   = newTempV128();
12182       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
12183                         mkexpr(argL), mkexpr(argR)));
12184       putQReg128(dd, mkexpr(res));
12185       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12186       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12187       const HChar* nm        = isADD ? (isU ? "uaddl" : "saddl")
12188                                      : (isU ? "usubl" : "ssubl");
12189       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12190           nameQReg128(dd), arrWide,
12191           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12192       return True;
12193    }
12194
12195    if (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,0,1,1)) {
12196       /* -------- 0,0001 SADDW{2} -------- */
12197       /* -------- 1,0001 UADDW{2} -------- */
12198       /* -------- 0,0011 SSUBW{2} -------- */
12199       /* -------- 1,0011 USUBW{2} -------- */
12200       /* Widens, and size refers to the narrow lanes. */
12201       if (size == X11) return False;
12202       vassert(size <= 2);
12203       Bool   isU   = bitU == 1;
12204       Bool   isADD = opcode == BITS4(0,0,0,1);
12205       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
12206       IRTemp res   = newTempV128();
12207       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
12208                         getQReg128(nn), mkexpr(argR)));
12209       putQReg128(dd, mkexpr(res));
12210       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12211       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12212       const HChar* nm        = isADD ? (isU ? "uaddw" : "saddw")
12213                                      : (isU ? "usubw" : "ssubw");
12214       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12215           nameQReg128(dd), arrWide,
12216           nameQReg128(nn), arrWide, nameQReg128(mm), arrNarrow);
12217       return True;
12218    }
12219
12220    if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
12221       /* -------- 0,0100  ADDHN{2} -------- */
12222       /* -------- 1,0100 RADDHN{2} -------- */
12223       /* -------- 0,0110  SUBHN{2} -------- */
12224       /* -------- 1,0110 RSUBHN{2} -------- */
12225       /* Narrows, and size refers to the narrowed lanes. */
12226       if (size == X11) return False;
12227       vassert(size <= 2);
12228       const UInt shift[3] = { 8, 16, 32 };
12229       Bool isADD = opcode == BITS4(0,1,0,0);
12230       Bool isR   = bitU == 1;
12231       /* Combined elements in wide lanes */
12232       IRTemp  wide  = newTempV128();
12233       IRExpr* wideE = binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
12234                             getQReg128(nn), getQReg128(mm));
12235       if (isR) {
12236          wideE = binop(mkVecADD(size+1),
12237                        wideE,
12238                        mkexpr(math_VEC_DUP_IMM(size+1,
12239                                                1ULL << (shift[size]-1))));
12240       }
12241       assign(wide, wideE);
12242       /* Top halves of elements, still in wide lanes */
12243       IRTemp shrd = newTempV128();
12244       assign(shrd, binop(mkVecSHRN(size+1), mkexpr(wide), mkU8(shift[size])));
12245       /* Elements now compacted into lower 64 bits */
12246       IRTemp new64 = newTempV128();
12247       assign(new64, binop(mkVecCATEVENLANES(size), mkexpr(shrd), mkexpr(shrd)));
12248       putLO64andZUorPutHI64(is2, dd, new64);
12249       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12250       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12251       const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
12252                               : (isR ? "rsubhn" : "subhn");
12253       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12254           nameQReg128(dd), arrNarrow,
12255           nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
12256       return True;
12257    }
12258
12259    if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) {
12260       /* -------- 0,0101 SABAL{2} -------- */
12261       /* -------- 1,0101 UABAL{2} -------- */
12262       /* -------- 0,0111 SABDL{2} -------- */
12263       /* -------- 1,0111 UABDL{2} -------- */
12264       /* Widens, and size refers to the narrow lanes. */
12265       if (size == X11) return False;
12266       vassert(size <= 2);
12267       Bool   isU   = bitU == 1;
12268       Bool   isACC = opcode == BITS4(0,1,0,1);
12269       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
12270       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
12271       IRTemp abd   = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
12272       IRTemp res   = newTempV128();
12273       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(abd), getQReg128(dd))
12274                         : mkexpr(abd));
12275       putQReg128(dd, mkexpr(res));
12276       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12277       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12278       const HChar* nm        = isACC ? (isU ? "uabal" : "sabal")
12279                                      : (isU ? "uabdl" : "sabdl");
12280       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12281           nameQReg128(dd), arrWide,
12282           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12283       return True;
12284    }
12285
12286    if (opcode == BITS4(1,1,0,0)
12287        || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
12288       /* -------- 0,1100  SMULL{2} -------- */ // 0 (ks)
12289       /* -------- 1,1100  UMULL{2} -------- */ // 0
12290       /* -------- 0,1000  SMLAL{2} -------- */ // 1
12291       /* -------- 1,1000  UMLAL{2} -------- */ // 1
12292       /* -------- 0,1010  SMLSL{2} -------- */ // 2
12293       /* -------- 1,1010  UMLSL{2} -------- */ // 2
12294       /* Widens, and size refers to the narrow lanes. */
12295       UInt ks = 3;
12296       switch (opcode) {
12297          case BITS4(1,1,0,0): ks = 0; break;
12298          case BITS4(1,0,0,0): ks = 1; break;
12299          case BITS4(1,0,1,0): ks = 2; break;
12300          default: vassert(0);
12301       }
12302       vassert(ks >= 0 && ks <= 2);
12303       if (size == X11) return False;
12304       vassert(size <= 2);
12305       Bool   isU  = bitU == 1;
12306       IRTemp vecN = newTempV128();
12307       IRTemp vecM = newTempV128();
12308       IRTemp vecD = newTempV128();
12309       assign(vecN, getQReg128(nn));
12310       assign(vecM, getQReg128(mm));
12311       assign(vecD, getQReg128(dd));
12312       IRTemp res = IRTemp_INVALID;
12313       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
12314                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
12315       putQReg128(dd, mkexpr(res));
12316       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12317       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12318       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
12319       DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
12320           nameQReg128(dd), arrWide,
12321           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12322       return True;
12323    }
12324
12325    if (bitU == 0
12326        && (opcode == BITS4(1,1,0,1)
12327            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
12328       /* -------- 0,1101  SQDMULL{2} -------- */ // 0 (ks)
12329       /* -------- 0,1001  SQDMLAL{2} -------- */ // 1
12330       /* -------- 0,1011  SQDMLSL{2} -------- */ // 2
12331       /* Widens, and size refers to the narrow lanes. */
12332       UInt ks = 3;
12333       switch (opcode) {
12334          case BITS4(1,1,0,1): ks = 0; break;
12335          case BITS4(1,0,0,1): ks = 1; break;
12336          case BITS4(1,0,1,1): ks = 2; break;
12337          default: vassert(0);
12338       }
12339       vassert(ks >= 0 && ks <= 2);
12340       if (size == X00 || size == X11) return False;
12341       vassert(size <= 2);
12342       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
12343       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
12344       newTempsV128_3(&vecN, &vecM, &vecD);
12345       assign(vecN, getQReg128(nn));
12346       assign(vecM, getQReg128(mm));
12347       assign(vecD, getQReg128(dd));
12348       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
12349                        is2, size, "mas"[ks],
12350                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
12351       putQReg128(dd, mkexpr(res));
12352       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
12353       updateQCFLAGwithDifference(sat1q, sat1n);
12354       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
12355          updateQCFLAGwithDifference(sat2q, sat2n);
12356       }
12357       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12358       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12359       const HChar* nm        = ks == 0 ? "sqdmull"
12360                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
12361       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12362           nameQReg128(dd), arrWide,
12363           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12364       return True;
12365    }
12366
12367    if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
12368       /* -------- 0,1110  PMULL{2} -------- */
12369       /* Widens, and size refers to the narrow lanes. */
12370       if (size != X00 && size != X11) return False;
12371       IRTemp  res  = IRTemp_INVALID;
12372       IRExpr* srcN = getQReg128(nn);
12373       IRExpr* srcM = getQReg128(mm);
12374       const HChar* arrNarrow = NULL;
12375       const HChar* arrWide   = NULL;
12376       if (size == X00) {
12377          res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
12378                                          srcN, srcM);
12379          arrNarrow = nameArr_Q_SZ(bitQ, size);
12380          arrWide   = nameArr_Q_SZ(1,    size+1);
12381       } else {
12382          /* The same thing as the X00 case, except we have to call
12383             a helper to do it. */
12384          vassert(size == X11);
12385          res = newTemp(Ity_V128);
12386          IROp slice
12387             = is2 ? Iop_V128HIto64 : Iop_V128to64;
12388          IRExpr** args
12389             = mkIRExprVec_3( IRExpr_VECRET(),
12390                              unop(slice, srcN), unop(slice, srcM));
12391          IRDirty* di
12392             = unsafeIRDirty_1_N( res, 0/*regparms*/,
12393                                       "arm64g_dirtyhelper_PMULLQ",
12394                                       &arm64g_dirtyhelper_PMULLQ, args);
12395          stmt(IRStmt_Dirty(di));
12396          /* We can't use nameArr_Q_SZ for this because it can't deal with
12397             Q-sized (128 bit) results.  Hence do it by hand. */
12398          arrNarrow = bitQ == 0 ? "1d" : "2d";
12399          arrWide   = "1q";
12400       }
12401       putQReg128(dd, mkexpr(res));
12402       DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "",
12403           nameQReg128(dd), arrWide,
12404           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12405       return True;
12406    }
12407
12408    return False;
12409 #  undef INSN
12410 }
12411
12412
12413 static
12414 Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
12415 {
12416    /* 31 30 29 28    23   21 20 15     10 9 4
12417       0  Q  U  01110 size 1  m  opcode 1  n d
12418       Decode fields: u,size,opcode
12419    */
12420 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12421    if (INSN(31,31) != 0
12422        || INSN(28,24) != BITS5(0,1,1,1,0)
12423        || INSN(21,21) != 1
12424        || INSN(10,10) != 1) {
12425       return False;
12426    }
12427    UInt bitQ   = INSN(30,30);
12428    UInt bitU   = INSN(29,29);
12429    UInt size   = INSN(23,22);
12430    UInt mm     = INSN(20,16);
12431    UInt opcode = INSN(15,11);
12432    UInt nn     = INSN(9,5);
12433    UInt dd     = INSN(4,0);
12434    vassert(size < 4);
12435
12436    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,1,0,0)) {
12437       /* -------- 0,xx,00000 SHADD std6_std6_std6 -------- */
12438       /* -------- 1,xx,00000 UHADD std6_std6_std6 -------- */
12439       /* -------- 0,xx,00100 SHSUB std6_std6_std6 -------- */
12440       /* -------- 1,xx,00100 UHSUB std6_std6_std6 -------- */
12441       if (size == X11) return False;
12442       Bool isADD = opcode == BITS5(0,0,0,0,0);
12443       Bool isU   = bitU == 1;
12444       /* Widen both args out, do the math, narrow to final result. */
12445       IRTemp argL   = newTempV128();
12446       IRTemp argLhi = IRTemp_INVALID;
12447       IRTemp argLlo = IRTemp_INVALID;
12448       IRTemp argR   = newTempV128();
12449       IRTemp argRhi = IRTemp_INVALID;
12450       IRTemp argRlo = IRTemp_INVALID;
12451       IRTemp resHi  = newTempV128();
12452       IRTemp resLo  = newTempV128();
12453       IRTemp res    = IRTemp_INVALID;
12454       assign(argL, getQReg128(nn));
12455       argLlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argL));
12456       argLhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argL));
12457       assign(argR, getQReg128(mm));
12458       argRlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argR));
12459       argRhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argR));
12460       IROp opADDSUB = isADD ? mkVecADD(size+1) : mkVecSUB(size+1);
12461       IROp opSxR = isU ? mkVecSHRN(size+1) : mkVecSARN(size+1);
12462       assign(resHi, binop(opSxR,
12463                           binop(opADDSUB, mkexpr(argLhi), mkexpr(argRhi)),
12464                           mkU8(1)));
12465       assign(resLo, binop(opSxR,
12466                           binop(opADDSUB, mkexpr(argLlo), mkexpr(argRlo)),
12467                           mkU8(1)));
12468       res = math_NARROW_LANES ( resHi, resLo, size );
12469       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12470       const HChar* nm  = isADD ? (isU ? "uhadd" : "shadd")
12471                                : (isU ? "uhsub" : "shsub");
12472       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12473       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12474           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12475       return True;
12476    }
12477
12478    if (opcode == BITS5(0,0,0,1,0)) {
12479       /* -------- 0,xx,00010 SRHADD std7_std7_std7 -------- */
12480       /* -------- 1,xx,00010 URHADD std7_std7_std7 -------- */
12481       if (bitQ == 0 && size == X11) return False; // implied 1d case
12482       Bool   isU  = bitU == 1;
12483       IRTemp argL = newTempV128();
12484       IRTemp argR = newTempV128();
12485       assign(argL, getQReg128(nn));
12486       assign(argR, getQReg128(mm));
12487       IRTemp res = math_RHADD(size, isU, argL, argR);
12488       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12489       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12490       DIP("%s %s.%s, %s.%s, %s.%s\n", isU ? "urhadd" : "srhadd",
12491           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12492       return True;
12493    }
12494
12495    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
12496       /* -------- 0,xx,00001 SQADD std7_std7_std7 -------- */
12497       /* -------- 1,xx,00001 UQADD std7_std7_std7 -------- */
12498       /* -------- 0,xx,00101 SQSUB std7_std7_std7 -------- */
12499       /* -------- 1,xx,00101 UQSUB std7_std7_std7 -------- */
12500       if (bitQ == 0 && size == X11) return False; // implied 1d case
12501       Bool isADD = opcode == BITS5(0,0,0,0,1);
12502       Bool isU   = bitU == 1;
12503       IROp qop   = Iop_INVALID;
12504       IROp nop   = Iop_INVALID;
12505       if (isADD) {
12506          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
12507          nop = mkVecADD(size);
12508       } else {
12509          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
12510          nop = mkVecSUB(size);
12511       }
12512       IRTemp argL = newTempV128();
12513       IRTemp argR = newTempV128();
12514       IRTemp qres = newTempV128();
12515       IRTemp nres = newTempV128();
12516       assign(argL, getQReg128(nn));
12517       assign(argR, getQReg128(mm));
12518       assign(qres, math_MAYBE_ZERO_HI64_fromE(
12519                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
12520       assign(nres, math_MAYBE_ZERO_HI64_fromE(
12521                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
12522       putQReg128(dd, mkexpr(qres));
12523       updateQCFLAGwithDifference(qres, nres);
12524       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
12525                                : (isU ? "uqsub" : "sqsub");
12526       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12527       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12528           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12529       return True;
12530    }
12531
12532    if (bitU == 0 && opcode == BITS5(0,0,0,1,1)) {
12533       /* -------- 0,00,00011 AND 16b_16b_16b, 8b_8b_8b -------- */
12534       /* -------- 0,01,00011 BIC 16b_16b_16b, 8b_8b_8b -------- */
12535       /* -------- 0,10,00011 ORR 16b_16b_16b, 8b_8b_8b -------- */
12536       /* -------- 0,10,00011 ORN 16b_16b_16b, 8b_8b_8b -------- */
12537       Bool   isORx  = (size & 2) == 2;
12538       Bool   invert = (size & 1) == 1;
12539       IRTemp res    = newTempV128();
12540       assign(res, binop(isORx ? Iop_OrV128 : Iop_AndV128,
12541                         getQReg128(nn),
12542                         invert ? unop(Iop_NotV128, getQReg128(mm))
12543                                : getQReg128(mm)));
12544       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12545       const HChar* names[4] = { "and", "bic", "orr", "orn" };
12546       const HChar* ar = bitQ == 1 ? "16b" : "8b";
12547       DIP("%s %s.%s, %s.%s, %s.%s\n", names[INSN(23,22)],
12548           nameQReg128(dd), ar, nameQReg128(nn), ar, nameQReg128(mm), ar);
12549       return True;
12550    }
12551
12552    if (bitU == 1 && opcode == BITS5(0,0,0,1,1)) {
12553       /* -------- 1,00,00011 EOR 16b_16b_16b, 8b_8b_8b -------- */
12554       /* -------- 1,01,00011 BSL 16b_16b_16b, 8b_8b_8b -------- */
12555       /* -------- 1,10,00011 BIT 16b_16b_16b, 8b_8b_8b -------- */
12556       /* -------- 1,10,00011 BIF 16b_16b_16b, 8b_8b_8b -------- */
12557       IRTemp argD = newTempV128();
12558       IRTemp argN = newTempV128();
12559       IRTemp argM = newTempV128();
12560       assign(argD, getQReg128(dd));
12561       assign(argN, getQReg128(nn));
12562       assign(argM, getQReg128(mm));
12563       const IROp opXOR = Iop_XorV128;
12564       const IROp opAND = Iop_AndV128;
12565       const IROp opNOT = Iop_NotV128;
12566       IRTemp res = newTempV128();
12567       switch (size) {
12568          case BITS2(0,0): /* EOR */
12569             assign(res, binop(opXOR, mkexpr(argM), mkexpr(argN)));
12570             break;
12571          case BITS2(0,1): /* BSL */
12572             assign(res, binop(opXOR, mkexpr(argM),
12573                               binop(opAND,
12574                                     binop(opXOR, mkexpr(argM), mkexpr(argN)),
12575                                           mkexpr(argD))));
12576             break;
12577          case BITS2(1,0): /* BIT */
12578             assign(res, binop(opXOR, mkexpr(argD),
12579                               binop(opAND,
12580                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
12581                                     mkexpr(argM))));
12582             break;
12583          case BITS2(1,1): /* BIF */
12584             assign(res, binop(opXOR, mkexpr(argD),
12585                               binop(opAND,
12586                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
12587                                     unop(opNOT, mkexpr(argM)))));
12588             break;
12589          default:
12590             vassert(0);
12591       }
12592       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12593       const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
12594       const HChar* arr = bitQ == 1 ? "16b" : "8b";
12595       DIP("%s %s.%s, %s.%s, %s.%s\n", nms[size],
12596           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12597       return True;
12598    }
12599
12600    if (opcode == BITS5(0,0,1,1,0)) {
12601       /* -------- 0,xx,00110 CMGT std7_std7_std7 -------- */ // >s
12602       /* -------- 1,xx,00110 CMHI std7_std7_std7 -------- */ // >u
12603       if (bitQ == 0 && size == X11) return False; // implied 1d case
12604       Bool   isGT  = bitU == 0;
12605       IRExpr* argL = getQReg128(nn);
12606       IRExpr* argR = getQReg128(mm);
12607       IRTemp  res  = newTempV128();
12608       assign(res,
12609              isGT ? binop(mkVecCMPGTS(size), argL, argR)
12610                   : binop(mkVecCMPGTU(size), argL, argR));
12611       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12612       const HChar* nm  = isGT ? "cmgt" : "cmhi";
12613       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12614       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12615           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12616       return True;
12617    }
12618
12619    if (opcode == BITS5(0,0,1,1,1)) {
12620       /* -------- 0,xx,00111 CMGE std7_std7_std7 -------- */ // >=s
12621       /* -------- 1,xx,00111 CMHS std7_std7_std7 -------- */ // >=u
12622       if (bitQ == 0 && size == X11) return False; // implied 1d case
12623       Bool    isGE = bitU == 0;
12624       IRExpr* argL = getQReg128(nn);
12625       IRExpr* argR = getQReg128(mm);
12626       IRTemp  res  = newTempV128();
12627       assign(res,
12628              isGE ? unop(Iop_NotV128, binop(mkVecCMPGTS(size), argR, argL))
12629                   : unop(Iop_NotV128, binop(mkVecCMPGTU(size), argR, argL)));
12630       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12631       const HChar* nm  = isGE ? "cmge" : "cmhs";
12632       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12633       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12634           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12635       return True;
12636    }
12637
12638    if (opcode == BITS5(0,1,0,0,0) || opcode == BITS5(0,1,0,1,0)) {
12639       /* -------- 0,xx,01000 SSHL  std7_std7_std7 -------- */
12640       /* -------- 0,xx,01010 SRSHL std7_std7_std7 -------- */
12641       /* -------- 1,xx,01000 USHL  std7_std7_std7 -------- */
12642       /* -------- 1,xx,01010 URSHL std7_std7_std7 -------- */
12643       if (bitQ == 0 && size == X11) return False; // implied 1d case
12644       Bool isU = bitU == 1;
12645       Bool isR = opcode == BITS5(0,1,0,1,0);
12646       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
12647                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
12648       IRTemp res = newTempV128();
12649       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
12650       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12651       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
12652                              : (isU ? "ushl"  : "sshl");
12653       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12654       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12655           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12656       return True;
12657    }
12658
12659    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
12660       /* -------- 0,xx,01001 SQSHL  std7_std7_std7 -------- */
12661       /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
12662       /* -------- 1,xx,01001 UQSHL  std7_std7_std7 -------- */
12663       /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */
12664       if (bitQ == 0 && size == X11) return False; // implied 1d case
12665       Bool isU = bitU == 1;
12666       Bool isR = opcode == BITS5(0,1,0,1,1);
12667       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
12668                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
12669       /* This is a bit tricky.  If we're only interested in the lowest 64 bits
12670          of the result (viz, bitQ == 0), then we must adjust the operands to
12671          ensure that the upper part of the result, that we don't care about,
12672          doesn't pollute the returned Q value.  To do this, zero out the upper
12673          operand halves beforehand.  This works because it means, for the
12674          lanes we don't care about, we are shifting zero by zero, which can
12675          never saturate. */
12676       IRTemp res256 = newTemp(Ity_V256);
12677       IRTemp resSH  = newTempV128();
12678       IRTemp resQ   = newTempV128();
12679       IRTemp zero   = newTempV128();
12680       assign(res256, binop(op,
12681                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)),
12682                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm))));
12683       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
12684       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
12685       assign(zero,  mkV128(0x0000));
12686       putQReg128(dd, mkexpr(resSH));
12687       updateQCFLAGwithDifference(resQ, zero);
12688       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
12689                              : (isU ? "uqshl"  : "sqshl");
12690       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12691       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12692           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12693       return True;
12694    }
12695
12696    if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) {
12697       /* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */
12698       /* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */
12699       /* -------- 0,xx,01101 SMIN std7_std7_std7 -------- */
12700       /* -------- 1,xx,01101 UMIN std7_std7_std7 -------- */
12701       if (bitQ == 0 && size == X11) return False; // implied 1d case
12702       Bool isU   = bitU == 1;
12703       Bool isMAX = (opcode & 1) == 0;
12704       IROp op    = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
12705                          : (isU ? mkVecMINU(size) : mkVecMINS(size));
12706       IRTemp t   = newTempV128();
12707       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
12708       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
12709       const HChar* nm = isMAX ? (isU ? "umax" : "smax")
12710                               : (isU ? "umin" : "smin");
12711       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12712       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12713           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12714       return True;
12715    }
12716
12717    if (opcode == BITS5(0,1,1,1,0) || opcode == BITS5(0,1,1,1,1)) {
12718       /* -------- 0,xx,01110 SABD std6_std6_std6 -------- */
12719       /* -------- 1,xx,01110 UABD std6_std6_std6 -------- */
12720       /* -------- 0,xx,01111 SABA std6_std6_std6 -------- */
12721       /* -------- 1,xx,01111 UABA std6_std6_std6 -------- */
12722       if (size == X11) return False; // 1d/2d cases not allowed
12723       Bool isU   = bitU == 1;
12724       Bool isACC = opcode == BITS5(0,1,1,1,1);
12725       vassert(size <= 2);
12726       IRTemp t1 = math_ABD(isU, size, getQReg128(nn), getQReg128(mm));
12727       IRTemp t2 = newTempV128();
12728       assign(t2, isACC ? binop(mkVecADD(size), mkexpr(t1), getQReg128(dd))
12729                        : mkexpr(t1));
12730       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12731       const HChar* nm  = isACC ? (isU ? "uaba" : "saba")
12732                                : (isU ? "uabd" : "sabd");
12733       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12734       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12735           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12736       return True;
12737    }
12738
12739    if (opcode == BITS5(1,0,0,0,0)) {
12740       /* -------- 0,xx,10000 ADD std7_std7_std7 -------- */
12741       /* -------- 1,xx,10000 SUB std7_std7_std7 -------- */
12742       if (bitQ == 0 && size == X11) return False; // implied 1d case
12743       Bool   isSUB = bitU == 1;
12744       IROp   op    = isSUB ? mkVecSUB(size) : mkVecADD(size);
12745       IRTemp t     = newTempV128();
12746       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
12747       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
12748       const HChar* nm  = isSUB ? "sub" : "add";
12749       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12750       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12751           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12752       return True;
12753    }
12754
12755    if (opcode == BITS5(1,0,0,0,1)) {
12756       /* -------- 0,xx,10001 CMTST std7_std7_std7 -------- */ // &, != 0
12757       /* -------- 1,xx,10001 CMEQ  std7_std7_std7 -------- */ // ==
12758       if (bitQ == 0 && size == X11) return False; // implied 1d case
12759       Bool    isEQ = bitU == 1;
12760       IRExpr* argL = getQReg128(nn);
12761       IRExpr* argR = getQReg128(mm);
12762       IRTemp  res  = newTempV128();
12763       assign(res,
12764              isEQ ? binop(mkVecCMPEQ(size), argL, argR)
12765                   : unop(Iop_NotV128, binop(mkVecCMPEQ(size),
12766                                             binop(Iop_AndV128, argL, argR),
12767                                             mkV128(0x0000))));
12768       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12769       const HChar* nm  = isEQ ? "cmeq" : "cmtst";
12770       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12771       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12772           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12773       return True;
12774    }
12775
12776    if (opcode == BITS5(1,0,0,1,0)) {
12777       /* -------- 0,xx,10010 MLA std7_std7_std7 -------- */
12778       /* -------- 1,xx,10010 MLS std7_std7_std7 -------- */
12779       if (bitQ == 0 && size == X11) return False; // implied 1d case
12780       Bool isMLS = bitU == 1;
12781       IROp   opMUL    = mkVecMUL(size);
12782       IROp   opADDSUB = isMLS ? mkVecSUB(size) : mkVecADD(size);
12783       IRTemp res      = newTempV128();
12784       if (opMUL != Iop_INVALID && opADDSUB != Iop_INVALID) {
12785          assign(res, binop(opADDSUB,
12786                            getQReg128(dd),
12787                            binop(opMUL, getQReg128(nn), getQReg128(mm))));
12788          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12789          const HChar* arr = nameArr_Q_SZ(bitQ, size);
12790          DIP("%s %s.%s, %s.%s, %s.%s\n", isMLS ? "mls" : "mla",
12791              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12792          return True;
12793       }
12794       return False;
12795    }
12796
12797    if (opcode == BITS5(1,0,0,1,1)) {
12798       /* -------- 0,xx,10011 MUL  std7_std7_std7 -------- */
12799       /* -------- 1,xx,10011 PMUL 16b_16b_16b, 8b_8b_8b -------- */
12800       if (bitQ == 0 && size == X11) return False; // implied 1d case
12801       Bool isPMUL = bitU == 1;
12802       const IROp opsPMUL[4]
12803          = { Iop_PolynomialMul8x16, Iop_INVALID, Iop_INVALID, Iop_INVALID };
12804       IROp   opMUL = isPMUL ? opsPMUL[size] : mkVecMUL(size);
12805       IRTemp res   = newTempV128();
12806       if (opMUL != Iop_INVALID) {
12807          assign(res, binop(opMUL, getQReg128(nn), getQReg128(mm)));
12808          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12809          const HChar* arr = nameArr_Q_SZ(bitQ, size);
12810          DIP("%s %s.%s, %s.%s, %s.%s\n", isPMUL ? "pmul" : "mul",
12811              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12812          return True;
12813       }
12814       return False;
12815    }
12816
12817    if (opcode == BITS5(1,0,1,0,0) || opcode == BITS5(1,0,1,0,1)) {
12818       /* -------- 0,xx,10100 SMAXP std6_std6_std6 -------- */
12819       /* -------- 1,xx,10100 UMAXP std6_std6_std6 -------- */
12820       /* -------- 0,xx,10101 SMINP std6_std6_std6 -------- */
12821       /* -------- 1,xx,10101 UMINP std6_std6_std6 -------- */
12822       if (size == X11) return False;
12823       Bool isU   = bitU == 1;
12824       Bool isMAX = opcode == BITS5(1,0,1,0,0);
12825       IRTemp vN  = newTempV128();
12826       IRTemp vM  = newTempV128();
12827       IROp op = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
12828                       : (isU ? mkVecMINU(size) : mkVecMINS(size));
12829       assign(vN, getQReg128(nn));
12830       assign(vM, getQReg128(mm));
12831       IRTemp res128 = newTempV128();
12832       assign(res128,
12833              binop(op,
12834                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
12835                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
12836       /* In the half-width case, use CatEL32x4 to extract the half-width
12837          result from the full-width result. */
12838       IRExpr* res
12839          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
12840                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
12841                                                         mkexpr(res128)))
12842                      : mkexpr(res128);
12843       putQReg128(dd, res);
12844       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12845       const HChar* nm  = isMAX ? (isU ? "umaxp" : "smaxp")
12846                                : (isU ? "uminp" : "sminp");
12847       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12848           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12849       return True;
12850    }
12851
12852    if (opcode == BITS5(1,0,1,1,0)) {
12853       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
12854       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
12855       if (size == X00 || size == X11) return False;
12856       Bool isR = bitU == 1;
12857       IRTemp res, sat1q, sat1n, vN, vM;
12858       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
12859       newTempsV128_2(&vN, &vM);
12860       assign(vN, getQReg128(nn));
12861       assign(vM, getQReg128(mm));
12862       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
12863       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12864       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
12865       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
12866       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12867       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
12868       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12869           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12870       return True;
12871    }
12872
12873    if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
12874       /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
12875       if (bitQ == 0 && size == X11) return False; // implied 1d case
12876       IRTemp vN = newTempV128();
12877       IRTemp vM = newTempV128();
12878       assign(vN, getQReg128(nn));
12879       assign(vM, getQReg128(mm));
12880       IRTemp res128 = newTempV128();
12881       assign(res128,
12882              binop(mkVecADD(size),
12883                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
12884                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
12885       /* In the half-width case, use CatEL32x4 to extract the half-width
12886          result from the full-width result. */
12887       IRExpr* res
12888          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
12889                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
12890                                                         mkexpr(res128)))
12891                      : mkexpr(res128);
12892       putQReg128(dd, res);
12893       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12894       DIP("addp %s.%s, %s.%s, %s.%s\n",
12895           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12896       return True;
12897    }
12898
12899    if (bitU == 0
12900        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
12901       /* -------- 0,0x,11000 FMAXNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12902       /* -------- 0,1x,11000 FMINNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12903       /* -------- 0,0x,11110 FMAX   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12904       /* -------- 0,1x,11110 FMIN   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12905       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
12906       Bool   isD   = (size & 1) == 1;
12907       if (bitQ == 0 && isD) return False; // implied 1d case
12908       Bool   isMIN = (size & 2) == 2;
12909       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
12910       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? X11 : X10);
12911       IRTemp res   = newTempV128();
12912       assign(res, binop(opMXX, getQReg128(nn), getQReg128(mm)));
12913       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12914       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12915       DIP("%s%s %s.%s, %s.%s, %s.%s\n",
12916           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
12917           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12918       return True;
12919    }
12920
12921    if (bitU == 0 && opcode == BITS5(1,1,0,0,1)) {
12922       /* -------- 0,0x,11001 FMLA 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12923       /* -------- 0,1x,11001 FMLS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12924       Bool isD   = (size & 1) == 1;
12925       Bool isSUB = (size & 2) == 2;
12926       if (bitQ == 0 && isD) return False; // implied 1d case
12927       IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
12928       IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12929       IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
12930       IRTemp rm = mk_get_IR_rounding_mode();
12931       IRTemp t1 = newTempV128();
12932       IRTemp t2 = newTempV128();
12933       // FIXME: double rounding; use FMA primops instead
12934       assign(t1, triop(opMUL,
12935                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12936       assign(t2, triop(isSUB ? opSUB : opADD,
12937                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
12938       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12939       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12940       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
12941           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12942       return True;
12943    }
12944
12945    if (bitU == 0 && opcode == BITS5(1,1,0,1,0)) {
12946       /* -------- 0,0x,11010 FADD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12947       /* -------- 0,1x,11010 FSUB 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12948       Bool isD   = (size & 1) == 1;
12949       Bool isSUB = (size & 2) == 2;
12950       if (bitQ == 0 && isD) return False; // implied 1d case
12951       const IROp ops[4]
12952          = { Iop_Add32Fx4, Iop_Add64Fx2, Iop_Sub32Fx4, Iop_Sub64Fx2 };
12953       IROp   op = ops[size];
12954       IRTemp rm = mk_get_IR_rounding_mode();
12955       IRTemp t1 = newTempV128();
12956       IRTemp t2 = newTempV128();
12957       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12958       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
12959       putQReg128(dd, mkexpr(t2));
12960       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12961       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fsub" : "fadd",
12962           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12963       return True;
12964    }
12965
12966    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
12967       /* -------- 1,1x,11010 FABD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12968       Bool isD = (size & 1) == 1;
12969       if (bitQ == 0 && isD) return False; // implied 1d case
12970       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12971       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12972       IRTemp rm    = mk_get_IR_rounding_mode();
12973       IRTemp t1    = newTempV128();
12974       IRTemp t2    = newTempV128();
12975       // FIXME: use Abd primop instead?
12976       assign(t1, triop(opSUB, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12977       assign(t2, unop(opABS, mkexpr(t1)));
12978       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12979       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12980       DIP("fabd %s.%s, %s.%s, %s.%s\n",
12981           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12982       return True;
12983    }
12984
12985    if (size <= X01 && opcode == BITS5(1,1,0,1,1)) {
12986       /* -------- 0,0x,11011 FMULX 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12987       /* -------- 1,0x,11011 FMUL  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12988       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
12989       Bool isD    = (size & 1) == 1;
12990       Bool isMULX = bitU == 0;
12991       if (bitQ == 0 && isD) return False; // implied 1d case
12992       IRTemp rm = mk_get_IR_rounding_mode();
12993       IRTemp t1 = newTempV128();
12994       assign(t1, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
12995                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12996       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12997       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12998       DIP("%s %s.%s, %s.%s, %s.%s\n", isMULX ? "fmulx" : "fmul",
12999           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13000       return True;
13001    }
13002
13003    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
13004       /* -------- 0,0x,11100 FCMEQ 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13005       /* -------- 1,0x,11100 FCMGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13006       Bool isD = (size & 1) == 1;
13007       if (bitQ == 0 && isD) return False; // implied 1d case
13008       Bool   isGE  = bitU == 1;
13009       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
13010                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
13011       IRTemp t1    = newTempV128();
13012       assign(t1, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
13013                       : binop(opCMP, getQReg128(nn), getQReg128(mm)));
13014       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13015       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13016       DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
13017           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13018       return True;
13019    }
13020
13021    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
13022       /* -------- 1,1x,11100 FCMGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13023       Bool isD = (size & 1) == 1;
13024       if (bitQ == 0 && isD) return False; // implied 1d case
13025       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
13026       IRTemp t1    = newTempV128();
13027       assign(t1, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
13028       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13029       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13030       DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
13031           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13032       return True;
13033    }
13034
13035    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
13036       /* -------- 1,0x,11101 FACGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13037       /* -------- 1,1x,11101 FACGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13038       Bool isD  = (size & 1) == 1;
13039       Bool isGT = (size & 2) == 2;
13040       if (bitQ == 0 && isD) return False; // implied 1d case
13041       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
13042                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
13043       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
13044       IRTemp t1    = newTempV128();
13045       assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
13046                               unop(opABS, getQReg128(nn)))); // swapd
13047       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13048       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13049       DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
13050           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13051       return True;
13052    }
13053
13054    if (bitU == 1
13055        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
13056       /* -------- 1,0x,11000 FMAXNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13057       /* -------- 1,1x,11000 FMINNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13058       /* -------- 1,0x,11110 FMAXP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13059       /* -------- 1,1x,11110 FMINP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13060       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
13061       Bool isD = (size & 1) == 1;
13062       if (bitQ == 0 && isD) return False; // implied 1d case
13063       Bool   isMIN = (size & 2) == 2;
13064       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
13065       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
13066       IRTemp srcN  = newTempV128();
13067       IRTemp srcM  = newTempV128();
13068       IRTemp preL  = IRTemp_INVALID;
13069       IRTemp preR  = IRTemp_INVALID;
13070       assign(srcN, getQReg128(nn));
13071       assign(srcM, getQReg128(mm));
13072       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR, srcM, srcN,
13073                                            isD ? ARM64VSizeD : ARM64VSizeS, bitQ);
13074       putQReg128(
13075          dd, math_MAYBE_ZERO_HI64_fromE(
13076                 bitQ,
13077                 binop(opMXX, mkexpr(preL), mkexpr(preR))));
13078       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13079       DIP("%s%sp %s.%s, %s.%s, %s.%s\n",
13080           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
13081           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13082       return True;
13083    }
13084
13085    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,0,1,0)) {
13086       /* -------- 1,0x,11010 FADDP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13087       Bool isD = size == X01;
13088       if (bitQ == 0 && isD) return False; // implied 1d case
13089       IRTemp srcN = newTempV128();
13090       IRTemp srcM = newTempV128();
13091       IRTemp preL = IRTemp_INVALID;
13092       IRTemp preR = IRTemp_INVALID;
13093       assign(srcN, getQReg128(nn));
13094       assign(srcM, getQReg128(mm));
13095       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR, srcM, srcN,
13096                                            isD ? ARM64VSizeD : ARM64VSizeS, bitQ);
13097       putQReg128(
13098          dd, math_MAYBE_ZERO_HI64_fromE(
13099                 bitQ,
13100                 triop(mkVecADDF(isD ? 3 : 2),
13101                       mkexpr(mk_get_IR_rounding_mode()),
13102                       mkexpr(preL), mkexpr(preR))));
13103       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13104       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
13105           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13106       return True;
13107    }
13108
13109    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,1,1,1)) {
13110       /* -------- 1,0x,11111 FDIV 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13111       Bool isD = (size & 1) == 1;
13112       if (bitQ == 0 && isD) return False; // implied 1d case
13113       vassert(size <= 1);
13114       const IROp ops[2] = { Iop_Div32Fx4, Iop_Div64Fx2 };
13115       IROp   op = ops[size];
13116       IRTemp rm = mk_get_IR_rounding_mode();
13117       IRTemp t1 = newTempV128();
13118       IRTemp t2 = newTempV128();
13119       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13120       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
13121       putQReg128(dd, mkexpr(t2));
13122       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13123       DIP("%s %s.%s, %s.%s, %s.%s\n", "fdiv",
13124           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13125       return True;
13126    }
13127
13128    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
13129       /* -------- 0,0x,11111: FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13130       /* -------- 0,1x,11111: FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13131       Bool isSQRT = (size & 2) == 2;
13132       Bool isD    = (size & 1) == 1;
13133       if (bitQ == 0 && isD) return False; // implied 1d case
13134       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
13135                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
13136       IRTemp res = newTempV128();
13137       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
13138       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13139       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13140       DIP("%s %s.%s, %s.%s, %s.%s\n", isSQRT ? "frsqrts" : "frecps",
13141           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13142       return True;
13143    }
13144
13145    return False;
13146 #  undef INSN
13147 }
13148
13149
13150 static
13151 Bool dis_AdvSIMD_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn)
13152 {
13153    /* 31 30 29 28    23   21 20 15 14     10 9 4
13154       0  Q  U  01110 size 0  m  1  opcode 1  n d
13155       Decode fields: u,size,opcode
13156    */
13157 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13158    if (INSN(31,31) != 0
13159        || INSN(28,24) != BITS5(0,1,1,1,0)
13160        || INSN(21,21) != 0
13161        || INSN(15,15) != 1
13162        || INSN(10,10) != 1) {
13163       return False;
13164    }
13165    UInt bitQ   = INSN(30,30);
13166    UInt bitU   = INSN(29,29);
13167    UInt size   = INSN(23,22);
13168    UInt mm     = INSN(20,16);
13169    UInt opcode = INSN(14,11);
13170    UInt nn     = INSN(9,5);
13171    UInt dd     = INSN(4,0);
13172    vassert(size < 4);
13173    vassert(mm < 32 && nn < 32 && dd < 32);
13174
13175    if (bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,0,1))) {
13176       /* -------- 0,xx,10110 SQRDMLAH s and h variants only -------- */
13177       /* -------- 1,xx,10110 SQRDMLSH s and h variants only -------- */
13178       if (size == X00 || size == X11) return False;
13179       Bool isAdd = opcode == BITS4(0,0,0,0);
13180
13181       IRTemp res, res_nosat, vD, vN, vM;
13182       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
13183       newTempsV128_3(&vD, &vN, &vM);
13184       assign(vD, getQReg128(dd));
13185       assign(vN, getQReg128(nn));
13186       assign(vM, getQReg128(mm));
13187
13188       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
13189       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13190       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
13191       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13192
13193       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13194       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
13195       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
13196           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13197       return True;
13198    }
13199
13200    return False;
13201 #  undef INSN
13202 }
13203
13204 static
13205 Bool dis_AdvSIMD_three_same_fp16(/*MB_OUT*/DisResult* dres, UInt insn,
13206                                  const VexArchInfo* archinfo)
13207 {
13208    /* This decode function only handles instructions with half-precision
13209       floating-point (fp16) operands.
13210    */
13211    if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
13212       return False;
13213
13214    /* 31 30 29 28    23   21 20 15     10 9 4
13215       0  Q  U  01110 size 0  m  opcode 1  n d
13216       Decode fields: u,size,opcode
13217    */
13218 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13219    if (INSN(31,31) != 0
13220        || INSN(28,24) != BITS5(0,1,1,1,0)
13221        || INSN(21,21) != 0
13222        || INSN(10,10) != 1) {
13223       return False;
13224    }
13225    UInt bitQ   = INSN(30,30);
13226    UInt bitU   = INSN(29,29);
13227    UInt size   = INSN(23,22);
13228    UInt mm     = INSN(20,16);
13229    UInt opcode = INSN(15,11);
13230    UInt nn     = INSN(9,5);
13231    UInt dd     = INSN(4,0);
13232    vassert(size < 4);
13233    vassert(mm < 32 && nn < 32 && dd < 32);
13234
13235    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,0,1,0)) {
13236       /* -------- 1,01,00010 FADDP 4h_4h_4h, 8h_8h_8h -------- */
13237       IROp  opADD = mkVecADDF(1); //bitQ == 0 ? 0 : 1);
13238       IRTemp srcN = newTempV128();
13239       IRTemp srcM = newTempV128();
13240       IRTemp preL = IRTemp_INVALID;
13241       IRTemp preR = IRTemp_INVALID;
13242       assign(srcN, getQReg128(nn));
13243       assign(srcM, getQReg128(mm));
13244       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR, srcM, srcN,
13245                                            ARM64VSizeH, bitQ);
13246       putQReg128(
13247          dd, math_MAYBE_ZERO_HI64_fromE(
13248                 bitQ,
13249                 triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
13250                       mkexpr(preL), mkexpr(preR))));
13251       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13252       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
13253           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13254       return True;
13255    }
13256
13257    if (bitU == 1 && size == X11 && opcode == BITS5(0,0,0,1,0)) {
13258       /* -------- 1,11,00010 FABD 4h_4h_4h, 8h_8h_8h -------- */
13259       IRTemp rm    = mk_get_IR_rounding_mode();
13260       IRTemp t1    = newTempV128();
13261       IRTemp t2    = newTempV128();
13262       assign(t1, triop(Iop_Sub16Fx8, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13263       assign(t2, unop(Iop_Abs16Fx8, mkexpr(t1)));
13264       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
13265       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13266       DIP("%s %s.%s, %s.%s, %s.%s\n", "fabd",
13267           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13268       return True;
13269    }
13270
13271    if (size == X01 && opcode == BITS5(0,0,1,0,0)) {
13272       /* -------- 0,01,00100 FCMEQ 4h_4h_4h, 8h_8h_8h -------- */
13273       /* -------- 1,01,00100 FCMGE 4h_4h_4h, 8h_8h_8h -------- */
13274       Bool   isGE  = bitU == 1;
13275       IRTemp t1    = newTempV128();
13276       /* Swap source and destination in order to use existing LE IR op for GE. */
13277       assign(t1, isGE ? binop(Iop_CmpLE16Fx8, getQReg128(mm), getQReg128(nn))
13278                       : binop(Iop_CmpEQ16Fx8, getQReg128(nn), getQReg128(mm)));
13279       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13280       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13281       DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
13282           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13283       return True;
13284    }
13285
13286    if (size == X11 && opcode == BITS5(0,0,1,0,0)) {
13287       /* -------- 1,11,00100 FCMGT 4h_4h_4h, 8h_8h_8h -------- */
13288       IRTemp t1    = newTempV128();
13289       /* Swap source and destination in order to use existing LT IR op for GT. */
13290       assign(t1, binop(Iop_CmpLT16Fx8, getQReg128(mm), getQReg128(nn)));
13291       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13292       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13293       DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
13294           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13295       return True;
13296    }
13297
13298    if (bitU == 1 && opcode == BITS5(0,0,1,0,1)) {
13299       /* -------- 1,01,00101 FACGE 4h_4h_4h 8h_8h_8h -------- */
13300       /* -------- 1,11,00101 FACGT 4h_4h_4h 8h_8h_8h -------- */
13301       Bool isGT  = (size & 3) == 3;
13302       IROp opCMP = isGT ? Iop_CmpLT16Fx8 : Iop_CmpLE16Fx8;
13303       IROp opABS = Iop_Abs16Fx8;
13304       IRTemp t1  = newTempV128();
13305       assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
13306                               unop(opABS, getQReg128(nn))));
13307       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13308       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13309       DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
13310           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13311       return True;
13312    }
13313
13314    if (bitU == 0 && size == X01 && opcode == BITS5(0,0,0,1,0)) {
13315       /* -------- 0,01,00010 FADD 4h_4h_4h, 8h_8h_8h -------- */
13316       IRTemp rm = mk_get_IR_rounding_mode();
13317       IRTemp t1 = newTempV128();
13318       IRTemp t2 = newTempV128();
13319       assign(t1, triop(Iop_Add16Fx8, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13320       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
13321       putQReg128(dd, mkexpr(t2));
13322       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13323       DIP("%s %s.%s, %s.%s, %s.%s\n", "fadd",
13324           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13325       return True;
13326    }
13327
13328    return False;
13329 #  undef INSN
13330 }
13331
13332
13333 static
13334 Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
13335 {
13336    /* 31 30 29 28    23   21    16     11 9 4
13337       0  Q  U  01110 size 10000 opcode 10 n d
13338       Decode fields: U,size,opcode
13339    */
13340 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13341    if (INSN(31,31) != 0
13342        || INSN(28,24) != BITS5(0,1,1,1,0)
13343        || INSN(21,17) != BITS5(1,0,0,0,0)
13344        || INSN(11,10) != BITS2(1,0)) {
13345       return False;
13346    }
13347    UInt bitQ   = INSN(30,30);
13348    UInt bitU   = INSN(29,29);
13349    UInt size   = INSN(23,22);
13350    UInt opcode = INSN(16,12);
13351    UInt nn     = INSN(9,5);
13352    UInt dd     = INSN(4,0);
13353    vassert(size < 4);
13354
13355    if (bitU == 0 && size <= X10 && opcode == BITS5(0,0,0,0,0)) {
13356       /* -------- 0,00,00000: REV64 16b_16b, 8b_8b -------- */
13357       /* -------- 0,01,00000: REV64 8h_8h, 4h_4h -------- */
13358       /* -------- 0,10,00000: REV64 4s_4s, 2s_2s -------- */
13359       const IROp iops[3] = { Iop_Reverse8sIn64_x2,
13360                              Iop_Reverse16sIn64_x2, Iop_Reverse32sIn64_x2 };
13361       vassert(size <= 2);
13362       IRTemp res = newTempV128();
13363       assign(res, unop(iops[size], getQReg128(nn)));
13364       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13365       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13366       DIP("%s %s.%s, %s.%s\n", "rev64",
13367           nameQReg128(dd), arr, nameQReg128(nn), arr);
13368       return True;
13369    }
13370
13371    if (bitU == 1 && size <= X01 && opcode == BITS5(0,0,0,0,0)) {
13372       /* -------- 1,00,00000: REV32 16b_16b, 8b_8b -------- */
13373       /* -------- 1,01,00000: REV32 8h_8h, 4h_4h -------- */
13374       Bool   isH = size == X01;
13375       IRTemp res = newTempV128();
13376       IROp   iop = isH ? Iop_Reverse16sIn32_x4 : Iop_Reverse8sIn32_x4;
13377       assign(res, unop(iop, getQReg128(nn)));
13378       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13379       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13380       DIP("%s %s.%s, %s.%s\n", "rev32",
13381           nameQReg128(dd), arr, nameQReg128(nn), arr);
13382       return True;
13383    }
13384
13385    if (bitU == 0 && size == X00 && opcode == BITS5(0,0,0,0,1)) {
13386       /* -------- 0,00,00001: REV16 16b_16b, 8b_8b -------- */
13387       IRTemp res = newTempV128();
13388       assign(res, unop(Iop_Reverse8sIn16_x8, getQReg128(nn)));
13389       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13390       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13391       DIP("%s %s.%s, %s.%s\n", "rev16",
13392           nameQReg128(dd), arr, nameQReg128(nn), arr);
13393       return True;
13394    }
13395
13396    if (opcode == BITS5(0,0,0,1,0) || opcode == BITS5(0,0,1,1,0)) {
13397       /* -------- 0,xx,00010: SADDLP std6_std6 -------- */
13398       /* -------- 1,xx,00010: UADDLP std6_std6 -------- */
13399       /* -------- 0,xx,00110: SADALP std6_std6 -------- */
13400       /* -------- 1,xx,00110: UADALP std6_std6 -------- */
13401       /* Widens, and size refers to the narrow size. */
13402       if (size == X11) return False; // no 1d or 2d cases
13403       Bool   isU   = bitU == 1;
13404       Bool   isACC = opcode == BITS5(0,0,1,1,0);
13405       IRTemp src   = newTempV128();
13406       IRTemp sum   = newTempV128();
13407       IRTemp res   = newTempV128();
13408       assign(src, getQReg128(nn));
13409       assign(sum,
13410              binop(mkVecADD(size+1),
13411                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
13412                              isU, True/*fromOdd*/, size, mkexpr(src))),
13413                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
13414                              isU, False/*!fromOdd*/, size, mkexpr(src)))));
13415       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
13416                         : mkexpr(sum));
13417       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13418       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13419       const HChar* arrWide   = nameArr_Q_SZ(bitQ, size+1);
13420       DIP("%s %s.%s, %s.%s\n", isACC ? (isU ? "uadalp" : "sadalp")
13421                                      : (isU ? "uaddlp" : "saddlp"),
13422           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
13423       return True;
13424    }
13425
13426    if (opcode == BITS5(0,0,0,1,1)) {
13427       /* -------- 0,xx,00011: SUQADD std7_std7 -------- */
13428       /* -------- 1,xx,00011: USQADD std7_std7 -------- */
13429       if (bitQ == 0 && size == X11) return False; // implied 1d case
13430       Bool isUSQADD = bitU == 1;
13431       /* This is switched (in the US vs SU sense) deliberately.
13432          SUQADD corresponds to the ExtUSsatSS variants and
13433          USQADD corresponds to the ExtSUsatUU variants.
13434          See libvex_ir for more details. */
13435       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
13436                              : mkVecQADDEXTUSSATSS(size);
13437       IROp   nop  = mkVecADD(size);
13438       IRTemp argL = newTempV128();
13439       IRTemp argR = newTempV128();
13440       IRTemp qres = newTempV128();
13441       IRTemp nres = newTempV128();
13442       /* Because the two arguments to the addition are implicitly
13443          extended differently (one signedly, the other unsignedly) it is
13444          important to present them to the primop in the correct order. */
13445       assign(argL, getQReg128(nn));
13446       assign(argR, getQReg128(dd));
13447       assign(qres, math_MAYBE_ZERO_HI64_fromE(
13448                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
13449       assign(nres, math_MAYBE_ZERO_HI64_fromE(
13450                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
13451       putQReg128(dd, mkexpr(qres));
13452       updateQCFLAGwithDifference(qres, nres);
13453       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13454       DIP("%s %s.%s, %s.%s\n", isUSQADD ? "usqadd" : "suqadd",
13455           nameQReg128(dd), arr, nameQReg128(nn), arr);
13456       return True;
13457    }
13458
13459    if (opcode == BITS5(0,0,1,0,0)) {
13460       /* -------- 0,xx,00100: CLS std6_std6 -------- */
13461       /* -------- 1,xx,00100: CLZ std6_std6 -------- */
13462       if (size == X11) return False; // no 1d or 2d cases
13463       const IROp opsCLS[3] = { Iop_Cls8x16, Iop_Cls16x8, Iop_Cls32x4 };
13464       const IROp opsCLZ[3] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4 };
13465       Bool   isCLZ = bitU == 1;
13466       IRTemp res   = newTempV128();
13467       vassert(size <= 2);
13468       assign(res, unop(isCLZ ? opsCLZ[size] : opsCLS[size], getQReg128(nn)));
13469       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13470       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13471       DIP("%s %s.%s, %s.%s\n", isCLZ ? "clz" : "cls",
13472           nameQReg128(dd), arr, nameQReg128(nn), arr);
13473       return True;
13474    }
13475
13476    if (size == X00 && opcode == BITS5(0,0,1,0,1)) {
13477       /* -------- 0,00,00101: CNT 16b_16b, 8b_8b -------- */
13478       /* -------- 1,00,00101: NOT 16b_16b, 8b_8b -------- */
13479       IRTemp res = newTempV128();
13480       assign(res, unop(bitU == 0 ? Iop_Cnt8x16 : Iop_NotV128, getQReg128(nn)));
13481       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13482       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
13483       DIP("%s %s.%s, %s.%s\n", bitU == 0 ? "cnt" : "not",
13484           nameQReg128(dd), arr, nameQReg128(nn), arr);
13485       return True;
13486    }
13487
13488    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,1,0,1)) {
13489       /* -------- 1,01,00101  RBIT 16b_16b, 8b_8b -------- */
13490       IRTemp res = newTempV128();
13491       assign(res, unop(Iop_Reverse1sIn8_x16, getQReg128(nn)));
13492       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13493       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
13494       DIP("%s %s.%s, %s.%s\n", "rbit",
13495           nameQReg128(dd), arr, nameQReg128(nn), arr);
13496       return True;
13497    }
13498
13499    if (opcode == BITS5(0,0,1,1,1)) {
13500       /* -------- 0,xx,00111 SQABS std7_std7 -------- */
13501       /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
13502       if (bitQ == 0 && size == X11) return False; // implied 1d case
13503       Bool   isNEG  = bitU == 1;
13504       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
13505       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
13506                                          getQReg128(nn), size );
13507       IRTemp qres = newTempV128(), nres = newTempV128();
13508       assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
13509       assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
13510       putQReg128(dd, mkexpr(qres));
13511       updateQCFLAGwithDifference(qres, nres);
13512       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13513       DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
13514           nameQReg128(dd), arr, nameQReg128(nn), arr);
13515       return True;
13516    }
13517
13518    if (opcode == BITS5(0,1,0,0,0)) {
13519       /* -------- 0,xx,01000: CMGT std7_std7_#0 -------- */ // >s 0
13520       /* -------- 1,xx,01000: CMGE std7_std7_#0 -------- */ // >=s 0
13521       if (bitQ == 0 && size == X11) return False; // implied 1d case
13522       Bool    isGT  = bitU == 0;
13523       IRExpr* argL  = getQReg128(nn);
13524       IRExpr* argR  = mkV128(0x0000);
13525       IRTemp  res   = newTempV128();
13526       IROp    opGTS = mkVecCMPGTS(size);
13527       assign(res, isGT ? binop(opGTS, argL, argR)
13528                        : unop(Iop_NotV128, binop(opGTS, argR, argL)));
13529       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13530       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13531       DIP("cm%s %s.%s, %s.%s, #0\n", isGT ? "gt" : "ge",
13532           nameQReg128(dd), arr, nameQReg128(nn), arr);
13533       return True;
13534    }
13535
13536    if (opcode == BITS5(0,1,0,0,1)) {
13537       /* -------- 0,xx,01001: CMEQ std7_std7_#0 -------- */ // == 0
13538       /* -------- 1,xx,01001: CMLE std7_std7_#0 -------- */ // <=s 0
13539       if (bitQ == 0 && size == X11) return False; // implied 1d case
13540       Bool    isEQ = bitU == 0;
13541       IRExpr* argL = getQReg128(nn);
13542       IRExpr* argR = mkV128(0x0000);
13543       IRTemp  res  = newTempV128();
13544       assign(res, isEQ ? binop(mkVecCMPEQ(size), argL, argR)
13545                        : unop(Iop_NotV128,
13546                               binop(mkVecCMPGTS(size), argL, argR)));
13547       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13548       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13549       DIP("cm%s %s.%s, %s.%s, #0\n", isEQ ? "eq" : "le",
13550           nameQReg128(dd), arr, nameQReg128(nn), arr);
13551       return True;
13552    }
13553
13554    if (bitU == 0 && opcode == BITS5(0,1,0,1,0)) {
13555       /* -------- 0,xx,01010: CMLT std7_std7_#0 -------- */ // <s 0
13556       if (bitQ == 0 && size == X11) return False; // implied 1d case
13557       IRExpr* argL = getQReg128(nn);
13558       IRExpr* argR = mkV128(0x0000);
13559       IRTemp  res  = newTempV128();
13560       assign(res, binop(mkVecCMPGTS(size), argR, argL));
13561       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13562       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13563       DIP("cm%s %s.%s, %s.%s, #0\n", "lt",
13564           nameQReg128(dd), arr, nameQReg128(nn), arr);
13565       return True;
13566    }
13567
13568    if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
13569       /* -------- 0,xx,01011: ABS std7_std7 -------- */
13570       if (bitQ == 0 && size == X11) return False; // implied 1d case
13571       IRTemp res = newTempV128();
13572       assign(res, unop(mkVecABS(size), getQReg128(nn)));
13573       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13574       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13575       DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
13576       return True;
13577    }
13578
13579    if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
13580       /* -------- 1,xx,01011: NEG std7_std7 -------- */
13581       if (bitQ == 0 && size == X11) return False; // implied 1d case
13582       IRTemp res = newTempV128();
13583       assign(res, binop(mkVecSUB(size), mkV128(0x0000), getQReg128(nn)));
13584       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13585       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13586       DIP("neg %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
13587       return True;
13588    }
13589
13590    UInt ix = 0; /*INVALID*/
13591    if (size >= X10) {
13592       switch (opcode) {
13593          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
13594          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
13595          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
13596          default: break;
13597       }
13598    }
13599    if (ix > 0) {
13600       /* -------- 0,1x,01100 FCMGT 2d_2d,4s_4s,2s_2s _#0.0 (ix 1) -------- */
13601       /* -------- 0,1x,01101 FCMEQ 2d_2d,4s_4s,2s_2s _#0.0 (ix 2) -------- */
13602       /* -------- 0,1x,01110 FCMLT 2d_2d,4s_4s,2s_2s _#0.0 (ix 3) -------- */
13603       /* -------- 1,1x,01100 FCMGE 2d_2d,4s_4s,2s_2s _#0.0 (ix 4) -------- */
13604       /* -------- 1,1x,01101 FCMLE 2d_2d,4s_4s,2s_2s _#0.0 (ix 5) -------- */
13605       if (bitQ == 0 && size == X11) return False; // implied 1d case
13606       Bool   isD     = size == X11;
13607       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
13608       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
13609       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
13610       IROp   opCmp   = Iop_INVALID;
13611       Bool   swap    = False;
13612       const HChar* nm = "??";
13613       switch (ix) {
13614          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
13615          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
13616          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
13617          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
13618          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
13619          default: vassert(0);
13620       }
13621       IRExpr* zero = mkV128(0x0000);
13622       IRTemp res = newTempV128();
13623       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
13624                        : binop(opCmp, getQReg128(nn), zero));
13625       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13626       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13627       DIP("%s %s.%s, %s.%s, #0.0\n", nm,
13628           nameQReg128(dd), arr, nameQReg128(nn), arr);
13629       return True;
13630    }
13631
13632    if (size >= X10 && opcode == BITS5(0,1,1,1,1)) {
13633       /* -------- 0,1x,01111: FABS 2d_2d, 4s_4s, 2s_2s -------- */
13634       /* -------- 1,1x,01111: FNEG 2d_2d, 4s_4s, 2s_2s -------- */
13635       if (bitQ == 0 && size == X11) return False; // implied 1d case
13636       Bool   isFNEG = bitU == 1;
13637       IROp   op     = isFNEG ? (size == X10 ? Iop_Neg32Fx4 : Iop_Neg64Fx2)
13638                              : (size == X10 ? Iop_Abs32Fx4 : Iop_Abs64Fx2);
13639       IRTemp res = newTempV128();
13640       assign(res, unop(op, getQReg128(nn)));
13641       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13642       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13643       DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
13644           nameQReg128(dd), arr, nameQReg128(nn), arr);
13645       return True;
13646    }
13647
13648    if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
13649       /* -------- 0,xx,10010: XTN{,2} -------- */
13650       if (size == X11) return False;
13651       vassert(size < 3);
13652       Bool   is2  = bitQ == 1;
13653       IROp   opN  = mkVecNARROWUN(size);
13654       IRTemp resN = newTempV128();
13655       assign(resN, unop(Iop_64UtoV128, unop(opN, getQReg128(nn))));
13656       putLO64andZUorPutHI64(is2, dd, resN);
13657       const HChar* nm        = "xtn";
13658       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13659       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13660       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
13661           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13662       return True;
13663    }
13664
13665    if (opcode == BITS5(1,0,1,0,0)
13666        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
13667       /* -------- 0,xx,10100: SQXTN{,2} -------- */
13668       /* -------- 1,xx,10100: UQXTN{,2} -------- */
13669       /* -------- 1,xx,10010: SQXTUN{,2} -------- */
13670       if (size == X11) return False;
13671       vassert(size < 3);
13672       Bool  is2    = bitQ == 1;
13673       IROp  opN    = Iop_INVALID;
13674       Bool  zWiden = True;
13675       const HChar* nm = "??";
13676       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
13677          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
13678       }
13679       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
13680          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
13681       }
13682       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
13683          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
13684       }
13685       else vassert(0);
13686       IRTemp src  = newTempV128();
13687       assign(src, getQReg128(nn));
13688       IRTemp resN = newTempV128();
13689       assign(resN, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
13690       putLO64andZUorPutHI64(is2, dd, resN);
13691       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
13692                                               size, mkexpr(resN));
13693       updateQCFLAGwithDifference(src, resW);
13694       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13695       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13696       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
13697           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13698       return True;
13699    }
13700
13701    if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
13702       /* -------- 1,xx,10011 SHLL{2} #lane-width -------- */
13703       /* Widens, and size is the narrow size. */
13704       if (size == X11) return False;
13705       Bool is2   = bitQ == 1;
13706       IROp opINT = is2 ? mkVecINTERLEAVEHI(size) : mkVecINTERLEAVELO(size);
13707       IROp opSHL = mkVecSHLN(size+1);
13708       IRTemp src = newTempV128();
13709       IRTemp res = newTempV128();
13710       assign(src, getQReg128(nn));
13711       assign(res, binop(opSHL, binop(opINT, mkexpr(src), mkexpr(src)),
13712                                mkU8(8 << size)));
13713       putQReg128(dd, mkexpr(res));
13714       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13715       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13716       DIP("shll%s %s.%s, %s.%s, #%d\n", is2 ? "2" : "",
13717           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow, 8 << size);
13718       return True;
13719    }
13720
13721    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,0)) {
13722       /* -------- 0,0x,10110: FCVTN 4h/8h_4s, 2s/4s_2d -------- */
13723       UInt   nLanes = size == X00 ? 4 : 2;
13724       IRType srcTy  = size == X00 ? Ity_F32 : Ity_F64;
13725       IROp   opCvt  = size == X00 ? Iop_F32toF16 : Iop_F64toF32;
13726       IRTemp rm     = mk_get_IR_rounding_mode();
13727       IRTemp src[nLanes];
13728       for (UInt i = 0; i < nLanes; i++) {
13729          src[i] = newTemp(srcTy);
13730          assign(src[i], getQRegLane(nn, i, srcTy));
13731       }
13732       for (UInt i = 0; i < nLanes; i++) {
13733          putQRegLane(dd, nLanes * bitQ + i,
13734                          binop(opCvt, mkexpr(rm), mkexpr(src[i])));
13735       }
13736       if (bitQ == 0) {
13737          putQRegLane(dd, 1, mkU64(0));
13738       }
13739       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
13740       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
13741       DIP("fcvtn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
13742           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13743       return True;
13744    }
13745
13746    if (bitU == 1 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
13747       /* -------- 1,01,10110: FCVTXN 2s/4s_2d -------- */
13748       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
13749          odd" but I don't know what that really means. */
13750       IRType srcTy = Ity_F64;
13751       IROp   opCvt = Iop_F64toF32;
13752       IRTemp src[2];
13753       for (UInt i = 0; i < 2; i++) {
13754          src[i] = newTemp(srcTy);
13755          assign(src[i], getQRegLane(nn, i, srcTy));
13756       }
13757       for (UInt i = 0; i < 2; i++) {
13758          putQRegLane(dd, 2 * bitQ + i,
13759                          binop(opCvt, mkU32(Irrm_NEAREST), mkexpr(src[i])));
13760       }
13761       if (bitQ == 0) {
13762          putQRegLane(dd, 1, mkU64(0));
13763       }
13764       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
13765       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
13766       DIP("fcvtxn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
13767           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13768       return True;
13769    }
13770
13771    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,1)) {
13772       /* -------- 0,0x,10111: FCVTL 4s_4h/8h, 2d_2s/4s -------- */
13773       UInt   nLanes = size == X00 ? 4 : 2;
13774       IRType srcTy  = size == X00 ? Ity_F16 : Ity_F32;
13775       IROp   opCvt  = size == X00 ? Iop_F16toF32 : Iop_F32toF64;
13776       IRTemp src[nLanes];
13777       for (UInt i = 0; i < nLanes; i++) {
13778          src[i] = newTemp(srcTy);
13779          assign(src[i], getQRegLane(nn, nLanes * bitQ + i, srcTy));
13780       }
13781       for (UInt i = 0; i < nLanes; i++) {
13782          putQRegLane(dd, i, unop(opCvt, mkexpr(src[i])));
13783       }
13784       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
13785       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
13786       DIP("fcvtl%s %s.%s, %s.%s\n", bitQ ? "2" : "",
13787           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
13788       return True;
13789    }
13790
13791    ix = 0;
13792    if (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,0,0,1)) {
13793       ix = 1 + ((((bitU & 1) << 2) | ((size & 2) << 0)) | ((opcode & 1) << 0));
13794       // = 1 + bitU[0]:size[1]:opcode[0]
13795       vassert(ix >= 1 && ix <= 8);
13796       if (ix == 7) ix = 0;
13797    }
13798    if (ix > 0) {
13799       /* -------- 0,0x,11000 FRINTN 2d_2d, 4s_4s, 2s_2s (1) -------- */
13800       /* -------- 0,0x,11001 FRINTM 2d_2d, 4s_4s, 2s_2s (2) -------- */
13801       /* -------- 0,1x,11000 FRINTP 2d_2d, 4s_4s, 2s_2s (3) -------- */
13802       /* -------- 0,1x,11001 FRINTZ 2d_2d, 4s_4s, 2s_2s (4) -------- */
13803       /* -------- 1,0x,11000 FRINTA 2d_2d, 4s_4s, 2s_2s (5) -------- */
13804       /* -------- 1,0x,11001 FRINTX 2d_2d, 4s_4s, 2s_2s (6) -------- */
13805       /* -------- 1,1x,11000 (apparently unassigned)    (7) -------- */
13806       /* -------- 1,1x,11001 FRINTI 2d_2d, 4s_4s, 2s_2s (8) -------- */
13807       /* rm plan:
13808          FRINTN: tieeven -- !! FIXME KLUDGED !!
13809          FRINTM: -inf
13810          FRINTP: +inf
13811          FRINTZ: zero
13812          FRINTA: tieaway -- !! FIXME KLUDGED !!
13813          FRINTX: per FPCR + "exact = TRUE"
13814          FRINTI: per FPCR
13815       */
13816       Bool isD = (size & 1) == 1;
13817       if (bitQ == 0 && isD) return False; // implied 1d case
13818
13819       IRTemp irrmRM = mk_get_IR_rounding_mode();
13820
13821       UChar ch = '?';
13822       IRTemp irrm = newTemp(Ity_I32);
13823       switch (ix) {
13824          case 1: ch = 'n'; assign(irrm, mkU32(Irrm_NEAREST)); break;
13825          case 2: ch = 'm'; assign(irrm, mkU32(Irrm_NegINF)); break;
13826          case 3: ch = 'p'; assign(irrm, mkU32(Irrm_PosINF)); break;
13827          case 4: ch = 'z'; assign(irrm, mkU32(Irrm_ZERO)); break;
13828          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
13829          case 5: ch = 'a'; assign(irrm, mkU32(Irrm_NEAREST)); break;
13830          // I am unsure about the following, due to the "integral exact"
13831          // description in the manual.  What does it mean? (frintx, that is)
13832          case 6: ch = 'x'; assign(irrm, mkexpr(irrmRM)); break;
13833          case 8: ch = 'i'; assign(irrm, mkexpr(irrmRM)); break;
13834          default: vassert(0);
13835       }
13836
13837       IROp opRND = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
13838       if (isD) {
13839          for (UInt i = 0; i < 2; i++) {
13840             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
13841                                             getQRegLane(nn, i, Ity_F64)));
13842          }
13843       } else {
13844          UInt n = bitQ==1 ? 4 : 2;
13845          for (UInt i = 0; i < n; i++) {
13846             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
13847                                             getQRegLane(nn, i, Ity_F32)));
13848          }
13849          if (bitQ == 0)
13850             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
13851       }
13852       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13853       DIP("frint%c %s.%s, %s.%s\n", ch,
13854           nameQReg128(dd), arr, nameQReg128(nn), arr);
13855       return True;
13856    }
13857
13858    ix = 0; /*INVALID*/
13859    switch (opcode) {
13860       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
13861       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
13862       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
13863       default: break;
13864    }
13865    if (ix > 0) {
13866       /* -------- 0,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
13867       /* -------- 0,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
13868       /* -------- 0,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
13869       /* -------- 0,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
13870       /* -------- 0,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
13871       /* -------- 1,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
13872       /* -------- 1,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
13873       /* -------- 1,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
13874       /* -------- 1,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
13875       /* -------- 1,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
13876       Bool isD = (size & 1) == 1;
13877       if (bitQ == 0 && isD) return False; // implied 1d case
13878
13879       IRRoundingMode irrm = 8; /*impossible*/
13880       HChar          ch   = '?';
13881       switch (ix) {
13882          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
13883          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
13884          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
13885          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
13886          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
13887          default: vassert(0);
13888       }
13889       IROp cvt = Iop_INVALID;
13890       if (bitU == 1) {
13891          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
13892       } else {
13893          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
13894       }
13895       if (isD) {
13896          for (UInt i = 0; i < 2; i++) {
13897             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
13898                                             getQRegLane(nn, i, Ity_F64)));
13899          }
13900       } else {
13901          UInt n = bitQ==1 ? 4 : 2;
13902          for (UInt i = 0; i < n; i++) {
13903             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
13904                                             getQRegLane(nn, i, Ity_F32)));
13905          }
13906          if (bitQ == 0)
13907             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
13908       }
13909       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13910       DIP("fcvt%c%c %s.%s, %s.%s\n", ch, bitU == 1 ? 'u' : 's',
13911           nameQReg128(dd), arr, nameQReg128(nn), arr);
13912       return True;
13913    }
13914
13915    if (size == X10 && opcode == BITS5(1,1,1,0,0)) {
13916       /* -------- 0,10,11100: URECPE  4s_4s, 2s_2s -------- */
13917       /* -------- 1,10,11100: URSQRTE 4s_4s, 2s_2s -------- */
13918       Bool isREC = bitU == 0;
13919       IROp op    = isREC ? Iop_RecipEst32Ux4 : Iop_RSqrtEst32Ux4;
13920       IRTemp res = newTempV128();
13921       assign(res, unop(op, getQReg128(nn)));
13922       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13923       const HChar* nm  = isREC ? "urecpe" : "ursqrte";
13924       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13925       DIP("%s %s.%s, %s.%s\n", nm,
13926           nameQReg128(dd), arr, nameQReg128(nn), arr);
13927       return True;
13928    }
13929
13930    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
13931       /* -------- 0,0x,11101: SCVTF -------- */
13932       /* -------- 1,0x,11101: UCVTF -------- */
13933       /* 31  28      22 21       15     9 4
13934          0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
13935          0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
13936          with laneage:
13937          case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
13938       */
13939       Bool isQ   = bitQ == 1;
13940       Bool isU   = bitU == 1;
13941       Bool isF64 = (size & 1) == 1;
13942       if (isQ || !isF64) {
13943          IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
13944          UInt   nLanes = 0;
13945          Bool   zeroHI = False;
13946          const HChar* arrSpec = NULL;
13947          Bool   ok  = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
13948                                        isQ, isF64 );
13949          IROp   iop = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
13950                           : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
13951          IRTemp rm  = mk_get_IR_rounding_mode();
13952          UInt   i;
13953          vassert(ok); /* the 'if' above should ensure this */
13954          for (i = 0; i < nLanes; i++) {
13955             putQRegLane(dd, i,
13956                         binop(iop, mkexpr(rm), getQRegLane(nn, i, tyI)));
13957          }
13958          if (zeroHI) {
13959             putQRegLane(dd, 1, mkU64(0));
13960          }
13961          DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
13962              nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
13963          return True;
13964       }
13965       /* else fall through */
13966    }
13967
13968    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
13969       /* -------- 0,1x,11101: FRECPE  2d_2d, 4s_4s, 2s_2s -------- */
13970       /* -------- 1,1x,11101: FRSQRTE 2d_2d, 4s_4s, 2s_2s -------- */
13971       Bool isSQRT = bitU == 1;
13972       Bool isD    = (size & 1) == 1;
13973       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
13974                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
13975       if (bitQ == 0 && isD) return False; // implied 1d case
13976       IRTemp resV = newTempV128();
13977       assign(resV, unop(op, getQReg128(nn)));
13978       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
13979       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13980       DIP("%s %s.%s, %s.%s\n", isSQRT ? "frsqrte" : "frecpe",
13981           nameQReg128(dd), arr, nameQReg128(nn), arr);
13982       return True;
13983    }
13984
13985    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
13986       /* -------- 1,1x,11111: FSQRT 2d_2d, 4s_4s, 2s_2s -------- */
13987       Bool isD = (size & 1) == 1;
13988       IROp op  = isD ? Iop_Sqrt64Fx2 : Iop_Sqrt32Fx4;
13989       if (bitQ == 0 && isD) return False; // implied 1d case
13990       IRTemp resV = newTempV128();
13991       assign(resV, binop(op, mkexpr(mk_get_IR_rounding_mode()),
13992                              getQReg128(nn)));
13993       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
13994       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13995       DIP("%s %s.%s, %s.%s\n", "fsqrt",
13996           nameQReg128(dd), arr, nameQReg128(nn), arr);
13997       return True;
13998    }
13999
14000    return False;
14001 #  undef INSN
14002 }
14003
14004
14005 static
14006 Bool dis_AdvSIMD_two_reg_misc_fp16(/*MB_OUT*/DisResult* dres, UInt insn,
14007                                    const VexArchInfo* archinfo)
14008 {
14009    /* This decode function only handles instructions with half-precision
14010       floating-point (fp16) operands.
14011    */
14012    if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
14013       return False;
14014
14015    /* 31 30 29 28    23   21    16     11 9 4
14016       0  Q  U  01110 size 11100 opcode 10 n d
14017       Decode fields: U,size,opcode
14018    */
14019 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14020    if (INSN(31,31) != 0
14021        || INSN(28,24) != BITS5(0,1,1,1,0)
14022        || INSN(21,17) != BITS5(1,1,1,0,0)
14023        || INSN(11,10) != BITS2(1,0)) {
14024       return False;
14025    }
14026    UInt bitQ   = INSN(30,30);
14027    UInt bitU   = INSN(29,29);
14028    UInt size   = INSN(23,22);
14029    UInt opcode = INSN(16,12);
14030    UInt nn     = INSN(9,5);
14031    UInt dd     = INSN(4,0);
14032    vassert(size < 4);
14033
14034    if (size == X11 && opcode == BITS5(0,1,1,1,1)) {
14035       /* -------- Q,0,11,01111: FABS 4h_4h, 8h_8h -------- */
14036       /* -------- Q,1,11,01111: FNEG 4h_4h, 8h_8h -------- */
14037       Bool   isFNEG = bitU == 1;
14038       IROp   op     = isFNEG ? Iop_Neg16Fx8 : Iop_Abs16Fx8;
14039       IRTemp res = newTempV128();
14040       assign(res, unop(op, getQReg128(nn)));
14041       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14042       const HChar* arr = bitQ == 0 ? "4h" : "8h";
14043       DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
14044           nameQReg128(dd), arr, nameQReg128(nn), arr);
14045       return True;
14046    }
14047
14048    if (bitU == 1 && size == X11 && opcode == BITS5(1,1,1,1,1)) {
14049       /* -------- 1,11,11111: FSQRT 4h_4h, 8h_8h -------- */
14050       IRTemp resV = newTempV128();
14051       assign(resV, binop(Iop_Sqrt16Fx8, mkexpr(mk_get_IR_rounding_mode()),
14052                              getQReg128(nn)));
14053       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
14054       const HChar* arr = bitQ == 0 ? "4h" : "8h";
14055       DIP("%s %s.%s, %s.%s\n", "fsqrt",
14056           nameQReg128(dd), arr, nameQReg128(nn), arr);
14057       return True;
14058    }
14059
14060    /* Decoding FCM<condtion> based on opcode and bitU. ix used to select
14061     * <condition>
14062     */
14063    UInt ix = 0; // Invalid <condition>
14064    switch (opcode) {
14065       case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 4 : 1; break; // FCMLE=4,FCMEQ=1
14066       case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 5 : 2; break; // FCMGE=5,FCMGT=2
14067       case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;    // FCMLT=3
14068       default: break;
14069    }
14070    if (ix > 0) {
14071       /* -------- 0,01101 FCMEQ 4h_4h,8h_8h _#0.0 (ix 1) -------- */
14072       /* -------- 0,01100 FCMGT 4h_4h,8h_8h _#0.0 (ix 2) -------- */
14073       /* -------- 0,01110 FCMLT 4h_4h,8h_8h _#0.0 (ix 3) -------- */
14074       /* -------- 1,01101 FCMLE 4h_4h,8h_8h _#0.0 (ix 4) -------- */
14075       /* -------- 1,01100 FCMGE 4h_4h,8h_8h _#0.0 (ix 5) -------- */
14076       IROp   opCmp   = Iop_INVALID;
14077       Bool   swap    = False;
14078       const HChar* nm = "??";
14079       switch (ix) {
14080          case 1: nm = "fcmeq"; opCmp = Iop_CmpEQ16Fx8; break;
14081          case 2: nm = "fcmgt"; opCmp = Iop_CmpLT16Fx8; swap = True; break;
14082          case 3: nm = "fcmlt"; opCmp = Iop_CmpLT16Fx8; break;
14083          case 4: nm = "fcmle"; opCmp = Iop_CmpLE16Fx8; break;
14084          case 5: nm = "fcmge"; opCmp = Iop_CmpLE16Fx8; swap = True; break;
14085          default: vassert(0);
14086       }
14087       IRExpr* zero = mkV128(0x0000);
14088       IRTemp res = newTempV128();
14089       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
14090                        : binop(opCmp, getQReg128(nn), zero));
14091       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14092       const HChar* arr = bitQ == 0 ? "4h" : "8h";
14093       DIP("%s %s.%s, %s.%s, #0.0\n", nm,
14094           nameQReg128(dd), arr, nameQReg128(nn), arr);
14095       return True;
14096    }
14097
14098    return False;
14099 #  undef INSN
14100 }
14101
14102 static
14103 Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
14104 {
14105    /* 31    28    23   21 20 19 15     11   9 4
14106       0 Q U 01111 size L  M  m  opcode H  0 n d
14107       Decode fields are: u,size,opcode
14108       M is really part of the mm register number.  Individual
14109       cases need to inspect L and H though.
14110    */
14111 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14112    if (INSN(31,31) != 0
14113        || INSN(28,24) != BITS5(0,1,1,1,1) || INSN(10,10) !=0) {
14114       return False;
14115    }
14116    UInt bitQ   = INSN(30,30);
14117    UInt bitU   = INSN(29,29);
14118    UInt size   = INSN(23,22);
14119    UInt bitL   = INSN(21,21);
14120    UInt bitM   = INSN(20,20);
14121    UInt mmLO4  = INSN(19,16);
14122    UInt opcode = INSN(15,12);
14123    UInt bitH   = INSN(11,11);
14124    UInt nn     = INSN(9,5);
14125    UInt dd     = INSN(4,0);
14126    vassert(size < 4);
14127    vassert(bitH < 2 && bitM < 2 && bitL < 2);
14128
14129    if (bitU == 0 && size >= X10
14130        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
14131       /* -------- 0,1x,0001 FMLA 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
14132       /* -------- 0,1x,0101 FMLS 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
14133       if (bitQ == 0 && size == X11) return False; // implied 1d case
14134       Bool isD   = (size & 1) == 1;
14135       Bool isSUB = opcode == BITS4(0,1,0,1);
14136       UInt index;
14137       if      (!isD)             index = (bitH << 1) | bitL;
14138       else if (isD && bitL == 0) index = bitH;
14139       else return False; // sz:L == x11 => unallocated encoding
14140       vassert(index < (isD ? 2 : 4));
14141       IRType ity   = isD ? Ity_F64 : Ity_F32;
14142       IRTemp elem  = newTemp(ity);
14143       UInt   mm    = (bitM << 4) | mmLO4;
14144       assign(elem, getQRegLane(mm, index, ity));
14145       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
14146       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
14147       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
14148       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
14149       IRTemp rm    = mk_get_IR_rounding_mode();
14150       IRTemp t1    = newTempV128();
14151       IRTemp t2    = newTempV128();
14152       // FIXME: double rounding; use FMA primops instead
14153       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
14154       assign(t2, triop(isSUB ? opSUB : opADD,
14155                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
14156       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
14157       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
14158       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
14159           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
14160           isD ? 'd' : 's', index);
14161       return True;
14162    }
14163
14164    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
14165       /* -------- 0,1x,1001 FMUL  2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
14166       /* -------- 1,1x,1001 FMULX 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
14167       if (bitQ == 0 && size == X11) return False; // implied 1d case
14168       Bool isD    = (size & 1) == 1;
14169       Bool isMULX = bitU == 1;
14170       UInt index;
14171       if      (!isD)             index = (bitH << 1) | bitL;
14172       else if (isD && bitL == 0) index = bitH;
14173       else return False; // sz:L == x11 => unallocated encoding
14174       vassert(index < (isD ? 2 : 4));
14175       IRType ity  = isD ? Ity_F64 : Ity_F32;
14176       IRTemp elem = newTemp(ity);
14177       UInt   mm   = (bitM << 4) | mmLO4;
14178       assign(elem, getQRegLane(mm, index, ity));
14179       IRTemp dupd = math_DUP_TO_V128(elem, ity);
14180       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
14181       IRTemp res  = newTempV128();
14182       assign(res, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
14183                         mkexpr(mk_get_IR_rounding_mode()),
14184                         getQReg128(nn), mkexpr(dupd)));
14185       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14186       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
14187       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n",
14188           isMULX ? "fmulx" : "fmul", nameQReg128(dd), arr,
14189           nameQReg128(nn), arr, nameQReg128(mm), isD ? 'd' : 's', index);
14190       return True;
14191    }
14192
14193    if ((bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,1,0,0)))
14194        || (bitU == 0 && opcode == BITS4(1,0,0,0))) {
14195       /* -------- 1,xx,0000 MLA s/h variants only -------- */
14196       /* -------- 1,xx,0100 MLS s/h variants only -------- */
14197       /* -------- 0,xx,1000 MUL s/h variants only -------- */
14198       Bool isMLA = opcode == BITS4(0,0,0,0);
14199       Bool isMLS = opcode == BITS4(0,1,0,0);
14200       UInt mm    = 32; // invalid
14201       UInt ix    = 16; // invalid
14202       switch (size) {
14203          case X00:
14204             return False; // b case is not allowed
14205          case X01:
14206             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14207          case X10:
14208             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14209          case X11:
14210             return False; // d case is not allowed
14211          default:
14212             vassert(0);
14213       }
14214       vassert(mm < 32 && ix < 16);
14215       IROp   opMUL = mkVecMUL(size);
14216       IROp   opADD = mkVecADD(size);
14217       IROp   opSUB = mkVecSUB(size);
14218       HChar  ch    = size == X01 ? 'h' : 's';
14219       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14220       IRTemp vecD  = newTempV128();
14221       IRTemp vecN  = newTempV128();
14222       IRTemp res   = newTempV128();
14223       assign(vecD, getQReg128(dd));
14224       assign(vecN, getQReg128(nn));
14225       IRExpr* prod = binop(opMUL, mkexpr(vecN), mkexpr(vecM));
14226       if (isMLA || isMLS) {
14227          assign(res, binop(isMLA ? opADD : opSUB, mkexpr(vecD), prod));
14228       } else {
14229          assign(res, prod);
14230       }
14231       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14232       const HChar* arr = nameArr_Q_SZ(bitQ, size);
14233       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isMLA ? "mla"
14234                                                 : (isMLS ? "mls" : "mul"),
14235           nameQReg128(dd), arr,
14236           nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
14237       return True;
14238    }
14239
14240    if (opcode == BITS4(1,0,1,0)
14241        || opcode == BITS4(0,0,1,0) || opcode == BITS4(0,1,1,0)) {
14242       /* -------- 0,xx,1010 SMULL s/h variants only -------- */ // 0 (ks)
14243       /* -------- 1,xx,1010 UMULL s/h variants only -------- */ // 0
14244       /* -------- 0,xx,0010 SMLAL s/h variants only -------- */ // 1
14245       /* -------- 1,xx,0010 UMLAL s/h variants only -------- */ // 1
14246       /* -------- 0,xx,0110 SMLSL s/h variants only -------- */ // 2
14247       /* -------- 1,xx,0110 SMLSL s/h variants only -------- */ // 2
14248       /* Widens, and size refers to the narrowed lanes. */
14249       UInt ks = 3;
14250       switch (opcode) {
14251          case BITS4(1,0,1,0): ks = 0; break;
14252          case BITS4(0,0,1,0): ks = 1; break;
14253          case BITS4(0,1,1,0): ks = 2; break;
14254          default: vassert(0);
14255       }
14256       vassert(ks >= 0 && ks <= 2);
14257       Bool isU = bitU == 1;
14258       Bool is2 = bitQ == 1;
14259       UInt mm  = 32; // invalid
14260       UInt ix  = 16; // invalid
14261       switch (size) {
14262          case X00:
14263             return False; // h_b_b[] case is not allowed
14264          case X01:
14265             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14266          case X10:
14267             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14268          case X11:
14269             return False; // q_d_d[] case is not allowed
14270          default:
14271             vassert(0);
14272       }
14273       vassert(mm < 32 && ix < 16);
14274       IRTemp vecN  = newTempV128();
14275       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14276       IRTemp vecD  = newTempV128();
14277       assign(vecN, getQReg128(nn));
14278       assign(vecD, getQReg128(dd));
14279       IRTemp res = IRTemp_INVALID;
14280       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
14281                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
14282       putQReg128(dd, mkexpr(res));
14283       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
14284       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
14285       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
14286       HChar ch               = size == X01 ? 'h' : 's';
14287       DIP("%c%s%s %s.%s, %s.%s, %s.%c[%u]\n",
14288           isU ? 'u' : 's', nm, is2 ? "2" : "",
14289           nameQReg128(dd), arrWide,
14290           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
14291       return True;
14292    }
14293
14294    if (bitU == 0
14295        && (opcode == BITS4(1,0,1,1)
14296            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
14297       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
14298       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
14299       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
14300       /* Widens, and size refers to the narrowed lanes. */
14301       UInt ks = 3;
14302       switch (opcode) {
14303          case BITS4(1,0,1,1): ks = 0; break;
14304          case BITS4(0,0,1,1): ks = 1; break;
14305          case BITS4(0,1,1,1): ks = 2; break;
14306          default: vassert(0);
14307       }
14308       vassert(ks >= 0 && ks <= 2);
14309       Bool is2 = bitQ == 1;
14310       UInt mm  = 32; // invalid
14311       UInt ix  = 16; // invalid
14312       switch (size) {
14313          case X00:
14314             return False; // h_b_b[] case is not allowed
14315          case X01:
14316             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14317          case X10:
14318             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14319          case X11:
14320             return False; // q_d_d[] case is not allowed
14321          default:
14322             vassert(0);
14323       }
14324       vassert(mm < 32 && ix < 16);
14325       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
14326       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
14327       newTempsV128_2(&vecN, &vecD);
14328       assign(vecN, getQReg128(nn));
14329       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14330       assign(vecD, getQReg128(dd));
14331       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
14332                        is2, size, "mas"[ks],
14333                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
14334       putQReg128(dd, mkexpr(res));
14335       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
14336       updateQCFLAGwithDifference(sat1q, sat1n);
14337       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
14338          updateQCFLAGwithDifference(sat2q, sat2n);
14339       }
14340       const HChar* nm        = ks == 0 ? "sqdmull"
14341                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
14342       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
14343       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
14344       HChar ch               = size == X01 ? 'h' : 's';
14345       DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
14346           nm, is2 ? "2" : "",
14347           nameQReg128(dd), arrWide,
14348           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
14349       return True;
14350    }
14351
14352    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
14353       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
14354       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
14355       UInt mm  = 32; // invalid
14356       UInt ix  = 16; // invalid
14357       switch (size) {
14358          case X00:
14359             return False; // b case is not allowed
14360          case X01:
14361             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14362          case X10:
14363             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14364          case X11:
14365             return False; // q case is not allowed
14366          default:
14367             vassert(0);
14368       }
14369       vassert(mm < 32 && ix < 16);
14370       Bool isR = opcode == BITS4(1,1,0,1);
14371       IRTemp res, sat1q, sat1n, vN, vM;
14372       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
14373       vN = newTempV128();
14374       assign(vN, getQReg128(nn));
14375       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14376       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
14377       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14378       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
14379       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
14380       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
14381       const HChar* arr = nameArr_Q_SZ(bitQ, size);
14382       HChar ch         = size == X01 ? 'h' : 's';
14383       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
14384           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
14385       return True;
14386    }
14387
14388    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
14389       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
14390       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
14391       UInt mm  = 32; // invalid
14392       UInt ix  = 16; // invalid
14393       switch (size) {
14394          case X00:
14395             return False; // b case is not allowed
14396          case X01:        // h
14397             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14398          case X10:        // s
14399             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14400          case X11:
14401             return False; // d case is not allowed
14402          default:
14403             vassert(0);
14404       }
14405       vassert(mm < 32 && ix < 16);
14406
14407       IRTemp res, res_nosat, vD, vN, vM;
14408       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
14409       newTempsV128_2(&vD, &vN);
14410       assign(vD, getQReg128(dd));
14411       assign(vN, getQReg128(nn));
14412
14413       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14414       Bool isAdd = opcode == BITS4(1,1,0,1);
14415       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
14416       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
14417       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
14418       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14419
14420       const HChar* arr = nameArr_Q_SZ(bitQ, size);
14421       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
14422       HChar ch         = size == X01 ? 'h' : 's';
14423       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
14424           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), ch, ix);
14425       return True;
14426    }
14427
14428    return False;
14429 #  undef INSN
14430 }
14431
14432
14433 static
14434 Bool dis_AdvSIMD_crypto_aes(/*MB_OUT*/DisResult* dres, UInt insn)
14435 {
14436    /* 31        23   21    16     11 9 4
14437       0100 1110 size 10100 opcode 10 n d
14438       Decode fields are: size,opcode
14439       Size is always 00 in ARMv8, it appears.
14440    */
14441 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14442    if (INSN(31,24) != BITS8(0,1,0,0,1,1,1,0)
14443       || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
14444       return False;
14445    }
14446    UInt size   = INSN(23,22);
14447    UInt opcode = INSN(16,12);
14448    UInt nn     = INSN(9,5);
14449    UInt dd     = INSN(4,0);
14450
14451    if (size == BITS2(0,0)
14452        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,0,1))) {
14453       /* -------- 00,00100: AESE Vd.16b, Vn.16b -------- */
14454       /* -------- 00,00101: AESD Vd.16b, Vn.16b -------- */
14455       Bool   isD  = opcode == BITS5(0,0,1,0,1);
14456       IRTemp op1  = newTemp(Ity_V128);
14457       IRTemp op2  = newTemp(Ity_V128);
14458       IRTemp xord = newTemp(Ity_V128);
14459       IRTemp res  = newTemp(Ity_V128);
14460       void*        helper = isD ? &arm64g_dirtyhelper_AESD
14461                                 : &arm64g_dirtyhelper_AESE;
14462       const HChar* hname  = isD ? "arm64g_dirtyhelper_AESD"
14463                                 : "arm64g_dirtyhelper_AESE";
14464       assign(op1, getQReg128(dd));
14465       assign(op2, getQReg128(nn));
14466       assign(xord, binop(Iop_XorV128, mkexpr(op1), mkexpr(op2)));
14467       IRDirty* di
14468          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
14469                               mkIRExprVec_3(
14470                                  IRExpr_VECRET(),
14471                                  unop(Iop_V128HIto64, mkexpr(xord)),
14472                                  unop(Iop_V128to64, mkexpr(xord)) ) );
14473       stmt(IRStmt_Dirty(di));
14474       putQReg128(dd, mkexpr(res));
14475       DIP("aes%c %s.16b, %s.16b\n", isD ? 'd' : 'e',
14476                                     nameQReg128(dd), nameQReg128(nn));
14477       return True;
14478    }
14479
14480    if (size == BITS2(0,0)
14481        && (opcode == BITS5(0,0,1,1,0) || opcode == BITS5(0,0,1,1,1))) {
14482       /* -------- 00,00110: AESMC  Vd.16b, Vn.16b -------- */
14483       /* -------- 00,00111: AESIMC Vd.16b, Vn.16b -------- */
14484       Bool   isI  = opcode == BITS5(0,0,1,1,1);
14485       IRTemp src  = newTemp(Ity_V128);
14486       IRTemp res  = newTemp(Ity_V128);
14487       void*        helper = isI ? &arm64g_dirtyhelper_AESIMC
14488                                 : &arm64g_dirtyhelper_AESMC;
14489       const HChar* hname  = isI ? "arm64g_dirtyhelper_AESIMC"
14490                                 : "arm64g_dirtyhelper_AESMC";
14491       assign(src, getQReg128(nn));
14492       IRDirty* di
14493          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
14494                               mkIRExprVec_3(
14495                                  IRExpr_VECRET(),
14496                                  unop(Iop_V128HIto64, mkexpr(src)),
14497                                  unop(Iop_V128to64, mkexpr(src)) ) );
14498       stmt(IRStmt_Dirty(di));
14499       putQReg128(dd, mkexpr(res));
14500       DIP("aes%s %s.16b, %s.16b\n", isI ? "imc" : "mc",
14501                                     nameQReg128(dd), nameQReg128(nn));
14502       return True;
14503    }
14504
14505    return False;
14506 #  undef INSN
14507 }
14508
14509
14510 static
14511 Bool dis_AdvSIMD_crypto_three_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
14512 {
14513    /* 31   28   23 21 20 15 14  11 9 4
14514       0101 1110 sz 0  m  0  opc 00 n d
14515       Decode fields are: sz,opc
14516    */
14517 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14518    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0) || INSN(21,21) != 0
14519        || INSN(15,15) != 0 || INSN(11,10) != BITS2(0,0)) {
14520       return False;
14521    }
14522    UInt sz  = INSN(23,22);
14523    UInt mm  = INSN(20,16);
14524    UInt opc = INSN(14,12);
14525    UInt nn  = INSN(9,5);
14526    UInt dd  = INSN(4,0);
14527    if (sz == BITS2(0,0) && opc <= BITS3(1,1,0)) {
14528       /* -------- 00,000 SHA1C     Qd,    Sn,    Vm.4S -------- */
14529       /* -------- 00,001 SHA1P     Qd,    Sn,    Vm.4S -------- */
14530       /* -------- 00,010 SHA1M     Qd,    Sn,    Vm.4S -------- */
14531       /* -------- 00,011 SHA1SU0   Vd.4S, Vn.4S, Vm.4S -------- */
14532       /* -------- 00,100 SHA256H   Qd,    Qn,    Vm.4S -------- */
14533       /* -------- 00,101 SHA256H2  Qd,    Qn,    Vm.4S -------- */
14534       /* -------- 00,110 SHA256SU1 Vd.4S, Vn.4S, Vm.4S -------- */
14535       vassert(opc < 7);
14536       const HChar* inames[7]
14537          = { "sha1c", "sha1p", "sha1m", "sha1su0",
14538              "sha256h", "sha256h2", "sha256su1" };
14539       void(*helpers[7])(V128*,ULong,ULong,ULong,ULong,ULong,ULong)
14540          = { &arm64g_dirtyhelper_SHA1C,    &arm64g_dirtyhelper_SHA1P,
14541              &arm64g_dirtyhelper_SHA1M,    &arm64g_dirtyhelper_SHA1SU0,
14542              &arm64g_dirtyhelper_SHA256H,  &arm64g_dirtyhelper_SHA256H2,
14543              &arm64g_dirtyhelper_SHA256SU1 };
14544       const HChar* hnames[7]
14545          = { "arm64g_dirtyhelper_SHA1C",    "arm64g_dirtyhelper_SHA1P",
14546              "arm64g_dirtyhelper_SHA1M",    "arm64g_dirtyhelper_SHA1SU0",
14547              "arm64g_dirtyhelper_SHA256H",  "arm64g_dirtyhelper_SHA256H2",
14548              "arm64g_dirtyhelper_SHA256SU1" };
14549       IRTemp vD      = newTemp(Ity_V128);
14550       IRTemp vN      = newTemp(Ity_V128);
14551       IRTemp vM      = newTemp(Ity_V128);
14552       IRTemp vDhi    = newTemp(Ity_I64);
14553       IRTemp vDlo    = newTemp(Ity_I64);
14554       IRTemp vNhiPre = newTemp(Ity_I64);
14555       IRTemp vNloPre = newTemp(Ity_I64);
14556       IRTemp vNhi    = newTemp(Ity_I64);
14557       IRTemp vNlo    = newTemp(Ity_I64);
14558       IRTemp vMhi    = newTemp(Ity_I64);
14559       IRTemp vMlo    = newTemp(Ity_I64);
14560       assign(vD,      getQReg128(dd));
14561       assign(vN,      getQReg128(nn));
14562       assign(vM,      getQReg128(mm));
14563       assign(vDhi,    unop(Iop_V128HIto64, mkexpr(vD)));
14564       assign(vDlo,    unop(Iop_V128to64,   mkexpr(vD)));
14565       assign(vNhiPre, unop(Iop_V128HIto64, mkexpr(vN)));
14566       assign(vNloPre, unop(Iop_V128to64,   mkexpr(vN)));
14567       assign(vMhi,    unop(Iop_V128HIto64, mkexpr(vM)));
14568       assign(vMlo,    unop(Iop_V128to64,   mkexpr(vM)));
14569       /* Mask off any bits of the N register operand that aren't actually
14570          needed, so that Memcheck doesn't complain unnecessarily. */
14571       switch (opc) {
14572          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
14573             assign(vNhi, mkU64(0));
14574             assign(vNlo, unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(vNloPre))));
14575             break;
14576          case BITS3(0,1,1): case BITS3(1,0,0):
14577          case BITS3(1,0,1): case BITS3(1,1,0):
14578             assign(vNhi, mkexpr(vNhiPre));
14579             assign(vNlo, mkexpr(vNloPre));
14580             break;
14581          default:
14582             vassert(0);
14583       }
14584       IRTemp res = newTemp(Ity_V128);
14585       IRDirty* di
14586          = unsafeIRDirty_1_N( res, 0/*regparms*/, hnames[opc], helpers[opc],
14587                               mkIRExprVec_7(
14588                                  IRExpr_VECRET(),
14589                                  mkexpr(vDhi), mkexpr(vDlo), mkexpr(vNhi),
14590                                  mkexpr(vNlo), mkexpr(vMhi), mkexpr(vMlo)));
14591       stmt(IRStmt_Dirty(di));
14592       putQReg128(dd, mkexpr(res));
14593       switch (opc) {
14594          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
14595             DIP("%s q%u, s%u, v%u.4s\n", inames[opc], dd, nn, mm);
14596             break;
14597          case BITS3(0,1,1): case BITS3(1,1,0):
14598             DIP("%s v%u.4s, v%u.4s, v%u.4s\n", inames[opc], dd, nn, mm);
14599             break;
14600          case BITS3(1,0,0): case BITS3(1,0,1):
14601             DIP("%s q%u, q%u, v%u.4s\n", inames[opc], dd, nn, mm);
14602             break;
14603          default:
14604             vassert(0);
14605       }
14606       return True;
14607    }
14608
14609    return False;
14610 #  undef INSN
14611 }
14612
14613
14614 static
14615 Bool dis_AdvSIMD_crypto_two_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
14616 {
14617    /* 31   28   23 21    16  11 9 4
14618       0101 1110 sz 10100 opc 10 n d
14619       Decode fields are: sz,opc
14620    */
14621 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14622    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0)
14623        || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
14624       return False;
14625    }
14626    UInt sz  = INSN(23,22);
14627    UInt opc = INSN(16,12);
14628    UInt nn  = INSN(9,5);
14629    UInt dd  = INSN(4,0);
14630    if (sz == BITS2(0,0) && opc <= BITS5(0,0,0,1,0)) {
14631       /* -------- 00,00000 SHA1H     Sd,    Sn    -------- */
14632       /* -------- 00,00001 SHA1SU1   Vd.4S, Vn.4S -------- */
14633       /* -------- 00,00010 SHA256SU0 Vd.4S, Vn.4S -------- */
14634       vassert(opc < 3);
14635       const HChar* inames[3] = { "sha1h", "sha1su1", "sha256su0" };
14636       IRTemp vD   = newTemp(Ity_V128);
14637       IRTemp vN   = newTemp(Ity_V128);
14638       IRTemp vDhi = newTemp(Ity_I64);
14639       IRTemp vDlo = newTemp(Ity_I64);
14640       IRTemp vNhi = newTemp(Ity_I64);
14641       IRTemp vNlo = newTemp(Ity_I64);
14642       assign(vD,   getQReg128(dd));
14643       assign(vN,   getQReg128(nn));
14644       assign(vDhi, unop(Iop_V128HIto64, mkexpr(vD)));
14645       assign(vDlo, unop(Iop_V128to64,   mkexpr(vD)));
14646       assign(vNhi, unop(Iop_V128HIto64, mkexpr(vN)));
14647       assign(vNlo, unop(Iop_V128to64,   mkexpr(vN)));
14648       /* Mask off any bits of the N register operand that aren't actually
14649          needed, so that Memcheck doesn't complain unnecessarily.  Also
14650          construct the calls, given that the helper functions don't take
14651          the same number of arguments. */
14652       IRDirty* di  = NULL;
14653       IRTemp   res = newTemp(Ity_V128);
14654       switch (opc) {
14655          case BITS5(0,0,0,0,0): {
14656             IRExpr* vNloMasked = unop(Iop_32Uto64,
14657                                       unop(Iop_64to32, mkexpr(vNlo)));
14658             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
14659                                     "arm64g_dirtyhelper_SHA1H",
14660                                     &arm64g_dirtyhelper_SHA1H,
14661                                     mkIRExprVec_3(
14662                                        IRExpr_VECRET(),
14663                                        mkU64(0), vNloMasked) );
14664             break;
14665          }
14666          case BITS5(0,0,0,0,1):
14667             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
14668                                     "arm64g_dirtyhelper_SHA1SU1",
14669                                     &arm64g_dirtyhelper_SHA1SU1,
14670                                     mkIRExprVec_5(
14671                                        IRExpr_VECRET(),
14672                                        mkexpr(vDhi), mkexpr(vDlo),
14673                                        mkexpr(vNhi), mkexpr(vNlo)) );
14674             break;
14675          case BITS5(0,0,0,1,0):
14676             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
14677                                     "arm64g_dirtyhelper_SHA256SU0",
14678                                     &arm64g_dirtyhelper_SHA256SU0,
14679                                     mkIRExprVec_5(
14680                                        IRExpr_VECRET(),
14681                                        mkexpr(vDhi), mkexpr(vDlo),
14682                                        mkexpr(vNhi), mkexpr(vNlo)) );
14683             break;
14684          default:
14685             vassert(0);
14686       }
14687       stmt(IRStmt_Dirty(di));
14688       putQReg128(dd, mkexpr(res));
14689       switch (opc) {
14690          case BITS5(0,0,0,0,0):
14691             DIP("%s s%u, s%u\n", inames[opc], dd, nn);
14692             break;
14693          case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,0):
14694             DIP("%s v%u.4s, v%u.4s\n", inames[opc], dd, nn);
14695             break;
14696          default:
14697             vassert(0);
14698       }
14699       return True;
14700    }
14701
14702    return False;
14703 #  undef INSN
14704 }
14705
14706
14707 static
14708 Bool dis_AdvSIMD_fp_compare(/*MB_OUT*/DisResult* dres, UInt insn)
14709 {
14710    /* 31  28    23 21 20 15 13   9 4
14711       000 11110 ty 1  m  op 1000 n opcode2
14712       The first 3 bits are really "M 0 S", but M and S are always zero.
14713       Decode fields are: ty,op,opcode2
14714    */
14715 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14716    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14717        || INSN(21,21) != 1 || INSN(13,10) != BITS4(1,0,0,0)) {
14718       return False;
14719    }
14720    UInt ty      = INSN(23,22);
14721    UInt mm      = INSN(20,16);
14722    UInt op      = INSN(15,14);
14723    UInt nn      = INSN(9,5);
14724    UInt opcode2 = INSN(4,0);
14725    vassert(ty < 4);
14726
14727    if (ty <= X01 && op == X00
14728        && (opcode2 & BITS5(0,0,1,1,1)) == BITS5(0,0,0,0,0)) {
14729       /* -------- 0x,00,00000 FCMP  d_d,   s_s -------- */
14730       /* -------- 0x,00,01000 FCMP  d_#0, s_#0 -------- */
14731       /* -------- 0x,00,10000 FCMPE d_d,   s_s -------- */
14732       /* -------- 0x,00,11000 FCMPE d_#0, s_#0 -------- */
14733       /* 31        23   20    15      9 4
14734          000 11110 01 1     m 00 1000 n 10 000  FCMPE Dn, Dm
14735          000 11110 01 1 00000 00 1000 n 11 000  FCMPE Dn, #0.0
14736          000 11110 01 1     m 00 1000 n 00 000  FCMP  Dn, Dm
14737          000 11110 01 1 00000 00 1000 n 01 000  FCMP  Dn, #0.0
14738
14739          000 11110 00 1     m 00 1000 n 10 000  FCMPE Sn, Sm
14740          000 11110 00 1 00000 00 1000 n 11 000  FCMPE Sn, #0.0
14741          000 11110 00 1     m 00 1000 n 00 000  FCMP  Sn, Sm
14742          000 11110 00 1 00000 00 1000 n 01 000  FCMP  Sn, #0.0
14743
14744          FCMPE generates Invalid Operation exn if either arg is any kind
14745          of NaN.  FCMP generates Invalid Operation exn if either arg is a
14746          signalling NaN.  We ignore this detail here and produce the same
14747          IR for both.
14748       */
14749       Bool   isD     = (ty & 1) == 1;
14750       Bool   isCMPE  = (opcode2 & 16) == 16;
14751       Bool   cmpZero = (opcode2 & 8) == 8;
14752       IRType ity     = isD ? Ity_F64 : Ity_F32;
14753       Bool   valid   = True;
14754       if (cmpZero && mm != 0) valid = False;
14755       if (valid) {
14756          IRTemp argL  = newTemp(ity);
14757          IRTemp argR  = newTemp(ity);
14758          IRTemp irRes = newTemp(Ity_I32);
14759          assign(argL, getQRegLO(nn, ity));
14760          assign(argR,
14761                 cmpZero
14762                    ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
14763                    : getQRegLO(mm, ity));
14764          assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
14765                              mkexpr(argL), mkexpr(argR)));
14766          IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
14767          IRTemp nzcv_28x0 = newTemp(Ity_I64);
14768          assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
14769          setFlags_COPY(nzcv_28x0);
14770          DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ity),
14771              cmpZero ? "#0.0" : nameQRegLO(mm, ity));
14772          return True;
14773       }
14774       return False;
14775    }
14776
14777    return False;
14778 #  undef INSN
14779 }
14780
14781
14782 static
14783 Bool dis_AdvSIMD_fp_conditional_compare(/*MB_OUT*/DisResult* dres, UInt insn,
14784                                         const VexArchInfo* archinfo, Bool sigill_diag)
14785 {
14786    /* 31  28    23 21 20 15   11 9 4  3
14787       000 11110 ty 1  m  cond 01 n op nzcv
14788       The first 3 bits are really "M 0 S", but M and S are always zero.
14789       Decode fields are: ty,op
14790    */
14791 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14792    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14793        || INSN(21,21) != 1 || INSN(11,10) != BITS2(0,1)) {
14794       return False;
14795    }
14796    UInt ty   = INSN(23,22);
14797    UInt mm   = INSN(20,16);
14798    UInt cond = INSN(15,12);
14799    UInt nn   = INSN(9,5);
14800    UInt op   = INSN(4,4);
14801    UInt nzcv = INSN(3,0);
14802    vassert(ty < 4 && op <= 1);
14803
14804    /* -------- 00,0 FCCMP  s_s -------- */
14805    /* -------- 00,1 FCCMPE s_s -------- */
14806    /* -------- 01,0 FCCMP  d_d -------- */
14807    /* -------- 01,1 FCCMPE d_d -------- */
14808    /* -------- 11,0 FCCMP  h_h -------- */
14809    /* -------- 11,1 FCCMPE h_h -------- */
14810
14811    /* FCCMPE generates Invalid Operation exn if either arg is any kind
14812       of NaN.  FCCMP generates Invalid Operation exn if either arg is a
14813       signalling NaN.  We ignore this detail here and produce the same
14814       IR for both.
14815    */
14816    Bool   isCMPE = op == 1;
14817    IRType ity;
14818    IROp   irop;
14819    if (ty == 0) {
14820       ity  = Ity_F32;
14821       irop = Iop_CmpF32;
14822    }
14823    else if (ty == 1) {
14824       ity  = Ity_F64;
14825       irop = Iop_CmpF64;
14826    }
14827    else if (ty == 3) {
14828       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
14829          return False;
14830       ity  = Ity_F16;
14831       irop = Iop_CmpF16;
14832    }
14833    else {
14834       /* ty = 2 is an illegal encoding */
14835       if (sigill_diag) {
14836          vex_printf("ARM64 front end: dis_AdvSIMD_fp_conditional_compare\n");
14837       }
14838       return False;
14839    }
14840    IRTemp argL   = newTemp(ity);
14841    IRTemp argR   = newTemp(ity);
14842    IRTemp irRes  = newTemp(Ity_I32);
14843    assign(argL,  getQRegLO(nn, ity));
14844    assign(argR,  getQRegLO(mm, ity));
14845    assign(irRes, binop(irop, mkexpr(argL), mkexpr(argR)));
14846    IRTemp condT = newTemp(Ity_I1);
14847    assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
14848    IRTemp nzcvT = mk_convert_IRCmpF64Result_to_NZCV(irRes);
14849
14850    IRTemp nzcvT_28x0 = newTemp(Ity_I64);
14851    assign(nzcvT_28x0, binop(Iop_Shl64, mkexpr(nzcvT), mkU8(28)));
14852
14853    IRExpr* nzcvF_28x0 = mkU64(((ULong)nzcv) << 28);
14854
14855    IRTemp nzcv_28x0 = newTemp(Ity_I64);
14856    assign(nzcv_28x0, IRExpr_ITE(mkexpr(condT),
14857                                 mkexpr(nzcvT_28x0), nzcvF_28x0));
14858    setFlags_COPY(nzcv_28x0);
14859    DIP("fccmp%s %s, %s, #%u, %s\n", isCMPE ? "e" : "",
14860        nameQRegLO(nn, ity), nameQRegLO(mm, ity), nzcv, nameCC(cond));
14861    return True;
14862
14863 #  undef INSN
14864 }
14865
14866
14867 static
14868 Bool dis_AdvSIMD_fp_conditional_select(/*MB_OUT*/DisResult* dres, UInt insn)
14869 {
14870    /* 31        23 21 20 15   11 9 5
14871       000 11110 ty 1  m  cond 11 n d
14872       The first 3 bits are really "M 0 S", but M and S are always zero.
14873       Decode fields: ty
14874    */
14875 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14876    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0) || INSN(21,21) != 1
14877        || INSN(11,10) != BITS2(1,1)) {
14878       return False;
14879    }
14880    UInt ty   = INSN(23,22);
14881    UInt mm   = INSN(20,16);
14882    UInt cond = INSN(15,12);
14883    UInt nn   = INSN(9,5);
14884    UInt dd   = INSN(4,0);
14885    if (ty <= X01) {
14886       /* -------- 00: FCSEL s_s -------- */
14887       /* -------- 00: FCSEL d_d -------- */
14888       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
14889       IRTemp srcT = newTemp(ity);
14890       IRTemp srcF = newTemp(ity);
14891       IRTemp res  = newTemp(ity);
14892       assign(srcT, getQRegLO(nn, ity));
14893       assign(srcF, getQRegLO(mm, ity));
14894       assign(res, IRExpr_ITE(
14895                      unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
14896                      mkexpr(srcT), mkexpr(srcF)));
14897       putQReg128(dd, mkV128(0x0000));
14898       putQRegLO(dd, mkexpr(res));
14899       DIP("fcsel %s, %s, %s, %s\n",
14900           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity),
14901           nameCC(cond));
14902       return True;
14903    }
14904    return False;
14905 #  undef INSN
14906 }
14907
14908
14909 static
14910 Bool dis_AdvSIMD_fp_data_proc_1_source(/*MB_OUT*/DisResult* dres, UInt insn)
14911 {
14912    /* 31  28    23 21 20     14    9 4
14913       000 11110 ty 1  opcode 10000 n d
14914       The first 3 bits are really "M 0 S", but M and S are always zero.
14915       Decode fields: ty,opcode
14916    */
14917 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14918    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14919        || INSN(21,21) != 1 || INSN(14,10) != BITS5(1,0,0,0,0)) {
14920       return False;
14921    }
14922    UInt ty     = INSN(23,22);
14923    UInt opcode = INSN(20,15);
14924    UInt nn     = INSN(9,5);
14925    UInt dd     = INSN(4,0);
14926
14927    if (opcode <= BITS6(0,0,0,0,1,1)) {
14928       /* -------- 0x,000000: FMOV  d_d, s_s -------- */
14929       /* -------- 0x,000001: FABS  d_d, s_s, h_h --- */
14930       /* -------- 0x,000010: FNEG  d_d, s_s, h_h --- */
14931       /* -------- 0x,000011: FSQRT d_d, s_s, h_h --- */
14932       IRType ity;
14933       if (ty == X01) ity = Ity_F64;
14934       else if (ty == X00) ity = Ity_F32;
14935       else if (ty == X11) ity = Ity_F16;
14936       else vassert(0);
14937       IRTemp src = newTemp(ity);
14938       IRTemp res = newTemp(ity);
14939       const HChar* nm = "??";
14940       assign(src, getQRegLO(nn, ity));
14941       switch (opcode) {
14942          case BITS6(0,0,0,0,0,0):
14943             nm = "fmov"; assign(res, mkexpr(src)); break;
14944          case BITS6(0,0,0,0,0,1):
14945             nm = "fabs"; assign(res, unop(mkABSF(ity), mkexpr(src))); break;
14946          case BITS6(0,0,0,0,1,0):
14947             nm = "fneg"; assign(res, unop(mkNEGF(ity), mkexpr(src))); break;
14948          case BITS6(0,0,0,0,1,1):
14949             nm = "fsqrt";
14950             assign(res, binop(mkSQRTF(ity),
14951                               mkexpr(mk_get_IR_rounding_mode()),
14952                               mkexpr(src))); break;
14953          default:
14954             vassert(0);
14955       }
14956       putQReg128(dd, mkV128(0x0000));
14957       putQRegLO(dd, mkexpr(res));
14958       DIP("%s %s, %s\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
14959       return True;
14960    }
14961
14962    if (   (ty == X11 && (opcode == BITS6(0,0,0,1,0,0)
14963                          || opcode == BITS6(0,0,0,1,0,1)))
14964        || (ty == X00 && (opcode == BITS6(0,0,0,1,1,1)
14965                          || opcode == BITS6(0,0,0,1,0,1)))
14966        || (ty == X01 && (opcode == BITS6(0,0,0,1,1,1)
14967                          || opcode == BITS6(0,0,0,1,0,0)))) {
14968       /* -------- 11,000100: FCVT s_h -------- */
14969       /* -------- 11,000101: FCVT d_h -------- */
14970       /* -------- 00,000111: FCVT h_s -------- */
14971       /* -------- 00,000101: FCVT d_s -------- */
14972       /* -------- 01,000111: FCVT h_d -------- */
14973       /* -------- 01,000100: FCVT s_d -------- */
14974       /* 31        23 21    16 14    9 4
14975          000 11110 11 10001 00 10000 n d   FCVT Sd, Hn
14976          --------- 11 ----- 01 ---------   FCVT Dd, Hn
14977          --------- 00 ----- 11 ---------   FCVT Hd, Sn
14978          --------- 00 ----- 01 ---------   FCVT Dd, Sn
14979          --------- 01 ----- 11 ---------   FCVT Hd, Dn
14980          --------- 01 ----- 00 ---------   FCVT Sd, Dn
14981          Rounding, when dst is smaller than src, is per the FPCR.
14982       */
14983       UInt b2322 = ty;
14984       UInt b1615 = opcode & BITS2(1,1);
14985       switch ((b2322 << 2) | b1615) {
14986          case BITS4(0,0,0,1):   // S -> D
14987          case BITS4(1,1,0,1): { // H -> D
14988             Bool   srcIsH = b2322 == BITS2(1,1);
14989             IRType srcTy  = srcIsH ? Ity_F16 : Ity_F32;
14990             IRTemp res    = newTemp(Ity_F64);
14991             assign(res, unop(srcIsH ? Iop_F16toF64 : Iop_F32toF64,
14992                              getQRegLO(nn, srcTy)));
14993             putQReg128(dd, mkV128(0x0000));
14994             putQRegLO(dd, mkexpr(res));
14995             DIP("fcvt %s, %s\n",
14996                 nameQRegLO(dd, Ity_F64), nameQRegLO(nn, srcTy));
14997             return True;
14998          }
14999          case BITS4(0,1,0,0):   // D -> S
15000          case BITS4(0,1,1,1): { // D -> H
15001             Bool   dstIsH = b1615 == BITS2(1,1);
15002             IRType dstTy  = dstIsH ? Ity_F16 : Ity_F32;
15003             IRTemp res    = newTemp(dstTy);
15004             assign(res, binop(dstIsH ? Iop_F64toF16 : Iop_F64toF32,
15005                               mkexpr(mk_get_IR_rounding_mode()),
15006                               getQRegLO(nn, Ity_F64)));
15007             putQReg128(dd, mkV128(0x0000));
15008             putQRegLO(dd, mkexpr(res));
15009             DIP("fcvt %s, %s\n",
15010                 nameQRegLO(dd, dstTy), nameQRegLO(nn, Ity_F64));
15011             return True;
15012          }
15013          case BITS4(0,0,1,1):   // S -> H
15014          case BITS4(1,1,0,0): { // H -> S
15015             Bool   toH   = b1615 == BITS2(1,1);
15016             IRType srcTy = toH ? Ity_F32 : Ity_F16;
15017             IRType dstTy = toH ? Ity_F16 : Ity_F32;
15018             IRTemp res = newTemp(dstTy);
15019             if (toH) {
15020                assign(res, binop(Iop_F32toF16,
15021                                  mkexpr(mk_get_IR_rounding_mode()),
15022                                  getQRegLO(nn, srcTy)));
15023
15024             } else {
15025                assign(res, unop(Iop_F16toF32,
15026                                 getQRegLO(nn, srcTy)));
15027             }
15028             putQReg128(dd, mkV128(0x0000));
15029             putQRegLO(dd, mkexpr(res));
15030             DIP("fcvt %s, %s\n",
15031                 nameQRegLO(dd, dstTy), nameQRegLO(nn, srcTy));
15032             return True;
15033          }
15034          default:
15035             break;
15036       }
15037       /* else unhandled */
15038       return False;
15039    }
15040
15041    if (ty <= X01
15042        && opcode >= BITS6(0,0,1,0,0,0) && opcode <= BITS6(0,0,1,1,1,1)
15043        && opcode != BITS6(0,0,1,1,0,1)) {
15044       /* -------- 0x,001000 FRINTN d_d, s_s -------- */
15045       /* -------- 0x,001001 FRINTP d_d, s_s -------- */
15046       /* -------- 0x,001010 FRINTM d_d, s_s -------- */
15047       /* -------- 0x,001011 FRINTZ d_d, s_s -------- */
15048       /* -------- 0x,001100 FRINTA d_d, s_s -------- */
15049       /* -------- 0x,001110 FRINTX d_d, s_s -------- */
15050       /* -------- 0x,001111 FRINTI d_d, s_s -------- */
15051       /* 31        23 21   17  14    9 4
15052          000 11110 0x 1001 111 10000 n d  FRINTI Fd, Fm (round per FPCR)
15053                            rm
15054          x==0 => S-registers, x==1 => D-registers
15055          rm (17:15) encodings:
15056             111 per FPCR  (FRINTI)
15057             001 +inf      (FRINTP)
15058             010 -inf      (FRINTM)
15059             011 zero      (FRINTZ)
15060             000 tieeven   (FRINTN) -- !! FIXME KLUDGED !!
15061             100 tieaway   (FRINTA) -- !! FIXME KLUDGED !!
15062             110 per FPCR + "exact = TRUE" (FRINTX)
15063             101 unallocated
15064       */
15065       Bool    isD   = (ty & 1) == 1;
15066       UInt    rm    = opcode & BITS6(0,0,0,1,1,1);
15067       IRType  ity   = isD ? Ity_F64 : Ity_F32;
15068       IRExpr* irrmE = NULL;
15069       UChar   ch    = '?';
15070       switch (rm) {
15071          case BITS3(0,1,1): ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
15072          case BITS3(0,1,0): ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
15073          case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
15074          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
15075          case BITS3(1,0,0): ch = 'a'; irrmE = mkU32(Irrm_NEAREST); break;
15076          // I am unsure about the following, due to the "integral exact"
15077          // description in the manual.  What does it mean? (frintx, that is)
15078          case BITS3(1,1,0):
15079             ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
15080          case BITS3(1,1,1):
15081             ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
15082          // The following is a kludge.  There's no Irrm_ value to represent
15083          // this ("to nearest, with ties to even")
15084          case BITS3(0,0,0): ch = 'n'; irrmE = mkU32(Irrm_NEAREST); break;
15085          default: break;
15086       }
15087       if (irrmE) {
15088          IRTemp src = newTemp(ity);
15089          IRTemp dst = newTemp(ity);
15090          assign(src, getQRegLO(nn, ity));
15091          assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
15092                            irrmE, mkexpr(src)));
15093          putQReg128(dd, mkV128(0x0000));
15094          putQRegLO(dd, mkexpr(dst));
15095          DIP("frint%c %s, %s\n",
15096              ch, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
15097          return True;
15098       }
15099       return False;
15100    }
15101
15102    return False;
15103 #  undef INSN
15104 }
15105
15106
15107 static
15108 Bool dis_AdvSIMD_fp_data_proc_2_source(/*MB_OUT*/DisResult* dres, UInt insn,
15109                                        const VexArchInfo* archinfo)
15110 {
15111    /* 31  28    23 21 20 15     11 9 4
15112       000 11110 ty 1  m  opcode 10 n d
15113       The first 3 bits are really "M 0 S", but M and S are always zero.
15114       Decode fields: ty, opcode
15115    */
15116 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15117    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
15118        || INSN(21,21) != 1 || INSN(11,10) != BITS2(1,0)) {
15119       return False;
15120    }
15121    UInt ty     = INSN(23,22);
15122    UInt mm     = INSN(20,16);
15123    UInt opcode = INSN(15,12);
15124    UInt nn     = INSN(9,5);
15125    UInt dd     = INSN(4,0);
15126
15127    if (ty <= X01 && opcode <= BITS4(0,1,1,1)) {
15128       /* ------- 0x,0000: FMUL d_d, s_s ------- */
15129       /* ------- 0x,0001: FDIV d_d, s_s ------- */
15130       /* ------- 0x,0010: FADD d_d, s_s ------- */
15131       /* ------- 0x,0011: FSUB d_d, s_s ------- */
15132       /* ------- 0x,0100: FMAX d_d, s_s ------- */
15133       /* ------- 0x,0101: FMIN d_d, s_s ------- */
15134       /* ------- 0x,0110: FMAXNM d_d, s_s ------- (FIXME KLUDGED) */
15135       /* ------- 0x,0111: FMINNM d_d, s_s ------- (FIXME KLUDGED) */
15136       IRType ity = ty == X00 ? Ity_F32 : Ity_F64;
15137       IROp   iop = Iop_INVALID;
15138       const HChar* nm = "???";
15139       switch (opcode) {
15140          case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
15141          case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
15142          case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
15143          case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
15144          case BITS4(0,1,0,0): nm = "fmax"; iop = mkVecMAXF(ty+2); break;
15145          case BITS4(0,1,0,1): nm = "fmin"; iop = mkVecMINF(ty+2); break;
15146          case BITS4(0,1,1,0): nm = "fmaxnm"; iop = mkVecMAXF(ty+2); break; //!!
15147          case BITS4(0,1,1,1): nm = "fminnm"; iop = mkVecMINF(ty+2); break; //!!
15148          default: vassert(0);
15149       }
15150       if (opcode <= BITS4(0,0,1,1)) {
15151          // This is really not good code.  TODO: avoid width-changing
15152          IRTemp res = newTemp(ity);
15153          assign(res, triop(iop, mkexpr(mk_get_IR_rounding_mode()),
15154                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
15155          putQReg128(dd, mkV128(0));
15156          putQRegLO(dd, mkexpr(res));
15157       } else {
15158          putQReg128(dd, unop(mkVecZEROHIxxOFV128(ty+2),
15159                              binop(iop, getQReg128(nn), getQReg128(mm))));
15160       }
15161       DIP("%s %s, %s, %s\n",
15162           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
15163       return True;
15164    }
15165
15166    if (ty <= X01 && opcode == BITS4(1,0,0,0)) {
15167       /* ------- 0x,1000: FNMUL d_d, s_s ------- */
15168       IRType ity  = ty == X00 ? Ity_F32 : Ity_F64;
15169       IROp   iop  = mkMULF(ity);
15170       IROp   iopn = mkNEGF(ity);
15171       const HChar* nm = "fnmul";
15172       IRExpr* resE = unop(iopn,
15173                           triop(iop, mkexpr(mk_get_IR_rounding_mode()),
15174                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
15175       IRTemp  res  = newTemp(ity);
15176       assign(res, resE);
15177       putQReg128(dd, mkV128(0));
15178       putQRegLO(dd, mkexpr(res));
15179       DIP("%s %s, %s, %s\n",
15180           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
15181       return True;
15182    }
15183
15184    if (ty == X11 && opcode <= BITS4(0,0,1,0)) {
15185       /* ------- 11,0010: FADD h_h ------- */
15186       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
15187          return False;
15188       IRTemp res = newTemp(Ity_F16);
15189       assign(res, triop(mkADDF(Ity_F16), mkexpr(mk_get_IR_rounding_mode()),
15190                              getQRegLO(nn, Ity_F16), getQRegLO(mm, Ity_F16)));
15191       putQReg128(dd, mkV128(0));
15192       putQRegLO(dd, mkexpr(res));
15193       DIP("fadd %s, %s, %s\n",
15194           nameQRegLO(dd, Ity_F16), nameQRegLO(nn, Ity_F16), nameQRegLO(mm, Ity_F16));
15195       return True;
15196    }
15197
15198    return False;
15199 #  undef INSN
15200 }
15201
15202
15203 static
15204 Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
15205 {
15206    /* 31  28    23 21 20 15 14 9 4
15207       000 11111 ty o1 m  o0 a  n d
15208       The first 3 bits are really "M 0 S", but M and S are always zero.
15209       Decode fields: ty,o1,o0
15210    */
15211 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15212    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,1)) {
15213       return False;
15214    }
15215    UInt ty    = INSN(23,22);
15216    UInt bitO1 = INSN(21,21);
15217    UInt mm    = INSN(20,16);
15218    UInt bitO0 = INSN(15,15);
15219    UInt aa    = INSN(14,10);
15220    UInt nn    = INSN(9,5);
15221    UInt dd    = INSN(4,0);
15222    vassert(ty < 4);
15223
15224    if (ty <= X01) {
15225       /* -------- 0x,0,0 FMADD  d_d_d_d, s_s_s_s -------- */
15226       /* -------- 0x,0,1 FMSUB  d_d_d_d, s_s_s_s -------- */
15227       /* -------- 0x,1,0 FNMADD d_d_d_d, s_s_s_s -------- */
15228       /* -------- 0x,1,1 FNMSUB d_d_d_d, s_s_s_s -------- */
15229       /* -------------------- F{N}M{ADD,SUB} -------------------- */
15230       /* 31          22   20 15 14 9 4   ix
15231          000 11111 0 sz 0 m  0  a  n d   0   FMADD  Fd,Fn,Fm,Fa
15232          000 11111 0 sz 0 m  1  a  n d   1   FMSUB  Fd,Fn,Fm,Fa
15233          000 11111 0 sz 1 m  0  a  n d   2   FNMADD Fd,Fn,Fm,Fa
15234          000 11111 0 sz 1 m  1  a  n d   3   FNMSUB Fd,Fn,Fm,Fa
15235          where Fx=Dx when sz=1, Fx=Sx when sz=0
15236
15237                   -----SPEC------    ----IMPL----
15238          fmadd       a +    n * m    fmadd (a, n, m)
15239          fmsub       a + (-n) * m    fmsub (a, n, m)
15240          fnmadd   (-a) + (-n) * m    fmadd (-a, -n, m)
15241          fnmsub   (-a) +    n * m    fmadd (-a, n, m)
15242
15243          Note Iop_MAdd/SubF32/64 take arguments in the order: rm, N, M, A
15244       */
15245       Bool    isD   = (ty & 1) == 1;
15246       UInt    ix    = (bitO1 << 1) | bitO0;
15247       IRType  ity   = isD ? Ity_F64 : Ity_F32;
15248       IROp    opFMADD = mkFMADDF(ity);
15249       IROp    opFMSUB = mkFMSUBF(ity);
15250       IROp    opNEG = mkNEGF(ity);
15251       IRTemp  res   = newTemp(ity);
15252       IRExpr* eA    = getQRegLO(aa, ity);
15253       IRExpr* eN    = getQRegLO(nn, ity);
15254       IRExpr* eM    = getQRegLO(mm, ity);
15255       IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
15256       switch (ix) {
15257          case 0: /* FMADD */
15258             assign(res, qop(opFMADD, rm, eN, eM, eA));
15259             break;
15260          case 1: /* FMSUB */
15261             assign(res, qop(opFMSUB, rm, eN, eM, eA));
15262             break;
15263          case 2: /* FNMADD */
15264             assign(res, qop(opFMADD, rm, unop(opNEG, eN), eM,
15265                             unop(opNEG,eA)));
15266             break;
15267          case 3: /* FNMSUB */
15268             assign(res, qop(opFMADD, rm, eN, eM, unop(opNEG, eA)));
15269             break;
15270          default:
15271             vassert(0);
15272       }
15273       putQReg128(dd, mkV128(0x0000));
15274       putQRegLO(dd, mkexpr(res));
15275       const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
15276       DIP("%s %s, %s, %s, %s\n",
15277           names[ix], nameQRegLO(dd, ity), nameQRegLO(nn, ity),
15278                      nameQRegLO(mm, ity), nameQRegLO(aa, ity));
15279       return True;
15280    }
15281
15282    return False;
15283 #  undef INSN
15284 }
15285
15286
15287 static
15288 Bool dis_AdvSIMD_fp_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
15289 {
15290    /* 31  28    23 21 20   12  9    4
15291       000 11110 ty 1  imm8 100 imm5 d
15292       The first 3 bits are really "M 0 S", but M and S are always zero.
15293    */
15294 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15295    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
15296        || INSN(21,21) != 1 || INSN(12,10) != BITS3(1,0,0)) {
15297       return False;
15298    }
15299    UInt ty     = INSN(23,22);
15300    UInt imm8   = INSN(20,13);
15301    UInt imm5   = INSN(9,5);
15302    UInt dd     = INSN(4,0);
15303
15304    /* ------- 00,00000: FMOV s_imm ------- */
15305    /* ------- 01,00000: FMOV d_imm ------- */
15306    if (ty <= X01 && imm5 == BITS5(0,0,0,0,0)) {
15307       Bool  isD  = (ty & 1) == 1;
15308       ULong imm  = VFPExpandImm(imm8, isD ? 64 : 32);
15309       if (!isD) {
15310          vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
15311       }
15312       putQReg128(dd, mkV128(0));
15313       putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
15314       DIP("fmov %s, #0x%llx\n",
15315           nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
15316       return True;
15317    }
15318
15319    return False;
15320 #  undef INSN
15321 }
15322
15323
15324 static
15325 Bool dis_AdvSIMD_fp_to_from_fixedp_conv(/*MB_OUT*/DisResult* dres, UInt insn)
15326 {
15327 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15328    /* 31 30 29 28    23   21 20    18     15    9 4
15329       sf  0  0 11110 type 0  rmode opcode scale n d
15330       The first 3 bits are really "sf 0 S", but S is always zero.
15331       Decode fields: sf,type,rmode,opcode
15332    */
15333 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15334    if (INSN(30,29) != BITS2(0,0)
15335        || INSN(28,24) != BITS5(1,1,1,1,0)
15336        || INSN(21,21) != 0) {
15337       return False;
15338    }
15339    UInt bitSF = INSN(31,31);
15340    UInt ty    = INSN(23,22); // type
15341    UInt rm    = INSN(20,19); // rmode
15342    UInt op    = INSN(18,16); // opcode
15343    UInt sc    = INSN(15,10); // scale
15344    UInt nn    = INSN(9,5);
15345    UInt dd    = INSN(4,0);
15346
15347    if (ty <= X01 && rm == X11
15348        && (op == BITS3(0,0,0) || op == BITS3(0,0,1))) {
15349       /* -------- (ix) sf ty rm opc -------- */
15350       /* -------- 0    0  00 11 000: FCVTZS w_s_#fbits -------- */
15351       /* -------- 1    0  01 11 000: FCVTZS w_d_#fbits -------- */
15352       /* -------- 2    1  00 11 000: FCVTZS x_s_#fbits -------- */
15353       /* -------- 3    1  01 11 000: FCVTZS x_d_#fbits -------- */
15354
15355       /* -------- 4    0  00 11 001: FCVTZU w_s_#fbits -------- */
15356       /* -------- 5    0  01 11 001: FCVTZU w_d_#fbits -------- */
15357       /* -------- 6    1  00 11 001: FCVTZU x_s_#fbits -------- */
15358       /* -------- 7    1  01 11 001: FCVTZU x_d_#fbits -------- */
15359       Bool isI64 = bitSF == 1;
15360       Bool isF64 = (ty & 1) == 1;
15361       Bool isU   = (op & 1) == 1;
15362       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
15363
15364       Int fbits = 64 - sc;
15365       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
15366
15367       Double  scale  = two_to_the_plus(fbits);
15368       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
15369                              : IRExpr_Const(IRConst_F32( (Float)scale ));
15370       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
15371
15372       const IROp ops[8]
15373         = { Iop_F32toI32S, Iop_F64toI32S, Iop_F32toI64S, Iop_F64toI64S,
15374             Iop_F32toI32U, Iop_F64toI32U, Iop_F32toI64U, Iop_F64toI64U };
15375       IRTemp irrm = newTemp(Ity_I32);
15376       assign(irrm, mkU32(Irrm_ZERO));
15377
15378       IRExpr* src = getQRegLO(nn, isF64 ? Ity_F64 : Ity_F32);
15379       IRExpr* res = binop(ops[ix], mkexpr(irrm),
15380                                    triop(opMUL, mkexpr(irrm), src, scaleE));
15381       putIRegOrZR(isI64, dd, res);
15382
15383       DIP("fcvtz%c %s, %s, #%d\n",
15384           isU ? 'u' : 's', nameIRegOrZR(isI64, dd),
15385           nameQRegLO(nn, isF64 ? Ity_F64 : Ity_F32), fbits);
15386       return True;
15387    }
15388
15389    /* ------ sf,ty,rm,opc ------ */
15390    /* ------ x,0x,00,010  SCVTF s/d, w/x, #fbits  ------ */
15391    /* ------ x,0x,00,011  UCVTF s/d, w/x, #fbits  ------ */
15392    /* (ix) sf  S 28    ty   rm opc 15    9 4
15393       0    0 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Wn, #fbits
15394       1    0 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Wn, #fbits
15395       2    1 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Xn, #fbits
15396       3    1 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Xn, #fbits
15397
15398       4    0 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Wn, #fbits
15399       5    0 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Wn, #fbits
15400       6    1 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Xn, #fbits
15401       7    1 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Xn, #fbits
15402
15403       These are signed/unsigned conversion from integer registers to
15404       FP registers, all 4 32/64-bit combinations, rounded per FPCR,
15405       scaled per |scale|.
15406    */
15407    if (ty <= X01 && rm == X00
15408        && (op == BITS3(0,1,0) || op == BITS3(0,1,1))
15409        && (bitSF == 1 || ((sc >> 5) & 1) == 1)) {
15410       Bool isI64 = bitSF == 1;
15411       Bool isF64 = (ty & 1) == 1;
15412       Bool isU   = (op & 1) == 1;
15413       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
15414
15415       Int fbits = 64 - sc;
15416       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
15417
15418       Double  scale  = two_to_the_minus(fbits);
15419       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
15420                              : IRExpr_Const(IRConst_F32( (Float)scale ));
15421       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
15422
15423       const IROp ops[8]
15424         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
15425             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
15426       IRExpr* src = getIRegOrZR(isI64, nn);
15427       IRExpr* res = (isF64 && !isI64)
15428                        ? unop(ops[ix], src)
15429                        : binop(ops[ix],
15430                                mkexpr(mk_get_IR_rounding_mode()), src);
15431       putQReg128(dd, mkV128(0));
15432       putQRegLO(dd, triop(opMUL, mkU32(Irrm_NEAREST), res, scaleE));
15433
15434       DIP("%ccvtf %s, %s, #%d\n",
15435           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
15436           nameIRegOrZR(isI64, nn), fbits);
15437       return True;
15438    }
15439
15440    return False;
15441 #  undef INSN
15442 }
15443
15444
15445 static
15446 Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn)
15447 {
15448    /* 31 30 29 28    23   21 20    18     15     9 4
15449       sf  0  0 11110 type 1  rmode opcode 000000 n d
15450       The first 3 bits are really "sf 0 S", but S is always zero.
15451       Decode fields: sf,type,rmode,opcode
15452    */
15453 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15454    if (INSN(30,29) != BITS2(0,0)
15455        || INSN(28,24) != BITS5(1,1,1,1,0)
15456        || INSN(21,21) != 1
15457        || INSN(15,10) != BITS6(0,0,0,0,0,0)) {
15458       return False;
15459    }
15460    UInt bitSF = INSN(31,31);
15461    UInt ty    = INSN(23,22); // type
15462    UInt rm    = INSN(20,19); // rmode
15463    UInt op    = INSN(18,16); // opcode
15464    UInt nn    = INSN(9,5);
15465    UInt dd    = INSN(4,0);
15466
15467    // op = 000, 001
15468    /* -------- FCVT{N,P,M,Z,A}{S,U} (scalar, integer) -------- */
15469    /*    30       23   20 18  15     9 4
15470       sf 00 11110 0x 1 00 000 000000 n d  FCVTNS Rd, Fn (round to
15471       sf 00 11110 0x 1 00 001 000000 n d  FCVTNU Rd, Fn  nearest)
15472       ---------------- 01 --------------  FCVTP-------- (round to +inf)
15473       ---------------- 10 --------------  FCVTM-------- (round to -inf)
15474       ---------------- 11 --------------  FCVTZ-------- (round to zero)
15475       ---------------- 00 100 ----------  FCVTAS------- (nearest, ties away)
15476       ---------------- 00 101 ----------  FCVTAU------- (nearest, ties away)
15477
15478       Rd is Xd when sf==1, Wd when sf==0
15479       Fn is Dn when x==1, Sn when x==0
15480       20:19 carry the rounding mode, using the same encoding as FPCR
15481    */
15482    if (ty <= X01
15483        && (   ((op == BITS3(0,0,0) || op == BITS3(0,0,1)) && True)
15484            || ((op == BITS3(1,0,0) || op == BITS3(1,0,1)) && rm == BITS2(0,0))
15485           )
15486       ) {
15487       Bool isI64 = bitSF == 1;
15488       Bool isF64 = (ty & 1) == 1;
15489       Bool isU   = (op & 1) == 1;
15490       /* Decide on the IR rounding mode to use. */
15491       IRRoundingMode irrm = 8; /*impossible*/
15492       HChar ch = '?';
15493       if (op == BITS3(0,0,0) || op == BITS3(0,0,1)) {
15494          switch (rm) {
15495             case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
15496             case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
15497             case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
15498             case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
15499             default: vassert(0);
15500          }
15501       } else {
15502          vassert(op == BITS3(1,0,0) || op == BITS3(1,0,1));
15503          switch (rm) {
15504             case BITS2(0,0): ch = 'a'; irrm = Irrm_NEAREST; break;
15505             default: vassert(0);
15506          }
15507       }
15508       vassert(irrm != 8);
15509       /* Decide on the conversion primop, based on the source size,
15510          dest size and signedness (8 possibilities).  Case coding:
15511             F32 ->s I32   0
15512             F32 ->u I32   1
15513             F32 ->s I64   2
15514             F32 ->u I64   3
15515             F64 ->s I32   4
15516             F64 ->u I32   5
15517             F64 ->s I64   6
15518             F64 ->u I64   7
15519       */
15520       UInt ix = (isF64 ? 4 : 0) | (isI64 ? 2 : 0) | (isU ? 1 : 0);
15521       vassert(ix < 8);
15522       const IROp iops[8]
15523          = { Iop_F32toI32S, Iop_F32toI32U, Iop_F32toI64S, Iop_F32toI64U,
15524              Iop_F64toI32S, Iop_F64toI32U, Iop_F64toI64S, Iop_F64toI64U };
15525       IROp iop = iops[ix];
15526       // A bit of ATCery: bounce all cases we haven't seen an example of.
15527       if (/* F32toI32S */
15528              (iop == Iop_F32toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Sn */
15529           || (iop == Iop_F32toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Sn */
15530           || (iop == Iop_F32toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Sn */
15531           || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,S */
15532           /* F32toI32U */
15533           || (iop == Iop_F32toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Sn */
15534           || (iop == Iop_F32toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Sn */
15535           || (iop == Iop_F32toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Sn */
15536           || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,S */
15537           /* F32toI64S */
15538           || (iop == Iop_F32toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Sn */
15539           || (iop == Iop_F32toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Sn */
15540           || (iop == Iop_F32toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Sn */
15541           || (iop == Iop_F32toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,S */
15542           /* F32toI64U */
15543           || (iop == Iop_F32toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Sn */
15544           || (iop == Iop_F32toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Sn */
15545           || (iop == Iop_F32toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Sn */
15546           || (iop == Iop_F32toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,S */
15547           /* F64toI32S */
15548           || (iop == Iop_F64toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Dn */
15549           || (iop == Iop_F64toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Dn */
15550           || (iop == Iop_F64toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Dn */
15551           || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,D */
15552           /* F64toI32U */
15553           || (iop == Iop_F64toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Dn */
15554           || (iop == Iop_F64toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Dn */
15555           || (iop == Iop_F64toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Dn */
15556           || (iop == Iop_F64toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,D */
15557           /* F64toI64S */
15558           || (iop == Iop_F64toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Dn */
15559           || (iop == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
15560           || (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
15561           || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,D */
15562           /* F64toI64U */
15563           || (iop == Iop_F64toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Dn */
15564           || (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
15565           || (iop == Iop_F64toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Dn */
15566           || (iop == Iop_F64toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,D */
15567          ) {
15568         /* validated */
15569       } else {
15570         return False;
15571       }
15572       IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
15573       IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
15574       IRTemp src    = newTemp(srcTy);
15575       IRTemp dst    = newTemp(dstTy);
15576       assign(src, getQRegLO(nn, srcTy));
15577       assign(dst, binop(iop, mkU32(irrm), mkexpr(src)));
15578       putIRegOrZR(isI64, dd, mkexpr(dst));
15579       DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
15580           nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
15581       return True;
15582    }
15583
15584    // op = 010, 011
15585    /* -------------- {S,U}CVTF (scalar, integer) -------------- */
15586    /* (ix) sf  S 28    ty   rm op  15     9 4
15587       0    0 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Wn
15588       1    0 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Wn
15589       2    1 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Xn
15590       3    1 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Xn
15591
15592       4    0 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Wn
15593       5    0 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Wn
15594       6    1 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Xn
15595       7    1 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Xn
15596
15597       These are signed/unsigned conversion from integer registers to
15598       FP registers, all 4 32/64-bit combinations, rounded per FPCR.
15599    */
15600    if (ty <= X01 && rm == X00 && (op == BITS3(0,1,0) || op == BITS3(0,1,1))) {
15601       Bool isI64 = bitSF == 1;
15602       Bool isF64 = (ty & 1) == 1;
15603       Bool isU   = (op & 1) == 1;
15604       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
15605       const IROp ops[8]
15606         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
15607             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
15608       IRExpr* src = getIRegOrZR(isI64, nn);
15609       IRExpr* res = (isF64 && !isI64)
15610                        ? unop(ops[ix], src)
15611                        : binop(ops[ix],
15612                                mkexpr(mk_get_IR_rounding_mode()), src);
15613       putQReg128(dd, mkV128(0));
15614       putQRegLO(dd, res);
15615       DIP("%ccvtf %s, %s\n",
15616           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
15617           nameIRegOrZR(isI64, nn));
15618       return True;
15619    }
15620
15621    // op = 110, 111
15622    /* -------- FMOV (general) -------- */
15623    /* case sf  S       ty   rm op  15     9 4
15624        (1) 0 0 0 11110 00 1 00 111 000000 n d     FMOV Sd,      Wn
15625        (2) 1 0 0 11110 01 1 00 111 000000 n d     FMOV Dd,      Xn
15626        (3) 1 0 0 11110 10 1 01 111 000000 n d     FMOV Vd.D[1], Xn
15627
15628        (4) 0 0 0 11110 00 1 00 110 000000 n d     FMOV Wd, Sn
15629        (5) 1 0 0 11110 01 1 00 110 000000 n d     FMOV Xd, Dn
15630        (6) 1 0 0 11110 10 1 01 110 000000 n d     FMOV Xd, Vn.D[1]
15631    */
15632    if (1) {
15633       UInt ix = 0; // case
15634       if (bitSF == 0) {
15635          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,1))
15636             ix = 1;
15637          else
15638          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,0))
15639             ix = 4;
15640       } else {
15641          vassert(bitSF == 1);
15642          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,1))
15643             ix = 2;
15644          else
15645          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,0))
15646             ix = 5;
15647          else
15648          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,1))
15649             ix = 3;
15650          else
15651          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,0))
15652             ix = 6;
15653       }
15654       if (ix > 0) {
15655          switch (ix) {
15656             case 1:
15657                putQReg128(dd, mkV128(0));
15658                putQRegLO(dd, getIReg32orZR(nn));
15659                DIP("fmov s%u, w%u\n", dd, nn);
15660                break;
15661             case 2:
15662                putQReg128(dd, mkV128(0));
15663                putQRegLO(dd, getIReg64orZR(nn));
15664                DIP("fmov d%u, x%u\n", dd, nn);
15665                break;
15666             case 3:
15667                putQRegHI64(dd, getIReg64orZR(nn));
15668                DIP("fmov v%u.d[1], x%u\n", dd, nn);
15669                break;
15670             case 4:
15671                putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
15672                DIP("fmov w%u, s%u\n", dd, nn);
15673                break;
15674             case 5:
15675                putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
15676                DIP("fmov x%u, d%u\n", dd, nn);
15677                break;
15678             case 6:
15679                putIReg64orZR(dd, getQRegHI64(nn));
15680                DIP("fmov x%u, v%u.d[1]\n", dd, nn);
15681                break;
15682             default:
15683                vassert(0);
15684          }
15685          return True;
15686       }
15687       /* undecodable; fall through */
15688    }
15689
15690    return False;
15691 #  undef INSN
15692 }
15693
15694
15695 static
15696 Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn,
15697                            const VexArchInfo* archinfo, Bool sigill_diag)
15698 {
15699    Bool ok;
15700    ok = dis_AdvSIMD_EXT(dres, insn);
15701    if (UNLIKELY(ok)) return True;
15702    ok = dis_AdvSIMD_TBL_TBX(dres, insn);
15703    if (UNLIKELY(ok)) return True;
15704    ok = dis_AdvSIMD_ZIP_UZP_TRN(dres, insn);
15705    if (UNLIKELY(ok)) return True;
15706    ok = dis_AdvSIMD_across_lanes(dres, insn);
15707    if (UNLIKELY(ok)) return True;
15708    ok = dis_AdvSIMD_copy(dres, insn);
15709    if (UNLIKELY(ok)) return True;
15710    ok = dis_AdvSIMD_modified_immediate(dres, insn);
15711    if (UNLIKELY(ok)) return True;
15712    ok = dis_AdvSIMD_scalar_copy(dres, insn);
15713    if (UNLIKELY(ok)) return True;
15714    ok = dis_AdvSIMD_scalar_pairwise(dres, insn, archinfo);
15715    if (UNLIKELY(ok)) return True;
15716    ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn);
15717    if (UNLIKELY(ok)) return True;
15718    ok = dis_AdvSIMD_scalar_three_different(dres, insn);
15719    if (UNLIKELY(ok)) return True;
15720    ok = dis_AdvSIMD_scalar_three_same(dres, insn);
15721    if (UNLIKELY(ok)) return True;
15722    ok = dis_AdvSIMD_scalar_three_same_extra(dres, insn, archinfo);
15723    if (UNLIKELY(ok)) return True;
15724    ok = dis_AdvSIMD_scalar_two_reg_misc(dres, insn);
15725    if (UNLIKELY(ok)) return True;
15726    ok = dis_AdvSIMD_scalar_two_reg_misc_fp16(dres, insn, archinfo);
15727    if (UNLIKELY(ok)) return True;
15728    ok = dis_AdvSIMD_scalar_x_indexed_element(dres, insn);
15729    if (UNLIKELY(ok)) return True;
15730    ok = dis_AdvSIMD_shift_by_immediate(dres, insn);
15731    if (UNLIKELY(ok)) return True;
15732    ok = dis_AdvSIMD_three_different(dres, insn);
15733    if (UNLIKELY(ok)) return True;
15734    ok = dis_AdvSIMD_three_same(dres, insn);
15735    if (UNLIKELY(ok)) return True;
15736    ok = dis_AdvSIMD_three_same_extra(dres, insn);
15737    if (UNLIKELY(ok)) return True;
15738    ok = dis_AdvSIMD_three_same_fp16(dres, insn, archinfo);
15739    if (UNLIKELY(ok)) return True;
15740    ok = dis_AdvSIMD_two_reg_misc(dres, insn);
15741    if (UNLIKELY(ok)) return True;
15742    ok = dis_AdvSIMD_two_reg_misc_fp16(dres, insn, archinfo);
15743    if (UNLIKELY(ok)) return True;
15744    ok = dis_AdvSIMD_vector_x_indexed_elem(dres, insn);
15745    if (UNLIKELY(ok)) return True;
15746    ok = dis_AdvSIMD_crypto_aes(dres, insn);
15747    if (UNLIKELY(ok)) return True;
15748    ok = dis_AdvSIMD_crypto_three_reg_sha(dres, insn);
15749    if (UNLIKELY(ok)) return True;
15750    ok = dis_AdvSIMD_crypto_two_reg_sha(dres, insn);
15751    if (UNLIKELY(ok)) return True;
15752    ok = dis_AdvSIMD_fp_compare(dres, insn);
15753    if (UNLIKELY(ok)) return True;
15754    ok = dis_AdvSIMD_fp_conditional_compare(dres, insn, archinfo, sigill_diag);
15755    if (UNLIKELY(ok)) return True;
15756    ok = dis_AdvSIMD_fp_conditional_select(dres, insn);
15757    if (UNLIKELY(ok)) return True;
15758    ok = dis_AdvSIMD_fp_data_proc_1_source(dres, insn);
15759    if (UNLIKELY(ok)) return True;
15760    ok = dis_AdvSIMD_fp_data_proc_2_source(dres, insn, archinfo);
15761    if (UNLIKELY(ok)) return True;
15762    ok = dis_AdvSIMD_fp_data_proc_3_source(dres, insn);
15763    if (UNLIKELY(ok)) return True;
15764    ok = dis_AdvSIMD_fp_immediate(dres, insn);
15765    if (UNLIKELY(ok)) return True;
15766    ok = dis_AdvSIMD_fp_to_from_fixedp_conv(dres, insn);
15767    if (UNLIKELY(ok)) return True;
15768    ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
15769    if (UNLIKELY(ok)) return True;
15770    return False;
15771 }
15772
15773
15774 /*------------------------------------------------------------*/
15775 /*--- Disassemble a single ARM64 instruction               ---*/
15776 /*------------------------------------------------------------*/
15777
15778 /* Disassemble a single ARM64 instruction into IR.  The instruction
15779    has is located at |guest_instr| and has guest IP of
15780    |guest_PC_curr_instr|, which will have been set before the call
15781    here.  Returns True iff the instruction was decoded, in which case
15782    *dres will be set accordingly, or False, in which case *dres should
15783    be ignored by the caller. */
15784
15785 static
15786 Bool disInstr_ARM64_WRK (
15787         /*MB_OUT*/DisResult* dres,
15788         const UChar* guest_instr,
15789         const VexArchInfo* archinfo,
15790         const VexAbiInfo*  abiinfo,
15791         Bool sigill_diag
15792      )
15793 {
15794    // A macro to fish bits out of 'insn'.
15795 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15796
15797 //ZZ    DisResult dres;
15798 //ZZ    UInt      insn;
15799 //ZZ    //Bool      allow_VFP = False;
15800 //ZZ    //UInt      hwcaps = archinfo->hwcaps;
15801 //ZZ    IRTemp    condT; /* :: Ity_I32 */
15802 //ZZ    UInt      summary;
15803 //ZZ    HChar     dis_buf[128];  // big enough to hold LDMIA etc text
15804 //ZZ
15805 //ZZ    /* What insn variants are we supporting today? */
15806 //ZZ    //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
15807 //ZZ    // etc etc
15808
15809    /* Set result defaults. */
15810    dres->whatNext    = Dis_Continue;
15811    dres->len         = 4;
15812    dres->jk_StopHere = Ijk_INVALID;
15813    dres->hint        = Dis_HintNone;
15814
15815    /* At least this is simple on ARM64: insns are all 4 bytes long, and
15816       4-aligned.  So just fish the whole thing out of memory right now
15817       and have done. */
15818    UInt insn = getUIntLittleEndianly( guest_instr );
15819
15820    if (0) vex_printf("insn: 0x%x\n", insn);
15821
15822    DIP("\t(arm64) 0x%llx:  ", (ULong)guest_PC_curr_instr);
15823
15824    vassert(0 == (guest_PC_curr_instr & 3ULL));
15825
15826    /* ----------------------------------------------------------- */
15827
15828    /* Spot "Special" instructions (see comment at top of file). */
15829    {
15830       const UChar* code = guest_instr;
15831       /* Spot the 16-byte preamble:
15832             93CC0D8C   ror x12, x12, #3
15833             93CC358C   ror x12, x12, #13
15834             93CCCD8C   ror x12, x12, #51
15835             93CCF58C   ror x12, x12, #61
15836       */
15837       UInt word1 = 0x93CC0D8C;
15838       UInt word2 = 0x93CC358C;
15839       UInt word3 = 0x93CCCD8C;
15840       UInt word4 = 0x93CCF58C;
15841       if (getUIntLittleEndianly(code+ 0) == word1 &&
15842           getUIntLittleEndianly(code+ 4) == word2 &&
15843           getUIntLittleEndianly(code+ 8) == word3 &&
15844           getUIntLittleEndianly(code+12) == word4) {
15845          /* Got a "Special" instruction preamble.  Which one is it? */
15846          if (getUIntLittleEndianly(code+16) == 0xAA0A014A
15847                                                /* orr x10,x10,x10 */) {
15848             /* X3 = client_request ( X4 ) */
15849             DIP("x3 = client_request ( x4 )\n");
15850             putPC(mkU64( guest_PC_curr_instr + 20 ));
15851             dres->jk_StopHere = Ijk_ClientReq;
15852             dres->whatNext    = Dis_StopHere;
15853             return True;
15854          }
15855          else
15856          if (getUIntLittleEndianly(code+16) == 0xAA0B016B
15857                                                /* orr x11,x11,x11 */) {
15858             /* X3 = guest_NRADDR */
15859             DIP("x3 = guest_NRADDR\n");
15860             dres->len = 20;
15861             putIReg64orZR(3, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
15862             return True;
15863          }
15864          else
15865          if (getUIntLittleEndianly(code+16) == 0xAA0C018C
15866                                                /* orr x12,x12,x12 */) {
15867             /*  branch-and-link-to-noredir X8 */
15868             DIP("branch-and-link-to-noredir x8\n");
15869             putIReg64orZR(30, mkU64(guest_PC_curr_instr + 20));
15870             putPC(getIReg64orZR(8));
15871             dres->jk_StopHere = Ijk_NoRedir;
15872             dres->whatNext    = Dis_StopHere;
15873             return True;
15874          }
15875          else
15876          if (getUIntLittleEndianly(code+16) == 0xAA090129
15877                                                /* orr x9,x9,x9 */) {
15878             /* IR injection */
15879             DIP("IR injection\n");
15880             vex_inject_ir(irsb, Iend_LE);
15881             // Invalidate the current insn. The reason is that the IRop we're
15882             // injecting here can change. In which case the translation has to
15883             // be redone. For ease of handling, we simply invalidate all the
15884             // time.
15885             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_PC_curr_instr)));
15886             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(20)));
15887             putPC(mkU64( guest_PC_curr_instr + 20 ));
15888             dres->whatNext    = Dis_StopHere;
15889             dres->jk_StopHere = Ijk_InvalICache;
15890             return True;
15891          }
15892          /* We don't know what it is. */
15893          return False;
15894          /*NOTREACHED*/
15895       }
15896    }
15897
15898    /* ----------------------------------------------------------- */
15899
15900    /* Main ARM64 instruction decoder starts here. */
15901
15902    Bool ok = False;
15903
15904    /* insn[28:25] determines the top-level grouping, so let's start
15905       off with that.
15906
15907       For all of these dis_ARM64_ functions, we pass *dres with the
15908       normal default results "insn OK, 4 bytes long, keep decoding" so
15909       they don't need to change it.  However, decodes of control-flow
15910       insns may cause *dres to change.
15911    */
15912    switch (INSN(28,25)) {
15913       case BITS4(1,0,0,0): case BITS4(1,0,0,1):
15914          // Data processing - immediate
15915          ok = dis_ARM64_data_processing_immediate(dres, insn, sigill_diag);
15916          break;
15917       case BITS4(1,0,1,0): case BITS4(1,0,1,1):
15918          // Branch, exception generation and system instructions
15919          ok = dis_ARM64_branch_etc(dres, insn, archinfo, abiinfo, sigill_diag);
15920          break;
15921       case BITS4(0,1,0,0): case BITS4(0,1,1,0):
15922       case BITS4(1,1,0,0): case BITS4(1,1,1,0):
15923          // Loads and stores
15924          ok = dis_ARM64_load_store(dres, insn, abiinfo, sigill_diag);
15925          break;
15926       case BITS4(0,1,0,1): case BITS4(1,1,0,1):
15927          // Data processing - register
15928          ok = dis_ARM64_data_processing_register(dres, insn, sigill_diag);
15929          break;
15930       case BITS4(0,1,1,1): case BITS4(1,1,1,1):
15931          // Data processing - SIMD and floating point
15932          ok = dis_ARM64_simd_and_fp(dres, insn, archinfo, sigill_diag);
15933          break;
15934       case BITS4(0,0,0,0): case BITS4(0,0,0,1):
15935       case BITS4(0,0,1,0): case BITS4(0,0,1,1):
15936          // UNALLOCATED
15937          break;
15938       default:
15939          vassert(0); /* Can't happen */
15940    }
15941
15942    /* If the next-level down decoders failed, make sure |dres| didn't
15943       get changed. */
15944    if (!ok) {
15945       vassert(dres->whatNext    == Dis_Continue);
15946       vassert(dres->len         == 4);
15947       vassert(dres->jk_StopHere == Ijk_INVALID);
15948    }
15949
15950    return ok;
15951
15952 #  undef INSN
15953 }
15954
15955
15956 /*------------------------------------------------------------*/
15957 /*--- Top-level fn                                         ---*/
15958 /*------------------------------------------------------------*/
15959
15960 /* Disassemble a single instruction into IR.  The instruction
15961    is located in host memory at &guest_code[delta]. */
15962
15963 DisResult disInstr_ARM64 ( IRSB*        irsb_IN,
15964                            const UChar* guest_code_IN,
15965                            Long         delta_IN,
15966                            Addr         guest_IP,
15967                            VexArch      guest_arch,
15968                            const VexArchInfo* archinfo,
15969                            const VexAbiInfo*  abiinfo,
15970                            VexEndness   host_endness_IN,
15971                            Bool         sigill_diag_IN )
15972 {
15973    DisResult dres;
15974    vex_bzero(&dres, sizeof(dres));
15975
15976    /* Set globals (see top of this file) */
15977    vassert(guest_arch == VexArchARM64);
15978
15979    irsb                = irsb_IN;
15980    host_endness        = host_endness_IN;
15981    guest_PC_curr_instr = (Addr64)guest_IP;
15982
15983    /* Sanity checks */
15984    /* (x::UInt - 2) <= 15   ===   x >= 2 && x <= 17 (I hope) */
15985    vassert((archinfo->arm64_dMinLine_lg2_szB - 2) <= 15);
15986    vassert((archinfo->arm64_iMinLine_lg2_szB - 2) <= 15);
15987
15988    /* Try to decode */
15989    Bool ok = disInstr_ARM64_WRK( &dres,
15990                                  &guest_code_IN[delta_IN],
15991                                  archinfo, abiinfo, sigill_diag_IN );
15992    if (ok) {
15993       /* All decode successes end up here. */
15994       vassert(dres.len == 4 || dres.len == 20);
15995       switch (dres.whatNext) {
15996          case Dis_Continue:
15997             putPC( mkU64(dres.len + guest_PC_curr_instr) );
15998             break;
15999          case Dis_StopHere:
16000             break;
16001          default:
16002             vassert(0);
16003       }
16004       DIP("\n");
16005    } else {
16006       /* All decode failures end up here. */
16007       if (sigill_diag_IN) {
16008          Int   i, j;
16009          UChar buf[64];
16010          UInt  insn
16011                   = getUIntLittleEndianly( &guest_code_IN[delta_IN] );
16012          vex_bzero(buf, sizeof(buf));
16013          for (i = j = 0; i < 32; i++) {
16014             if (i > 0) {
16015               if ((i & 7) == 0) buf[j++] = ' ';
16016               else if ((i & 3) == 0) buf[j++] = '\'';
16017             }
16018             buf[j++] = (insn & (1<<(31-i))) ? '1' : '0';
16019          }
16020          vex_printf("disInstr(arm64): unhandled instruction 0x%08x\n", insn);
16021          vex_printf("disInstr(arm64): %s\n", buf);
16022       }
16023
16024       /* Tell the dispatcher that this insn cannot be decoded, and so
16025          has not been executed, and (is currently) the next to be
16026          executed.  PC should be up-to-date since it is made so at the
16027          start of each insn, but nevertheless be paranoid and update
16028          it again right now. */
16029       putPC( mkU64(guest_PC_curr_instr) );
16030       dres.len         = 0;
16031       dres.whatNext    = Dis_StopHere;
16032       dres.jk_StopHere = Ijk_NoDecode;
16033    }
16034    return dres;
16035 }
16036
16037
16038 /*--------------------------------------------------------------------*/
16039 /*--- end                                       guest_arm64_toIR.c ---*/
16040 /*--------------------------------------------------------------------*/