VEX/priv/guest_amd64_helpers.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                             guest_amd64_helpers.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex_emnote.h"
  36 #include "libvex_guest_amd64.h"
  37 #include "libvex_ir.h"
  38 #include "libvex.h"
  39
  40 #include "main_util.h"
  41 #include "main_globals.h"
  42 #include "guest_generic_bb_to_IR.h"
  43 #include "guest_amd64_defs.h"
  44 #include "guest_generic_x87.h"
  45
  46
  47 /* This file contains helper functions for amd64 guest code.
  48    Calls to these functions are generated by the back end.
  49    These calls are of course in the host machine code and
  50    this file will be compiled to host machine code, so that
  51    all makes sense.
  52
  53    Only change the signatures of these helper functions very
  54    carefully.  If you change the signature here, you'll have to change
  55    the parameters passed to it in the IR calls constructed by
  56    guest-amd64/toIR.c.
  57
  58    The convention used is that all functions called from generated
  59    code are named amd64g_<something>, and any function whose name lacks
  60    that prefix is not called from generated code.  Note that some
  61    LibVEX_* functions can however be called by VEX's client, but that
  62    is not the same as calling them from VEX-generated code.
  63 */
  64
  65
  66 /* Set to 1 to get detailed profiling info about use of the flag
  67    machinery. */
  68 #define PROFILE_RFLAGS 0
  69
  70
  71 /*---------------------------------------------------------------*/
  72 /*--- %rflags run-time helpers.                               ---*/
  73 /*---------------------------------------------------------------*/
  74
  75 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
  76    after imulq/mulq. */
  77
  78 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
  79 {
  80    const Long halfMask = 0xFFFFFFFFLL;
  81    ULong u0, v0, w0;
  82     Long u1, v1, w1, w2, t;
  83    u0   = u & halfMask;
  84    u1   = u >> 32;
  85    v0   = v & halfMask;
  86    v1   = v >> 32;
  87    w0   = u0 * v0;
  88    t    = u1 * v0 + (w0 >> 32);
  89    w1   = t & halfMask;
  90    w2   = t >> 32;
  91    w1   = u0 * v1 + w1;
  92    *rHi = u1 * v1 + w2 + (w1 >> 32);
  93    *rLo = (Long)((ULong)u * (ULong)v);
  94 }
  95
  96 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
  97 {
  98    const ULong halfMask = 0xFFFFFFFFULL;
  99    ULong u0, v0, w0;
 100    ULong u1, v1, w1,w2,t;
 101    u0   = u & halfMask;
 102    u1   = u >> 32;
 103    v0   = v & halfMask;
 104    v1   = v >> 32;
 105    w0   = u0 * v0;
 106    t    = u1 * v0 + (w0 >> 32);
 107    w1   = t & halfMask;
 108    w2   = t >> 32;
 109    w1   = u0 * v1 + w1;
 110    *rHi = u1 * v1 + w2 + (w1 >> 32);
 111    *rLo = u * v;
 112 }
 113
 114
 115 static const UChar parity_table[256] = {
 116     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 117     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 118     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 119     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 120     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 121     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 122     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 123     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 124     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 125     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 126     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 127     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 128     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 129     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 130     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 131     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 132     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 133     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 134     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 135     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 136     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 137     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 138     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 139     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 140     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 141     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 142     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 143     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 144     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 145     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 146     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 147     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 148 };
 149
 150 /* generalised left-shifter */
 151 static inline Long lshift ( Long x, Int n )
 152 {
 153    if (n >= 0)
 154       return (ULong)x << n;
 155    else
 156       return x >> (-n);
 157 }
 158
 159 /* identity on ULong */
 160 static inline ULong idULong ( ULong x )
 161 {
 162    return x;
 163 }
 164
 165
 166 #define PREAMBLE(__data_bits)                                   \
 167    /* const */ ULong DATA_MASK                                  \
 168       = __data_bits==8                                          \
 169            ? 0xFFULL                                            \
 170            : (__data_bits==16                                   \
 171                 ? 0xFFFFULL                                     \
 172                 : (__data_bits==32                              \
 173                      ? 0xFFFFFFFFULL                            \
 174                      : 0xFFFFFFFFFFFFFFFFULL));                 \
 175    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
 176    /* const */ ULong CC_DEP1 = cc_dep1_formal;                  \
 177    /* const */ ULong CC_DEP2 = cc_dep2_formal;                  \
 178    /* const */ ULong CC_NDEP = cc_ndep_formal;                  \
 179    /* Four bogus assignments, which hopefully gcc can     */    \
 180    /* optimise away, and which stop it complaining about  */    \
 181    /* unused variables.                                   */    \
 182    SIGN_MASK = SIGN_MASK;                                       \
 183    DATA_MASK = DATA_MASK;                                       \
 184    CC_DEP2 = CC_DEP2;                                           \
 185    CC_NDEP = CC_NDEP;
 186
 187
 188 /*-------------------------------------------------------------*/
 189
 190 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)                       \
 191 {                                                               \
 192    PREAMBLE(DATA_BITS);                                         \
 193    { ULong cf, pf, af, zf, sf, of;                              \
 194      ULong argL, argR, res;                                     \
 195      argL = CC_DEP1;                                            \
 196      argR = CC_DEP2;                                            \
 197      res  = argL + argR;                                        \
 198      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;                   \
 199      pf = parity_table[(UChar)res];                             \
 200      af = (res ^ argL ^ argR) & 0x10;                           \
 201      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 202      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 203      of = lshift((argL ^ argR ^ -1) & (argL ^ res),             \
 204                  12 - DATA_BITS) & AMD64G_CC_MASK_O;            \
 205      return cf | pf | af | zf | sf | of;                        \
 206    }                                                            \
 207 }
 208
 209 /*-------------------------------------------------------------*/
 210
 211 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)                       \
 212 {                                                               \
 213    PREAMBLE(DATA_BITS);                                         \
 214    { ULong cf, pf, af, zf, sf, of;                              \
 215      ULong argL, argR, res;                                     \
 216      argL = CC_DEP1;                                            \
 217      argR = CC_DEP2;                                            \
 218      res  = argL - argR;                                        \
 219      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;                  \
 220      pf = parity_table[(UChar)res];                             \
 221      af = (res ^ argL ^ argR) & 0x10;                           \
 222      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 223      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 224      of = lshift((argL ^ argR) & (argL ^ res),                  \
 225                  12 - DATA_BITS) & AMD64G_CC_MASK_O;            \
 226      return cf | pf | af | zf | sf | of;                        \
 227    }                                                            \
 228 }
 229
 230 /*-------------------------------------------------------------*/
 231
 232 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)                       \
 233 {                                                               \
 234    PREAMBLE(DATA_BITS);                                         \
 235    { ULong cf, pf, af, zf, sf, of;                              \
 236      ULong argL, argR, oldC, res;                               \
 237      oldC = CC_NDEP & AMD64G_CC_MASK_C;                         \
 238      argL = CC_DEP1;                                            \
 239      argR = CC_DEP2 ^ oldC;                                     \
 240      res  = (argL + argR) + oldC;                               \
 241      if (oldC)                                                  \
 242         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;               \
 243      else                                                       \
 244         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;                \
 245      pf = parity_table[(UChar)res];                             \
 246      af = (res ^ argL ^ argR) & 0x10;                           \
 247      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 248      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 249      of = lshift((argL ^ argR ^ -1) & (argL ^ res),             \
 250                   12 - DATA_BITS) & AMD64G_CC_MASK_O;           \
 251      return cf | pf | af | zf | sf | of;                        \
 252    }                                                            \
 253 }
 254
 255 /*-------------------------------------------------------------*/
 256
 257 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)                       \
 258 {                                                               \
 259    PREAMBLE(DATA_BITS);                                         \
 260    { ULong cf, pf, af, zf, sf, of;                              \
 261      ULong argL, argR, oldC, res;                               \
 262      oldC = CC_NDEP & AMD64G_CC_MASK_C;                         \
 263      argL = CC_DEP1;                                            \
 264      argR = CC_DEP2 ^ oldC;                                     \
 265      res  = (argL - argR) - oldC;                               \
 266      if (oldC)                                                  \
 267         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;              \
 268      else                                                       \
 269         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;               \
 270      pf = parity_table[(UChar)res];                             \
 271      af = (res ^ argL ^ argR) & 0x10;                           \
 272      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 273      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 274      of = lshift((argL ^ argR) & (argL ^ res),                  \
 275                  12 - DATA_BITS) & AMD64G_CC_MASK_O;            \
 276      return cf | pf | af | zf | sf | of;                        \
 277    }                                                            \
 278 }
 279
 280 /*-------------------------------------------------------------*/
 281
 282 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)                     \
 283 {                                                               \
 284    PREAMBLE(DATA_BITS);                                         \
 285    { ULong cf, pf, af, zf, sf, of;                              \
 286      cf = 0;                                                    \
 287      pf = parity_table[(UChar)CC_DEP1];                         \
 288      af = 0;                                                    \
 289      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 290      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 291      of = 0;                                                    \
 292      return cf | pf | af | zf | sf | of;                        \
 293    }                                                            \
 294 }
 295
 296 /*-------------------------------------------------------------*/
 297
 298 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)                       \
 299 {                                                               \
 300    PREAMBLE(DATA_BITS);                                         \
 301    { ULong cf, pf, af, zf, sf, of;                              \
 302      ULong argL, argR, res;                                     \
 303      res  = CC_DEP1;                                            \
 304      argL = res - 1;                                            \
 305      argR = 1;                                                  \
 306      cf = CC_NDEP & AMD64G_CC_MASK_C;                           \
 307      pf = parity_table[(UChar)res];                             \
 308      af = (res ^ argL ^ argR) & 0x10;                           \
 309      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 310      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 311      of = ((res & DATA_MASK) == SIGN_MASK) << 11;               \
 312      return cf | pf | af | zf | sf | of;                        \
 313    }                                                            \
 314 }
 315
 316 /*-------------------------------------------------------------*/
 317
 318 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)                       \
 319 {                                                               \
 320    PREAMBLE(DATA_BITS);                                         \
 321    { ULong cf, pf, af, zf, sf, of;                              \
 322      ULong argL, argR, res;                                     \
 323      res  = CC_DEP1;                                            \
 324      argL = res + 1;                                            \
 325      argR = 1;                                                  \
 326      cf = CC_NDEP & AMD64G_CC_MASK_C;                           \
 327      pf = parity_table[(UChar)res];                             \
 328      af = (res ^ argL ^ argR) & 0x10;                           \
 329      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 330      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 331      of = ((res & DATA_MASK)                                    \
 332           == ((ULong)SIGN_MASK - 1)) << 11;                     \
 333      return cf | pf | af | zf | sf | of;                        \
 334    }                                                            \
 335 }
 336
 337 /*-------------------------------------------------------------*/
 338
 339 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)                       \
 340 {                                                               \
 341    PREAMBLE(DATA_BITS);                                         \
 342    { ULong cf, pf, af, zf, sf, of;                              \
 343      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;      \
 344      pf = parity_table[(UChar)CC_DEP1];                         \
 345      af = 0; /* undefined */                                    \
 346      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 347      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 348      /* of is defined if shift count == 1 */                    \
 349      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)             \
 350           & AMD64G_CC_MASK_O;                                   \
 351      return cf | pf | af | zf | sf | of;                        \
 352    }                                                            \
 353 }
 354
 355 /*-------------------------------------------------------------*/
 356
 357 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)                       \
 358 {                                                               \
 359    PREAMBLE(DATA_BITS);                                         \
 360    { ULong cf, pf, af, zf, sf, of;                              \
 361      cf = CC_DEP2 & 1;                                          \
 362      pf = parity_table[(UChar)CC_DEP1];                         \
 363      af = 0; /* undefined */                                    \
 364      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 365      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 366      /* of is defined if shift count == 1 */                    \
 367      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)             \
 368           & AMD64G_CC_MASK_O;                                   \
 369      return cf | pf | af | zf | sf | of;                        \
 370    }                                                            \
 371 }
 372
 373 /*-------------------------------------------------------------*/
 374
 375 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
 376 /* DEP1 = result, NDEP = old flags */
 377 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)                       \
 378 {                                                               \
 379    PREAMBLE(DATA_BITS);                                         \
 380    { ULong fl                                                   \
 381         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))    \
 382           | (AMD64G_CC_MASK_C & CC_DEP1)                        \
 383           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,                \
 384                                       11-(DATA_BITS-1))         \
 385                      ^ lshift(CC_DEP1, 11)));                   \
 386      return fl;                                                 \
 387    }                                                            \
 388 }
 389
 390 /*-------------------------------------------------------------*/
 391
 392 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
 393 /* DEP1 = result, NDEP = old flags */
 394 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)                       \
 395 {                                                               \
 396    PREAMBLE(DATA_BITS);                                         \
 397    { ULong fl                                                   \
 398         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))    \
 399           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))     \
 400           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,                \
 401                                       11-(DATA_BITS-1))         \
 402                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));   \
 403      return fl;                                                 \
 404    }                                                            \
 405 }
 406
 407 /*-------------------------------------------------------------*/
 408
 409 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
 410                                 DATA_U2TYPE, NARROWto2U)        \
 411 {                                                               \
 412    PREAMBLE(DATA_BITS);                                         \
 413    { ULong cf, pf, af, zf, sf, of;                              \
 414      DATA_UTYPE  hi;                                            \
 415      DATA_UTYPE  lo                                             \
 416         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
 417                      * ((DATA_UTYPE)CC_DEP2) );                 \
 418      DATA_U2TYPE rr                                             \
 419         = NARROWto2U(                                           \
 420              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
 421              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
 422      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
 423      cf = (hi != 0);                                            \
 424      pf = parity_table[(UChar)lo];                              \
 425      af = 0; /* undefined */                                    \
 426      zf = (lo == 0) << 6;                                       \
 427      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
 428      of = cf << 11;                                             \
 429      return cf | pf | af | zf | sf | of;                        \
 430    }                                                            \
 431 }
 432
 433 /*-------------------------------------------------------------*/
 434
 435 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
 436                                 DATA_S2TYPE, NARROWto2S)        \
 437 {                                                               \
 438    PREAMBLE(DATA_BITS);                                         \
 439    { ULong cf, pf, af, zf, sf, of;                              \
 440      DATA_STYPE  hi;                                            \
 441      DATA_STYPE  lo                                             \
 442         = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
 443                      * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
 444      DATA_S2TYPE rr                                             \
 445         = NARROWto2S(                                           \
 446              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
 447              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
 448      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
 449      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
 450      pf = parity_table[(UChar)lo];                              \
 451      af = 0; /* undefined */                                    \
 452      zf = (lo == 0) << 6;                                       \
 453      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
 454      of = cf << 11;                                             \
 455      return cf | pf | af | zf | sf | of;                        \
 456    }                                                            \
 457 }
 458
 459 /*-------------------------------------------------------------*/
 460
 461 #define ACTIONS_UMULQ                                           \
 462 {                                                               \
 463    PREAMBLE(64);                                                \
 464    { ULong cf, pf, af, zf, sf, of;                              \
 465      ULong lo, hi;                                              \
 466      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
 467      cf = (hi != 0);                                            \
 468      pf = parity_table[(UChar)lo];                              \
 469      af = 0; /* undefined */                                    \
 470      zf = (lo == 0) << 6;                                       \
 471      sf = lshift(lo, 8 - 64) & 0x80;                            \
 472      of = cf << 11;                                             \
 473      return cf | pf | af | zf | sf | of;                        \
 474    }                                                            \
 475 }
 476
 477 /*-------------------------------------------------------------*/
 478
 479 #define ACTIONS_SMULQ                                           \
 480 {                                                               \
 481    PREAMBLE(64);                                                \
 482    { ULong cf, pf, af, zf, sf, of;                              \
 483      Long lo, hi;                                               \
 484      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
 485      cf = (hi != (lo >>/*s*/ (64-1)));                          \
 486      pf = parity_table[(UChar)lo];                              \
 487      af = 0; /* undefined */                                    \
 488      zf = (lo == 0) << 6;                                       \
 489      sf = lshift(lo, 8 - 64) & 0x80;                            \
 490      of = cf << 11;                                             \
 491      return cf | pf | af | zf | sf | of;                        \
 492    }                                                            \
 493 }
 494
 495 /*-------------------------------------------------------------*/
 496
 497 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)                      \
 498 {                                                               \
 499    PREAMBLE(DATA_BITS);                                         \
 500    { ULong cf, pf, af, zf, sf, of;                              \
 501      cf = 0;                                                    \
 502      pf = 0;                                                    \
 503      af = 0;                                                    \
 504      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 505      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 506      of = 0;                                                    \
 507      return cf | pf | af | zf | sf | of;                        \
 508    }                                                            \
 509 }
 510
 511 /*-------------------------------------------------------------*/
 512
 513 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)                      \
 514 {                                                               \
 515    PREAMBLE(DATA_BITS);                                         \
 516    { ULong cf, pf, af, zf, sf, of;                              \
 517      cf = ((DATA_UTYPE)CC_DEP2 != 0);                           \
 518      pf = 0;                                                    \
 519      af = 0;                                                    \
 520      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 521      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 522      of = 0;                                                    \
 523      return cf | pf | af | zf | sf | of;                        \
 524    }                                                            \
 525 }
 526
 527 /*-------------------------------------------------------------*/
 528
 529 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)                    \
 530 {                                                               \
 531    PREAMBLE(DATA_BITS);                                         \
 532    { Long cf, pf, af, zf, sf, of;                               \
 533      cf = ((DATA_UTYPE)CC_DEP2 == 0);                           \
 534      pf = 0;                                                    \
 535      af = 0;                                                    \
 536      zf = 0;                                                    \
 537      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 538      of = 0;                                                    \
 539      return cf | pf | af | zf | sf | of;                        \
 540    }                                                            \
 541 }
 542
 543 /*-------------------------------------------------------------*/
 544
 545 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)                      \
 546 {                                                               \
 547    PREAMBLE(DATA_BITS);                                         \
 548    { ULong cf, pf, af, zf, sf, of;                              \
 549      cf = ((DATA_UTYPE)CC_DEP2 == 0);                           \
 550      pf = 0;                                                    \
 551      af = 0;                                                    \
 552      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 553      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 554      of = 0;                                                    \
 555      return cf | pf | af | zf | sf | of;                        \
 556    }                                                            \
 557 }
 558
 559 /*-------------------------------------------------------------*/
 560
 561 #define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME)              \
 562 {                                                               \
 563    PREAMBLE(DATA_BITS);                                         \
 564    { ULong ocf; /* o or c */                                    \
 565      ULong argL, argR, oldOC, res;                              \
 566      oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1;       \
 567      argL  = CC_DEP1;                                           \
 568      argR  = CC_DEP2 ^ oldOC;                                   \
 569      res   = (argL + argR) + oldOC;                             \
 570      if (oldOC)                                                 \
 571         ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;              \
 572      else                                                       \
 573         ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL;               \
 574      return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME)              \
 575             | (ocf << AMD64G_CC_SHIFT_##FLAGNAME);              \
 576    }                                                            \
 577 }
 578
 579 /*-------------------------------------------------------------*/
 580
 581
 582 #if PROFILE_RFLAGS
 583
 584 static Bool initted     = False;
 585
 586 /* C flag, fast route */
 587 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
 588 /* C flag, slow route */
 589 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
 590 /* table for calculate_cond */
 591 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
 592 /* total entry counts for calc_all, calc_c, calc_cond. */
 593 static UInt n_calc_all  = 0;
 594 static UInt n_calc_c    = 0;
 595 static UInt n_calc_cond = 0;
 596
 597 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
 598
 599
 600 static void showCounts ( void )
 601 {
 602    Int op, co;
 603    HChar ch;
 604    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
 605               n_calc_all, n_calc_cond, n_calc_c);
 606
 607    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
 608               "    S   NS    P   NP    L   NL   LE  NLE\n");
 609    vex_printf("     -----------------------------------------------------"
 610               "----------------------------------------\n");
 611    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
 612
 613       ch = ' ';
 614       if (op > 0 && (op-1) % 4 == 0)
 615          ch = 'B';
 616       if (op > 0 && (op-1) % 4 == 1)
 617          ch = 'W';
 618       if (op > 0 && (op-1) % 4 == 2)
 619          ch = 'L';
 620       if (op > 0 && (op-1) % 4 == 3)
 621          ch = 'Q';
 622
 623       vex_printf("%2d%c: ", op, ch);
 624       vex_printf("%6u ", tabc_slow[op]);
 625       vex_printf("%6u ", tabc_fast[op]);
 626       for (co = 0; co < 16; co++) {
 627          Int n = tab_cond[op][co];
 628          if (n >= 1000) {
 629             vex_printf(" %3dK", n / 1000);
 630          } else
 631          if (n >= 0) {
 632             vex_printf(" %3d ", n );
 633          } else {
 634             vex_printf("     ");
 635          }
 636       }
 637       vex_printf("\n");
 638    }
 639    vex_printf("\n");
 640 }
 641
 642 static void initCounts ( void )
 643 {
 644    Int op, co;
 645    initted = True;
 646    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
 647       tabc_fast[op] = tabc_slow[op] = 0;
 648       for (co = 0; co < 16; co++)
 649          tab_cond[op][co] = 0;
 650    }
 651 }
 652
 653 #endif /* PROFILE_RFLAGS */
 654
 655
 656 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 657 /* Calculate all the 6 flags from the supplied thunk parameters.
 658    Worker function, not directly called from generated code. */
 659 static
 660 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
 661                                         ULong cc_dep1_formal,
 662                                         ULong cc_dep2_formal,
 663                                         ULong cc_ndep_formal )
 664 {
 665    switch (cc_op) {
 666       case AMD64G_CC_OP_COPY:
 667          return cc_dep1_formal
 668                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
 669                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
 670
 671       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
 672       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
 673       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
 674       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
 675
 676       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
 677       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
 678       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
 679       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
 680
 681       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
 682       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
 683       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
 684       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
 685
 686       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
 687       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
 688       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
 689       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
 690
 691       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
 692       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
 693       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
 694       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
 695
 696       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
 697       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
 698       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
 699       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
 700
 701       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
 702       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
 703       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
 704       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
 705
 706       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
 707       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
 708       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
 709       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
 710
 711       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
 712       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
 713       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
 714       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
 715
 716       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
 717       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
 718       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
 719       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
 720
 721       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
 722       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
 723       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
 724       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
 725
 726       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
 727                                                   UShort, toUShort );
 728       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
 729                                                   UInt,   toUInt );
 730       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
 731                                                   ULong,  idULong );
 732
 733       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
 734
 735       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
 736                                                   Short,  toUShort );
 737       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
 738                                                   Int,    toUInt   );
 739       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
 740                                                   Long,   idULong );
 741
 742       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
 743
 744       case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
 745       case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
 746
 747       case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
 748       case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
 749
 750       case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
 751       case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
 752
 753       case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
 754       case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
 755
 756       case AMD64G_CC_OP_ADCX32: ACTIONS_ADX( 32, UInt,  C );
 757       case AMD64G_CC_OP_ADCX64: ACTIONS_ADX( 64, ULong, C );
 758
 759       case AMD64G_CC_OP_ADOX32: ACTIONS_ADX( 32, UInt,  O );
 760       case AMD64G_CC_OP_ADOX64: ACTIONS_ADX( 64, ULong, O );
 761
 762       default:
 763          /* shouldn't really make these calls from generated code */
 764          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
 765                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
 766                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
 767          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
 768    }
 769 }
 770
 771 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 772 /* Calculate all the 6 flags from the supplied thunk parameters. */
 773 ULong amd64g_calculate_rflags_all ( ULong cc_op,
 774                                     ULong cc_dep1,
 775                                     ULong cc_dep2,
 776                                     ULong cc_ndep )
 777 {
 778 #  if PROFILE_RFLAGS
 779    if (!initted) initCounts();
 780    n_calc_all++;
 781    if (SHOW_COUNTS_NOW) showCounts();
 782 #  endif
 783    return
 784       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
 785 }
 786
 787
 788 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 789 /* Calculate just the carry flag from the supplied thunk parameters. */
 790 ULong amd64g_calculate_rflags_c ( ULong cc_op,
 791                                   ULong cc_dep1,
 792                                   ULong cc_dep2,
 793                                   ULong cc_ndep )
 794 {
 795 #  if PROFILE_RFLAGS
 796    if (!initted) initCounts();
 797    n_calc_c++;
 798    tabc_fast[cc_op]++;
 799    if (SHOW_COUNTS_NOW) showCounts();
 800 #  endif
 801
 802    /* Fast-case some common ones. */
 803    switch (cc_op) {
 804       case AMD64G_CC_OP_COPY:
 805          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
 806       case AMD64G_CC_OP_LOGICQ:
 807       case AMD64G_CC_OP_LOGICL:
 808       case AMD64G_CC_OP_LOGICW:
 809       case AMD64G_CC_OP_LOGICB:
 810          return 0;
 811          //      case AMD64G_CC_OP_SUBL:
 812          //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
 813          //                   ? AMD64G_CC_MASK_C : 0;
 814          //      case AMD64G_CC_OP_SUBW:
 815          //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
 816          //                   ? AMD64G_CC_MASK_C : 0;
 817          //      case AMD64G_CC_OP_SUBB:
 818          //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
 819          //                   ? AMD64G_CC_MASK_C : 0;
 820          //      case AMD64G_CC_OP_INCL:
 821          //      case AMD64G_CC_OP_DECL:
 822          //         return cc_ndep & AMD64G_CC_MASK_C;
 823       default:
 824          break;
 825    }
 826
 827 #  if PROFILE_RFLAGS
 828    tabc_fast[cc_op]--;
 829    tabc_slow[cc_op]++;
 830 #  endif
 831
 832    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
 833           & AMD64G_CC_MASK_C;
 834 }
 835
 836
 837 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 838 /* returns 1 or 0 */
 839 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
 840                                    ULong cc_op,
 841                                    ULong cc_dep1,
 842                                    ULong cc_dep2,
 843                                    ULong cc_ndep )
 844 {
 845    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
 846                                                   cc_dep2, cc_ndep);
 847    ULong of,sf,zf,cf,pf;
 848    ULong inv = cond & 1;
 849
 850 #  if PROFILE_RFLAGS
 851    if (!initted) initCounts();
 852    tab_cond[cc_op][cond]++;
 853    n_calc_cond++;
 854    if (SHOW_COUNTS_NOW) showCounts();
 855 #  endif
 856
 857    switch (cond) {
 858       case AMD64CondNO:
 859       case AMD64CondO: /* OF == 1 */
 860          of = rflags >> AMD64G_CC_SHIFT_O;
 861          return 1 & (inv ^ of);
 862
 863       case AMD64CondNZ:
 864       case AMD64CondZ: /* ZF == 1 */
 865          zf = rflags >> AMD64G_CC_SHIFT_Z;
 866          return 1 & (inv ^ zf);
 867
 868       case AMD64CondNB:
 869       case AMD64CondB: /* CF == 1 */
 870          cf = rflags >> AMD64G_CC_SHIFT_C;
 871          return 1 & (inv ^ cf);
 872          break;
 873
 874       case AMD64CondNBE:
 875       case AMD64CondBE: /* (CF or ZF) == 1 */
 876          cf = rflags >> AMD64G_CC_SHIFT_C;
 877          zf = rflags >> AMD64G_CC_SHIFT_Z;
 878          return 1 & (inv ^ (cf | zf));
 879          break;
 880
 881       case AMD64CondNS:
 882       case AMD64CondS: /* SF == 1 */
 883          sf = rflags >> AMD64G_CC_SHIFT_S;
 884          return 1 & (inv ^ sf);
 885
 886       case AMD64CondNP:
 887       case AMD64CondP: /* PF == 1 */
 888          pf = rflags >> AMD64G_CC_SHIFT_P;
 889          return 1 & (inv ^ pf);
 890
 891       case AMD64CondNL:
 892       case AMD64CondL: /* (SF xor OF) == 1 */
 893          sf = rflags >> AMD64G_CC_SHIFT_S;
 894          of = rflags >> AMD64G_CC_SHIFT_O;
 895          return 1 & (inv ^ (sf ^ of));
 896          break;
 897
 898       case AMD64CondNLE:
 899       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
 900          sf = rflags >> AMD64G_CC_SHIFT_S;
 901          of = rflags >> AMD64G_CC_SHIFT_O;
 902          zf = rflags >> AMD64G_CC_SHIFT_Z;
 903          return 1 & (inv ^ ((sf ^ of) | zf));
 904          break;
 905
 906       default:
 907          /* shouldn't really make these calls from generated code */
 908          vex_printf("amd64g_calculate_condition"
 909                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
 910                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
 911          vpanic("amd64g_calculate_condition");
 912    }
 913 }
 914
 915
 916 /* VISIBLE TO LIBVEX CLIENT */
 917 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
 918 {
 919    ULong rflags = amd64g_calculate_rflags_all_WRK(
 920                      vex_state->guest_CC_OP,
 921                      vex_state->guest_CC_DEP1,
 922                      vex_state->guest_CC_DEP2,
 923                      vex_state->guest_CC_NDEP
 924                   );
 925    Long dflag = vex_state->guest_DFLAG;
 926    vassert(dflag == 1 || dflag == -1);
 927    if (dflag == -1)
 928       rflags |= (1<<10);
 929    if (vex_state->guest_IDFLAG == 1)
 930       rflags |= (1<<21);
 931    if (vex_state->guest_ACFLAG == 1)
 932       rflags |= (1<<18);
 933
 934    return rflags;
 935 }
 936
 937 /* VISIBLE TO LIBVEX CLIENT */
 938 void
 939 LibVEX_GuestAMD64_put_rflags ( ULong rflags,
 940                                /*MOD*/VexGuestAMD64State* vex_state )
 941 {
 942    /* D flag */
 943    if (rflags & AMD64G_CC_MASK_D) {
 944       vex_state->guest_DFLAG = -1;
 945       rflags &= ~AMD64G_CC_MASK_D;
 946    }
 947    else
 948       vex_state->guest_DFLAG = 1;
 949
 950    /* ID flag */
 951    if (rflags & AMD64G_CC_MASK_ID) {
 952       vex_state->guest_IDFLAG = 1;
 953       rflags &= ~AMD64G_CC_MASK_ID;
 954    }
 955    else
 956       vex_state->guest_IDFLAG = 0;
 957
 958    /* AC flag */
 959    if (rflags & AMD64G_CC_MASK_AC) {
 960       vex_state->guest_ACFLAG = 1;
 961       rflags &= ~AMD64G_CC_MASK_AC;
 962    }
 963    else
 964       vex_state->guest_ACFLAG = 0;
 965
 966    UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
 967                   AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
 968    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
 969    vex_state->guest_CC_DEP1 = rflags & cc_mask;
 970    vex_state->guest_CC_DEP2 = 0;
 971    vex_state->guest_CC_NDEP = 0;
 972 }
 973
 974 /* VISIBLE TO LIBVEX CLIENT */
 975 void
 976 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
 977                                /*MOD*/VexGuestAMD64State* vex_state )
 978 {
 979    ULong oszacp = amd64g_calculate_rflags_all_WRK(
 980                      vex_state->guest_CC_OP,
 981                      vex_state->guest_CC_DEP1,
 982                      vex_state->guest_CC_DEP2,
 983                      vex_state->guest_CC_NDEP
 984                   );
 985    if (new_carry_flag & 1) {
 986       oszacp |= AMD64G_CC_MASK_C;
 987    } else {
 988       oszacp &= ~AMD64G_CC_MASK_C;
 989    }
 990    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
 991    vex_state->guest_CC_DEP1 = oszacp;
 992    vex_state->guest_CC_DEP2 = 0;
 993    vex_state->guest_CC_NDEP = 0;
 994 }
 995
 996 /*---------------------------------------------------------------*/
 997 /*--- %rflags translation-time function specialisers.         ---*/
 998 /*--- These help iropt specialise calls the above run-time    ---*/
 999 /*--- %rflags functions.                                      ---*/
1000 /*---------------------------------------------------------------*/
1001
1002 /* Used by the optimiser to try specialisations.  Returns an
1003    equivalent expression, or NULL if none. */
1004
1005 static inline Bool isU64 ( IRExpr* e, ULong n )
1006 {
1007    return e->tag == Iex_Const
1008           && e->Iex.Const.con->tag == Ico_U64
1009           && e->Iex.Const.con->Ico.U64 == n;
1010 }
1011
1012 /* Returns N if W64 is a value of the form 1 << N for N in 1 to 31,
1013    and zero in any other case. */
1014 static Int isU64_1_shl_N_literal ( ULong w64 )
1015 {
1016    if (w64 < (1ULL << 1) || w64 > (1ULL << 31))
1017       return 0;
1018    if ((w64 & (w64 - 1)) != 0)
1019       return 0;
1020    /* At this point, we know w64 is a power of two in the range 2^1 .. 2^31,
1021       and we only need to find out which one it is. */
1022    for (Int n = 1; n <= 31; n++) {
1023       if (w64 == (1ULL << n))
1024          return n;
1025    }
1026    /* Consequently we should never get here. */
1027    /*UNREACHED*/
1028    vassert(0);
1029    return 0;
1030 }
1031
1032 /* Returns N if E is an immediate of the form 1 << N for N in 1 to 31,
1033    and zero in any other case. */
1034 static Int isU64_1_shl_N ( IRExpr* e )
1035 {
1036    if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1037       return 0;
1038    ULong w64 = e->Iex.Const.con->Ico.U64;
1039    return isU64_1_shl_N_literal(w64);
1040 }
1041
1042 /* Returns N if E is an immediate of the form (1 << N) - 1 for N in 1 to 31,
1043    and zero in any other case. */
1044 static Int isU64_1_shl_N_minus_1 ( IRExpr* e )
1045 {
1046   if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1047     return 0;
1048   ULong w64 = e->Iex.Const.con->Ico.U64;
1049   // This isn't actually necessary since isU64_1_shl_N_literal will return
1050   // zero given a zero argument, but still ..
1051   if (w64 == 0xFFFFFFFFFFFFFFFFULL)
1052      return 0;
1053   return isU64_1_shl_N_literal(w64 + 1);
1054 }
1055
1056 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
1057                                  IRExpr** args,
1058                                  IRStmt** precedingStmts,
1059                                  Int      n_precedingStmts )
1060 {
1061 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
1062 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
1063 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
1064 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
1065 #  define mkU16(_n) IRExpr_Const(IRConst_U16(_n))
1066 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
1067
1068    Int i, arity = 0;
1069    for (i = 0; args[i]; i++)
1070       arity++;
1071 #  if 0
1072    vex_printf("spec request:\n");
1073    vex_printf("   %s  ", function_name);
1074    for (i = 0; i < arity; i++) {
1075       vex_printf("  ");
1076       ppIRExpr(args[i]);
1077    }
1078    vex_printf("\n");
1079 #  endif
1080
1081    /* --------- specialising "amd64g_calculate_condition" --------- */
1082
1083    if (vex_streq(function_name, "amd64g_calculate_condition")) {
1084       /* specialise calls to above "calculate condition" function */
1085       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
1086       vassert(arity == 5);
1087       cond    = args[0];
1088       cc_op   = args[1];
1089       cc_dep1 = args[2];
1090       cc_dep2 = args[3];
1091
1092       /*---------------- ADDQ ----------------*/
1093
1094       /* 4, */
1095       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
1096          /* long long add, then Z --> test (dst+src == 0) */
1097          return unop(Iop_1Uto64,
1098                      binop(Iop_CmpEQ64,
1099                            binop(Iop_Add64, cc_dep1, cc_dep2),
1100                            mkU64(0)));
1101       }
1102
1103       /* 8, */
1104       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondS)) {
1105          /* long long add, then S (negative)
1106             --> (dst+src)[63]
1107             --> ((dst + src) >>u 63) & 1
1108          */
1109          return binop(Iop_And64,
1110                       binop(Iop_Shr64,
1111                             binop(Iop_Add64, cc_dep1, cc_dep2),
1112                             mkU8(63)),
1113                       mkU64(1));
1114       }
1115
1116       /*---------------- ADDL ----------------*/
1117
1118       /* 0, */
1119       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
1120          /* This is very commonly generated by Javascript JITs, for
1121             the idiom "do a 32-bit add and jump to out-of-line code if
1122             an overflow occurs". */
1123          /* long add, then O (overflow)
1124             --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
1125             --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1126             --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1127          */
1128          vassert(isIRAtom(cc_dep1));
1129          vassert(isIRAtom(cc_dep2));
1130          return
1131             binop(Iop_And64,
1132                   binop(Iop_Shr64,
1133                         binop(Iop_And64,
1134                               unop(Iop_Not64,
1135                                    binop(Iop_Xor64, cc_dep1, cc_dep2)),
1136                               binop(Iop_Xor64,
1137                                     cc_dep1,
1138                                     binop(Iop_Add64, cc_dep1, cc_dep2))),
1139                         mkU8(31)),
1140                   mkU64(1));
1141
1142       }
1143
1144       /* 4, */
1145       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondZ)) {
1146          /* long add, then Z --> test ((int)(dst+src) == 0) */
1147          return unop(Iop_1Uto64,
1148                      binop(Iop_CmpEQ32,
1149                            unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
1150                            mkU32(0)));
1151       }
1152
1153       /* 8, 9 */
1154       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondS)) {
1155          /* long add, then S (negative)
1156             --> (dst+src)[31]
1157             --> ((dst +64 src) >>u 31) & 1
1158             Pointless to narrow the args to 32 bit before the add. */
1159          return binop(Iop_And64,
1160                       binop(Iop_Shr64,
1161                             binop(Iop_Add64, cc_dep1, cc_dep2),
1162                             mkU8(31)),
1163                       mkU64(1));
1164       }
1165       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondNS)) {
1166          /* long add, then NS (not negative)
1167             --> (dst+src)[31] ^ 1
1168             --> (((dst +64 src) >>u 31) & 1) ^ 1
1169             Pointless to narrow the args to 32 bit before the add. */
1170          return binop(Iop_Xor64,
1171                       binop(Iop_And64,
1172                             binop(Iop_Shr64,
1173                                   binop(Iop_Add64, cc_dep1, cc_dep2),
1174                                   mkU8(31)),
1175                             mkU64(1)),
1176                       mkU64(1));
1177       }
1178
1179       /*---------------- ADDW ----------------*/
1180
1181       /* 4, */
1182       if (isU64(cc_op, AMD64G_CC_OP_ADDW) && isU64(cond, AMD64CondZ)) {
1183
1184          /* word add, then Z --> test ((short)(dst+src) == 0) */
1185          return unop(Iop_1Uto64,
1186                      binop(Iop_CmpEQ16,
1187                            unop(Iop_64to16, binop(Iop_Add64, cc_dep1, cc_dep2)),
1188                            mkU16(0)));
1189       }
1190
1191       /*---------------- ADDB ----------------*/
1192
1193       /* 4, */
1194       if (isU64(cc_op, AMD64G_CC_OP_ADDB) && isU64(cond, AMD64CondZ)) {
1195          /* byte add, then Z --> test ((char)(dst+src) == 0) */
1196          return unop(Iop_1Uto64,
1197                      binop(Iop_CmpEQ8,
1198                            unop(Iop_64to8, binop(Iop_Add64, cc_dep1, cc_dep2)),
1199                            mkU8(0)));
1200       }
1201
1202       /*---------------- SUBQ ----------------*/
1203
1204       /* 0, */
1205       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
1206          /* long long sub/cmp, then O (overflow)
1207             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
1208             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
1209          */
1210          vassert(isIRAtom(cc_dep1));
1211          vassert(isIRAtom(cc_dep2));
1212          return binop(Iop_Shr64,
1213                       binop(Iop_And64,
1214                             binop(Iop_Xor64, cc_dep1, cc_dep2),
1215                             binop(Iop_Xor64,
1216                                   cc_dep1,
1217                                   binop(Iop_Sub64, cc_dep1, cc_dep2))),
1218                       mkU8(63));
1219       }
1220       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
1221          /* No action.  Never yet found a test case. */
1222       }
1223
1224       /* 2, 3 */
1225       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
1226          /* long long sub/cmp, then B (unsigned less than)
1227             --> test dst <u src */
1228          return unop(Iop_1Uto64,
1229                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1230       }
1231       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
1232          /* long long sub/cmp, then NB (unsigned greater than or equal)
1233             --> test src <=u dst */
1234          /* Note, args are opposite way round from the usual */
1235          return unop(Iop_1Uto64,
1236                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1237       }
1238
1239       /* 4, 5 */
1240       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
1241          /* long long sub/cmp, then Z --> test dst==src */
1242          return unop(Iop_1Uto64,
1243                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
1244       }
1245       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
1246          /* long long sub/cmp, then NZ --> test dst!=src */
1247          return unop(Iop_1Uto64,
1248                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
1249       }
1250
1251       /* 6, 7 */
1252       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
1253          /* long long sub/cmp, then BE (unsigned less than or equal)
1254             --> test dst <=u src */
1255          return unop(Iop_1Uto64,
1256                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1257       }
1258       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
1259          /* long long sub/cmp, then NBE (unsigned greater than)
1260             --> test !(dst <=u src) */
1261          return binop(Iop_Xor64,
1262                       unop(Iop_1Uto64,
1263                            binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
1264                       mkU64(1));
1265       }
1266
1267       /* 8, 9 */
1268       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
1269          /* long long sub/cmp, then S (negative)
1270             --> (dst-src)[63]
1271             --> (dst-src) >>u 63 */
1272          return binop(Iop_Shr64,
1273                       binop(Iop_Sub64, cc_dep1, cc_dep2),
1274                       mkU8(63));
1275       }
1276       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
1277          /* long long sub/cmp, then NS (not negative)
1278             --> (dst-src)[63] ^ 1
1279             --> ((dst-src) >>u 63) ^ 1 */
1280          return binop(Iop_Xor64,
1281                       binop(Iop_Shr64,
1282                             binop(Iop_Sub64, cc_dep1, cc_dep2),
1283                             mkU8(63)),
1284                       mkU64(1));
1285       }
1286
1287       /* 12, 13 */
1288       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
1289          /* long long sub/cmp, then L (signed less than)
1290             --> test dst <s src */
1291          return unop(Iop_1Uto64,
1292                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1293       }
1294       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
1295          /* long long sub/cmp, then NL (signed greater than or equal)
1296             --> test dst >=s src
1297             --> test src <=s dst */
1298          return unop(Iop_1Uto64,
1299                      binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
1300       }
1301
1302       /* 14, 15 */
1303       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
1304          /* long long sub/cmp, then LE (signed less than or equal)
1305             --> test dst <=s src */
1306          return unop(Iop_1Uto64,
1307                      binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
1308       }
1309       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
1310          /* long long sub/cmp, then NLE (signed greater than)
1311             --> test !(dst <=s src)
1312             --> test (dst >s src)
1313             --> test (src <s dst) */
1314          return unop(Iop_1Uto64,
1315                      binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1316
1317       }
1318
1319       /*---------------- SUBL ----------------*/
1320
1321       /* 0, */
1322       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
1323          /* This is very commonly generated by Javascript JITs, for
1324             the idiom "do a 32-bit subtract and jump to out-of-line
1325             code if an overflow occurs". */
1326          /* long sub/cmp, then O (overflow)
1327             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
1328             --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
1329          */
1330          vassert(isIRAtom(cc_dep1));
1331          vassert(isIRAtom(cc_dep2));
1332          return
1333             binop(Iop_And64,
1334                   binop(Iop_Shr64,
1335                         binop(Iop_And64,
1336                               binop(Iop_Xor64, cc_dep1, cc_dep2),
1337                               binop(Iop_Xor64,
1338                                     cc_dep1,
1339                                     binop(Iop_Sub64, cc_dep1, cc_dep2))),
1340                         mkU8(31)),
1341                   mkU64(1));
1342       }
1343
1344       /* 1, */
1345       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
1346          /* No action.  Never yet found a test case. */
1347       }
1348
1349       /* 2, 3 */
1350       {
1351         /* It appears that LLVM 5.0 and later have a new way to find out
1352            whether the top N bits of a word W are all zero, by computing
1353
1354              W  <u   0---(N-1)---0 1 0---0  or
1355              W  <=u  0---(N-1)---0 0 1---1
1356
1357            In particular, the result will be defined if the top N bits of W
1358            are defined, even if the trailing bits -- those corresponding to
1359            the rightmost 0---0 / 1---1 section -- are undefined.  Rather than
1360            make Memcheck more complex, we detect this case where we can and
1361            shift out the irrelevant and potentially undefined bits. */
1362         Int n = 0;
1363         Bool is_NB_or_NBE = False;
1364         if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1365            if (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB)) {
1366               /* long sub/cmp, then B (unsigned less than),
1367                  where dep2 is a power of 2:
1368                    -> CmpLT32U(dep1, 1 << N)
1369                    -> CmpEQ32(dep1 >>u N, 0)
1370                  and
1371                  long sub/cmp, then NB (unsigned greater than or equal),
1372                  where dep2 is a power of 2:
1373                    -> CmpGE32U(dep1, 1 << N)
1374                    -> CmpNE32(dep1 >>u N, 0)
1375                  This avoids CmpLT32U/CmpGE32U being applied to potentially
1376                  uninitialised bits in the area being shifted out. */
1377               n = isU64_1_shl_N(cc_dep2);
1378               is_NB_or_NBE = isU64(cond, AMD64CondNB);
1379            } else if (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE)) {
1380               /* long sub/cmp, then BE (unsigned less than or equal),
1381                  where dep2 is a power of 2 minus 1:
1382                   -> CmpLE32U(dep1, (1 << N) - 1)
1383                   -> CmpEQ32(dep1 >>u N, 0)
1384                  and
1385                  long sub/cmp, then NBE (unsigned greater than),
1386                  where dep2 is a power of 2 minus 1:
1387                    -> CmpGT32U(dep1, (1 << N) - 1)
1388                    -> CmpNE32(dep1 >>u N, 0)
1389                  This avoids CmpLE32U/CmpGT32U being applied to potentially
1390                  uninitialised bits in the area being shifted out. */
1391               n = isU64_1_shl_N_minus_1(cc_dep2);
1392               is_NB_or_NBE = isU64(cond, AMD64CondNBE);
1393            }
1394         }
1395         if (n > 0) {
1396            vassert(n >= 1 && n <= 31);
1397            return unop(Iop_1Uto64,
1398                        binop(is_NB_or_NBE ? Iop_CmpNE32 : Iop_CmpEQ32,
1399                              binop(Iop_Shr32, unop(Iop_64to32, cc_dep1),
1400                                               mkU8(n)),
1401                              mkU32(0)));
1402         }
1403       }
1404       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1405          /* long sub/cmp, then B (unsigned less than)
1406             --> test dst <u src */
1407          return unop(Iop_1Uto64,
1408                      binop(Iop_CmpLT32U,
1409                            unop(Iop_64to32, cc_dep1),
1410                            unop(Iop_64to32, cc_dep2)));
1411       }
1412       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
1413          /* long sub/cmp, then NB (unsigned greater than or equal)
1414             --> test src <=u dst */
1415          /* Note, args are opposite way round from the usual */
1416          return unop(Iop_1Uto64,
1417                      binop(Iop_CmpLE32U,
1418                            unop(Iop_64to32, cc_dep2),
1419                            unop(Iop_64to32, cc_dep1)));
1420       }
1421
1422       /* 4, 5 */
1423       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
1424          /* long sub/cmp, then Z --> test dst==src */
1425          return unop(Iop_1Uto64,
1426                      binop(Iop_CmpEQ32,
1427                            unop(Iop_64to32, cc_dep1),
1428                            unop(Iop_64to32, cc_dep2)));
1429       }
1430       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
1431          /* long sub/cmp, then NZ --> test dst!=src */
1432          return unop(Iop_1Uto64,
1433                      binop(Iop_CmpNE32,
1434                            unop(Iop_64to32, cc_dep1),
1435                            unop(Iop_64to32, cc_dep2)));
1436       }
1437
1438       /* 6, 7 */
1439       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1440          /* long sub/cmp, then BE (unsigned less than or equal)
1441             --> test dst <=u src */
1442          return unop(Iop_1Uto64,
1443                      binop(Iop_CmpLE32U,
1444                            unop(Iop_64to32, cc_dep1),
1445                            unop(Iop_64to32, cc_dep2)));
1446       }
1447       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1448          /* long sub/cmp, then NBE (unsigned greater than)
1449             --> test src <u dst */
1450          /* Note, args are opposite way round from the usual */
1451          return unop(Iop_1Uto64,
1452                      binop(Iop_CmpLT32U,
1453                            unop(Iop_64to32, cc_dep2),
1454                            unop(Iop_64to32, cc_dep1)));
1455       }
1456
1457       /* 8, 9 */
1458       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1459          /* long sub/cmp, then S (negative)
1460             --> (dst-src)[31]
1461             --> ((dst -64 src) >>u 31) & 1
1462             Pointless to narrow the args to 32 bit before the subtract. */
1463          return binop(Iop_And64,
1464                       binop(Iop_Shr64,
1465                             binop(Iop_Sub64, cc_dep1, cc_dep2),
1466                             mkU8(31)),
1467                       mkU64(1));
1468       }
1469       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
1470          /* long sub/cmp, then NS (not negative)
1471             --> (dst-src)[31] ^ 1
1472             --> (((dst -64 src) >>u 31) & 1) ^ 1
1473             Pointless to narrow the args to 32 bit before the subtract. */
1474          return binop(Iop_Xor64,
1475                       binop(Iop_And64,
1476                             binop(Iop_Shr64,
1477                                   binop(Iop_Sub64, cc_dep1, cc_dep2),
1478                                   mkU8(31)),
1479                             mkU64(1)),
1480                       mkU64(1));
1481       }
1482
1483       /* 12, 13 */
1484       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
1485          /* long sub/cmp, then L (signed less than)
1486             --> test dst <s src */
1487          return unop(Iop_1Uto64,
1488                      binop(Iop_CmpLT32S,
1489                            unop(Iop_64to32, cc_dep1),
1490                            unop(Iop_64to32, cc_dep2)));
1491       }
1492       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
1493          /* long sub/cmp, then NL (signed greater than or equal)
1494             --> test dst >=s src
1495             --> test src <=s dst */
1496          return unop(Iop_1Uto64,
1497                      binop(Iop_CmpLE32S,
1498                            unop(Iop_64to32, cc_dep2),
1499                            unop(Iop_64to32, cc_dep1)));
1500       }
1501
1502       /* 14, 15 */
1503       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
1504          /* long sub/cmp, then LE (signed less than or equal)
1505             --> test dst <=s src */
1506          return unop(Iop_1Uto64,
1507                      binop(Iop_CmpLE32S,
1508                            unop(Iop_64to32, cc_dep1),
1509                            unop(Iop_64to32, cc_dep2)));
1510
1511       }
1512       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1513          /* long sub/cmp, then NLE (signed greater than)
1514             --> test !(dst <=s src)
1515             --> test (dst >s src)
1516             --> test (src <s dst) */
1517          return unop(Iop_1Uto64,
1518                      binop(Iop_CmpLT32S,
1519                            unop(Iop_64to32, cc_dep2),
1520                            unop(Iop_64to32, cc_dep1)));
1521
1522       }
1523
1524       /*---------------- SUBW ----------------*/
1525
1526       /* 4, 5 */
1527       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1528          /* word sub/cmp, then Z --> test dst==src */
1529          return unop(Iop_1Uto64,
1530                      binop(Iop_CmpEQ16,
1531                            unop(Iop_64to16,cc_dep1),
1532                            unop(Iop_64to16,cc_dep2)));
1533       }
1534       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1535          /* word sub/cmp, then NZ --> test dst!=src */
1536          return unop(Iop_1Uto64,
1537                      binop(Iop_CmpNE16,
1538                            unop(Iop_64to16,cc_dep1),
1539                            unop(Iop_64to16,cc_dep2)));
1540       }
1541
1542       /* 6, */
1543       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
1544          /* word sub/cmp, then BE (unsigned less than or equal)
1545             --> test dst <=u src */
1546          return unop(Iop_1Uto64,
1547                      binop(Iop_CmpLE64U,
1548                            binop(Iop_Shl64, cc_dep1, mkU8(48)),
1549                            binop(Iop_Shl64, cc_dep2, mkU8(48))));
1550       }
1551
1552       /* 8, 9 */
1553       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondS)
1554                                           && isU64(cc_dep2, 0)) {
1555          /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
1556                                          --> test dst <s 0
1557                                          --> (ULong)dst[15]
1558             This is yet another scheme by which clang figures out if the
1559             top bit of a word is 1 or 0.  See also LOGICB/CondS below. */
1560          /* Note: isU64(cc_dep2, 0) is correct, even though this is
1561             for an 16-bit comparison, since the args to the helper
1562             function are always U64s. */
1563          return binop(Iop_And64,
1564                       binop(Iop_Shr64,cc_dep1,mkU8(15)),
1565                       mkU64(1));
1566       }
1567       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNS)
1568                                           && isU64(cc_dep2, 0)) {
1569          /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1570                                           --> test !(dst <s 0)
1571                                           --> (ULong) !dst[15]
1572          */
1573          return binop(Iop_Xor64,
1574                       binop(Iop_And64,
1575                             binop(Iop_Shr64,cc_dep1,mkU8(15)),
1576                             mkU64(1)),
1577                       mkU64(1));
1578       }
1579
1580       /* 14, */
1581       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1582          /* word sub/cmp, then LE (signed less than or equal)
1583             --> test dst <=s src */
1584          return unop(Iop_1Uto64,
1585                      binop(Iop_CmpLE64S,
1586                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1587                            binop(Iop_Shl64,cc_dep2,mkU8(48))));
1588
1589       }
1590
1591       /*---------------- SUBB ----------------*/
1592
1593       /* 2, 3 */
1594       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
1595          /* byte sub/cmp, then B (unsigned less than)
1596             --> test dst <u src */
1597          return unop(Iop_1Uto64,
1598                      binop(Iop_CmpLT64U,
1599                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1600                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1601       }
1602       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
1603          /* byte sub/cmp, then NB (unsigned greater than or equal)
1604             --> test src <=u dst */
1605          /* Note, args are opposite way round from the usual */
1606          return unop(Iop_1Uto64,
1607                      binop(Iop_CmpLE64U,
1608                            binop(Iop_And64, cc_dep2, mkU64(0xFF)),
1609                            binop(Iop_And64, cc_dep1, mkU64(0xFF))));
1610       }
1611
1612       /* 4, 5 */
1613       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1614          /* byte sub/cmp, then Z --> test dst==src */
1615          return unop(Iop_1Uto64,
1616                      binop(Iop_CmpEQ8,
1617                            unop(Iop_64to8,cc_dep1),
1618                            unop(Iop_64to8,cc_dep2)));
1619       }
1620       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1621          /* byte sub/cmp, then NZ --> test dst!=src */
1622          return unop(Iop_1Uto64,
1623                      binop(Iop_CmpNE8,
1624                            unop(Iop_64to8,cc_dep1),
1625                            unop(Iop_64to8,cc_dep2)));
1626       }
1627
1628       /* 6, */
1629       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1630          /* byte sub/cmp, then BE (unsigned less than or equal)
1631             --> test dst <=u src */
1632          return unop(Iop_1Uto64,
1633                      binop(Iop_CmpLE64U,
1634                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1635                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1636       }
1637
1638       /* 8, 9 */
1639       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1640                                           && isU64(cc_dep2, 0)) {
1641          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1642                                          --> test dst <s 0
1643                                          --> (ULong)dst[7]
1644             This is yet another scheme by which gcc figures out if the
1645             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
1646          /* Note: isU64(cc_dep2, 0) is correct, even though this is
1647             for an 8-bit comparison, since the args to the helper
1648             function are always U64s. */
1649          return binop(Iop_And64,
1650                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1651                       mkU64(1));
1652       }
1653       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1654                                           && isU64(cc_dep2, 0)) {
1655          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1656                                           --> test !(dst <s 0)
1657                                           --> (ULong) !dst[7]
1658          */
1659          return binop(Iop_Xor64,
1660                       binop(Iop_And64,
1661                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1662                             mkU64(1)),
1663                       mkU64(1));
1664       }
1665
1666       /*---------------- LOGICQ ----------------*/
1667
1668       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1669          /* long long and/or/xor, then Z --> test dst==0 */
1670          return unop(Iop_1Uto64,
1671                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1672       }
1673       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1674          /* long long and/or/xor, then NZ --> test dst!=0 */
1675          return unop(Iop_1Uto64,
1676                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1677       }
1678
1679       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1680          /* long long and/or/xor, then L
1681             LOGIC sets SF and ZF according to the
1682             result and makes OF be zero.  L computes SF ^ OF, but
1683             OF is zero, so this reduces to SF -- which will be 1 iff
1684             the result is < signed 0.  Hence ...
1685          */
1686          return unop(Iop_1Uto64,
1687                      binop(Iop_CmpLT64S,
1688                            cc_dep1,
1689                            mkU64(0)));
1690       }
1691
1692       // Verified
1693       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondS)) {
1694          /* long long and/or/xor, then S --> (ULong)result[63] */
1695          return binop(Iop_Shr64, cc_dep1, mkU8(63));
1696       }
1697       // Verified
1698       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNS)) {
1699          /* long long and/or/xor, then S --> (ULong) ~ result[63] */
1700          return binop(Iop_Xor64,
1701                       binop(Iop_Shr64, cc_dep1, mkU8(63)),
1702                       mkU64(1));
1703       }
1704
1705       /*---------------- LOGICL ----------------*/
1706
1707       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1708          /* long and/or/xor, then Z --> test dst==0 */
1709          return unop(Iop_1Uto64,
1710                      binop(Iop_CmpEQ32,
1711                            unop(Iop_64to32, cc_dep1),
1712                            mkU32(0)));
1713       }
1714       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1715          /* long and/or/xor, then NZ --> test dst!=0 */
1716          return unop(Iop_1Uto64,
1717                      binop(Iop_CmpNE32,
1718                            unop(Iop_64to32, cc_dep1),
1719                            mkU32(0)));
1720       }
1721
1722       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1723          /* long and/or/xor, then LE
1724             This is pretty subtle.  LOGIC sets SF and ZF according to the
1725             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
1726             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1727             the result is <=signed 0.  Hence ...
1728          */
1729          return unop(Iop_1Uto64,
1730                      binop(Iop_CmpLE32S,
1731                            unop(Iop_64to32, cc_dep1),
1732                            mkU32(0)));
1733       }
1734
1735       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1736          /* long and/or/xor, then S --> (ULong)result[31] */
1737          return binop(Iop_And64,
1738                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1739                       mkU64(1));
1740       }
1741       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1742          /* long and/or/xor, then S --> (ULong) ~ result[31] */
1743          return binop(Iop_Xor64,
1744                 binop(Iop_And64,
1745                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1746                       mkU64(1)),
1747                 mkU64(1));
1748       }
1749
1750       /*---------------- LOGICW ----------------*/
1751
1752       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1753          /* word and/or/xor, then Z --> test dst==0 */
1754          // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
1755          // it exactly at EdcAUTO.
1756          return unop(Iop_1Uto64,
1757                      binop(Iop_CmpEQ32,
1758                            unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1759                            mkU32(0)));
1760       }
1761       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1762          /* word and/or/xor, then NZ --> test dst!=0 */
1763          // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
1764          // it exactly at EdcAUTO.
1765          return unop(Iop_1Uto64,
1766                      binop(Iop_CmpNE32,
1767                            unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1768                            mkU32(0)));
1769       }
1770
1771       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondS)) {
1772          /* word and/or/xor, then S --> (ULong)result[15] */
1773          return binop(Iop_And64,
1774                       binop(Iop_Shr64, cc_dep1, mkU8(15)),
1775                       mkU64(1));
1776       }
1777       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNS)) {
1778          /* word and/or/xor, then S --> (ULong) ~ result[15] */
1779          return binop(Iop_Xor64,
1780                       binop(Iop_And64,
1781                             binop(Iop_Shr64, cc_dep1, mkU8(15)),
1782                             mkU64(1)),
1783                       mkU64(1));
1784       }
1785
1786       /*---------------- LOGICB ----------------*/
1787
1788       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1789          /* byte and/or/xor, then Z --> test dst==0 */
1790          // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
1791          // it exactly at EdcAUTO.
1792          return unop(Iop_1Uto64,
1793                      binop(Iop_CmpEQ32,
1794                            unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
1795                            mkU32(0)));
1796       }
1797       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1798          /* byte and/or/xor, then NZ --> test dst!=0 */
1799          // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
1800          // it exactly at EdcAUTO.
1801          return unop(Iop_1Uto64,
1802                      binop(Iop_CmpNE32,
1803                            unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
1804                            mkU32(0)));
1805       }
1806
1807       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1808          /* this is an idiom gcc sometimes uses to find out if the top
1809             bit of a byte register is set: eg testb %al,%al; js ..
1810             Since it just depends on the top bit of the byte, extract
1811             that bit and explicitly get rid of all the rest.  This
1812             helps memcheck avoid false positives in the case where any
1813             of the other bits in the byte are undefined. */
1814          /* byte and/or/xor, then S --> (UInt)result[7] */
1815          return binop(Iop_And64,
1816                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1817                       mkU64(1));
1818       }
1819       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1820          /* byte and/or/xor, then NS --> (UInt)!result[7] */
1821          return binop(Iop_Xor64,
1822                       binop(Iop_And64,
1823                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1824                             mkU64(1)),
1825                       mkU64(1));
1826       }
1827
1828       /*---------------- INCB ----------------*/
1829
1830       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1831          /* 8-bit inc, then LE --> sign bit of the arg */
1832          return binop(Iop_And64,
1833                       binop(Iop_Shr64,
1834                             binop(Iop_Sub64, cc_dep1, mkU64(1)),
1835                             mkU8(7)),
1836                       mkU64(1));
1837       }
1838
1839       /*---------------- INCW ----------------*/
1840
1841       if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1842          /* 16-bit inc, then Z --> test dst == 0 */
1843          return unop(Iop_1Uto64,
1844                      binop(Iop_CmpEQ64,
1845                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1846                            mkU64(0)));
1847       }
1848
1849       /*---------------- DECL ----------------*/
1850
1851       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1852          /* dec L, then Z --> test dst == 0 */
1853          return unop(Iop_1Uto64,
1854                      binop(Iop_CmpEQ32,
1855                            unop(Iop_64to32, cc_dep1),
1856                            mkU32(0)));
1857       }
1858
1859       /*---------------- DECW ----------------*/
1860
1861       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1862          /* 16-bit dec, then NZ --> test dst != 0 */
1863          return unop(Iop_1Uto64,
1864                      binop(Iop_CmpNE64,
1865                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1866                            mkU64(0)));
1867       }
1868
1869       /*---------------- SHRQ ----------------*/
1870
1871       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
1872          /* SHRQ, then Z --> test result[63:0] == 0 */
1873          return unop(Iop_1Uto64,
1874                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1875       }
1876       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
1877          /* SHRQ, then NZ --> test result[63:0] != 0 */
1878          return unop(Iop_1Uto64,
1879                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1880       }
1881
1882       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondS)) {
1883          /* SHRQ, then S --> (ULong)result[63] (result is in dep1) */
1884          return binop(Iop_Shr64, cc_dep1, mkU8(63));
1885       }
1886       // No known test case for this, hence disabled:
1887       //if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNS)) {
1888       //   /* SHRQ, then NS --> (ULong) ~ result[63] */
1889       //   vassert(0);
1890       //}
1891
1892       /*---------------- SHRL ----------------*/
1893
1894       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
1895          /* SHRL, then Z --> test dep1 == 0 */
1896          return unop(Iop_1Uto64,
1897                      binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1898                            mkU32(0)));
1899       }
1900       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNZ)) {
1901          /* SHRL, then NZ --> test dep1 != 0 */
1902          return unop(Iop_1Uto64,
1903                      binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1904                            mkU32(0)));
1905       }
1906
1907       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondS)) {
1908          /* SHRL/SARL, then S --> (ULong)result[31] */
1909          return binop(Iop_And64,
1910                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1911                       mkU64(1));
1912       }
1913       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNS)) {
1914          /* SHRL/SARL, then NS --> (ULong) ~ result[31] */
1915          return binop(Iop_Xor64,
1916                       binop(Iop_And64,
1917                             binop(Iop_Shr64, cc_dep1, mkU8(31)),
1918                             mkU64(1)),
1919                       mkU64(1));
1920       }
1921
1922       /*---------------- SHRW ----------------*/
1923
1924       if (isU64(cc_op, AMD64G_CC_OP_SHRW) && isU64(cond, AMD64CondZ)) {
1925          /* SHRW, then Z --> test dep1 == 0 */
1926          return unop(Iop_1Uto64,
1927                      binop(Iop_CmpEQ32,
1928                            unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1929                            mkU32(0)));
1930       }
1931       // No known test case for this, hence disabled:
1932       //if (isU64(cc_op, AMD64G_CC_OP_SHRW) && isU64(cond, AMD64CondNZ)) {
1933       //   /* SHRW, then NZ --> test dep1 == 0 */
1934       //   return unop(Iop_1Uto64,
1935       //               binop(Iop_CmpNE32,
1936       //                     unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1937       //                     mkU32(0)));
1938       //}
1939
1940       /*---------------- SHLQ ----------------*/
1941
1942       if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondZ)) {
1943          /* SHLQ, then Z --> test dep1 == 0 */
1944          return unop(Iop_1Uto64,
1945                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1946       }
1947       if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondNZ)) {
1948          /* SHLQ, then NZ --> test dep1 != 0 */
1949          return unop(Iop_1Uto64,
1950                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1951       }
1952
1953       // Verified
1954       if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondS)) {
1955          /* SHLQ, then S --> (ULong)result[63] */
1956          return binop(Iop_Shr64, cc_dep1, mkU8(63));
1957       }
1958       // No known test case
1959       //if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondNS)) {
1960       //   /* SHLQ, then NS --> (ULong) ~ result[63] */
1961       //   vassert(0);
1962       //}
1963
1964       /*---------------- SHLL ----------------*/
1965
1966       if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondZ)) {
1967          /* SHLL, then Z --> test result[31:0] == 0 */
1968          return unop(Iop_1Uto64,
1969                      binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1970                            mkU32(0)));
1971       }
1972       // Verified
1973       if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondNZ)) {
1974          /* SHLL, then NZ --> test dep1 != 0 */
1975          return unop(Iop_1Uto64,
1976                      binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1977                            mkU32(0)));
1978       }
1979
1980       if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondS)) {
1981          /* SHLL, then S --> (ULong)result[31] */
1982          return binop(Iop_And64,
1983                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1984                       mkU64(1));
1985       }
1986       // No known test case
1987       //if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondNS)) {
1988       //   /* SHLL, then NS --> (ULong) ~ result[31] */
1989       //   vassert(0);
1990       //}
1991
1992       /*---------------- COPY ----------------*/
1993       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1994          jbe" for example. */
1995
1996       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1997           && (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1998          /* COPY, then BE --> extract C and Z from dep1, and test (C
1999             or Z == 1). */
2000          /* COPY, then NBE --> extract C and Z from dep1, and test (C
2001             or Z == 0). */
2002          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
2003          return
2004             unop(
2005                Iop_1Uto64,
2006                binop(
2007                   Iop_CmpEQ64,
2008                   binop(
2009                      Iop_And64,
2010                      binop(
2011                         Iop_Or64,
2012                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
2013                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
2014                      ),
2015                      mkU64(1)
2016                   ),
2017                   mkU64(nnn)
2018                )
2019             );
2020       }
2021
2022       if (isU64(cc_op, AMD64G_CC_OP_COPY)
2023           && (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB))) {
2024          /* COPY, then B --> extract C from dep1, and test (C == 1). */
2025          /* COPY, then NB --> extract C from dep1, and test (C == 0). */
2026          ULong nnn = isU64(cond, AMD64CondB) ? 1 : 0;
2027          return
2028             unop(
2029                Iop_1Uto64,
2030                binop(
2031                   Iop_CmpEQ64,
2032                   binop(
2033                      Iop_And64,
2034                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
2035                      mkU64(1)
2036                   ),
2037                   mkU64(nnn)
2038                )
2039             );
2040       }
2041
2042       if (isU64(cc_op, AMD64G_CC_OP_COPY)
2043           && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
2044          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
2045          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
2046          ULong nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
2047          return
2048             unop(
2049                Iop_1Uto64,
2050                binop(
2051                   Iop_CmpEQ64,
2052                   binop(
2053                      Iop_And64,
2054                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
2055                      mkU64(1)
2056                   ),
2057                   mkU64(nnn)
2058                )
2059             );
2060       }
2061
2062       if (isU64(cc_op, AMD64G_CC_OP_COPY)
2063           && (isU64(cond, AMD64CondP) || isU64(cond, AMD64CondNP))) {
2064          /* COPY, then P --> extract P from dep1, and test (P == 1). */
2065          /* COPY, then NP --> extract P from dep1, and test (P == 0). */
2066          ULong nnn = isU64(cond, AMD64CondP) ? 1 : 0;
2067          return
2068             unop(
2069                Iop_1Uto64,
2070                binop(
2071                   Iop_CmpEQ64,
2072                   binop(
2073                      Iop_And64,
2074                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
2075                      mkU64(1)
2076                   ),
2077                   mkU64(nnn)
2078                )
2079             );
2080       }
2081
2082 #     if 0
2083       if (cond->tag == Iex_Const && cc_op->tag == Iex_Const) {
2084          vex_printf("spec request failed: ");
2085          vex_printf("   %s  ", function_name);
2086          for (i = 0; i < 2/*arity*/; i++) {
2087             vex_printf("  ");
2088             ppIRExpr(args[i]);
2089          }
2090          vex_printf("\n");
2091       }
2092 #     endif
2093
2094       return NULL;
2095    }
2096
2097    /* --------- specialising "amd64g_calculate_rflags_c" --------- */
2098
2099    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
2100       /* specialise calls to above "calculate_rflags_c" function */
2101       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
2102       vassert(arity == 4);
2103       cc_op   = args[0];
2104       cc_dep1 = args[1];
2105       cc_dep2 = args[2];
2106       cc_ndep = args[3];
2107
2108       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
2109          /* C after sub denotes unsigned less than */
2110          return unop(Iop_1Uto64,
2111                      binop(Iop_CmpLT64U,
2112                            cc_dep1,
2113                            cc_dep2));
2114       }
2115       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
2116          /* C after sub denotes unsigned less than */
2117          return unop(Iop_1Uto64,
2118                      binop(Iop_CmpLT32U,
2119                            unop(Iop_64to32, cc_dep1),
2120                            unop(Iop_64to32, cc_dep2)));
2121       }
2122       if (isU64(cc_op, AMD64G_CC_OP_SUBW)) {
2123          /* C after sub denotes unsigned less than */
2124          return unop(Iop_1Uto64,
2125                      binop(Iop_CmpLT64U,
2126                            binop(Iop_And64,cc_dep1,mkU64(0xFFFF)),
2127                            binop(Iop_And64,cc_dep2,mkU64(0xFFFF))));
2128       }
2129       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
2130          /* C after sub denotes unsigned less than */
2131          return unop(Iop_1Uto64,
2132                      binop(Iop_CmpLT64U,
2133                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
2134                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
2135       }
2136       if (isU64(cc_op, AMD64G_CC_OP_ADDQ)) {
2137          /* C after add denotes sum <u either arg */
2138          return unop(Iop_1Uto64,
2139                      binop(Iop_CmpLT64U,
2140                            binop(Iop_Add64, cc_dep1, cc_dep2),
2141                            cc_dep1));
2142       }
2143       if (isU64(cc_op, AMD64G_CC_OP_ADDL)) {
2144          /* C after add denotes sum <u either arg */
2145          return unop(Iop_1Uto64,
2146                      binop(Iop_CmpLT32U,
2147                            unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
2148                            unop(Iop_64to32, cc_dep1)));
2149       }
2150       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
2151           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
2152           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
2153           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
2154          /* cflag after logic is zero */
2155          return mkU64(0);
2156       }
2157       if (isU64(cc_op, AMD64G_CC_OP_DECL)
2158           || isU64(cc_op, AMD64G_CC_OP_INCL)
2159           || isU64(cc_op, AMD64G_CC_OP_DECQ)
2160           || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
2161          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
2162          return cc_ndep;
2163       }
2164
2165 #     if 0
2166       if (cc_op->tag == Iex_Const) {
2167          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
2168       }
2169 #     endif
2170
2171 #     if 0
2172       if (cc_op->tag == Iex_Const) {
2173          vex_printf("spec request failed: ");
2174          vex_printf("   %s  ", function_name);
2175          for (i = 0; i < 2/*arity*/; i++) {
2176             vex_printf("  ");
2177             ppIRExpr(args[i]);
2178          }
2179          vex_printf("\n");
2180       }
2181 #     endif
2182
2183       return NULL;
2184    }
2185
2186 #  undef unop
2187 #  undef binop
2188 #  undef mkU64
2189 #  undef mkU32
2190 #  undef mkU8
2191
2192    return NULL;
2193 }
2194
2195
2196 /*---------------------------------------------------------------*/
2197 /*--- Supporting functions for x87 FPU activities.            ---*/
2198 /*---------------------------------------------------------------*/
2199
2200 static inline Bool host_is_little_endian ( void )
2201 {
2202    UInt x = 0x76543210;
2203    UChar* p = (UChar*)(&x);
2204    return toBool(*p == 0x10);
2205 }
2206
2207 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
2208 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
2209 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
2210 {
2211    Bool   mantissaIsZero;
2212    Int    bexp;
2213    UChar  sign;
2214    UChar* f64;
2215
2216    vassert(host_is_little_endian());
2217
2218    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
2219
2220    f64  = (UChar*)(&dbl);
2221    sign = toUChar( (f64[7] >> 7) & 1 );
2222
2223    /* First off, if the tag indicates the register was empty,
2224       return 1,0,sign,1 */
2225    if (tag == 0) {
2226       /* vex_printf("Empty\n"); */
2227       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
2228                                    | AMD64G_FC_MASK_C0;
2229    }
2230
2231    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
2232    bexp &= 0x7FF;
2233
2234    mantissaIsZero
2235       = toBool(
2236            (f64[6] & 0x0F) == 0
2237            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
2238         );
2239
2240    /* If both exponent and mantissa are zero, the value is zero.
2241       Return 1,0,sign,0. */
2242    if (bexp == 0 && mantissaIsZero) {
2243       /* vex_printf("Zero\n"); */
2244       return AMD64G_FC_MASK_C3 | 0
2245                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
2246    }
2247
2248    /* If exponent is zero but mantissa isn't, it's a denormal.
2249       Return 1,1,sign,0. */
2250    if (bexp == 0 && !mantissaIsZero) {
2251       /* vex_printf("Denormal\n"); */
2252       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
2253                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
2254    }
2255
2256    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
2257       Return 0,1,sign,1. */
2258    if (bexp == 0x7FF && mantissaIsZero) {
2259       /* vex_printf("Inf\n"); */
2260       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
2261                                    | AMD64G_FC_MASK_C0;
2262    }
2263
2264    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
2265       Return 0,0,sign,1. */
2266    if (bexp == 0x7FF && !mantissaIsZero) {
2267       /* vex_printf("NaN\n"); */
2268       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
2269    }
2270
2271    /* Uh, ok, we give up.  It must be a normal finite number.
2272       Return 0,1,sign,0.
2273    */
2274    /* vex_printf("normal\n"); */
2275    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
2276 }
2277
2278
2279 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
2280    appears to differ from the former only in that the 8 FP registers
2281    themselves are not transferred into the guest state. */
2282 static
2283 VexEmNote do_put_x87 ( Bool moveRegs,
2284                        /*IN*/Fpu_State* x87_state,
2285                        /*OUT*/VexGuestAMD64State* vex_state )
2286 {
2287    Int        stno, preg;
2288    UInt       tag;
2289    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2290    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2291    UInt       ftop    = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
2292    UInt       tagw    = x87_state->env[FP_ENV_TAG];
2293    UInt       fpucw   = x87_state->env[FP_ENV_CTRL];
2294    UInt       c3210   = x87_state->env[FP_ENV_STAT] & 0x4700;
2295    VexEmNote  ew;
2296    UInt       fpround;
2297    ULong      pair;
2298
2299    /* Copy registers and tags */
2300    for (stno = 0; stno < 8; stno++) {
2301       preg = (stno + ftop) & 7;
2302       tag = (tagw >> (2*preg)) & 3;
2303       if (tag == 3) {
2304          /* register is empty */
2305          /* hmm, if it's empty, does it still get written?  Probably
2306             safer to say it does.  If we don't, memcheck could get out
2307             of sync, in that it thinks all FP registers are defined by
2308             this helper, but in reality some have not been updated. */
2309          if (moveRegs)
2310             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2311          vexTags[preg] = 0;
2312       } else {
2313          /* register is non-empty */
2314          if (moveRegs)
2315             convert_f80le_to_f64le( &x87_state->reg[10*stno],
2316                                     (UChar*)&vexRegs[preg] );
2317          vexTags[preg] = 1;
2318       }
2319    }
2320
2321    /* stack pointer */
2322    vex_state->guest_FTOP = ftop;
2323
2324    /* status word */
2325    vex_state->guest_FC3210 = c3210;
2326
2327    /* handle the control word, setting FPROUND and detecting any
2328       emulation warnings. */
2329    pair    = amd64g_check_fldcw ( (ULong)fpucw );
2330    fpround = (UInt)pair & 0xFFFFFFFFULL;
2331    ew      = (VexEmNote)(pair >> 32);
2332
2333    vex_state->guest_FPROUND = fpround & 3;
2334
2335    /* emulation warnings --> caller */
2336    return ew;
2337 }
2338
2339
2340 /* Create an x87 FPU state from the guest state, as close as
2341    we can approximate it. */
2342 static
2343 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
2344                   /*OUT*/Fpu_State* x87_state )
2345 {
2346    Int        i, stno, preg;
2347    UInt       tagw;
2348    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2349    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2350    UInt       ftop    = vex_state->guest_FTOP;
2351    UInt       c3210   = vex_state->guest_FC3210;
2352
2353    for (i = 0; i < 14; i++)
2354       x87_state->env[i] = 0;
2355
2356    x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
2357       = x87_state->env[13] = 0xFFFF;
2358    x87_state->env[FP_ENV_STAT]
2359       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2360    x87_state->env[FP_ENV_CTRL]
2361       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2362
2363    /* Dump the register stack in ST order. */
2364    tagw = 0;
2365    for (stno = 0; stno < 8; stno++) {
2366       preg = (stno + ftop) & 7;
2367       if (vexTags[preg] == 0) {
2368          /* register is empty */
2369          tagw |= (3 << (2*preg));
2370          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2371                                  &x87_state->reg[10*stno] );
2372       } else {
2373          /* register is full. */
2374          tagw |= (0 << (2*preg));
2375          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2376                                  &x87_state->reg[10*stno] );
2377       }
2378    }
2379    x87_state->env[FP_ENV_TAG] = toUShort(tagw);
2380 }
2381
2382
2383 /*---------------------------------------------------------------*/
2384 /*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
2385 /*---------------------------------------------------------------*/
2386
2387 /* CALLED FROM GENERATED CODE */
2388 /* DIRTY HELPER (reads guest state, writes guest mem) */
2389 /* XSAVE component 0 is the x87 FPU state. */
2390 void amd64g_dirtyhelper_XSAVE_COMPONENT_0
2391         ( VexGuestAMD64State* gst, HWord addr )
2392 {
2393    /* Derived from values obtained from
2394       vendor_id       : AuthenticAMD
2395       cpu family      : 15
2396       model           : 12
2397       model name      : AMD Athlon(tm) 64 Processor 3200+
2398       stepping        : 0
2399       cpu MHz         : 2200.000
2400       cache size      : 512 KB
2401    */
2402    /* Somewhat roundabout, but at least it's simple. */
2403    Fpu_State tmp;
2404    UShort*   addrS = (UShort*)addr;
2405    UChar*    addrC = (UChar*)addr;
2406    UShort    fp_tags;
2407    UInt      summary_tags;
2408    Int       r, stno;
2409    UShort    *srcS, *dstS;
2410
2411    do_get_x87( gst, &tmp );
2412
2413    /* Now build the proper fxsave x87 image from the fsave x87 image
2414       we just made. */
2415
2416    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
2417    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
2418
2419    /* set addrS[2] in an endian-independent way */
2420    summary_tags = 0;
2421    fp_tags = tmp.env[FP_ENV_TAG];
2422    for (r = 0; r < 8; r++) {
2423       if ( ((fp_tags >> (2*r)) & 3) != 3 )
2424          summary_tags |= (1 << r);
2425    }
2426    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
2427    addrC[5]  = 0; /* pad */
2428
2429    /* FOP: faulting fpu opcode.  From experimentation, the real CPU
2430       does not write this field. (?!) */
2431    addrS[3]  = 0; /* BOGUS */
2432
2433    /* RIP (Last x87 instruction pointer).  From experimentation, the
2434       real CPU does not write this field. (?!) */
2435    addrS[4]  = 0; /* BOGUS */
2436    addrS[5]  = 0; /* BOGUS */
2437    addrS[6]  = 0; /* BOGUS */
2438    addrS[7]  = 0; /* BOGUS */
2439
2440    /* RDP (Last x87 data pointer).  From experimentation, the real CPU
2441       does not write this field. (?!) */
2442    addrS[8]  = 0; /* BOGUS */
2443    addrS[9]  = 0; /* BOGUS */
2444    addrS[10] = 0; /* BOGUS */
2445    addrS[11] = 0; /* BOGUS */
2446
2447    /* addrS[13,12] are MXCSR -- not written */
2448    /* addrS[15,14] are MXCSR_MASK -- not written */
2449
2450    /* Copy in the FP registers, in ST order. */
2451    for (stno = 0; stno < 8; stno++) {
2452       srcS = (UShort*)(&tmp.reg[10*stno]);
2453       dstS = (UShort*)(&addrS[16 + 8*stno]);
2454       dstS[0] = srcS[0];
2455       dstS[1] = srcS[1];
2456       dstS[2] = srcS[2];
2457       dstS[3] = srcS[3];
2458       dstS[4] = srcS[4];
2459       dstS[5] = 0;
2460       dstS[6] = 0;
2461       dstS[7] = 0;
2462    }
2463 }
2464
2465
2466 /* CALLED FROM GENERATED CODE */
2467 /* DIRTY HELPER (reads guest state, writes guest mem) */
2468 /* XSAVE component 1 is the SSE state. */
2469 void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
2470         ( VexGuestAMD64State* gst, HWord addr )
2471 {
2472    UShort* addrS = (UShort*)addr;
2473    UInt    mxcsr;
2474
2475    /* The only non-register parts of the SSE state are MXCSR and
2476       MXCSR_MASK. */
2477    mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
2478
2479    addrS[12] = toUShort(mxcsr);  /* MXCSR */
2480    addrS[13] = toUShort(mxcsr >> 16);
2481
2482    addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
2483    addrS[15] = 0x0000; /* MXCSR mask (hi16) */
2484 }
2485
2486
2487 /* VISIBLE TO LIBVEX CLIENT */
2488 /* Do FXSAVE from the supplied VexGuestAMD64State structure and store
2489    the result at the given address which represents a buffer of at
2490    least 416 bytes.
2491
2492    This function is not called from generated code.  FXSAVE is dealt
2493    with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
2494    functions above plus some in-line IR.  This function is merely a
2495    convenience function for VEX's users.
2496 */
2497 void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
2498                                 /*OUT*/HWord fp_state )
2499 {
2500    /* Do the x87 part */
2501    amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
2502
2503    /* And now the SSE part, except for the registers themselves. */
2504    amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2505
2506    /* That's the first 160 bytes of the image done. */
2507    /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
2508       big-endian, these need to be byte-swapped. */
2509    U128 *xmm = (U128 *)(fp_state + 160);
2510    vassert(host_is_little_endian());
2511
2512 #  define COPY_U128(_dst,_src)                       \
2513       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2514            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2515       while (0)
2516
2517    COPY_U128( xmm[0],  gst->guest_YMM0 );
2518    COPY_U128( xmm[1],  gst->guest_YMM1 );
2519    COPY_U128( xmm[2],  gst->guest_YMM2 );
2520    COPY_U128( xmm[3],  gst->guest_YMM3 );
2521    COPY_U128( xmm[4],  gst->guest_YMM4 );
2522    COPY_U128( xmm[5],  gst->guest_YMM5 );
2523    COPY_U128( xmm[6],  gst->guest_YMM6 );
2524    COPY_U128( xmm[7],  gst->guest_YMM7 );
2525    COPY_U128( xmm[8],  gst->guest_YMM8 );
2526    COPY_U128( xmm[9],  gst->guest_YMM9 );
2527    COPY_U128( xmm[10], gst->guest_YMM10 );
2528    COPY_U128( xmm[11], gst->guest_YMM11 );
2529    COPY_U128( xmm[12], gst->guest_YMM12 );
2530    COPY_U128( xmm[13], gst->guest_YMM13 );
2531    COPY_U128( xmm[14], gst->guest_YMM14 );
2532    COPY_U128( xmm[15], gst->guest_YMM15 );
2533 #  undef COPY_U128
2534 }
2535
2536
2537 /*---------------------------------------------------------------*/
2538 /*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
2539 /*---------------------------------------------------------------*/
2540
2541 /* CALLED FROM GENERATED CODE */
2542 /* DIRTY HELPER (writes guest state, reads guest mem) */
2543 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
2544              ( VexGuestAMD64State* gst, HWord addr )
2545 {
2546    Fpu_State tmp;
2547    UShort*   addrS   = (UShort*)addr;
2548    UChar*    addrC   = (UChar*)addr;
2549    UShort    fp_tags;
2550    Int       r, stno, i;
2551
2552    /* Copy the x87 registers out of the image, into a temporary
2553       Fpu_State struct. */
2554    for (i = 0; i < 14; i++) tmp.env[i] = 0;
2555    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
2556    /* fill in tmp.reg[0..7] */
2557    for (stno = 0; stno < 8; stno++) {
2558       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
2559       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
2560       dstS[0] = srcS[0];
2561       dstS[1] = srcS[1];
2562       dstS[2] = srcS[2];
2563       dstS[3] = srcS[3];
2564       dstS[4] = srcS[4];
2565    }
2566    /* fill in tmp.env[0..13] */
2567    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
2568    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
2569
2570    fp_tags = 0;
2571    for (r = 0; r < 8; r++) {
2572       if (addrC[4] & (1<<r))
2573          fp_tags |= (0 << (2*r)); /* EMPTY */
2574       else
2575          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
2576    }
2577    tmp.env[FP_ENV_TAG] = fp_tags;
2578
2579    /* Now write 'tmp' into the guest state. */
2580    VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );
2581
2582    return warnX87;
2583 }
2584
2585
2586 /* CALLED FROM GENERATED CODE */
2587 /* DIRTY HELPER (writes guest state, reads guest mem) */
2588 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
2589              ( VexGuestAMD64State* gst, HWord addr )
2590 {
2591    UShort* addrS = (UShort*)addr;
2592    UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
2593                    | ((((UInt)addrS[13]) & 0xFFFF) << 16);
2594    ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );
2595
2596    VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
2597
2598    gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
2599    return warnXMM;
2600 }
2601
2602
2603 /* VISIBLE TO LIBVEX CLIENT */
2604 /* Do FXRSTOR from the supplied address and store read values to the given
2605    VexGuestAMD64State structure.
2606
2607    This function is not called from generated code.  FXRSTOR is dealt
2608    with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
2609    functions above plus some in-line IR.  This function is merely a
2610    convenience function for VEX's users.
2611 */
2612 VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
2613                                       /*MOD*/VexGuestAMD64State* gst )
2614 {
2615    /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
2616       to be byte-swapped. */
2617    U128 *xmm = (U128 *)(fp_state + 160);
2618
2619    vassert(host_is_little_endian());
2620
2621 #  define COPY_U128(_dst,_src)                       \
2622       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2623            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2624       while (0)
2625
2626    COPY_U128( gst->guest_YMM0, xmm[0] );
2627    COPY_U128( gst->guest_YMM1, xmm[1] );
2628    COPY_U128( gst->guest_YMM2, xmm[2] );
2629    COPY_U128( gst->guest_YMM3, xmm[3] );
2630    COPY_U128( gst->guest_YMM4, xmm[4] );
2631    COPY_U128( gst->guest_YMM5, xmm[5] );
2632    COPY_U128( gst->guest_YMM6, xmm[6] );
2633    COPY_U128( gst->guest_YMM7, xmm[7] );
2634    COPY_U128( gst->guest_YMM8, xmm[8] );
2635    COPY_U128( gst->guest_YMM9, xmm[9] );
2636    COPY_U128( gst->guest_YMM10, xmm[10] );
2637    COPY_U128( gst->guest_YMM11, xmm[11] );
2638    COPY_U128( gst->guest_YMM12, xmm[12] );
2639    COPY_U128( gst->guest_YMM13, xmm[13] );
2640    COPY_U128( gst->guest_YMM14, xmm[14] );
2641    COPY_U128( gst->guest_YMM15, xmm[15] );
2642
2643 #  undef COPY_U128
2644
2645    VexEmNote warnXMM
2646       = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2647    VexEmNote warnX87
2648       = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
2649
2650    /* Prefer an X87 emwarn over an XMM one, if both exist. */
2651    if (warnX87 != EmNote_NONE)
2652       return warnX87;
2653    else
2654       return warnXMM;
2655 }
2656
2657
2658 /*---------------------------------------------------------------*/
2659 /*--- Supporting functions for FSAVE/FRSTOR                   ---*/
2660 /*---------------------------------------------------------------*/
2661
2662 /* DIRTY HELPER (writes guest state) */
2663 /* Initialise the x87 FPU state as per 'finit'. */
2664 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
2665 {
2666    Int i;
2667    gst->guest_FTOP = 0;
2668    for (i = 0; i < 8; i++) {
2669       gst->guest_FPTAG[i] = 0; /* empty */
2670       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
2671    }
2672    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
2673    gst->guest_FC3210  = 0;
2674 }
2675
2676
2677 /* CALLED FROM GENERATED CODE */
2678 /* DIRTY HELPER (reads guest memory) */
2679 ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
2680 {
2681    ULong f64;
2682    convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
2683    return f64;
2684 }
2685
2686 /* CALLED FROM GENERATED CODE */
2687 /* DIRTY HELPER (writes guest memory) */
2688 void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
2689 {
2690    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
2691 }
2692
2693
2694 /* CALLED FROM GENERATED CODE */
2695 /* CLEAN HELPER */
2696 /* mxcsr[15:0] contains a SSE native format MXCSR value.
2697    Extract from it the required SSEROUND value and any resulting
2698    emulation warning, and return (warn << 32) | sseround value.
2699 */
2700 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
2701 {
2702    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
2703    /* NOTE, encoded exactly as per enum IRRoundingMode. */
2704    ULong rmode = (mxcsr >> 13) & 3;
2705
2706    /* Detect any required emulation warnings. */
2707    VexEmNote ew = EmNote_NONE;
2708
2709    if ((mxcsr & 0x1F80) != 0x1F80) {
2710       /* unmasked exceptions! */
2711       ew = EmWarn_X86_sseExns;
2712    }
2713    else
2714    if (mxcsr & (1<<15)) {
2715       /* FZ is set */
2716       ew = EmWarn_X86_fz;
2717    }
2718    else
2719    if (mxcsr & (1<<6)) {
2720       /* DAZ is set */
2721       ew = EmWarn_X86_daz;
2722    }
2723
2724    return (((ULong)ew) << 32) | ((ULong)rmode);
2725 }
2726
2727
2728 /* CALLED FROM GENERATED CODE */
2729 /* CLEAN HELPER */
2730 /* Given sseround as an IRRoundingMode value, create a suitable SSE
2731    native format MXCSR value. */
2732 ULong amd64g_create_mxcsr ( ULong sseround )
2733 {
2734    sseround &= 3;
2735    return 0x1F80 | (sseround << 13);
2736 }
2737
2738
2739 /* CLEAN HELPER */
2740 /* fpucw[15:0] contains a x87 native format FPU control word.
2741    Extract from it the required FPROUND value and any resulting
2742    emulation warning, and return (warn << 32) | fpround value.
2743 */
2744 ULong amd64g_check_fldcw ( ULong fpucw )
2745 {
2746    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
2747    /* NOTE, encoded exactly as per enum IRRoundingMode. */
2748    ULong rmode = (fpucw >> 10) & 3;
2749
2750    /* Detect any required emulation warnings. */
2751    VexEmNote ew = EmNote_NONE;
2752
2753    if ((fpucw & 0x3F) != 0x3F) {
2754       /* unmasked exceptions! */
2755       ew = EmWarn_X86_x87exns;
2756    }
2757    else
2758    if (((fpucw >> 8) & 3) != 3) {
2759       /* unsupported precision */
2760       ew = EmWarn_X86_x87precision;
2761    }
2762
2763    return (((ULong)ew) << 32) | ((ULong)rmode);
2764 }
2765
2766
2767 /* CLEAN HELPER */
2768 /* Given fpround as an IRRoundingMode value, create a suitable x87
2769    native format FPU control word. */
2770 ULong amd64g_create_fpucw ( ULong fpround )
2771 {
2772    fpround &= 3;
2773    return 0x037F | (fpround << 10);
2774 }
2775
2776
2777 /* This is used to implement 'fldenv'.
2778    Reads 28 bytes at x87_state[0 .. 27]. */
2779 /* CALLED FROM GENERATED CODE */
2780 /* DIRTY HELPER */
2781 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
2782                                       /*IN*/HWord x87_state)
2783 {
2784    return do_put_x87( False, (Fpu_State*)x87_state, vex_state );
2785 }
2786
2787
2788 /* CALLED FROM GENERATED CODE */
2789 /* DIRTY HELPER */
2790 /* Create an x87 FPU env from the guest state, as close as we can
2791    approximate it.  Writes 28 bytes at x87_state[0..27]. */
2792 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
2793                                  /*OUT*/HWord x87_state )
2794 {
2795    Int        i, stno, preg;
2796    UInt       tagw;
2797    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2798    Fpu_State* x87     = (Fpu_State*)x87_state;
2799    UInt       ftop    = vex_state->guest_FTOP;
2800    ULong      c3210   = vex_state->guest_FC3210;
2801
2802    for (i = 0; i < 14; i++)
2803       x87->env[i] = 0;
2804
2805    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
2806    x87->env[FP_ENV_STAT]
2807       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
2808    x87->env[FP_ENV_CTRL]
2809       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
2810
2811    /* Compute the x87 tag word. */
2812    tagw = 0;
2813    for (stno = 0; stno < 8; stno++) {
2814       preg = (stno + ftop) & 7;
2815       if (vexTags[preg] == 0) {
2816          /* register is empty */
2817          tagw |= (3 << (2*preg));
2818       } else {
2819          /* register is full. */
2820          tagw |= (0 << (2*preg));
2821       }
2822    }
2823    x87->env[FP_ENV_TAG] = toUShort(tagw);
2824
2825    /* We don't dump the x87 registers, tho. */
2826 }
2827
2828
2829 /* This is used to implement 'fnsave'.
2830    Writes 108 bytes at x87_state[0 .. 107]. */
2831 /* CALLED FROM GENERATED CODE */
2832 /* DIRTY HELPER */
2833 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2834                                  /*OUT*/HWord x87_state)
2835 {
2836    do_get_x87( vex_state, (Fpu_State*)x87_state );
2837 }
2838
2839
2840 /* This is used to implement 'fnsaves'.
2841    Writes 94 bytes at x87_state[0 .. 93]. */
2842 /* CALLED FROM GENERATED CODE */
2843 /* DIRTY HELPER */
2844 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2845                                   /*OUT*/HWord x87_state)
2846 {
2847    Int           i, stno, preg;
2848    UInt          tagw;
2849    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2850    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2851    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2852    UInt          ftop    = vex_state->guest_FTOP;
2853    UInt          c3210   = vex_state->guest_FC3210;
2854
2855    for (i = 0; i < 7; i++)
2856       x87->env[i] = 0;
2857
2858    x87->env[FPS_ENV_STAT]
2859       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2860    x87->env[FPS_ENV_CTRL]
2861       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2862
2863    /* Dump the register stack in ST order. */
2864    tagw = 0;
2865    for (stno = 0; stno < 8; stno++) {
2866       preg = (stno + ftop) & 7;
2867       if (vexTags[preg] == 0) {
2868          /* register is empty */
2869          tagw |= (3 << (2*preg));
2870          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2871                                  &x87->reg[10*stno] );
2872       } else {
2873          /* register is full. */
2874          tagw |= (0 << (2*preg));
2875          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2876                                  &x87->reg[10*stno] );
2877       }
2878    }
2879    x87->env[FPS_ENV_TAG] = toUShort(tagw);
2880 }
2881
2882
2883 /* This is used to implement 'frstor'.
2884    Reads 108 bytes at x87_state[0 .. 107]. */
2885 /* CALLED FROM GENERATED CODE */
2886 /* DIRTY HELPER */
2887 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2888                                       /*IN*/HWord x87_state)
2889 {
2890    return do_put_x87( True, (Fpu_State*)x87_state, vex_state );
2891 }
2892
2893
2894 /* This is used to implement 'frstors'.
2895    Reads 94 bytes at x87_state[0 .. 93]. */
2896 /* CALLED FROM GENERATED CODE */
2897 /* DIRTY HELPER */
2898 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2899                                        /*IN*/HWord x87_state)
2900 {
2901    Int           stno, preg;
2902    UInt          tag;
2903    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2904    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2905    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2906    UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2907    UInt          tagw    = x87->env[FPS_ENV_TAG];
2908    UInt          fpucw   = x87->env[FPS_ENV_CTRL];
2909    UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
2910    VexEmNote     ew;
2911    UInt          fpround;
2912    ULong         pair;
2913
2914    /* Copy registers and tags */
2915    for (stno = 0; stno < 8; stno++) {
2916       preg = (stno + ftop) & 7;
2917       tag = (tagw >> (2*preg)) & 3;
2918       if (tag == 3) {
2919          /* register is empty */
2920          /* hmm, if it's empty, does it still get written?  Probably
2921             safer to say it does.  If we don't, memcheck could get out
2922             of sync, in that it thinks all FP registers are defined by
2923             this helper, but in reality some have not been updated. */
2924          vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2925          vexTags[preg] = 0;
2926       } else {
2927          /* register is non-empty */
2928          convert_f80le_to_f64le( &x87->reg[10*stno],
2929                                  (UChar*)&vexRegs[preg] );
2930          vexTags[preg] = 1;
2931       }
2932    }
2933
2934    /* stack pointer */
2935    vex_state->guest_FTOP = ftop;
2936
2937    /* status word */
2938    vex_state->guest_FC3210 = c3210;
2939
2940    /* handle the control word, setting FPROUND and detecting any
2941       emulation warnings. */
2942    pair    = amd64g_check_fldcw ( (ULong)fpucw );
2943    fpround = (UInt)pair & 0xFFFFFFFFULL;
2944    ew      = (VexEmNote)(pair >> 32);
2945
2946    vex_state->guest_FPROUND = fpround & 3;
2947
2948    /* emulation warnings --> caller */
2949    return ew;
2950 }
2951
2952
2953 /*---------------------------------------------------------------*/
2954 /*--- CPUID helpers.                                          ---*/
2955 /*---------------------------------------------------------------*/
2956
2957 /* Claim to be the following CPU, which is probably representative of
2958    the lowliest (earliest) amd64 offerings.  It can do neither sse3
2959    nor cx16.
2960
2961    vendor_id       : AuthenticAMD
2962    cpu family      : 15
2963    model           : 5
2964    model name      : AMD Opteron (tm) Processor 848
2965    stepping        : 10
2966    cpu MHz         : 1797.682
2967    cache size      : 1024 KB
2968    fpu             : yes
2969    fpu_exception   : yes
2970    cpuid level     : 1
2971    wp              : yes
2972    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2973                      mtrr pge mca cmov pat pse36 clflush mmx fxsr
2974                      sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2975    bogomips        : 3600.62
2976    TLB size        : 1088 4K pages
2977    clflush size    : 64
2978    cache_alignment : 64
2979    address sizes   : 40 bits physical, 48 bits virtual
2980    power management: ts fid vid ttp
2981
2982    2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2983    we don't support them.  See #291568.  3dnow is 80000001.EDX.31
2984    and 3dnowext is 80000001.EDX.30.
2985 */
2986 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2987 {
2988 #  define SET_ABCD(_a,_b,_c,_d)                \
2989       do { st->guest_RAX = (ULong)(_a);        \
2990            st->guest_RBX = (ULong)(_b);        \
2991            st->guest_RCX = (ULong)(_c);        \
2992            st->guest_RDX = (ULong)(_d);        \
2993       } while (0)
2994
2995    switch (0xFFFFFFFF & st->guest_RAX) {
2996       case 0x00000000:
2997          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2998          break;
2999       case 0x00000001:
3000          SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
3001          break;
3002       case 0x80000000:
3003          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
3004          break;
3005       case 0x80000001:
3006          /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
3007             the original it-is-supported value that the h/w provides.
3008             See #291568. */
3009          SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
3010                                                       0x21d3fbff);
3011          break;
3012       case 0x80000002:
3013          SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
3014          break;
3015       case 0x80000003:
3016          SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
3017          break;
3018       case 0x80000004:
3019          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3020          break;
3021       case 0x80000005:
3022          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
3023          break;
3024       case 0x80000006:
3025          SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
3026          break;
3027       case 0x80000007:
3028          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
3029          break;
3030       case 0x80000008:
3031          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
3032          break;
3033       default:
3034          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3035          break;
3036    }
3037 #  undef SET_ABCD
3038 }
3039
3040
3041 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
3042    capable.
3043
3044    vendor_id       : GenuineIntel
3045    cpu family      : 6
3046    model           : 15
3047    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
3048    stepping        : 6
3049    cpu MHz         : 2394.000
3050    cache size      : 4096 KB
3051    physical id     : 0
3052    siblings        : 2
3053    core id         : 0
3054    cpu cores       : 2
3055    fpu             : yes
3056    fpu_exception   : yes
3057    cpuid level     : 10
3058    wp              : yes
3059    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
3060                      mtrr pge mca cmov pat pse36 clflush dts acpi
3061                      mmx fxsr sse sse2 ss ht tm syscall nx lm
3062                      constant_tsc pni monitor ds_cpl vmx est tm2
3063                      cx16 xtpr lahf_lm
3064    bogomips        : 4798.78
3065    clflush size    : 64
3066    cache_alignment : 64
3067    address sizes   : 36 bits physical, 48 bits virtual
3068    power management:
3069 */
3070 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
3071 {
3072 #  define SET_ABCD(_a,_b,_c,_d)                \
3073       do { st->guest_RAX = (ULong)(_a);        \
3074            st->guest_RBX = (ULong)(_b);        \
3075            st->guest_RCX = (ULong)(_c);        \
3076            st->guest_RDX = (ULong)(_d);        \
3077       } while (0)
3078
3079    switch (0xFFFFFFFF & st->guest_RAX) {
3080       case 0x00000000:
3081          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
3082          break;
3083       case 0x00000001:
3084          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
3085          break;
3086       case 0x00000002:
3087          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
3088          break;
3089       case 0x00000003:
3090          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3091          break;
3092       case 0x00000004: {
3093          switch (0xFFFFFFFF & st->guest_RCX) {
3094             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
3095                                       0x0000003f, 0x00000001); break;
3096             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
3097                                       0x0000003f, 0x00000001); break;
3098             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
3099                                       0x00000fff, 0x00000001); break;
3100             default:         SET_ABCD(0x00000000, 0x00000000,
3101                                       0x00000000, 0x00000000); break;
3102          }
3103          break;
3104       }
3105       case 0x00000005:
3106          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
3107          break;
3108       case 0x00000006:
3109          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
3110          break;
3111       case 0x00000007:
3112          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3113          break;
3114       case 0x00000008:
3115          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
3116          break;
3117       case 0x00000009:
3118          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3119          break;
3120       case 0x0000000a:
3121       unhandled_eax_value:
3122          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
3123          break;
3124       case 0x80000000:
3125          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3126          break;
3127       case 0x80000001:
3128          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
3129          break;
3130       case 0x80000002:
3131          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3132          break;
3133       case 0x80000003:
3134          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
3135          break;
3136       case 0x80000004:
3137          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
3138          break;
3139       case 0x80000005:
3140          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3141          break;
3142       case 0x80000006:
3143          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
3144          break;
3145       case 0x80000007:
3146          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3147          break;
3148       case 0x80000008:
3149          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3150          break;
3151       default:
3152          goto unhandled_eax_value;
3153    }
3154 #  undef SET_ABCD
3155 }
3156
3157
3158 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
3159    capable.
3160
3161    vendor_id       : GenuineIntel
3162    cpu family      : 6
3163    model           : 37
3164    model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
3165    stepping        : 2
3166    cpu MHz         : 3334.000
3167    cache size      : 4096 KB
3168    physical id     : 0
3169    siblings        : 4
3170    core id         : 0
3171    cpu cores       : 2
3172    apicid          : 0
3173    initial apicid  : 0
3174    fpu             : yes
3175    fpu_exception   : yes
3176    cpuid level     : 11
3177    wp              : yes
3178    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
3179                      mtrr pge mca cmov pat pse36 clflush dts acpi
3180                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
3181                      lm constant_tsc arch_perfmon pebs bts rep_good
3182                      xtopology nonstop_tsc aperfmperf pni pclmulqdq
3183                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
3184                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
3185                      arat tpr_shadow vnmi flexpriority ept vpid
3186    bogomips        : 6957.57
3187    clflush size    : 64
3188    cache_alignment : 64
3189    address sizes   : 36 bits physical, 48 bits virtual
3190    power management:
3191 */
3192 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
3193 {
3194 #  define SET_ABCD(_a,_b,_c,_d)                \
3195       do { st->guest_RAX = (ULong)(_a);        \
3196            st->guest_RBX = (ULong)(_b);        \
3197            st->guest_RCX = (ULong)(_c);        \
3198            st->guest_RDX = (ULong)(_d);        \
3199       } while (0)
3200
3201    UInt old_eax = (UInt)st->guest_RAX;
3202    UInt old_ecx = (UInt)st->guest_RCX;
3203
3204    switch (old_eax) {
3205       case 0x00000000:
3206          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
3207          break;
3208       case 0x00000001:
3209          SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
3210          break;
3211       case 0x00000002:
3212          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
3213          break;
3214       case 0x00000003:
3215          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3216          break;
3217       case 0x00000004:
3218          switch (old_ecx) {
3219             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3220                                       0x0000003f, 0x00000000); break;
3221             case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
3222                                       0x0000007f, 0x00000000); break;
3223             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3224                                       0x000001ff, 0x00000000); break;
3225             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3226                                       0x00000fff, 0x00000002); break;
3227             default:         SET_ABCD(0x00000000, 0x00000000,
3228                                       0x00000000, 0x00000000); break;
3229          }
3230          break;
3231       case 0x00000005:
3232          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3233          break;
3234       case 0x00000006:
3235          SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
3236          break;
3237       case 0x00000007:
3238          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3239          break;
3240       case 0x00000008:
3241          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3242          break;
3243       case 0x00000009:
3244          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3245          break;
3246       case 0x0000000a:
3247          SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
3248          break;
3249       case 0x0000000b:
3250          switch (old_ecx) {
3251             case 0x00000000:
3252                SET_ABCD(0x00000001, 0x00000002,
3253                         0x00000100, 0x00000000); break;
3254             case 0x00000001:
3255                SET_ABCD(0x00000004, 0x00000004,
3256                         0x00000201, 0x00000000); break;
3257             default:
3258                SET_ABCD(0x00000000, 0x00000000,
3259                         old_ecx,    0x00000000); break;
3260          }
3261          break;
3262       case 0x0000000c:
3263          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3264          break;
3265       case 0x0000000d:
3266          switch (old_ecx) {
3267             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3268                                       0x00000100, 0x00000000); break;
3269             case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
3270                                       0x00000201, 0x00000000); break;
3271             default:         SET_ABCD(0x00000000, 0x00000000,
3272                                       old_ecx,    0x00000000); break;
3273          }
3274          break;
3275       case 0x80000000:
3276          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3277          break;
3278       case 0x80000001:
3279          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3280          break;
3281       case 0x80000002:
3282          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3283          break;
3284       case 0x80000003:
3285          SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
3286          break;
3287       case 0x80000004:
3288          SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
3289          break;
3290       case 0x80000005:
3291          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3292          break;
3293       case 0x80000006:
3294          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3295          break;
3296       case 0x80000007:
3297          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3298          break;
3299       case 0x80000008:
3300          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3301          break;
3302       default:
3303          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3304          break;
3305    }
3306 #  undef SET_ABCD
3307 }
3308
3309
3310 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
3311    capable.  Plus (kludge!) it "supports" HTM.
3312
3313    Also with the following change: claim that XSaveOpt is not
3314    available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
3315    on the real CPU.  Consequently, programs that correctly observe
3316    these CPUID values should only try to use 3 of the 8 XSave-family
3317    instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
3318    having to implement the compacted or optimised save/restore
3319    variants.
3320
3321    vendor_id       : GenuineIntel
3322    cpu family      : 6
3323    model           : 42
3324    model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
3325    stepping        : 7
3326    cpu MHz         : 1600.000
3327    cache size      : 6144 KB
3328    physical id     : 0
3329    siblings        : 4
3330    core id         : 3
3331    cpu cores       : 4
3332    apicid          : 6
3333    initial apicid  : 6
3334    fpu             : yes
3335    fpu_exception   : yes
3336    cpuid level     : 13
3337    wp              : yes
3338    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
3339                      mtrr pge mca cmov pat pse36 clflush dts acpi
3340                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
3341                      lm constant_tsc arch_perfmon pebs bts rep_good
3342                      nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
3343                      dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
3344                      xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
3345                      lahf_lm ida arat epb xsaveopt pln pts dts
3346                      tpr_shadow vnmi flexpriority ept vpid
3347
3348    bogomips        : 5768.94
3349    clflush size    : 64
3350    cache_alignment : 64
3351    address sizes   : 36 bits physical, 48 bits virtual
3352    power management:
3353 */
3354 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st,
3355                                              ULong hasF16C, ULong hasRDRAND,
3356                                              ULong hasRDSEED )
3357 {
3358    vassert((hasF16C >> 1) == 0ULL);
3359    vassert((hasRDRAND >> 1) == 0ULL);
3360 #  define SET_ABCD(_a,_b,_c,_d)                \
3361       do { st->guest_RAX = (ULong)(_a);        \
3362            st->guest_RBX = (ULong)(_b);        \
3363            st->guest_RCX = (ULong)(_c);        \
3364            st->guest_RDX = (ULong)(_d);        \
3365       } while (0)
3366
3367    UInt old_eax = (UInt)st->guest_RAX;
3368    UInt old_ecx = (UInt)st->guest_RCX;
3369
3370    switch (old_eax) {
3371       case 0x00000000:
3372          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3373          break;
3374       case 0x00000001: {
3375          // As a baseline, advertise neither F16C (ecx:29) nor RDRAND (ecx:30),
3376          // but patch in support for them as directed by the caller.
3377          UInt ecx_extra
3378             = (hasF16C ? (1U << 29) : 0) | (hasRDRAND ? (1U << 30) : 0);
3379          SET_ABCD(0x000206a7, 0x00100800, (0x1f9ae3bf | ecx_extra), 0xbfebfbff);
3380          break;
3381       }
3382       case 0x00000002:
3383          SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
3384          break;
3385       case 0x00000003:
3386          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3387          break;
3388       case 0x00000004:
3389          switch (old_ecx) {
3390             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3391                                       0x0000003f, 0x00000000); break;
3392             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3393                                       0x0000003f, 0x00000000); break;
3394             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3395                                       0x000001ff, 0x00000000); break;
3396             case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
3397                                       0x00001fff, 0x00000006); break;
3398             default:         SET_ABCD(0x00000000, 0x00000000,
3399                                       0x00000000, 0x00000000); break;
3400          }
3401          break;
3402       case 0x00000005:
3403          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3404          break;
3405       case 0x00000006:
3406          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3407          break;
3408       case 0x00000007: {
3409          UInt ebx_extra = 0;
3410          if (old_ecx == 0)
3411              ebx_extra = hasRDSEED ? (1U << 18) : 0;
3412          SET_ABCD(0x00000000, 0x00000800 | ebx_extra, 0x00000000,
3413                   0x00000000);
3414          break;
3415                        }
3416       case 0x00000008:
3417          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3418          break;
3419       case 0x00000009:
3420          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3421          break;
3422       case 0x0000000a:
3423          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3424          break;
3425       case 0x0000000b:
3426          switch (old_ecx) {
3427             case 0x00000000:
3428                SET_ABCD(0x00000001, 0x00000001,
3429                         0x00000100, 0x00000000); break;
3430             case 0x00000001:
3431                SET_ABCD(0x00000004, 0x00000004,
3432                         0x00000201, 0x00000000); break;
3433             default:
3434                SET_ABCD(0x00000000, 0x00000000,
3435                         old_ecx,    0x00000000); break;
3436          }
3437          break;
3438       case 0x0000000c:
3439          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3440          break;
3441       case 0x0000000d:
3442          switch (old_ecx) {
3443             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3444                                       0x00000340, 0x00000000); break;
3445             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3446                                       0x00000000, 0x00000000); break;
3447             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3448                                       0x00000000, 0x00000000); break;
3449             default:         SET_ABCD(0x00000000, 0x00000000,
3450                                       0x00000000, 0x00000000); break;
3451          }
3452          break;
3453       case 0x0000000e:
3454          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3455          break;
3456       case 0x0000000f:
3457          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3458          break;
3459       case 0x80000000:
3460          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3461          break;
3462       case 0x80000001:
3463          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3464          break;
3465       case 0x80000002:
3466          SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
3467          break;
3468       case 0x80000003:
3469          SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
3470          break;
3471       case 0x80000004:
3472          SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
3473          break;
3474       case 0x80000005:
3475          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3476          break;
3477       case 0x80000006:
3478          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3479          break;
3480       case 0x80000007:
3481          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3482          break;
3483       case 0x80000008:
3484          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3485          break;
3486       default:
3487          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3488          break;
3489    }
3490 #  undef SET_ABCD
3491 }
3492
3493
3494 /* Claim to be the following CPU (4 x ...), which is AVX2 capable.
3495
3496    With the following change: claim that XSaveOpt is not available, by
3497    cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
3498    CPU.  Consequently, programs that correctly observe these CPUID
3499    values should only try to use 3 of the 8 XSave-family instructions:
3500    XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
3501    implement the compacted or optimised save/restore variants.
3502
3503    vendor_id       : GenuineIntel
3504    cpu family      : 6
3505    model           : 60
3506    model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
3507    stepping        : 3
3508    microcode       : 0x1c
3509    cpu MHz         : 919.957
3510    cache size      : 8192 KB
3511    physical id     : 0
3512    siblings        : 4
3513    core id         : 3
3514    cpu cores       : 4
3515    apicid          : 6
3516    initial apicid  : 6
3517    fpu             : yes
3518    fpu_exception   : yes
3519    cpuid level     : 13
3520    wp              : yes
3521    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
3522                      cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
3523                      tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
3524                      arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
3525                      aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
3526                      vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
3527                      sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
3528                      avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
3529                      tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
3530                      bmi1 avx2 smep bmi2 erms invpcid xsaveopt
3531    bugs            :
3532    bogomips        : 5786.68
3533    clflush size    : 64
3534    cache_alignment : 64
3535    address sizes   : 39 bits physical, 48 bits virtual
3536    power management:
3537 */
3538 void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st,
3539                                      ULong hasF16C, ULong hasRDRAND,
3540                                      ULong hasRDSEED )
3541 {
3542    vassert((hasF16C >> 1) == 0ULL);
3543    vassert((hasRDRAND >> 1) == 0ULL);
3544 #  define SET_ABCD(_a,_b,_c,_d)                \
3545       do { st->guest_RAX = (ULong)(_a);        \
3546            st->guest_RBX = (ULong)(_b);        \
3547            st->guest_RCX = (ULong)(_c);        \
3548            st->guest_RDX = (ULong)(_d);        \
3549       } while (0)
3550
3551    UInt old_eax = (UInt)st->guest_RAX;
3552    UInt old_ecx = (UInt)st->guest_RCX;
3553
3554    switch (old_eax) {
3555       case 0x00000000:
3556          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3557          break;
3558       case 0x00000001: {
3559          // As a baseline, advertise neither F16C (ecx:29) nor RDRAND (ecx:30),
3560          // but patch in support for them as directed by the caller.
3561          UInt ecx_extra
3562             = (hasF16C ? (1U << 29) : 0) | (hasRDRAND ? (1U << 30) : 0);
3563          SET_ABCD(0x000306c3, 0x02100800, (0x1ffafbff | ecx_extra), 0xbfebfbff);
3564          break;
3565       }
3566       case 0x00000002:
3567          SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
3568          break;
3569       case 0x00000003:
3570          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3571          break;
3572       case 0x00000004:
3573          switch (old_ecx) {
3574             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3575                                       0x0000003f, 0x00000000); break;
3576             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3577                                       0x0000003f, 0x00000000); break;
3578             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3579                                       0x000001ff, 0x00000000); break;
3580             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3581                                       0x00001fff, 0x00000006); break;
3582             default:         SET_ABCD(0x00000000, 0x00000000,
3583                                       0x00000000, 0x00000000); break;
3584          }
3585          break;
3586       case 0x00000005:
3587          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
3588          break;
3589       case 0x00000006:
3590          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3591          break;
3592       case 0x00000007:
3593          switch (old_ecx) {
3594             /* Don't advertise FSGSBASE support, bit 0 in EBX.  */
3595
3596             case 0x00000000: {
3597                UInt ebx_extra = hasRDSEED ? (1U << 18) : 0;
3598                SET_ABCD(0x00000000, 0x000027aa | ebx_extra,
3599                         0x00000000, 0x00000000); break;
3600                              }
3601             default:         SET_ABCD(0x00000000, 0x00000000,
3602                                       0x00000000, 0x00000000); break;
3603          }
3604          break;
3605       case 0x00000008:
3606          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3607          break;
3608       case 0x00000009:
3609          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3610          break;
3611       case 0x0000000a:
3612          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3613          break;
3614       case 0x0000000b:
3615          switch (old_ecx) {
3616             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3617                                       0x00000100, 0x00000002); break;
3618             case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
3619                                       0x00000201, 0x00000002); break;
3620             default:         SET_ABCD(0x00000000, 0x00000000,
3621                                       old_ecx,    0x00000002); break;
3622          }
3623          break;
3624       case 0x0000000c:
3625          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3626          break;
3627       case 0x0000000d:
3628          switch (old_ecx) {
3629             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3630                                       0x00000340, 0x00000000); break;
3631             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3632                                       0x00000000, 0x00000000); break;
3633             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3634                                       0x00000000, 0x00000000); break;
3635             default:         SET_ABCD(0x00000000, 0x00000000,
3636                                       0x00000000, 0x00000000); break;
3637          }
3638          break;
3639       case 0x80000000:
3640          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3641          break;
3642       case 0x80000001:
3643          SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
3644          break;
3645       case 0x80000002:
3646          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3647          break;
3648       case 0x80000003:
3649          SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
3650          break;
3651       case 0x80000004:
3652          SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
3653          break;
3654       case 0x80000005:
3655          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3656          break;
3657       case 0x80000006:
3658          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3659          break;
3660       case 0x80000007:
3661          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3662          break;
3663       case 0x80000008:
3664          SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
3665          break;
3666       default:
3667          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3668          break;
3669    }
3670 #  undef SET_ABCD
3671 }
3672
3673
3674 /*---------------------------------------------------------------*/
3675 /*--- Misc integer helpers, including rotates and crypto.     ---*/
3676 /*---------------------------------------------------------------*/
3677
3678 ULong amd64g_calculate_RCR ( ULong arg,
3679                              ULong rot_amt,
3680                              ULong rflags_in,
3681                              Long  szIN )
3682 {
3683    Bool  wantRflags = toBool(szIN < 0);
3684    ULong sz         = wantRflags ? (-szIN) : szIN;
3685    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3686    ULong cf=0, of=0, tempcf;
3687
3688    switch (sz) {
3689       case 8:
3690          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3691          of        = ((arg >> 63) ^ cf) & 1;
3692          while (tempCOUNT > 0) {
3693             tempcf = arg & 1;
3694             arg    = (arg >> 1) | (cf << 63);
3695             cf     = tempcf;
3696             tempCOUNT--;
3697          }
3698          break;
3699       case 4:
3700          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3701          of        = ((arg >> 31) ^ cf) & 1;
3702          while (tempCOUNT > 0) {
3703             tempcf = arg & 1;
3704             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
3705             cf     = tempcf;
3706             tempCOUNT--;
3707          }
3708          break;
3709       case 2:
3710          while (tempCOUNT >= 17) tempCOUNT -= 17;
3711          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3712          of        = ((arg >> 15) ^ cf) & 1;
3713          while (tempCOUNT > 0) {
3714             tempcf = arg & 1;
3715             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
3716             cf     = tempcf;
3717             tempCOUNT--;
3718          }
3719          break;
3720       case 1:
3721          while (tempCOUNT >= 9) tempCOUNT -= 9;
3722          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3723          of        = ((arg >> 7) ^ cf) & 1;
3724          while (tempCOUNT > 0) {
3725             tempcf = arg & 1;
3726             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
3727             cf     = tempcf;
3728             tempCOUNT--;
3729          }
3730          break;
3731       default:
3732          vpanic("calculate_RCR(amd64g): invalid size");
3733    }
3734
3735    cf &= 1;
3736    of &= 1;
3737    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3738    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3739
3740    /* caller can ask to have back either the resulting flags or
3741       resulting value, but not both */
3742    return wantRflags ? rflags_in : arg;
3743 }
3744
3745 ULong amd64g_calculate_RCL ( ULong arg,
3746                              ULong rot_amt,
3747                              ULong rflags_in,
3748                              Long  szIN )
3749 {
3750    Bool  wantRflags = toBool(szIN < 0);
3751    ULong sz         = wantRflags ? (-szIN) : szIN;
3752    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3753    ULong cf=0, of=0, tempcf;
3754
3755    switch (sz) {
3756       case 8:
3757          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3758          while (tempCOUNT > 0) {
3759             tempcf = (arg >> 63) & 1;
3760             arg    = (arg << 1) | (cf & 1);
3761             cf     = tempcf;
3762             tempCOUNT--;
3763          }
3764          of = ((arg >> 63) ^ cf) & 1;
3765          break;
3766       case 4:
3767          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3768          while (tempCOUNT > 0) {
3769             tempcf = (arg >> 31) & 1;
3770             arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
3771             cf     = tempcf;
3772             tempCOUNT--;
3773          }
3774          of = ((arg >> 31) ^ cf) & 1;
3775          break;
3776       case 2:
3777          while (tempCOUNT >= 17) tempCOUNT -= 17;
3778          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3779          while (tempCOUNT > 0) {
3780             tempcf = (arg >> 15) & 1;
3781             arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
3782             cf     = tempcf;
3783             tempCOUNT--;
3784          }
3785          of = ((arg >> 15) ^ cf) & 1;
3786          break;
3787       case 1:
3788          while (tempCOUNT >= 9) tempCOUNT -= 9;
3789          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3790          while (tempCOUNT > 0) {
3791             tempcf = (arg >> 7) & 1;
3792             arg    = 0xFFULL & ((arg << 1) | (cf & 1));
3793             cf     = tempcf;
3794             tempCOUNT--;
3795          }
3796          of = ((arg >> 7) ^ cf) & 1;
3797          break;
3798       default:
3799          vpanic("calculate_RCL(amd64g): invalid size");
3800    }
3801
3802    cf &= 1;
3803    of &= 1;
3804    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3805    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3806
3807    return wantRflags ? rflags_in : arg;
3808 }
3809
3810 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
3811  * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
3812  */
3813 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
3814 {
3815     ULong hi, lo, tmp, A[16];
3816
3817    A[0] = 0;            A[1] = a;
3818    A[2] = A[1] << 1;    A[3] = A[2] ^ a;
3819    A[4] = A[2] << 1;    A[5] = A[4] ^ a;
3820    A[6] = A[3] << 1;    A[7] = A[6] ^ a;
3821    A[8] = A[4] << 1;    A[9] = A[8] ^ a;
3822    A[10] = A[5] << 1;   A[11] = A[10] ^ a;
3823    A[12] = A[6] << 1;   A[13] = A[12] ^ a;
3824    A[14] = A[7] << 1;   A[15] = A[14] ^ a;
3825
3826    lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
3827    hi = lo >> 56;
3828    lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
3829    hi = (hi << 8) | (lo >> 56);
3830    lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
3831    hi = (hi << 8) | (lo >> 56);
3832    lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
3833    hi = (hi << 8) | (lo >> 56);
3834    lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
3835    hi = (hi << 8) | (lo >> 56);
3836    lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
3837    hi = (hi << 8) | (lo >> 56);
3838    lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
3839    hi = (hi << 8) | (lo >> 56);
3840    lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
3841
3842    ULong m0 = -1;
3843    m0 /= 255;
3844    tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
3845    tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
3846    tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
3847    tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
3848    tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
3849    tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
3850    tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
3851
3852    return which ? hi : lo;
3853 }
3854
3855
3856 /* CALLED FROM GENERATED CODE */
3857 /* DIRTY HELPER (non-referentially-transparent) */
3858 /* Horrible hack.  On non-amd64 platforms, return 1. */
3859 ULong amd64g_dirtyhelper_RDTSC ( void )
3860 {
3861 #  if defined(__x86_64__)
3862    UInt  eax, edx;
3863    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
3864    return (((ULong)edx) << 32) | ((ULong)eax);
3865 #  else
3866    return 1ULL;
3867 #  endif
3868 }
3869
3870 /* CALLED FROM GENERATED CODE */
3871 /* DIRTY HELPER (non-referentially-transparent) */
3872 /* Horrible hack.  On non-amd64 platforms, return 1. */
3873 /* This uses a different calling convention from _RDTSC just above
3874    only because of the difficulty of returning 96 bits from a C
3875    function -- RDTSC returns 64 bits and so is simple by comparison,
3876    on amd64. */
3877 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
3878 {
3879 #  if defined(__x86_64__)
3880    UInt eax, ecx, edx;
3881    __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
3882    st->guest_RAX = (ULong)eax;
3883    st->guest_RCX = (ULong)ecx;
3884    st->guest_RDX = (ULong)edx;
3885 #  else
3886    /* Do nothing. */
3887 #  endif
3888 }
3889
3890 /* CALLED FROM GENERATED CODE */
3891 /* DIRTY HELPER (non-referentially-transparent) */
3892 /* Horrible hack.  On non-amd64 platforms, return 0. */
3893 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
3894 {
3895 #  if defined(__x86_64__)
3896    ULong r = 0;
3897    portno &= 0xFFFF;
3898    switch (sz) {
3899       case 4:
3900          __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
3901                               : "=a" (r) : "Nd" (portno));
3902          break;
3903       case 2:
3904          __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
3905                               : "=a" (r) : "Nd" (portno));
3906          break;
3907       case 1:
3908          __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
3909                               : "=a" (r) : "Nd" (portno));
3910          break;
3911       default:
3912          break; /* note: no 64-bit version of insn exists */
3913    }
3914    return r;
3915 #  else
3916    return 0;
3917 #  endif
3918 }
3919
3920
3921 /* CALLED FROM GENERATED CODE */
3922 /* DIRTY HELPER (non-referentially-transparent) */
3923 /* Horrible hack.  On non-amd64 platforms, do nothing. */
3924 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
3925 {
3926 #  if defined(__x86_64__)
3927    portno &= 0xFFFF;
3928    switch (sz) {
3929       case 4:
3930          __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
3931                               : : "a" (data), "Nd" (portno));
3932          break;
3933       case 2:
3934          __asm__ __volatile__("outw %w0, %w1"
3935                               : : "a" (data), "Nd" (portno));
3936          break;
3937       case 1:
3938          __asm__ __volatile__("outb %b0, %w1"
3939                               : : "a" (data), "Nd" (portno));
3940          break;
3941       default:
3942          break; /* note: no 64-bit version of insn exists */
3943    }
3944 #  else
3945    /* do nothing */
3946 #  endif
3947 }
3948
3949 /* CALLED FROM GENERATED CODE */
3950 /* DIRTY HELPER (non-referentially-transparent) */
3951 /* Horrible hack.  On non-amd64 platforms, do nothing. */
3952 /* op = 0: call the native SGDT instruction.
3953    op = 1: call the native SIDT instruction.
3954 */
3955 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
3956 #  if defined(__x86_64__)
3957    switch (op) {
3958       case 0:
3959          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
3960          break;
3961       case 1:
3962          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
3963          break;
3964       default:
3965          vpanic("amd64g_dirtyhelper_SxDT");
3966    }
3967 #  else
3968    /* do nothing */
3969    UChar* p = (UChar*)address;
3970    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
3971    p[6] = p[7] = p[8] = p[9] = 0;
3972 #  endif
3973 }
3974
3975 /* CALLED FROM GENERATED CODE */
3976 /* DIRTY HELPER (non-referentially-transparent) */
3977 /* Horrible hack.  On non-amd64 platforms, do nothing.  On amd64 targets, get a
3978    32 bit random number using RDRAND, and return it and the associated rflags.C
3979    value. */
3980 ULong amd64g_dirtyhelper_RDRAND ( void ) {
3981 #  if defined(__x86_64__)
3982    ULong res   = 0;
3983    ULong cflag = 0;
3984    __asm__ __volatile__(
3985       "movq $0, %%r11 ; "
3986       "movq $0, %%r12 ; "
3987       "rdrand %%r11d ; "
3988       "setc %%r12b ; "
3989       "movq %%r11, %0 ; "
3990       "movq %%r12, %1"
3991       : "=r"(res), "=r"(cflag) : : "r11", "r12"
3992    );
3993    res &= 0xFFFFFFFFULL;
3994    cflag &= 1ULL;
3995    return (cflag << 32) | res;
3996 #  else
3997    /* There's nothing we can sensibly do.  Return a value denoting
3998       "I succeeded, and the random bits are all zero" :-/ */
3999    return 1ULL << 32;
4000 #  endif
4001 }
4002
4003 ULong amd64g_dirtyhelper_RDSEED ( void ) {
4004 #  if defined(__x86_64__)
4005    ULong res   = 0;
4006    ULong cflag = 0;
4007    __asm__ __volatile__(
4008       "movq $0, %%r11 ; "
4009       "movq $0, %%r12 ; "
4010       "rdseed %%r11d ; "
4011       "setc %%r12b ; "
4012       "movq %%r11, %0 ; "
4013       "movq %%r12, %1"
4014       : "=r"(res), "=r"(cflag) : : "r11", "r12"
4015    );
4016    res &= 0xFFFFFFFFULL;
4017    cflag &= 1ULL;
4018    return (cflag << 32) | res;
4019 #  else
4020    /* There's nothing we can sensibly do.  Return a value denoting
4021       "I succeeded, and the random bits are all zero" :-/ */
4022    return 1ULL << 32;
4023 #  endif
4024 }
4025
4026 /*---------------------------------------------------------------*/
4027 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
4028 /*---------------------------------------------------------------*/
4029
4030 static inline UChar abdU8 ( UChar xx, UChar yy ) {
4031    return toUChar(xx>yy ? xx-yy : yy-xx);
4032 }
4033
4034 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
4035    return (((ULong)w1) << 32) | ((ULong)w0);
4036 }
4037
4038 static inline UShort sel16x4_3 ( ULong w64 ) {
4039    UInt hi32 = toUInt(w64 >> 32);
4040    return toUShort(hi32 >> 16);
4041 }
4042 static inline UShort sel16x4_2 ( ULong w64 ) {
4043    UInt hi32 = toUInt(w64 >> 32);
4044    return toUShort(hi32);
4045 }
4046 static inline UShort sel16x4_1 ( ULong w64 ) {
4047    UInt lo32 = toUInt(w64);
4048    return toUShort(lo32 >> 16);
4049 }
4050 static inline UShort sel16x4_0 ( ULong w64 ) {
4051    UInt lo32 = toUInt(w64);
4052    return toUShort(lo32);
4053 }
4054
4055 static inline UChar sel8x8_7 ( ULong w64 ) {
4056    UInt hi32 = toUInt(w64 >> 32);
4057    return toUChar(hi32 >> 24);
4058 }
4059 static inline UChar sel8x8_6 ( ULong w64 ) {
4060    UInt hi32 = toUInt(w64 >> 32);
4061    return toUChar(hi32 >> 16);
4062 }
4063 static inline UChar sel8x8_5 ( ULong w64 ) {
4064    UInt hi32 = toUInt(w64 >> 32);
4065    return toUChar(hi32 >> 8);
4066 }
4067 static inline UChar sel8x8_4 ( ULong w64 ) {
4068    UInt hi32 = toUInt(w64 >> 32);
4069    return toUChar(hi32 >> 0);
4070 }
4071 static inline UChar sel8x8_3 ( ULong w64 ) {
4072    UInt lo32 = toUInt(w64);
4073    return toUChar(lo32 >> 24);
4074 }
4075 static inline UChar sel8x8_2 ( ULong w64 ) {
4076    UInt lo32 = toUInt(w64);
4077    return toUChar(lo32 >> 16);
4078 }
4079 static inline UChar sel8x8_1 ( ULong w64 ) {
4080    UInt lo32 = toUInt(w64);
4081    return toUChar(lo32 >> 8);
4082 }
4083 static inline UChar sel8x8_0 ( ULong w64 ) {
4084    UInt lo32 = toUInt(w64);
4085    return toUChar(lo32 >> 0);
4086 }
4087
4088 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4089 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
4090 {
4091    return
4092       mk32x2(
4093          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
4094             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
4095          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
4096             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
4097       );
4098 }
4099
4100 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4101 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
4102 {
4103    UInt t = 0;
4104    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
4105    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
4106    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
4107    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
4108    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
4109    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
4110    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
4111    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
4112    t &= 0xFFFF;
4113    return (ULong)t;
4114 }
4115
4116 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4117 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
4118 {
4119    UShort t, min;
4120    UInt   idx;
4121    t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
4122    t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
4123    t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
4124    t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
4125    t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
4126    t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
4127    t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
4128    t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
4129    return ((ULong)(idx << 16)) | ((ULong)min);
4130 }
4131
4132 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4133 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
4134 {
4135    UInt  i;
4136    ULong crc = (b & 0xFFULL) ^ crcIn;
4137    for (i = 0; i < 8; i++)
4138       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
4139    return crc;
4140 }
4141
4142 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4143 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
4144 {
4145    UInt  i;
4146    ULong crc = (w & 0xFFFFULL) ^ crcIn;
4147    for (i = 0; i < 16; i++)
4148       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
4149    return crc;
4150 }
4151
4152 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4153 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
4154 {
4155    UInt i;
4156    ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
4157    for (i = 0; i < 32; i++)
4158       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
4159    return crc;
4160 }
4161
4162 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4163 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
4164 {
4165    ULong crc = amd64g_calc_crc32l(crcIn, q);
4166    return amd64g_calc_crc32l(crc, q >> 32);
4167 }
4168
4169
4170 /* .. helper for next fn .. */
4171 static inline ULong sad_8x4 ( ULong xx, ULong yy )
4172 {
4173    UInt t = 0;
4174    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
4175    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
4176    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
4177    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
4178    return (ULong)t;
4179 }
4180
4181 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4182 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
4183                             ULong dHi, ULong dLo,
4184                             ULong imm_and_return_control_bit )
4185 {
4186    UInt imm8     = imm_and_return_control_bit & 7;
4187    Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
4188    UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
4189    UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
4190    /* For src we only need 32 bits, so get them into the
4191       lower half of a 64 bit word. */
4192    ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
4193    /* For dst we need to get hold of 56 bits (7 bytes) from a total of
4194       11 bytes.  If calculating the low part of the result, need bytes
4195       dstOffsL * 4 + (0 .. 6); if calculating the high part,
4196       dstOffsL * 4 + (4 .. 10). */
4197    ULong dst;
4198    /* dstOffL = 0, Lo  ->  0 .. 6
4199       dstOffL = 1, Lo  ->  4 .. 10
4200       dstOffL = 0, Hi  ->  4 .. 10
4201       dstOffL = 1, Hi  ->  8 .. 14
4202    */
4203    if (calcHi && dstOffsL) {
4204       /* 8 .. 14 */
4205       dst = dHi & 0x00FFFFFFFFFFFFFFULL;
4206    }
4207    else if (!calcHi && !dstOffsL) {
4208       /* 0 .. 6 */
4209       dst = dLo & 0x00FFFFFFFFFFFFFFULL;
4210    }
4211    else {
4212       /* 4 .. 10 */
4213       dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
4214    }
4215    ULong r0  = sad_8x4( dst >>  0, src );
4216    ULong r1  = sad_8x4( dst >>  8, src );
4217    ULong r2  = sad_8x4( dst >> 16, src );
4218    ULong r3  = sad_8x4( dst >> 24, src );
4219    ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
4220    return res;
4221 }
4222
4223 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4224 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
4225 {
4226    ULong dst = 0;
4227    ULong src_bit;
4228    ULong dst_bit = 1;
4229    for (src_bit = 1; src_bit; src_bit <<= 1) {
4230       if (mask & src_bit) {
4231          if (src_masked & src_bit) dst |= dst_bit;
4232          dst_bit <<= 1;
4233       }
4234    }
4235    return dst;
4236 }
4237
4238 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4239 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
4240 {
4241    ULong dst = 0;
4242    ULong dst_bit;
4243    ULong src_bit = 1;
4244    for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
4245       if (mask & dst_bit) {
4246          if (src & src_bit) dst |= dst_bit;
4247          src_bit <<= 1;
4248       }
4249    }
4250    return dst;
4251 }
4252
4253 /*---------------------------------------------------------------*/
4254 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
4255 /*---------------------------------------------------------------*/
4256
4257 static UInt zmask_from_V128 ( V128* arg )
4258 {
4259    UInt i, res = 0;
4260    for (i = 0; i < 16; i++) {
4261       res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
4262    }
4263    return res;
4264 }
4265
4266 static UInt zmask_from_V128_wide ( V128* arg )
4267 {
4268    UInt i, res = 0;
4269    for (i = 0; i < 8; i++) {
4270       res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
4271    }
4272    return res;
4273 }
4274
4275 /* Helps with PCMP{I,E}STR{I,M}.
4276
4277    CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
4278    actually it could be a clean helper, but for the fact that we can't
4279    pass by value 2 x V128 to a clean helper, nor have one returned.)
4280    Reads guest state, writes to guest state for the xSTRM cases, no
4281    accesses of memory, is a pure function.
4282
4283    opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
4284    the callee knows which I/E and I/M variant it is dealing with and
4285    what the specific operation is.  4th byte of opcode is in the range
4286    0x60 to 0x63:
4287        istri  66 0F 3A 63
4288        istrm  66 0F 3A 62
4289        estri  66 0F 3A 61
4290        estrm  66 0F 3A 60
4291
4292    gstOffL and gstOffR are the guest state offsets for the two XMM
4293    register inputs.  We never have to deal with the memory case since
4294    that is handled by pre-loading the relevant value into the fake
4295    XMM16 register.
4296
4297    For ESTRx variants, edxIN and eaxIN hold the values of those two
4298    registers.
4299
4300    In all cases, the bottom 16 bits of the result contain the new
4301    OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
4302    result hold the new %ecx value.  For xSTRM variants, the helper
4303    writes the result directly to the guest XMM0.
4304
4305    Declarable side effects: in all cases, reads guest state at
4306    [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
4307    guest_XMM0.
4308
4309    Is expected to be called with opc_and_imm combinations which have
4310    actually been validated, and will assert if otherwise.  The front
4311    end should ensure we're only called with verified values.
4312 */
4313 ULong amd64g_dirtyhelper_PCMPxSTRx (
4314           VexGuestAMD64State* gst,
4315           HWord opc4_and_imm,
4316           HWord gstOffL, HWord gstOffR,
4317           HWord edxIN, HWord eaxIN
4318        )
4319 {
4320    HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
4321    HWord imm8 = opc4_and_imm & 0xFF;
4322    HWord isISTRx = opc4 & 2;
4323    HWord isxSTRM = (opc4 & 1) ^ 1;
4324    vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
4325    HWord wide = (imm8 & 1);
4326
4327    // where the args are
4328    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4329    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4330
4331    /* Create the arg validity masks, either from the vectors
4332       themselves or from the supplied edx/eax values. */
4333    // FIXME: this is only right for the 8-bit data cases.
4334    // At least that is asserted above.
4335    UInt zmaskL, zmaskR;
4336
4337    // temp spot for the resulting flags and vector.
4338    V128 resV;
4339    UInt resOSZACP;
4340
4341    // for checking whether case was handled
4342    Bool ok = False;
4343
4344    if (wide) {
4345       if (isISTRx) {
4346          zmaskL = zmask_from_V128_wide(argL);
4347          zmaskR = zmask_from_V128_wide(argR);
4348       } else {
4349          Int tmp;
4350          tmp = edxIN & 0xFFFFFFFF;
4351          if (tmp < -8) tmp = -8;
4352          if (tmp > 8)  tmp = 8;
4353          if (tmp < 0)  tmp = -tmp;
4354          vassert(tmp >= 0 && tmp <= 8);
4355          zmaskL = (1 << tmp) & 0xFF;
4356          tmp = eaxIN & 0xFFFFFFFF;
4357          if (tmp < -8) tmp = -8;
4358          if (tmp > 8)  tmp = 8;
4359          if (tmp < 0)  tmp = -tmp;
4360          vassert(tmp >= 0 && tmp <= 8);
4361          zmaskR = (1 << tmp) & 0xFF;
4362       }
4363       // do the meyaath
4364       ok = compute_PCMPxSTRx_wide (
4365               &resV, &resOSZACP, argL, argR,
4366               zmaskL, zmaskR, imm8, (Bool)isxSTRM
4367            );
4368    } else {
4369       if (isISTRx) {
4370          zmaskL = zmask_from_V128(argL);
4371          zmaskR = zmask_from_V128(argR);
4372       } else {
4373          Int tmp;
4374          tmp = edxIN & 0xFFFFFFFF;
4375          if (tmp < -16) tmp = -16;
4376          if (tmp > 16)  tmp = 16;
4377          if (tmp < 0)   tmp = -tmp;
4378          vassert(tmp >= 0 && tmp <= 16);
4379          zmaskL = (1 << tmp) & 0xFFFF;
4380          tmp = eaxIN & 0xFFFFFFFF;
4381          if (tmp < -16) tmp = -16;
4382          if (tmp > 16)  tmp = 16;
4383          if (tmp < 0)   tmp = -tmp;
4384          vassert(tmp >= 0 && tmp <= 16);
4385          zmaskR = (1 << tmp) & 0xFFFF;
4386       }
4387       // do the meyaath
4388       ok = compute_PCMPxSTRx (
4389               &resV, &resOSZACP, argL, argR,
4390               zmaskL, zmaskR, imm8, (Bool)isxSTRM
4391            );
4392    }
4393
4394    // front end shouldn't pass us any imm8 variants we can't
4395    // handle.  Hence:
4396    vassert(ok);
4397
4398    // So, finally we need to get the results back to the caller.
4399    // In all cases, the new OSZACP value is the lowest 16 of
4400    // the return value.
4401    if (isxSTRM) {
4402       gst->guest_YMM0[0] = resV.w32[0];
4403       gst->guest_YMM0[1] = resV.w32[1];
4404       gst->guest_YMM0[2] = resV.w32[2];
4405       gst->guest_YMM0[3] = resV.w32[3];
4406       return resOSZACP & 0x8D5;
4407    } else {
4408       UInt newECX = resV.w32[0] & 0xFFFF;
4409       return (newECX << 16) | (resOSZACP & 0x8D5);
4410    }
4411 }
4412
4413 /*---------------------------------------------------------------*/
4414 /*--- AES primitives and helpers                              ---*/
4415 /*---------------------------------------------------------------*/
4416 /* a 16 x 16 matrix */
4417 static const UChar sbox[256] = {                   // row nr
4418    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
4419    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
4420    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
4421    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
4422    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
4423    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
4424    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
4425    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
4426    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
4427    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
4428    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
4429    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
4430    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
4431    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
4432    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
4433    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
4434    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
4435    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
4436    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
4437    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
4438    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
4439    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
4440    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
4441    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
4442    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
4443    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
4444    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
4445    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
4446    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
4447    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
4448    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
4449    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
4450 };
4451 static void SubBytes (V128* v)
4452 {
4453    V128 r;
4454    UInt i;
4455    for (i = 0; i < 16; i++)
4456       r.w8[i] = sbox[v->w8[i]];
4457    *v = r;
4458 }
4459
4460 /* a 16 x 16 matrix */
4461 static const UChar invsbox[256] = {                // row nr
4462    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
4463    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
4464    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
4465    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
4466    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
4467    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
4468    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
4469    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
4470    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
4471    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
4472    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
4473    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
4474    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
4475    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
4476    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
4477    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
4478    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
4479    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
4480    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
4481    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
4482    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
4483    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
4484    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
4485    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
4486    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
4487    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
4488    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
4489    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
4490    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
4491    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
4492    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
4493    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
4494 };
4495 static void InvSubBytes (V128* v)
4496 {
4497    V128 r;
4498    UInt i;
4499    for (i = 0; i < 16; i++)
4500       r.w8[i] = invsbox[v->w8[i]];
4501    *v = r;
4502 }
4503
4504 static const UChar ShiftRows_op[16] =
4505    {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
4506 static void ShiftRows (V128* v)
4507 {
4508    V128 r;
4509    UInt i;
4510    for (i = 0; i < 16; i++)
4511       r.w8[i] = v->w8[ShiftRows_op[15-i]];
4512    *v = r;
4513 }
4514
4515 static const UChar InvShiftRows_op[16] =
4516    {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
4517 static void InvShiftRows (V128* v)
4518 {
4519    V128 r;
4520    UInt i;
4521    for (i = 0; i < 16; i++)
4522       r.w8[i] = v->w8[InvShiftRows_op[15-i]];
4523    *v = r;
4524 }
4525
4526 /* Multiplication of the finite fields elements of AES.
4527    See "A Specification for The AES Algorithm Rijndael
4528         (by Joan Daemen & Vincent Rijmen)"
4529         Dr. Brian Gladman, v3.1, 3rd March 2001. */
4530 /* N values so that (hex) xy = 0x03^N.
4531    0x00 cannot be used. We put 0xff for this value.*/
4532 /* a 16 x 16 matrix */
4533 static const UChar Nxy[256] = {                    // row nr
4534    0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
4535    0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
4536    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
4537    0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
4538    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
4539    0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
4540    0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
4541    0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
4542    0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
4543    0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
4544    0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
4545    0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
4546    0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
4547    0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
4548    0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
4549    0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
4550    0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
4551    0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
4552    0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
4553    0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
4554    0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
4555    0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
4556    0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
4557    0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
4558    0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
4559    0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
4560    0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
4561    0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
4562    0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
4563    0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
4564    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
4565    0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
4566 };
4567
4568 /* E values so that E = 0x03^xy. */
4569 static const UChar Exy[256] = {                    // row nr
4570    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
4571    0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
4572    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
4573    0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
4574    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
4575    0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
4576    0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
4577    0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
4578    0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
4579    0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
4580    0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
4581    0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
4582    0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
4583    0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
4584    0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
4585    0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
4586    0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
4587    0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
4588    0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
4589    0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
4590    0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
4591    0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
4592    0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
4593    0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
4594    0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
4595    0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
4596    0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
4597    0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
4598    0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
4599    0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
4600    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
4601    0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
4602
4603 static inline UChar ff_mul(UChar u1, UChar u2)
4604 {
4605    if ((u1 > 0) && (u2 > 0)) {
4606       UInt ui = Nxy[u1] + Nxy[u2];
4607       if (ui >= 255)
4608          ui = ui - 255;
4609       return Exy[ui];
4610    } else {
4611       return 0;
4612    };
4613 }
4614
4615 static void MixColumns (V128* v)
4616 {
4617    V128 r;
4618    Int j;
4619 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4620    for (j = 0; j < 4; j++) {
4621       P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
4622          ^ P(v,j,2) ^ P(v,j,3);
4623       P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
4624          ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
4625       P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
4626          ^ ff_mul(0x03, P(v,j,3) );
4627       P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
4628          ^ ff_mul( 0x02, P(v,j,3) );
4629    }
4630    *v = r;
4631 #undef P
4632 }
4633
4634 static void InvMixColumns (V128* v)
4635 {
4636    V128 r;
4637    Int j;
4638 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4639    for (j = 0; j < 4; j++) {
4640       P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
4641          ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
4642       P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
4643          ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
4644       P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
4645          ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
4646       P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
4647          ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
4648    }
4649    *v = r;
4650 #undef P
4651
4652 }
4653
4654 /* For description, see definition in guest_amd64_defs.h */
4655 void amd64g_dirtyhelper_AES (
4656           VexGuestAMD64State* gst,
4657           HWord opc4, HWord gstOffD,
4658           HWord gstOffL, HWord gstOffR
4659        )
4660 {
4661    // where the args are
4662    V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
4663    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4664    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4665    V128  r;
4666
4667    switch (opc4) {
4668       case 0xDC: /* AESENC */
4669       case 0xDD: /* AESENCLAST */
4670          r = *argR;
4671          ShiftRows (&r);
4672          SubBytes  (&r);
4673          if (opc4 == 0xDC)
4674             MixColumns (&r);
4675          argD->w64[0] = r.w64[0] ^ argL->w64[0];
4676          argD->w64[1] = r.w64[1] ^ argL->w64[1];
4677          break;
4678
4679       case 0xDE: /* AESDEC */
4680       case 0xDF: /* AESDECLAST */
4681          r = *argR;
4682          InvShiftRows (&r);
4683          InvSubBytes (&r);
4684          if (opc4 == 0xDE)
4685             InvMixColumns (&r);
4686          argD->w64[0] = r.w64[0] ^ argL->w64[0];
4687          argD->w64[1] = r.w64[1] ^ argL->w64[1];
4688          break;
4689
4690       case 0xDB: /* AESIMC */
4691          *argD = *argL;
4692          InvMixColumns (argD);
4693          break;
4694       default: vassert(0);
4695    }
4696 }
4697
4698 static inline UInt RotWord (UInt   w32)
4699 {
4700    return ((w32 >> 8) | (w32 << 24));
4701 }
4702
4703 static inline UInt SubWord (UInt   w32)
4704 {
4705    UChar *w8;
4706    UChar *r8;
4707    UInt res;
4708    w8 = (UChar*) &w32;
4709    r8 = (UChar*) &res;
4710    r8[0] = sbox[w8[0]];
4711    r8[1] = sbox[w8[1]];
4712    r8[2] = sbox[w8[2]];
4713    r8[3] = sbox[w8[3]];
4714    return res;
4715 }
4716
4717 /* For description, see definition in guest_amd64_defs.h */
4718 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
4719           VexGuestAMD64State* gst,
4720           HWord imm8,
4721           HWord gstOffL, HWord gstOffR
4722        )
4723 {
4724    // where the args are
4725    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4726    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4727
4728    // We have to create the result in a temporary in the
4729    // case where the src and dst regs are the same.  See #341698.
4730    V128 tmp;
4731
4732    tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
4733    tmp.w32[2] = SubWord (argL->w32[3]);
4734    tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
4735    tmp.w32[0] = SubWord (argL->w32[1]);
4736
4737    argR->w32[3] = tmp.w32[3];
4738    argR->w32[2] = tmp.w32[2];
4739    argR->w32[1] = tmp.w32[1];
4740    argR->w32[0] = tmp.w32[0];
4741 }
4742
4743
4744
4745 /*---------------------------------------------------------------*/
4746 /*--- Helpers for dealing with, and describing,               ---*/
4747 /*--- guest state as a whole.                                 ---*/
4748 /*---------------------------------------------------------------*/
4749
4750 /* Initialise the entire amd64 guest state. */
4751 /* VISIBLE TO LIBVEX CLIENT */
4752 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
4753 {
4754    vex_state->host_EvC_FAILADDR = 0;
4755    vex_state->host_EvC_COUNTER = 0;
4756    vex_state->pad0 = 0;
4757
4758    vex_state->guest_RAX = 0;
4759    vex_state->guest_RCX = 0;
4760    vex_state->guest_RDX = 0;
4761    vex_state->guest_RBX = 0;
4762    vex_state->guest_RSP = 0;
4763    vex_state->guest_RBP = 0;
4764    vex_state->guest_RSI = 0;
4765    vex_state->guest_RDI = 0;
4766    vex_state->guest_R8  = 0;
4767    vex_state->guest_R9  = 0;
4768    vex_state->guest_R10 = 0;
4769    vex_state->guest_R11 = 0;
4770    vex_state->guest_R12 = 0;
4771    vex_state->guest_R13 = 0;
4772    vex_state->guest_R14 = 0;
4773    vex_state->guest_R15 = 0;
4774
4775    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
4776    vex_state->guest_CC_DEP1 = 0;
4777    vex_state->guest_CC_DEP2 = 0;
4778    vex_state->guest_CC_NDEP = 0;
4779
4780    vex_state->guest_DFLAG   = 1; /* forwards */
4781    vex_state->guest_IDFLAG  = 0;
4782    vex_state->guest_ACFLAG  = 0;
4783
4784    /* HACK: represent the offset associated with a constant %fs.
4785       Typically, on linux, this assumes that %fs is only ever zero (main
4786       thread) or 0x63. */
4787    vex_state->guest_FS_CONST = 0;
4788
4789    vex_state->guest_RIP = 0;
4790
4791    /* Initialise the simulated FPU */
4792    amd64g_dirtyhelper_FINIT( vex_state );
4793
4794    /* Initialise the AVX state. */
4795 #  define AVXZERO(_ymm) \
4796       do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
4797            _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
4798       } while (0)
4799    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
4800    AVXZERO(vex_state->guest_YMM0);
4801    AVXZERO(vex_state->guest_YMM1);
4802    AVXZERO(vex_state->guest_YMM2);
4803    AVXZERO(vex_state->guest_YMM3);
4804    AVXZERO(vex_state->guest_YMM4);
4805    AVXZERO(vex_state->guest_YMM5);
4806    AVXZERO(vex_state->guest_YMM6);
4807    AVXZERO(vex_state->guest_YMM7);
4808    AVXZERO(vex_state->guest_YMM8);
4809    AVXZERO(vex_state->guest_YMM9);
4810    AVXZERO(vex_state->guest_YMM10);
4811    AVXZERO(vex_state->guest_YMM11);
4812    AVXZERO(vex_state->guest_YMM12);
4813    AVXZERO(vex_state->guest_YMM13);
4814    AVXZERO(vex_state->guest_YMM14);
4815    AVXZERO(vex_state->guest_YMM15);
4816    AVXZERO(vex_state->guest_YMM16);
4817
4818 #  undef AVXZERO
4819
4820    vex_state->guest_EMNOTE = EmNote_NONE;
4821
4822    vex_state->guest_SETC = 0;
4823
4824    /* These should not ever be either read or written, but we
4825       initialise them anyway. */
4826    vex_state->guest_CMSTART = 0;
4827    vex_state->guest_CMLEN   = 0;
4828
4829    vex_state->guest_NRADDR   = 0;
4830    vex_state->guest_SC_CLASS = 0;
4831    vex_state->guest_GS_CONST = 0;
4832
4833    vex_state->guest_IP_AT_SYSCALL = 0;
4834    vex_state->pad1 = 0;
4835 }
4836
4837
4838 /* Figure out if any part of the guest state contained in minoff
4839    .. maxoff requires precise memory exceptions.  If in doubt return
4840    True (but this generates significantly slower code).
4841
4842    By default we enforce precise exns for guest %RSP, %RBP and %RIP
4843    only.  These are the minimum needed to extract correct stack
4844    backtraces from amd64 code.
4845
4846    Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
4847 */
4848 Bool guest_amd64_state_requires_precise_mem_exns (
4849         Int minoff, Int maxoff, VexRegisterUpdates pxControl
4850      )
4851 {
4852    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
4853    Int rbp_max = rbp_min + 8 - 1;
4854    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
4855    Int rsp_max = rsp_min + 8 - 1;
4856    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
4857    Int rip_max = rip_min + 8 - 1;
4858
4859    if (maxoff < rsp_min || minoff > rsp_max) {
4860       /* no overlap with rsp */
4861       if (pxControl == VexRegUpdSpAtMemAccess)
4862          return False; // We only need to check stack pointer.
4863    } else {
4864       return True;
4865    }
4866
4867    if (maxoff < rbp_min || minoff > rbp_max) {
4868       /* no overlap with rbp */
4869    } else {
4870       return True;
4871    }
4872
4873    if (maxoff < rip_min || minoff > rip_max) {
4874       /* no overlap with eip */
4875    } else {
4876       return True;
4877    }
4878
4879    return False;
4880 }
4881
4882
4883 #define ALWAYSDEFD(field)                             \
4884     { offsetof(VexGuestAMD64State, field),            \
4885       (sizeof ((VexGuestAMD64State*)0)->field) }
4886
4887 VexGuestLayout
4888    amd64guest_layout
4889       = {
4890           /* Total size of the guest state, in bytes. */
4891           .total_sizeB = sizeof(VexGuestAMD64State),
4892
4893           /* Describe the stack pointer. */
4894           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
4895           .sizeof_SP = 8,
4896
4897           /* Describe the frame pointer. */
4898           .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
4899           .sizeof_FP = 8,
4900
4901           /* Describe the instruction pointer. */
4902           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
4903           .sizeof_IP = 8,
4904
4905           /* Describe any sections to be regarded by Memcheck as
4906              'always-defined'. */
4907           .n_alwaysDefd = 16,
4908
4909           /* flags thunk: OP and NDEP are always defd, whereas DEP1
4910              and DEP2 have to be tracked.  See detailed comment in
4911              gdefs.h on meaning of thunk fields. */
4912           .alwaysDefd
4913              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
4914                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
4915                  /*  2 */ ALWAYSDEFD(guest_DFLAG),
4916                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
4917                  /*  4 */ ALWAYSDEFD(guest_RIP),
4918                  /*  5 */ ALWAYSDEFD(guest_FS_CONST),
4919                  /*  6 */ ALWAYSDEFD(guest_FTOP),
4920                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
4921                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
4922                  /*  9 */ ALWAYSDEFD(guest_FC3210),
4923                  // /* */ ALWAYSDEFD(guest_CS),
4924                  // /* */ ALWAYSDEFD(guest_DS),
4925                  // /* */ ALWAYSDEFD(guest_ES),
4926                  // /* */ ALWAYSDEFD(guest_FS),
4927                  // /* */ ALWAYSDEFD(guest_GS),
4928                  // /* */ ALWAYSDEFD(guest_SS),
4929                  // /* */ ALWAYSDEFD(guest_LDT),
4930                  // /* */ ALWAYSDEFD(guest_GDT),
4931                  /* 10 */ ALWAYSDEFD(guest_EMNOTE),
4932                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
4933                  /* 12 */ ALWAYSDEFD(guest_CMSTART),
4934                  /* 13 */ ALWAYSDEFD(guest_CMLEN),
4935                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
4936                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
4937                }
4938         };
4939
4940
4941 /*---------------------------------------------------------------*/
4942 /*--- end                               guest_amd64_helpers.c ---*/
4943 /*---------------------------------------------------------------*/