VEX/priv/guest_amd64_helpers.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                             guest_amd64_helpers.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, write to the Free Software
  25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26    02110-1301, USA.
  27
  28    The GNU General Public License is contained in the file COPYING.
  29
  30    Neither the names of the U.S. Department of Energy nor the
  31    University of California nor the names of its contributors may be
  32    used to endorse or promote products derived from this software
  33    without prior written permission.
  34 */
  35
  36 #include "libvex_basictypes.h"
  37 #include "libvex_emnote.h"
  38 #include "libvex_guest_amd64.h"
  39 #include "libvex_ir.h"
  40 #include "libvex.h"
  41
  42 #include "main_util.h"
  43 #include "main_globals.h"
  44 #include "guest_generic_bb_to_IR.h"
  45 #include "guest_amd64_defs.h"
  46 #include "guest_generic_x87.h"
  47
  48
  49 /* This file contains helper functions for amd64 guest code.
  50    Calls to these functions are generated by the back end.
  51    These calls are of course in the host machine code and
  52    this file will be compiled to host machine code, so that
  53    all makes sense.
  54
  55    Only change the signatures of these helper functions very
  56    carefully.  If you change the signature here, you'll have to change
  57    the parameters passed to it in the IR calls constructed by
  58    guest-amd64/toIR.c.
  59
  60    The convention used is that all functions called from generated
  61    code are named amd64g_<something>, and any function whose name lacks
  62    that prefix is not called from generated code.  Note that some
  63    LibVEX_* functions can however be called by VEX's client, but that
  64    is not the same as calling them from VEX-generated code.
  65 */
  66
  67
  68 /* Set to 1 to get detailed profiling info about use of the flag
  69    machinery. */
  70 #define PROFILE_RFLAGS 0
  71
  72
  73 /*---------------------------------------------------------------*/
  74 /*--- %rflags run-time helpers.                               ---*/
  75 /*---------------------------------------------------------------*/
  76
  77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
  78    after imulq/mulq. */
  79
  80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
  81 {
  82    const Long halfMask = 0xFFFFFFFFLL;
  83    ULong u0, v0, w0;
  84     Long u1, v1, w1, w2, t;
  85    u0   = u & halfMask;
  86    u1   = u >> 32;
  87    v0   = v & halfMask;
  88    v1   = v >> 32;
  89    w0   = u0 * v0;
  90    t    = u1 * v0 + (w0 >> 32);
  91    w1   = t & halfMask;
  92    w2   = t >> 32;
  93    w1   = u0 * v1 + w1;
  94    *rHi = u1 * v1 + w2 + (w1 >> 32);
  95    *rLo = (Long)((ULong)u * (ULong)v);
  96 }
  97
  98 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
  99 {
 100    const ULong halfMask = 0xFFFFFFFFULL;
 101    ULong u0, v0, w0;
 102    ULong u1, v1, w1,w2,t;
 103    u0   = u & halfMask;
 104    u1   = u >> 32;
 105    v0   = v & halfMask;
 106    v1   = v >> 32;
 107    w0   = u0 * v0;
 108    t    = u1 * v0 + (w0 >> 32);
 109    w1   = t & halfMask;
 110    w2   = t >> 32;
 111    w1   = u0 * v1 + w1;
 112    *rHi = u1 * v1 + w2 + (w1 >> 32);
 113    *rLo = u * v;
 114 }
 115
 116
 117 static const UChar parity_table[256] = {
 118     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 119     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 120     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 121     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 122     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 123     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 124     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 125     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 126     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 127     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 128     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 129     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 130     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 131     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 132     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 133     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 134     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 135     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 136     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 137     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 138     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 139     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 140     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 141     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 142     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 143     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 144     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 145     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 146     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 147     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 148     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 149     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 150 };
 151
 152 /* generalised left-shifter */
 153 static inline Long lshift ( Long x, Int n )
 154 {
 155    if (n >= 0)
 156       return (ULong)x << n;
 157    else
 158       return x >> (-n);
 159 }
 160
 161 /* identity on ULong */
 162 static inline ULong idULong ( ULong x )
 163 {
 164    return x;
 165 }
 166
 167
 168 #define PREAMBLE(__data_bits)                                   \
 169    /* const */ ULong DATA_MASK                                  \
 170       = __data_bits==8                                          \
 171            ? 0xFFULL                                            \
 172            : (__data_bits==16                                   \
 173                 ? 0xFFFFULL                                     \
 174                 : (__data_bits==32                              \
 175                      ? 0xFFFFFFFFULL                            \
 176                      : 0xFFFFFFFFFFFFFFFFULL));                 \
 177    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
 178    /* const */ ULong CC_DEP1 = cc_dep1_formal;                  \
 179    /* const */ ULong CC_DEP2 = cc_dep2_formal;                  \
 180    /* const */ ULong CC_NDEP = cc_ndep_formal;                  \
 181    /* Four bogus assignments, which hopefully gcc can     */    \
 182    /* optimise away, and which stop it complaining about  */    \
 183    /* unused variables.                                   */    \
 184    SIGN_MASK = SIGN_MASK;                                       \
 185    DATA_MASK = DATA_MASK;                                       \
 186    CC_DEP2 = CC_DEP2;                                           \
 187    CC_NDEP = CC_NDEP;
 188
 189
 190 /*-------------------------------------------------------------*/
 191
 192 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)                       \
 193 {                                                               \
 194    PREAMBLE(DATA_BITS);                                         \
 195    { ULong cf, pf, af, zf, sf, of;                              \
 196      ULong argL, argR, res;                                     \
 197      argL = CC_DEP1;                                            \
 198      argR = CC_DEP2;                                            \
 199      res  = argL + argR;                                        \
 200      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;                   \
 201      pf = parity_table[(UChar)res];                             \
 202      af = (res ^ argL ^ argR) & 0x10;                           \
 203      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 204      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 205      of = lshift((argL ^ argR ^ -1) & (argL ^ res),             \
 206                  12 - DATA_BITS) & AMD64G_CC_MASK_O;            \
 207      return cf | pf | af | zf | sf | of;                        \
 208    }                                                            \
 209 }
 210
 211 /*-------------------------------------------------------------*/
 212
 213 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)                       \
 214 {                                                               \
 215    PREAMBLE(DATA_BITS);                                         \
 216    { ULong cf, pf, af, zf, sf, of;                              \
 217      ULong argL, argR, res;                                     \
 218      argL = CC_DEP1;                                            \
 219      argR = CC_DEP2;                                            \
 220      res  = argL - argR;                                        \
 221      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;                  \
 222      pf = parity_table[(UChar)res];                             \
 223      af = (res ^ argL ^ argR) & 0x10;                           \
 224      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 225      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 226      of = lshift((argL ^ argR) & (argL ^ res),                  \
 227                  12 - DATA_BITS) & AMD64G_CC_MASK_O;            \
 228      return cf | pf | af | zf | sf | of;                        \
 229    }                                                            \
 230 }
 231
 232 /*-------------------------------------------------------------*/
 233
 234 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)                       \
 235 {                                                               \
 236    PREAMBLE(DATA_BITS);                                         \
 237    { ULong cf, pf, af, zf, sf, of;                              \
 238      ULong argL, argR, oldC, res;                               \
 239      oldC = CC_NDEP & AMD64G_CC_MASK_C;                         \
 240      argL = CC_DEP1;                                            \
 241      argR = CC_DEP2 ^ oldC;                                     \
 242      res  = (argL + argR) + oldC;                               \
 243      if (oldC)                                                  \
 244         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;               \
 245      else                                                       \
 246         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;                \
 247      pf = parity_table[(UChar)res];                             \
 248      af = (res ^ argL ^ argR) & 0x10;                           \
 249      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 250      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 251      of = lshift((argL ^ argR ^ -1) & (argL ^ res),             \
 252                   12 - DATA_BITS) & AMD64G_CC_MASK_O;           \
 253      return cf | pf | af | zf | sf | of;                        \
 254    }                                                            \
 255 }
 256
 257 /*-------------------------------------------------------------*/
 258
 259 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)                       \
 260 {                                                               \
 261    PREAMBLE(DATA_BITS);                                         \
 262    { ULong cf, pf, af, zf, sf, of;                              \
 263      ULong argL, argR, oldC, res;                               \
 264      oldC = CC_NDEP & AMD64G_CC_MASK_C;                         \
 265      argL = CC_DEP1;                                            \
 266      argR = CC_DEP2 ^ oldC;                                     \
 267      res  = (argL - argR) - oldC;                               \
 268      if (oldC)                                                  \
 269         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;              \
 270      else                                                       \
 271         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;               \
 272      pf = parity_table[(UChar)res];                             \
 273      af = (res ^ argL ^ argR) & 0x10;                           \
 274      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 275      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 276      of = lshift((argL ^ argR) & (argL ^ res),                  \
 277                  12 - DATA_BITS) & AMD64G_CC_MASK_O;            \
 278      return cf | pf | af | zf | sf | of;                        \
 279    }                                                            \
 280 }
 281
 282 /*-------------------------------------------------------------*/
 283
 284 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)                     \
 285 {                                                               \
 286    PREAMBLE(DATA_BITS);                                         \
 287    { ULong cf, pf, af, zf, sf, of;                              \
 288      cf = 0;                                                    \
 289      pf = parity_table[(UChar)CC_DEP1];                         \
 290      af = 0;                                                    \
 291      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 292      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 293      of = 0;                                                    \
 294      return cf | pf | af | zf | sf | of;                        \
 295    }                                                            \
 296 }
 297
 298 /*-------------------------------------------------------------*/
 299
 300 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)                       \
 301 {                                                               \
 302    PREAMBLE(DATA_BITS);                                         \
 303    { ULong cf, pf, af, zf, sf, of;                              \
 304      ULong argL, argR, res;                                     \
 305      res  = CC_DEP1;                                            \
 306      argL = res - 1;                                            \
 307      argR = 1;                                                  \
 308      cf = CC_NDEP & AMD64G_CC_MASK_C;                           \
 309      pf = parity_table[(UChar)res];                             \
 310      af = (res ^ argL ^ argR) & 0x10;                           \
 311      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 312      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 313      of = ((res & DATA_MASK) == SIGN_MASK) << 11;               \
 314      return cf | pf | af | zf | sf | of;                        \
 315    }                                                            \
 316 }
 317
 318 /*-------------------------------------------------------------*/
 319
 320 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)                       \
 321 {                                                               \
 322    PREAMBLE(DATA_BITS);                                         \
 323    { ULong cf, pf, af, zf, sf, of;                              \
 324      ULong argL, argR, res;                                     \
 325      res  = CC_DEP1;                                            \
 326      argL = res + 1;                                            \
 327      argR = 1;                                                  \
 328      cf = CC_NDEP & AMD64G_CC_MASK_C;                           \
 329      pf = parity_table[(UChar)res];                             \
 330      af = (res ^ argL ^ argR) & 0x10;                           \
 331      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 332      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 333      of = ((res & DATA_MASK)                                    \
 334           == ((ULong)SIGN_MASK - 1)) << 11;                     \
 335      return cf | pf | af | zf | sf | of;                        \
 336    }                                                            \
 337 }
 338
 339 /*-------------------------------------------------------------*/
 340
 341 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)                       \
 342 {                                                               \
 343    PREAMBLE(DATA_BITS);                                         \
 344    { ULong cf, pf, af, zf, sf, of;                              \
 345      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;      \
 346      pf = parity_table[(UChar)CC_DEP1];                         \
 347      af = 0; /* undefined */                                    \
 348      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 349      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 350      /* of is defined if shift count == 1 */                    \
 351      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)             \
 352           & AMD64G_CC_MASK_O;                                   \
 353      return cf | pf | af | zf | sf | of;                        \
 354    }                                                            \
 355 }
 356
 357 /*-------------------------------------------------------------*/
 358
 359 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)                       \
 360 {                                                               \
 361    PREAMBLE(DATA_BITS);                                         \
 362    { ULong cf, pf, af, zf, sf, of;                              \
 363      cf = CC_DEP2 & 1;                                          \
 364      pf = parity_table[(UChar)CC_DEP1];                         \
 365      af = 0; /* undefined */                                    \
 366      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 367      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 368      /* of is defined if shift count == 1 */                    \
 369      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)             \
 370           & AMD64G_CC_MASK_O;                                   \
 371      return cf | pf | af | zf | sf | of;                        \
 372    }                                                            \
 373 }
 374
 375 /*-------------------------------------------------------------*/
 376
 377 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
 378 /* DEP1 = result, NDEP = old flags */
 379 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)                       \
 380 {                                                               \
 381    PREAMBLE(DATA_BITS);                                         \
 382    { ULong fl                                                   \
 383         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))    \
 384           | (AMD64G_CC_MASK_C & CC_DEP1)                        \
 385           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,                \
 386                                       11-(DATA_BITS-1))         \
 387                      ^ lshift(CC_DEP1, 11)));                   \
 388      return fl;                                                 \
 389    }                                                            \
 390 }
 391
 392 /*-------------------------------------------------------------*/
 393
 394 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
 395 /* DEP1 = result, NDEP = old flags */
 396 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)                       \
 397 {                                                               \
 398    PREAMBLE(DATA_BITS);                                         \
 399    { ULong fl                                                   \
 400         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))    \
 401           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))     \
 402           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,                \
 403                                       11-(DATA_BITS-1))         \
 404                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));   \
 405      return fl;                                                 \
 406    }                                                            \
 407 }
 408
 409 /*-------------------------------------------------------------*/
 410
 411 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
 412                                 DATA_U2TYPE, NARROWto2U)        \
 413 {                                                               \
 414    PREAMBLE(DATA_BITS);                                         \
 415    { ULong cf, pf, af, zf, sf, of;                              \
 416      DATA_UTYPE  hi;                                            \
 417      DATA_UTYPE  lo                                             \
 418         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
 419                      * ((DATA_UTYPE)CC_DEP2) );                 \
 420      DATA_U2TYPE rr                                             \
 421         = NARROWto2U(                                           \
 422              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
 423              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
 424      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
 425      cf = (hi != 0);                                            \
 426      pf = parity_table[(UChar)lo];                              \
 427      af = 0; /* undefined */                                    \
 428      zf = (lo == 0) << 6;                                       \
 429      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
 430      of = cf << 11;                                             \
 431      return cf | pf | af | zf | sf | of;                        \
 432    }                                                            \
 433 }
 434
 435 /*-------------------------------------------------------------*/
 436
 437 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
 438                                 DATA_S2TYPE, NARROWto2S)        \
 439 {                                                               \
 440    PREAMBLE(DATA_BITS);                                         \
 441    { ULong cf, pf, af, zf, sf, of;                              \
 442      DATA_STYPE  hi;                                            \
 443      DATA_STYPE  lo                                             \
 444         = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
 445                      * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
 446      DATA_S2TYPE rr                                             \
 447         = NARROWto2S(                                           \
 448              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
 449              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
 450      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
 451      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
 452      pf = parity_table[(UChar)lo];                              \
 453      af = 0; /* undefined */                                    \
 454      zf = (lo == 0) << 6;                                       \
 455      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
 456      of = cf << 11;                                             \
 457      return cf | pf | af | zf | sf | of;                        \
 458    }                                                            \
 459 }
 460
 461 /*-------------------------------------------------------------*/
 462
 463 #define ACTIONS_UMULQ                                           \
 464 {                                                               \
 465    PREAMBLE(64);                                                \
 466    { ULong cf, pf, af, zf, sf, of;                              \
 467      ULong lo, hi;                                              \
 468      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
 469      cf = (hi != 0);                                            \
 470      pf = parity_table[(UChar)lo];                              \
 471      af = 0; /* undefined */                                    \
 472      zf = (lo == 0) << 6;                                       \
 473      sf = lshift(lo, 8 - 64) & 0x80;                            \
 474      of = cf << 11;                                             \
 475      return cf | pf | af | zf | sf | of;                        \
 476    }                                                            \
 477 }
 478
 479 /*-------------------------------------------------------------*/
 480
 481 #define ACTIONS_SMULQ                                           \
 482 {                                                               \
 483    PREAMBLE(64);                                                \
 484    { ULong cf, pf, af, zf, sf, of;                              \
 485      Long lo, hi;                                               \
 486      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
 487      cf = (hi != (lo >>/*s*/ (64-1)));                          \
 488      pf = parity_table[(UChar)lo];                              \
 489      af = 0; /* undefined */                                    \
 490      zf = (lo == 0) << 6;                                       \
 491      sf = lshift(lo, 8 - 64) & 0x80;                            \
 492      of = cf << 11;                                             \
 493      return cf | pf | af | zf | sf | of;                        \
 494    }                                                            \
 495 }
 496
 497 /*-------------------------------------------------------------*/
 498
 499 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)                      \
 500 {                                                               \
 501    PREAMBLE(DATA_BITS);                                         \
 502    { ULong cf, pf, af, zf, sf, of;                              \
 503      cf = 0;                                                    \
 504      pf = 0;                                                    \
 505      af = 0;                                                    \
 506      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 507      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 508      of = 0;                                                    \
 509      return cf | pf | af | zf | sf | of;                        \
 510    }                                                            \
 511 }
 512
 513 /*-------------------------------------------------------------*/
 514
 515 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)                      \
 516 {                                                               \
 517    PREAMBLE(DATA_BITS);                                         \
 518    { ULong cf, pf, af, zf, sf, of;                              \
 519      cf = ((DATA_UTYPE)CC_DEP2 != 0);                           \
 520      pf = 0;                                                    \
 521      af = 0;                                                    \
 522      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 523      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 524      of = 0;                                                    \
 525      return cf | pf | af | zf | sf | of;                        \
 526    }                                                            \
 527 }
 528
 529 /*-------------------------------------------------------------*/
 530
 531 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)                    \
 532 {                                                               \
 533    PREAMBLE(DATA_BITS);                                         \
 534    { Long cf, pf, af, zf, sf, of;                               \
 535      cf = ((DATA_UTYPE)CC_DEP2 == 0);                           \
 536      pf = 0;                                                    \
 537      af = 0;                                                    \
 538      zf = 0;                                                    \
 539      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 540      of = 0;                                                    \
 541      return cf | pf | af | zf | sf | of;                        \
 542    }                                                            \
 543 }
 544
 545 /*-------------------------------------------------------------*/
 546
 547 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)                      \
 548 {                                                               \
 549    PREAMBLE(DATA_BITS);                                         \
 550    { ULong cf, pf, af, zf, sf, of;                              \
 551      cf = ((DATA_UTYPE)CC_DEP2 == 0);                           \
 552      pf = 0;                                                    \
 553      af = 0;                                                    \
 554      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 555      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 556      of = 0;                                                    \
 557      return cf | pf | af | zf | sf | of;                        \
 558    }                                                            \
 559 }
 560
 561 /*-------------------------------------------------------------*/
 562
 563 #define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME)              \
 564 {                                                               \
 565    PREAMBLE(DATA_BITS);                                         \
 566    { ULong ocf; /* o or c */                                    \
 567      ULong argL, argR, oldOC, res;                              \
 568      oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1;       \
 569      argL  = CC_DEP1;                                           \
 570      argR  = CC_DEP2 ^ oldOC;                                   \
 571      res   = (argL + argR) + oldOC;                             \
 572      if (oldOC)                                                 \
 573         ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;              \
 574      else                                                       \
 575         ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL;               \
 576      return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME)              \
 577             | (ocf << AMD64G_CC_SHIFT_##FLAGNAME);              \
 578    }                                                            \
 579 }
 580
 581 /*-------------------------------------------------------------*/
 582
 583
 584 #if PROFILE_RFLAGS
 585
 586 static Bool initted     = False;
 587
 588 /* C flag, fast route */
 589 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
 590 /* C flag, slow route */
 591 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
 592 /* table for calculate_cond */
 593 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
 594 /* total entry counts for calc_all, calc_c, calc_cond. */
 595 static UInt n_calc_all  = 0;
 596 static UInt n_calc_c    = 0;
 597 static UInt n_calc_cond = 0;
 598
 599 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
 600
 601
 602 static void showCounts ( void )
 603 {
 604    Int op, co;
 605    HChar ch;
 606    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
 607               n_calc_all, n_calc_cond, n_calc_c);
 608
 609    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
 610               "    S   NS    P   NP    L   NL   LE  NLE\n");
 611    vex_printf("     -----------------------------------------------------"
 612               "----------------------------------------\n");
 613    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
 614
 615       ch = ' ';
 616       if (op > 0 && (op-1) % 4 == 0)
 617          ch = 'B';
 618       if (op > 0 && (op-1) % 4 == 1)
 619          ch = 'W';
 620       if (op > 0 && (op-1) % 4 == 2)
 621          ch = 'L';
 622       if (op > 0 && (op-1) % 4 == 3)
 623          ch = 'Q';
 624
 625       vex_printf("%2d%c: ", op, ch);
 626       vex_printf("%6u ", tabc_slow[op]);
 627       vex_printf("%6u ", tabc_fast[op]);
 628       for (co = 0; co < 16; co++) {
 629          Int n = tab_cond[op][co];
 630          if (n >= 1000) {
 631             vex_printf(" %3dK", n / 1000);
 632          } else
 633          if (n >= 0) {
 634             vex_printf(" %3d ", n );
 635          } else {
 636             vex_printf("     ");
 637          }
 638       }
 639       vex_printf("\n");
 640    }
 641    vex_printf("\n");
 642 }
 643
 644 static void initCounts ( void )
 645 {
 646    Int op, co;
 647    initted = True;
 648    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
 649       tabc_fast[op] = tabc_slow[op] = 0;
 650       for (co = 0; co < 16; co++)
 651          tab_cond[op][co] = 0;
 652    }
 653 }
 654
 655 #endif /* PROFILE_RFLAGS */
 656
 657
 658 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 659 /* Calculate all the 6 flags from the supplied thunk parameters.
 660    Worker function, not directly called from generated code. */
 661 static
 662 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
 663                                         ULong cc_dep1_formal,
 664                                         ULong cc_dep2_formal,
 665                                         ULong cc_ndep_formal )
 666 {
 667    switch (cc_op) {
 668       case AMD64G_CC_OP_COPY:
 669          return cc_dep1_formal
 670                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
 671                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
 672
 673       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
 674       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
 675       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
 676       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
 677
 678       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
 679       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
 680       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
 681       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
 682
 683       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
 684       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
 685       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
 686       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
 687
 688       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
 689       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
 690       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
 691       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
 692
 693       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
 694       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
 695       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
 696       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
 697
 698       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
 699       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
 700       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
 701       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
 702
 703       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
 704       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
 705       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
 706       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
 707
 708       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
 709       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
 710       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
 711       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
 712
 713       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
 714       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
 715       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
 716       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
 717
 718       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
 719       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
 720       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
 721       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
 722
 723       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
 724       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
 725       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
 726       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
 727
 728       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
 729                                                   UShort, toUShort );
 730       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
 731                                                   UInt,   toUInt );
 732       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
 733                                                   ULong,  idULong );
 734
 735       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
 736
 737       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
 738                                                   Short,  toUShort );
 739       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
 740                                                   Int,    toUInt   );
 741       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
 742                                                   Long,   idULong );
 743
 744       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
 745
 746       case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
 747       case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
 748
 749       case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
 750       case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
 751
 752       case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
 753       case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
 754
 755       case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
 756       case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
 757
 758       case AMD64G_CC_OP_ADCX32: ACTIONS_ADX( 32, UInt,  C );
 759       case AMD64G_CC_OP_ADCX64: ACTIONS_ADX( 64, ULong, C );
 760
 761       case AMD64G_CC_OP_ADOX32: ACTIONS_ADX( 32, UInt,  O );
 762       case AMD64G_CC_OP_ADOX64: ACTIONS_ADX( 64, ULong, O );
 763
 764       default:
 765          /* shouldn't really make these calls from generated code */
 766          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
 767                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
 768                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
 769          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
 770    }
 771 }
 772
 773
 774 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 775 /* Calculate all the 6 flags from the supplied thunk parameters. */
 776 ULong amd64g_calculate_rflags_all ( ULong cc_op,
 777                                     ULong cc_dep1,
 778                                     ULong cc_dep2,
 779                                     ULong cc_ndep )
 780 {
 781 #  if PROFILE_RFLAGS
 782    if (!initted) initCounts();
 783    n_calc_all++;
 784    if (SHOW_COUNTS_NOW) showCounts();
 785 #  endif
 786    return
 787       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
 788 }
 789
 790
 791 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 792 /* Calculate just the carry flag from the supplied thunk parameters. */
 793 ULong amd64g_calculate_rflags_c ( ULong cc_op,
 794                                   ULong cc_dep1,
 795                                   ULong cc_dep2,
 796                                   ULong cc_ndep )
 797 {
 798 #  if PROFILE_RFLAGS
 799    if (!initted) initCounts();
 800    n_calc_c++;
 801    tabc_fast[cc_op]++;
 802    if (SHOW_COUNTS_NOW) showCounts();
 803 #  endif
 804
 805    /* Fast-case some common ones. */
 806    switch (cc_op) {
 807       case AMD64G_CC_OP_COPY:
 808          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
 809       case AMD64G_CC_OP_LOGICQ:
 810       case AMD64G_CC_OP_LOGICL:
 811       case AMD64G_CC_OP_LOGICW:
 812       case AMD64G_CC_OP_LOGICB:
 813          return 0;
 814          //      case AMD64G_CC_OP_SUBL:
 815          //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
 816          //                   ? AMD64G_CC_MASK_C : 0;
 817          //      case AMD64G_CC_OP_SUBW:
 818          //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
 819          //                   ? AMD64G_CC_MASK_C : 0;
 820          //      case AMD64G_CC_OP_SUBB:
 821          //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
 822          //                   ? AMD64G_CC_MASK_C : 0;
 823          //      case AMD64G_CC_OP_INCL:
 824          //      case AMD64G_CC_OP_DECL:
 825          //         return cc_ndep & AMD64G_CC_MASK_C;
 826       default:
 827          break;
 828    }
 829
 830 #  if PROFILE_RFLAGS
 831    tabc_fast[cc_op]--;
 832    tabc_slow[cc_op]++;
 833 #  endif
 834
 835    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
 836           & AMD64G_CC_MASK_C;
 837 }
 838
 839
 840 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 841 /* returns 1 or 0 */
 842 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
 843                                    ULong cc_op,
 844                                    ULong cc_dep1,
 845                                    ULong cc_dep2,
 846                                    ULong cc_ndep )
 847 {
 848    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
 849                                                   cc_dep2, cc_ndep);
 850    ULong of,sf,zf,cf,pf;
 851    ULong inv = cond & 1;
 852
 853 #  if PROFILE_RFLAGS
 854    if (!initted) initCounts();
 855    tab_cond[cc_op][cond]++;
 856    n_calc_cond++;
 857    if (SHOW_COUNTS_NOW) showCounts();
 858 #  endif
 859
 860    switch (cond) {
 861       case AMD64CondNO:
 862       case AMD64CondO: /* OF == 1 */
 863          of = rflags >> AMD64G_CC_SHIFT_O;
 864          return 1 & (inv ^ of);
 865
 866       case AMD64CondNZ:
 867       case AMD64CondZ: /* ZF == 1 */
 868          zf = rflags >> AMD64G_CC_SHIFT_Z;
 869          return 1 & (inv ^ zf);
 870
 871       case AMD64CondNB:
 872       case AMD64CondB: /* CF == 1 */
 873          cf = rflags >> AMD64G_CC_SHIFT_C;
 874          return 1 & (inv ^ cf);
 875          break;
 876
 877       case AMD64CondNBE:
 878       case AMD64CondBE: /* (CF or ZF) == 1 */
 879          cf = rflags >> AMD64G_CC_SHIFT_C;
 880          zf = rflags >> AMD64G_CC_SHIFT_Z;
 881          return 1 & (inv ^ (cf | zf));
 882          break;
 883
 884       case AMD64CondNS:
 885       case AMD64CondS: /* SF == 1 */
 886          sf = rflags >> AMD64G_CC_SHIFT_S;
 887          return 1 & (inv ^ sf);
 888
 889       case AMD64CondNP:
 890       case AMD64CondP: /* PF == 1 */
 891          pf = rflags >> AMD64G_CC_SHIFT_P;
 892          return 1 & (inv ^ pf);
 893
 894       case AMD64CondNL:
 895       case AMD64CondL: /* (SF xor OF) == 1 */
 896          sf = rflags >> AMD64G_CC_SHIFT_S;
 897          of = rflags >> AMD64G_CC_SHIFT_O;
 898          return 1 & (inv ^ (sf ^ of));
 899          break;
 900
 901       case AMD64CondNLE:
 902       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
 903          sf = rflags >> AMD64G_CC_SHIFT_S;
 904          of = rflags >> AMD64G_CC_SHIFT_O;
 905          zf = rflags >> AMD64G_CC_SHIFT_Z;
 906          return 1 & (inv ^ ((sf ^ of) | zf));
 907          break;
 908
 909       default:
 910          /* shouldn't really make these calls from generated code */
 911          vex_printf("amd64g_calculate_condition"
 912                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
 913                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
 914          vpanic("amd64g_calculate_condition");
 915    }
 916 }
 917
 918
 919 /* VISIBLE TO LIBVEX CLIENT */
 920 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
 921 {
 922    ULong rflags = amd64g_calculate_rflags_all_WRK(
 923                      vex_state->guest_CC_OP,
 924                      vex_state->guest_CC_DEP1,
 925                      vex_state->guest_CC_DEP2,
 926                      vex_state->guest_CC_NDEP
 927                   );
 928    Long dflag = vex_state->guest_DFLAG;
 929    vassert(dflag == 1 || dflag == -1);
 930    if (dflag == -1)
 931       rflags |= (1<<10);
 932    if (vex_state->guest_IDFLAG == 1)
 933       rflags |= (1<<21);
 934    if (vex_state->guest_ACFLAG == 1)
 935       rflags |= (1<<18);
 936
 937    return rflags;
 938 }
 939
 940 /* VISIBLE TO LIBVEX CLIENT */
 941 void
 942 LibVEX_GuestAMD64_put_rflags ( ULong rflags,
 943                                /*MOD*/VexGuestAMD64State* vex_state )
 944 {
 945    /* D flag */
 946    if (rflags & AMD64G_CC_MASK_D) {
 947       vex_state->guest_DFLAG = -1;
 948       rflags &= ~AMD64G_CC_MASK_D;
 949    }
 950    else
 951       vex_state->guest_DFLAG = 1;
 952
 953    /* ID flag */
 954    if (rflags & AMD64G_CC_MASK_ID) {
 955       vex_state->guest_IDFLAG = 1;
 956       rflags &= ~AMD64G_CC_MASK_ID;
 957    }
 958    else
 959       vex_state->guest_IDFLAG = 0;
 960
 961    /* AC flag */
 962    if (rflags & AMD64G_CC_MASK_AC) {
 963       vex_state->guest_ACFLAG = 1;
 964       rflags &= ~AMD64G_CC_MASK_AC;
 965    }
 966    else
 967       vex_state->guest_ACFLAG = 0;
 968
 969    UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
 970                   AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
 971    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
 972    vex_state->guest_CC_DEP1 = rflags & cc_mask;
 973    vex_state->guest_CC_DEP2 = 0;
 974    vex_state->guest_CC_NDEP = 0;
 975 }
 976
 977 /* VISIBLE TO LIBVEX CLIENT */
 978 void
 979 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
 980                                /*MOD*/VexGuestAMD64State* vex_state )
 981 {
 982    ULong oszacp = amd64g_calculate_rflags_all_WRK(
 983                      vex_state->guest_CC_OP,
 984                      vex_state->guest_CC_DEP1,
 985                      vex_state->guest_CC_DEP2,
 986                      vex_state->guest_CC_NDEP
 987                   );
 988    if (new_carry_flag & 1) {
 989       oszacp |= AMD64G_CC_MASK_C;
 990    } else {
 991       oszacp &= ~AMD64G_CC_MASK_C;
 992    }
 993    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
 994    vex_state->guest_CC_DEP1 = oszacp;
 995    vex_state->guest_CC_DEP2 = 0;
 996    vex_state->guest_CC_NDEP = 0;
 997 }
 998
 999
1000 /*---------------------------------------------------------------*/
1001 /*--- %rflags translation-time function specialisers.         ---*/
1002 /*--- These help iropt specialise calls the above run-time    ---*/
1003 /*--- %rflags functions.                                      ---*/
1004 /*---------------------------------------------------------------*/
1005
1006 /* Used by the optimiser to try specialisations.  Returns an
1007    equivalent expression, or NULL if none. */
1008
1009 static inline Bool isU64 ( IRExpr* e, ULong n )
1010 {
1011    return e->tag == Iex_Const
1012           && e->Iex.Const.con->tag == Ico_U64
1013           && e->Iex.Const.con->Ico.U64 == n;
1014 }
1015
1016 /* Returns N if E is an immediate of the form 1 << N for N in 1 to 31,
1017    and zero in any other case. */
1018 static Int isU64_1_shl_N ( IRExpr* e )
1019 {
1020    if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1021       return 0;
1022    ULong w64 = e->Iex.Const.con->Ico.U64;
1023    if (w64 < (1ULL << 1) || w64 > (1ULL << 31))
1024       return 0;
1025    if ((w64 & (w64 - 1)) != 0)
1026       return 0;
1027    /* At this point, we know w64 is a power of two in the range 2^1 .. 2^31,
1028       and we only need to find out which one it is. */
1029    for (Int n = 1; n <= 31; n++) {
1030       if (w64 == (1ULL << n))
1031          return n;
1032    }
1033    /* Consequently we should never get here. */
1034    /*UNREACHED*/
1035    vassert(0);
1036    return 0;
1037 }
1038
1039 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
1040                                  IRExpr** args,
1041                                  IRStmt** precedingStmts,
1042                                  Int      n_precedingStmts )
1043 {
1044 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
1045 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
1046 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
1047 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
1048 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
1049
1050    Int i, arity = 0;
1051    for (i = 0; args[i]; i++)
1052       arity++;
1053 #  if 0
1054    vex_printf("spec request:\n");
1055    vex_printf("   %s  ", function_name);
1056    for (i = 0; i < arity; i++) {
1057       vex_printf("  ");
1058       ppIRExpr(args[i]);
1059    }
1060    vex_printf("\n");
1061 #  endif
1062
1063    /* --------- specialising "amd64g_calculate_condition" --------- */
1064
1065    if (vex_streq(function_name, "amd64g_calculate_condition")) {
1066       /* specialise calls to above "calculate condition" function */
1067       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
1068       vassert(arity == 5);
1069       cond    = args[0];
1070       cc_op   = args[1];
1071       cc_dep1 = args[2];
1072       cc_dep2 = args[3];
1073
1074       /*---------------- ADDQ ----------------*/
1075
1076       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
1077          /* long long add, then Z --> test (dst+src == 0) */
1078          return unop(Iop_1Uto64,
1079                      binop(Iop_CmpEQ64,
1080                            binop(Iop_Add64, cc_dep1, cc_dep2),
1081                            mkU64(0)));
1082       }
1083
1084       /*---------------- ADDL ----------------*/
1085
1086       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
1087          /* This is very commonly generated by Javascript JITs, for
1088             the idiom "do a 32-bit add and jump to out-of-line code if
1089             an overflow occurs". */
1090          /* long add, then O (overflow)
1091             --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
1092             --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1093             --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1094          */
1095          vassert(isIRAtom(cc_dep1));
1096          vassert(isIRAtom(cc_dep2));
1097          return
1098             binop(Iop_And64,
1099                   binop(Iop_Shr64,
1100                         binop(Iop_And64,
1101                               unop(Iop_Not64,
1102                                    binop(Iop_Xor64, cc_dep1, cc_dep2)),
1103                               binop(Iop_Xor64,
1104                                     cc_dep1,
1105                                     binop(Iop_Add64, cc_dep1, cc_dep2))),
1106                         mkU8(31)),
1107                   mkU64(1));
1108
1109       }
1110
1111       /*---------------- SUBQ ----------------*/
1112
1113       /* 0, */
1114       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
1115          /* long long sub/cmp, then O (overflow)
1116             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
1117             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
1118          */
1119          vassert(isIRAtom(cc_dep1));
1120          vassert(isIRAtom(cc_dep2));
1121          return binop(Iop_Shr64,
1122                       binop(Iop_And64,
1123                             binop(Iop_Xor64, cc_dep1, cc_dep2),
1124                             binop(Iop_Xor64,
1125                                   cc_dep1,
1126                                   binop(Iop_Sub64, cc_dep1, cc_dep2))),
1127                       mkU8(63));
1128       }
1129       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
1130          /* No action.  Never yet found a test case. */
1131       }
1132
1133       /* 2, 3 */
1134       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
1135          /* long long sub/cmp, then B (unsigned less than)
1136             --> test dst <u src */
1137          return unop(Iop_1Uto64,
1138                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1139       }
1140       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
1141          /* long long sub/cmp, then NB (unsigned greater than or equal)
1142             --> test src <=u dst */
1143          /* Note, args are opposite way round from the usual */
1144          return unop(Iop_1Uto64,
1145                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1146       }
1147
1148       /* 4, 5 */
1149       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
1150          /* long long sub/cmp, then Z --> test dst==src */
1151          return unop(Iop_1Uto64,
1152                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
1153       }
1154       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
1155          /* long long sub/cmp, then NZ --> test dst!=src */
1156          return unop(Iop_1Uto64,
1157                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
1158       }
1159
1160       /* 6, 7 */
1161       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
1162          /* long long sub/cmp, then BE (unsigned less than or equal)
1163             --> test dst <=u src */
1164          return unop(Iop_1Uto64,
1165                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1166       }
1167       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
1168          /* long long sub/cmp, then NBE (unsigned greater than)
1169             --> test !(dst <=u src) */
1170          return binop(Iop_Xor64,
1171                       unop(Iop_1Uto64,
1172                            binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
1173                       mkU64(1));
1174       }
1175
1176       /* 8, 9 */
1177       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
1178          /* long long sub/cmp, then S (negative)
1179             --> (dst-src)[63]
1180             --> (dst-src) >>u 63 */
1181          return binop(Iop_Shr64,
1182                       binop(Iop_Sub64, cc_dep1, cc_dep2),
1183                       mkU8(63));
1184       }
1185       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
1186          /* long long sub/cmp, then NS (not negative)
1187             --> (dst-src)[63] ^ 1
1188             --> ((dst-src) >>u 63) ^ 1 */
1189          return binop(Iop_Xor64,
1190                       binop(Iop_Shr64,
1191                             binop(Iop_Sub64, cc_dep1, cc_dep2),
1192                             mkU8(63)),
1193                       mkU64(1));
1194       }
1195
1196       /* 12, 13 */
1197       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
1198          /* long long sub/cmp, then L (signed less than)
1199             --> test dst <s src */
1200          return unop(Iop_1Uto64,
1201                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1202       }
1203       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
1204          /* long long sub/cmp, then NL (signed greater than or equal)
1205             --> test dst >=s src
1206             --> test src <=s dst */
1207          return unop(Iop_1Uto64,
1208                      binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
1209       }
1210
1211       /* 14, 15 */
1212       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
1213          /* long long sub/cmp, then LE (signed less than or equal)
1214             --> test dst <=s src */
1215          return unop(Iop_1Uto64,
1216                      binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
1217       }
1218       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
1219          /* long sub/cmp, then NLE (signed greater than)
1220             --> test !(dst <=s src)
1221             --> test (dst >s src)
1222             --> test (src <s dst) */
1223          return unop(Iop_1Uto64,
1224                      binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1225
1226       }
1227
1228       /*---------------- SUBL ----------------*/
1229
1230       /* 0, */
1231       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
1232          /* This is very commonly generated by Javascript JITs, for
1233             the idiom "do a 32-bit subtract and jump to out-of-line
1234             code if an overflow occurs". */
1235          /* long sub/cmp, then O (overflow)
1236             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
1237             --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
1238          */
1239          vassert(isIRAtom(cc_dep1));
1240          vassert(isIRAtom(cc_dep2));
1241          return
1242             binop(Iop_And64,
1243                   binop(Iop_Shr64,
1244                         binop(Iop_And64,
1245                               binop(Iop_Xor64, cc_dep1, cc_dep2),
1246                               binop(Iop_Xor64,
1247                                     cc_dep1,
1248                                     binop(Iop_Sub64, cc_dep1, cc_dep2))),
1249                         mkU8(31)),
1250                   mkU64(1));
1251       }
1252       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
1253          /* No action.  Never yet found a test case. */
1254       }
1255
1256       /* 2, 3 */
1257       {
1258         /* It appears that LLVM 5.0 and later have a new way to find out
1259            whether the top N bits of a word W are all zero, by computing
1260
1261              W  <u  0---(N-1)---0 1 0---0
1262
1263            In particular, the result will be defined if the top N bits of W
1264            are defined, even if the trailing bits -- those corresponding to
1265            the 0---0 section -- are undefined.  Rather than make Memcheck
1266            more complex, we detect this case where we can and shift out the
1267            irrelevant and potentially undefined bits. */
1268         Int n = 0;
1269         if (isU64(cc_op, AMD64G_CC_OP_SUBL)
1270             && (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB))
1271             && (n = isU64_1_shl_N(cc_dep2)) > 0) {
1272            /* long sub/cmp, then B (unsigned less than),
1273               where dep2 is a power of 2:
1274                 -> CmpLT32(dep1, 1 << N)
1275                 -> CmpEQ32(dep1 >>u N, 0)
1276               and
1277               long sub/cmp, then NB (unsigned greater than or equal),
1278               where dep2 is a power of 2:
1279                 -> CmpGE32(dep1, 1 << N)
1280                 -> CmpNE32(dep1 >>u N, 0)
1281               This avoids CmpLT32U/CmpGE32U being applied to potentially
1282               uninitialised bits in the area being shifted out. */
1283            vassert(n >= 1 && n <= 31);
1284            Bool isNB = isU64(cond, AMD64CondNB);
1285            return unop(Iop_1Uto64,
1286                        binop(isNB ? Iop_CmpNE32 : Iop_CmpEQ32,
1287                              binop(Iop_Shr32, unop(Iop_64to32, cc_dep1),
1288                                               mkU8(n)),
1289                              mkU32(0)));
1290         }
1291       }
1292       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1293          /* long sub/cmp, then B (unsigned less than)
1294             --> test dst <u src */
1295          return unop(Iop_1Uto64,
1296                      binop(Iop_CmpLT32U,
1297                            unop(Iop_64to32, cc_dep1),
1298                            unop(Iop_64to32, cc_dep2)));
1299       }
1300       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
1301          /* long sub/cmp, then NB (unsigned greater than or equal)
1302             --> test src <=u dst */
1303          /* Note, args are opposite way round from the usual */
1304          return unop(Iop_1Uto64,
1305                      binop(Iop_CmpLE32U,
1306                            unop(Iop_64to32, cc_dep2),
1307                            unop(Iop_64to32, cc_dep1)));
1308       }
1309
1310       /* 4, 5 */
1311       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
1312          /* long sub/cmp, then Z --> test dst==src */
1313          return unop(Iop_1Uto64,
1314                      binop(Iop_CmpEQ32,
1315                            unop(Iop_64to32, cc_dep1),
1316                            unop(Iop_64to32, cc_dep2)));
1317       }
1318       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
1319          /* long sub/cmp, then NZ --> test dst!=src */
1320          return unop(Iop_1Uto64,
1321                      binop(Iop_CmpNE32,
1322                            unop(Iop_64to32, cc_dep1),
1323                            unop(Iop_64to32, cc_dep2)));
1324       }
1325
1326       /* 6, 7 */
1327       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1328          /* long sub/cmp, then BE (unsigned less than or equal)
1329             --> test dst <=u src */
1330          return unop(Iop_1Uto64,
1331                      binop(Iop_CmpLE32U,
1332                            unop(Iop_64to32, cc_dep1),
1333                            unop(Iop_64to32, cc_dep2)));
1334       }
1335       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1336          /* long sub/cmp, then NBE (unsigned greater than)
1337             --> test src <u dst */
1338          /* Note, args are opposite way round from the usual */
1339          return unop(Iop_1Uto64,
1340                      binop(Iop_CmpLT32U,
1341                            unop(Iop_64to32, cc_dep2),
1342                            unop(Iop_64to32, cc_dep1)));
1343       }
1344
1345       /* 8, 9 */
1346       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1347          /* long sub/cmp, then S (negative)
1348             --> (dst-src)[31]
1349             --> ((dst -64 src) >>u 31) & 1
1350             Pointless to narrow the args to 32 bit before the subtract. */
1351          return binop(Iop_And64,
1352                       binop(Iop_Shr64,
1353                             binop(Iop_Sub64, cc_dep1, cc_dep2),
1354                             mkU8(31)),
1355                       mkU64(1));
1356       }
1357       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
1358          /* long sub/cmp, then NS (not negative)
1359             --> (dst-src)[31] ^ 1
1360             --> (((dst -64 src) >>u 31) & 1) ^ 1
1361             Pointless to narrow the args to 32 bit before the subtract. */
1362          return binop(Iop_Xor64,
1363                       binop(Iop_And64,
1364                             binop(Iop_Shr64,
1365                                   binop(Iop_Sub64, cc_dep1, cc_dep2),
1366                                   mkU8(31)),
1367                             mkU64(1)),
1368                       mkU64(1));
1369       }
1370
1371       /* 12, 13 */
1372       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
1373          /* long sub/cmp, then L (signed less than)
1374             --> test dst <s src */
1375          return unop(Iop_1Uto64,
1376                      binop(Iop_CmpLT32S,
1377                            unop(Iop_64to32, cc_dep1),
1378                            unop(Iop_64to32, cc_dep2)));
1379       }
1380       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
1381          /* long sub/cmp, then NL (signed greater than or equal)
1382             --> test dst >=s src
1383             --> test src <=s dst */
1384          return unop(Iop_1Uto64,
1385                      binop(Iop_CmpLE32S,
1386                            unop(Iop_64to32, cc_dep2),
1387                            unop(Iop_64to32, cc_dep1)));
1388       }
1389
1390       /* 14, 15 */
1391       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
1392          /* long sub/cmp, then LE (signed less than or equal)
1393             --> test dst <=s src */
1394          return unop(Iop_1Uto64,
1395                      binop(Iop_CmpLE32S,
1396                            unop(Iop_64to32, cc_dep1),
1397                            unop(Iop_64to32, cc_dep2)));
1398
1399       }
1400       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1401          /* long sub/cmp, then NLE (signed greater than)
1402             --> test !(dst <=s src)
1403             --> test (dst >s src)
1404             --> test (src <s dst) */
1405          return unop(Iop_1Uto64,
1406                      binop(Iop_CmpLT32S,
1407                            unop(Iop_64to32, cc_dep2),
1408                            unop(Iop_64to32, cc_dep1)));
1409
1410       }
1411
1412       /*---------------- SUBW ----------------*/
1413
1414       /* 4, 5 */
1415       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1416          /* word sub/cmp, then Z --> test dst==src */
1417          return unop(Iop_1Uto64,
1418                      binop(Iop_CmpEQ16,
1419                            unop(Iop_64to16,cc_dep1),
1420                            unop(Iop_64to16,cc_dep2)));
1421       }
1422       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1423          /* word sub/cmp, then NZ --> test dst!=src */
1424          return unop(Iop_1Uto64,
1425                      binop(Iop_CmpNE16,
1426                            unop(Iop_64to16,cc_dep1),
1427                            unop(Iop_64to16,cc_dep2)));
1428       }
1429
1430       /* 6, */
1431       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
1432          /* word sub/cmp, then BE (unsigned less than or equal)
1433             --> test dst <=u src */
1434          return unop(Iop_1Uto64,
1435                      binop(Iop_CmpLE64U,
1436                            binop(Iop_Shl64, cc_dep1, mkU8(48)),
1437                            binop(Iop_Shl64, cc_dep2, mkU8(48))));
1438       }
1439
1440       /* 8, 9 */
1441       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondS)
1442                                           && isU64(cc_dep2, 0)) {
1443          /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
1444                                          --> test dst <s 0
1445                                          --> (ULong)dst[15]
1446             This is yet another scheme by which clang figures out if the
1447             top bit of a word is 1 or 0.  See also LOGICB/CondS below. */
1448          /* Note: isU64(cc_dep2, 0) is correct, even though this is
1449             for an 16-bit comparison, since the args to the helper
1450             function are always U64s. */
1451          return binop(Iop_And64,
1452                       binop(Iop_Shr64,cc_dep1,mkU8(15)),
1453                       mkU64(1));
1454       }
1455       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNS)
1456                                           && isU64(cc_dep2, 0)) {
1457          /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1458                                           --> test !(dst <s 0)
1459                                           --> (ULong) !dst[15]
1460          */
1461          return binop(Iop_Xor64,
1462                       binop(Iop_And64,
1463                             binop(Iop_Shr64,cc_dep1,mkU8(15)),
1464                             mkU64(1)),
1465                       mkU64(1));
1466       }
1467
1468       /* 14, */
1469       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1470          /* word sub/cmp, then LE (signed less than or equal)
1471             --> test dst <=s src */
1472          return unop(Iop_1Uto64,
1473                      binop(Iop_CmpLE64S,
1474                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1475                            binop(Iop_Shl64,cc_dep2,mkU8(48))));
1476
1477       }
1478
1479       /*---------------- SUBB ----------------*/
1480
1481       /* 2, 3 */
1482       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
1483          /* byte sub/cmp, then B (unsigned less than)
1484             --> test dst <u src */
1485          return unop(Iop_1Uto64,
1486                      binop(Iop_CmpLT64U,
1487                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1488                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1489       }
1490       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
1491          /* byte sub/cmp, then NB (unsigned greater than or equal)
1492             --> test src <=u dst */
1493          /* Note, args are opposite way round from the usual */
1494          return unop(Iop_1Uto64,
1495                      binop(Iop_CmpLE64U,
1496                            binop(Iop_And64, cc_dep2, mkU64(0xFF)),
1497                            binop(Iop_And64, cc_dep1, mkU64(0xFF))));
1498       }
1499
1500       /* 4, 5 */
1501       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1502          /* byte sub/cmp, then Z --> test dst==src */
1503          return unop(Iop_1Uto64,
1504                      binop(Iop_CmpEQ8,
1505                            unop(Iop_64to8,cc_dep1),
1506                            unop(Iop_64to8,cc_dep2)));
1507       }
1508       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1509          /* byte sub/cmp, then NZ --> test dst!=src */
1510          return unop(Iop_1Uto64,
1511                      binop(Iop_CmpNE8,
1512                            unop(Iop_64to8,cc_dep1),
1513                            unop(Iop_64to8,cc_dep2)));
1514       }
1515
1516       /* 6, */
1517       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1518          /* byte sub/cmp, then BE (unsigned less than or equal)
1519             --> test dst <=u src */
1520          return unop(Iop_1Uto64,
1521                      binop(Iop_CmpLE64U,
1522                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1523                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1524       }
1525
1526       /* 8, 9 */
1527       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1528                                           && isU64(cc_dep2, 0)) {
1529          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1530                                          --> test dst <s 0
1531                                          --> (ULong)dst[7]
1532             This is yet another scheme by which gcc figures out if the
1533             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
1534          /* Note: isU64(cc_dep2, 0) is correct, even though this is
1535             for an 8-bit comparison, since the args to the helper
1536             function are always U64s. */
1537          return binop(Iop_And64,
1538                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1539                       mkU64(1));
1540       }
1541       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1542                                           && isU64(cc_dep2, 0)) {
1543          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1544                                           --> test !(dst <s 0)
1545                                           --> (ULong) !dst[7]
1546          */
1547          return binop(Iop_Xor64,
1548                       binop(Iop_And64,
1549                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1550                             mkU64(1)),
1551                       mkU64(1));
1552       }
1553
1554       /*---------------- LOGICQ ----------------*/
1555
1556       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1557          /* long long and/or/xor, then Z --> test dst==0 */
1558          return unop(Iop_1Uto64,
1559                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1560       }
1561       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1562          /* long long and/or/xor, then NZ --> test dst!=0 */
1563          return unop(Iop_1Uto64,
1564                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1565       }
1566
1567       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1568          /* long long and/or/xor, then L
1569             LOGIC sets SF and ZF according to the
1570             result and makes OF be zero.  L computes SF ^ OF, but
1571             OF is zero, so this reduces to SF -- which will be 1 iff
1572             the result is < signed 0.  Hence ...
1573          */
1574          return unop(Iop_1Uto64,
1575                      binop(Iop_CmpLT64S,
1576                            cc_dep1,
1577                            mkU64(0)));
1578       }
1579
1580       /*---------------- LOGICL ----------------*/
1581
1582       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1583          /* long and/or/xor, then Z --> test dst==0 */
1584          return unop(Iop_1Uto64,
1585                      binop(Iop_CmpEQ32,
1586                            unop(Iop_64to32, cc_dep1),
1587                            mkU32(0)));
1588       }
1589       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1590          /* long and/or/xor, then NZ --> test dst!=0 */
1591          return unop(Iop_1Uto64,
1592                      binop(Iop_CmpNE32,
1593                            unop(Iop_64to32, cc_dep1),
1594                            mkU32(0)));
1595       }
1596
1597       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1598          /* long and/or/xor, then LE
1599             This is pretty subtle.  LOGIC sets SF and ZF according to the
1600             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
1601             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1602             the result is <=signed 0.  Hence ...
1603          */
1604          return unop(Iop_1Uto64,
1605                      binop(Iop_CmpLE32S,
1606                            unop(Iop_64to32, cc_dep1),
1607                            mkU32(0)));
1608       }
1609
1610       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1611          /* long and/or/xor, then S --> (ULong)result[31] */
1612          return binop(Iop_And64,
1613                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1614                       mkU64(1));
1615       }
1616       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1617          /* long and/or/xor, then S --> (ULong) ~ result[31] */
1618          return binop(Iop_Xor64,
1619                 binop(Iop_And64,
1620                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1621                       mkU64(1)),
1622                 mkU64(1));
1623       }
1624
1625       /*---------------- LOGICW ----------------*/
1626
1627       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1628          /* word and/or/xor, then Z --> test dst==0 */
1629          return unop(Iop_1Uto64,
1630                      binop(Iop_CmpEQ64,
1631                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1632                            mkU64(0)));
1633       }
1634       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1635          /* word and/or/xor, then NZ --> test dst!=0 */
1636          return unop(Iop_1Uto64,
1637                      binop(Iop_CmpNE64,
1638                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1639                            mkU64(0)));
1640       }
1641
1642       /*---------------- LOGICB ----------------*/
1643
1644       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1645          /* byte and/or/xor, then Z --> test dst==0 */
1646          return unop(Iop_1Uto64,
1647                      binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
1648                                         mkU64(0)));
1649       }
1650       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1651          /* byte and/or/xor, then NZ --> test dst!=0 */
1652          return unop(Iop_1Uto64,
1653                      binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
1654                                         mkU64(0)));
1655       }
1656
1657       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1658          /* this is an idiom gcc sometimes uses to find out if the top
1659             bit of a byte register is set: eg testb %al,%al; js ..
1660             Since it just depends on the top bit of the byte, extract
1661             that bit and explicitly get rid of all the rest.  This
1662             helps memcheck avoid false positives in the case where any
1663             of the other bits in the byte are undefined. */
1664          /* byte and/or/xor, then S --> (UInt)result[7] */
1665          return binop(Iop_And64,
1666                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1667                       mkU64(1));
1668       }
1669       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1670          /* byte and/or/xor, then NS --> (UInt)!result[7] */
1671          return binop(Iop_Xor64,
1672                       binop(Iop_And64,
1673                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1674                             mkU64(1)),
1675                       mkU64(1));
1676       }
1677
1678       /*---------------- INCB ----------------*/
1679
1680       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1681          /* 8-bit inc, then LE --> sign bit of the arg */
1682          return binop(Iop_And64,
1683                       binop(Iop_Shr64,
1684                             binop(Iop_Sub64, cc_dep1, mkU64(1)),
1685                             mkU8(7)),
1686                       mkU64(1));
1687       }
1688
1689       /*---------------- INCW ----------------*/
1690
1691       if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1692          /* 16-bit inc, then Z --> test dst == 0 */
1693          return unop(Iop_1Uto64,
1694                      binop(Iop_CmpEQ64,
1695                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1696                            mkU64(0)));
1697       }
1698
1699       /*---------------- DECL ----------------*/
1700
1701       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1702          /* dec L, then Z --> test dst == 0 */
1703          return unop(Iop_1Uto64,
1704                      binop(Iop_CmpEQ32,
1705                            unop(Iop_64to32, cc_dep1),
1706                            mkU32(0)));
1707       }
1708
1709       /*---------------- DECW ----------------*/
1710
1711       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1712          /* 16-bit dec, then NZ --> test dst != 0 */
1713          return unop(Iop_1Uto64,
1714                      binop(Iop_CmpNE64,
1715                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1716                            mkU64(0)));
1717       }
1718
1719       /*---------------- SHRQ ----------------*/
1720
1721       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
1722          /* SHRQ, then Z --> test dep1 == 0 */
1723          return unop(Iop_1Uto64,
1724                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1725       }
1726       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
1727          /* SHRQ, then NZ --> test dep1 != 0 */
1728          return unop(Iop_1Uto64,
1729                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1730       }
1731
1732       /*---------------- SHRL ----------------*/
1733
1734       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
1735          /* SHRL, then Z --> test dep1 == 0 */
1736          return unop(Iop_1Uto64,
1737                      binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1738                            mkU32(0)));
1739       }
1740       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNZ)) {
1741          /* SHRL, then NZ --> test dep1 != 0 */
1742          return unop(Iop_1Uto64,
1743                      binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1744                            mkU32(0)));
1745       }
1746
1747       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondS)) {
1748          /* SHRL/SARL, then S --> (ULong)result[31] */
1749          return binop(Iop_And64,
1750                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1751                       mkU64(1));
1752       }
1753       // The following looks correct to me, but never seems to happen because
1754       // the front end converts jns to js by switching the fallthrough vs
1755       // taken addresses.  See jcc_01().  But then why do other conditions
1756       // considered by this function show up in both variants (xx and Nxx) ?
1757       //if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNS)) {
1758       //   /* SHRL/SARL, then NS --> (ULong) ~ result[31] */
1759       //   vassert(0);
1760       //   return binop(Iop_Xor64,
1761       //                binop(Iop_And64,
1762       //                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
1763       //                      mkU64(1)),
1764       //                mkU64(1));
1765       //}
1766
1767       /*---------------- COPY ----------------*/
1768       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1769          jbe" for example. */
1770
1771       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1772           && (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1773          /* COPY, then BE --> extract C and Z from dep1, and test (C
1774             or Z == 1). */
1775          /* COPY, then NBE --> extract C and Z from dep1, and test (C
1776             or Z == 0). */
1777          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1778          return
1779             unop(
1780                Iop_1Uto64,
1781                binop(
1782                   Iop_CmpEQ64,
1783                   binop(
1784                      Iop_And64,
1785                      binop(
1786                         Iop_Or64,
1787                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1788                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1789                      ),
1790                      mkU64(1)
1791                   ),
1792                   mkU64(nnn)
1793                )
1794             );
1795       }
1796
1797       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1798           && (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB))) {
1799          /* COPY, then B --> extract C from dep1, and test (C == 1). */
1800          /* COPY, then NB --> extract C from dep1, and test (C == 0). */
1801          ULong nnn = isU64(cond, AMD64CondB) ? 1 : 0;
1802          return
1803             unop(
1804                Iop_1Uto64,
1805                binop(
1806                   Iop_CmpEQ64,
1807                   binop(
1808                      Iop_And64,
1809                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1810                      mkU64(1)
1811                   ),
1812                   mkU64(nnn)
1813                )
1814             );
1815       }
1816
1817       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1818           && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1819          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1820          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1821          ULong nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1822          return
1823             unop(
1824                Iop_1Uto64,
1825                binop(
1826                   Iop_CmpEQ64,
1827                   binop(
1828                      Iop_And64,
1829                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1830                      mkU64(1)
1831                   ),
1832                   mkU64(nnn)
1833                )
1834             );
1835       }
1836
1837       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1838           && (isU64(cond, AMD64CondP) || isU64(cond, AMD64CondNP))) {
1839          /* COPY, then P --> extract P from dep1, and test (P == 1). */
1840          /* COPY, then NP --> extract P from dep1, and test (P == 0). */
1841          ULong nnn = isU64(cond, AMD64CondP) ? 1 : 0;
1842          return
1843             unop(
1844                Iop_1Uto64,
1845                binop(
1846                   Iop_CmpEQ64,
1847                   binop(
1848                      Iop_And64,
1849                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1850                      mkU64(1)
1851                   ),
1852                   mkU64(nnn)
1853                )
1854             );
1855       }
1856
1857       return NULL;
1858    }
1859
1860    /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1861
1862    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1863       /* specialise calls to above "calculate_rflags_c" function */
1864       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1865       vassert(arity == 4);
1866       cc_op   = args[0];
1867       cc_dep1 = args[1];
1868       cc_dep2 = args[2];
1869       cc_ndep = args[3];
1870
1871       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1872          /* C after sub denotes unsigned less than */
1873          return unop(Iop_1Uto64,
1874                      binop(Iop_CmpLT64U,
1875                            cc_dep1,
1876                            cc_dep2));
1877       }
1878       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1879          /* C after sub denotes unsigned less than */
1880          return unop(Iop_1Uto64,
1881                      binop(Iop_CmpLT32U,
1882                            unop(Iop_64to32, cc_dep1),
1883                            unop(Iop_64to32, cc_dep2)));
1884       }
1885       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1886          /* C after sub denotes unsigned less than */
1887          return unop(Iop_1Uto64,
1888                      binop(Iop_CmpLT64U,
1889                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1890                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1891       }
1892       if (isU64(cc_op, AMD64G_CC_OP_ADDQ)) {
1893          /* C after add denotes sum <u either arg */
1894          return unop(Iop_1Uto64,
1895                      binop(Iop_CmpLT64U,
1896                            binop(Iop_Add64, cc_dep1, cc_dep2),
1897                            cc_dep1));
1898       }
1899       if (isU64(cc_op, AMD64G_CC_OP_ADDL)) {
1900          /* C after add denotes sum <u either arg */
1901          return unop(Iop_1Uto64,
1902                      binop(Iop_CmpLT32U,
1903                            unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
1904                            unop(Iop_64to32, cc_dep1)));
1905       }
1906       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1907           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1908           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1909           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1910          /* cflag after logic is zero */
1911          return mkU64(0);
1912       }
1913       if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1914           || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1915          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1916          return cc_ndep;
1917       }
1918
1919 #     if 0
1920       if (cc_op->tag == Iex_Const) {
1921          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1922       }
1923 #     endif
1924
1925       return NULL;
1926    }
1927
1928 #  undef unop
1929 #  undef binop
1930 #  undef mkU64
1931 #  undef mkU32
1932 #  undef mkU8
1933
1934    return NULL;
1935 }
1936
1937
1938 /*---------------------------------------------------------------*/
1939 /*--- Supporting functions for x87 FPU activities.            ---*/
1940 /*---------------------------------------------------------------*/
1941
1942 static inline Bool host_is_little_endian ( void )
1943 {
1944    UInt x = 0x76543210;
1945    UChar* p = (UChar*)(&x);
1946    return toBool(*p == 0x10);
1947 }
1948
1949 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1950 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
1951 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
1952 {
1953    Bool   mantissaIsZero;
1954    Int    bexp;
1955    UChar  sign;
1956    UChar* f64;
1957
1958    vassert(host_is_little_endian());
1959
1960    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1961
1962    f64  = (UChar*)(&dbl);
1963    sign = toUChar( (f64[7] >> 7) & 1 );
1964
1965    /* First off, if the tag indicates the register was empty,
1966       return 1,0,sign,1 */
1967    if (tag == 0) {
1968       /* vex_printf("Empty\n"); */
1969       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
1970                                    | AMD64G_FC_MASK_C0;
1971    }
1972
1973    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
1974    bexp &= 0x7FF;
1975
1976    mantissaIsZero
1977       = toBool(
1978            (f64[6] & 0x0F) == 0
1979            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
1980         );
1981
1982    /* If both exponent and mantissa are zero, the value is zero.
1983       Return 1,0,sign,0. */
1984    if (bexp == 0 && mantissaIsZero) {
1985       /* vex_printf("Zero\n"); */
1986       return AMD64G_FC_MASK_C3 | 0
1987                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
1988    }
1989
1990    /* If exponent is zero but mantissa isn't, it's a denormal.
1991       Return 1,1,sign,0. */
1992    if (bexp == 0 && !mantissaIsZero) {
1993       /* vex_printf("Denormal\n"); */
1994       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
1995                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
1996    }
1997
1998    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1999       Return 0,1,sign,1. */
2000    if (bexp == 0x7FF && mantissaIsZero) {
2001       /* vex_printf("Inf\n"); */
2002       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
2003                                    | AMD64G_FC_MASK_C0;
2004    }
2005
2006    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
2007       Return 0,0,sign,1. */
2008    if (bexp == 0x7FF && !mantissaIsZero) {
2009       /* vex_printf("NaN\n"); */
2010       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
2011    }
2012
2013    /* Uh, ok, we give up.  It must be a normal finite number.
2014       Return 0,1,sign,0.
2015    */
2016    /* vex_printf("normal\n"); */
2017    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
2018 }
2019
2020
2021 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
2022    appears to differ from the former only in that the 8 FP registers
2023    themselves are not transferred into the guest state. */
2024 static
2025 VexEmNote do_put_x87 ( Bool moveRegs,
2026                        /*IN*/Fpu_State* x87_state,
2027                        /*OUT*/VexGuestAMD64State* vex_state )
2028 {
2029    Int        stno, preg;
2030    UInt       tag;
2031    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2032    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2033    UInt       ftop    = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
2034    UInt       tagw    = x87_state->env[FP_ENV_TAG];
2035    UInt       fpucw   = x87_state->env[FP_ENV_CTRL];
2036    UInt       c3210   = x87_state->env[FP_ENV_STAT] & 0x4700;
2037    VexEmNote  ew;
2038    UInt       fpround;
2039    ULong      pair;
2040
2041    /* Copy registers and tags */
2042    for (stno = 0; stno < 8; stno++) {
2043       preg = (stno + ftop) & 7;
2044       tag = (tagw >> (2*preg)) & 3;
2045       if (tag == 3) {
2046          /* register is empty */
2047          /* hmm, if it's empty, does it still get written?  Probably
2048             safer to say it does.  If we don't, memcheck could get out
2049             of sync, in that it thinks all FP registers are defined by
2050             this helper, but in reality some have not been updated. */
2051          if (moveRegs)
2052             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2053          vexTags[preg] = 0;
2054       } else {
2055          /* register is non-empty */
2056          if (moveRegs)
2057             convert_f80le_to_f64le( &x87_state->reg[10*stno],
2058                                     (UChar*)&vexRegs[preg] );
2059          vexTags[preg] = 1;
2060       }
2061    }
2062
2063    /* stack pointer */
2064    vex_state->guest_FTOP = ftop;
2065
2066    /* status word */
2067    vex_state->guest_FC3210 = c3210;
2068
2069    /* handle the control word, setting FPROUND and detecting any
2070       emulation warnings. */
2071    pair    = amd64g_check_fldcw ( (ULong)fpucw );
2072    fpround = (UInt)pair & 0xFFFFFFFFULL;
2073    ew      = (VexEmNote)(pair >> 32);
2074
2075    vex_state->guest_FPROUND = fpround & 3;
2076
2077    /* emulation warnings --> caller */
2078    return ew;
2079 }
2080
2081
2082 /* Create an x87 FPU state from the guest state, as close as
2083    we can approximate it. */
2084 static
2085 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
2086                   /*OUT*/Fpu_State* x87_state )
2087 {
2088    Int        i, stno, preg;
2089    UInt       tagw;
2090    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2091    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2092    UInt       ftop    = vex_state->guest_FTOP;
2093    UInt       c3210   = vex_state->guest_FC3210;
2094
2095    for (i = 0; i < 14; i++)
2096       x87_state->env[i] = 0;
2097
2098    x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
2099       = x87_state->env[13] = 0xFFFF;
2100    x87_state->env[FP_ENV_STAT]
2101       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2102    x87_state->env[FP_ENV_CTRL]
2103       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2104
2105    /* Dump the register stack in ST order. */
2106    tagw = 0;
2107    for (stno = 0; stno < 8; stno++) {
2108       preg = (stno + ftop) & 7;
2109       if (vexTags[preg] == 0) {
2110          /* register is empty */
2111          tagw |= (3 << (2*preg));
2112          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2113                                  &x87_state->reg[10*stno] );
2114       } else {
2115          /* register is full. */
2116          tagw |= (0 << (2*preg));
2117          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2118                                  &x87_state->reg[10*stno] );
2119       }
2120    }
2121    x87_state->env[FP_ENV_TAG] = toUShort(tagw);
2122 }
2123
2124
2125 /*---------------------------------------------------------------*/
2126 /*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
2127 /*---------------------------------------------------------------*/
2128
2129 /* CALLED FROM GENERATED CODE */
2130 /* DIRTY HELPER (reads guest state, writes guest mem) */
2131 /* XSAVE component 0 is the x87 FPU state. */
2132 void amd64g_dirtyhelper_XSAVE_COMPONENT_0
2133         ( VexGuestAMD64State* gst, HWord addr )
2134 {
2135    /* Derived from values obtained from
2136       vendor_id       : AuthenticAMD
2137       cpu family      : 15
2138       model           : 12
2139       model name      : AMD Athlon(tm) 64 Processor 3200+
2140       stepping        : 0
2141       cpu MHz         : 2200.000
2142       cache size      : 512 KB
2143    */
2144    /* Somewhat roundabout, but at least it's simple. */
2145    Fpu_State tmp;
2146    UShort*   addrS = (UShort*)addr;
2147    UChar*    addrC = (UChar*)addr;
2148    UShort    fp_tags;
2149    UInt      summary_tags;
2150    Int       r, stno;
2151    UShort    *srcS, *dstS;
2152
2153    do_get_x87( gst, &tmp );
2154
2155    /* Now build the proper fxsave x87 image from the fsave x87 image
2156       we just made. */
2157
2158    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
2159    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
2160
2161    /* set addrS[2] in an endian-independent way */
2162    summary_tags = 0;
2163    fp_tags = tmp.env[FP_ENV_TAG];
2164    for (r = 0; r < 8; r++) {
2165       if ( ((fp_tags >> (2*r)) & 3) != 3 )
2166          summary_tags |= (1 << r);
2167    }
2168    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
2169    addrC[5]  = 0; /* pad */
2170
2171    /* FOP: faulting fpu opcode.  From experimentation, the real CPU
2172       does not write this field. (?!) */
2173    addrS[3]  = 0; /* BOGUS */
2174
2175    /* RIP (Last x87 instruction pointer).  From experimentation, the
2176       real CPU does not write this field. (?!) */
2177    addrS[4]  = 0; /* BOGUS */
2178    addrS[5]  = 0; /* BOGUS */
2179    addrS[6]  = 0; /* BOGUS */
2180    addrS[7]  = 0; /* BOGUS */
2181
2182    /* RDP (Last x87 data pointer).  From experimentation, the real CPU
2183       does not write this field. (?!) */
2184    addrS[8]  = 0; /* BOGUS */
2185    addrS[9]  = 0; /* BOGUS */
2186    addrS[10] = 0; /* BOGUS */
2187    addrS[11] = 0; /* BOGUS */
2188
2189    /* addrS[13,12] are MXCSR -- not written */
2190    /* addrS[15,14] are MXCSR_MASK -- not written */
2191
2192    /* Copy in the FP registers, in ST order. */
2193    for (stno = 0; stno < 8; stno++) {
2194       srcS = (UShort*)(&tmp.reg[10*stno]);
2195       dstS = (UShort*)(&addrS[16 + 8*stno]);
2196       dstS[0] = srcS[0];
2197       dstS[1] = srcS[1];
2198       dstS[2] = srcS[2];
2199       dstS[3] = srcS[3];
2200       dstS[4] = srcS[4];
2201       dstS[5] = 0;
2202       dstS[6] = 0;
2203       dstS[7] = 0;
2204    }
2205 }
2206
2207
2208 /* CALLED FROM GENERATED CODE */
2209 /* DIRTY HELPER (reads guest state, writes guest mem) */
2210 /* XSAVE component 1 is the SSE state. */
2211 void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
2212         ( VexGuestAMD64State* gst, HWord addr )
2213 {
2214    UShort* addrS = (UShort*)addr;
2215    UInt    mxcsr;
2216
2217    /* The only non-register parts of the SSE state are MXCSR and
2218       MXCSR_MASK. */
2219    mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
2220
2221    addrS[12] = toUShort(mxcsr);  /* MXCSR */
2222    addrS[13] = toUShort(mxcsr >> 16);
2223
2224    addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
2225    addrS[15] = 0x0000; /* MXCSR mask (hi16) */
2226 }
2227
2228
2229 /* VISIBLE TO LIBVEX CLIENT */
2230 /* Do FXSAVE from the supplied VexGuestAMD64State structure and store
2231    the result at the given address which represents a buffer of at
2232    least 416 bytes.
2233
2234    This function is not called from generated code.  FXSAVE is dealt
2235    with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
2236    functions above plus some in-line IR.  This function is merely a
2237    convenience function for VEX's users.
2238 */
2239 void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
2240                                 /*OUT*/HWord fp_state )
2241 {
2242    /* Do the x87 part */
2243    amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
2244
2245    /* And now the SSE part, except for the registers themselves. */
2246    amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2247
2248    /* That's the first 160 bytes of the image done. */
2249    /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
2250       big-endian, these need to be byte-swapped. */
2251    U128 *xmm = (U128 *)(fp_state + 160);
2252    vassert(host_is_little_endian());
2253
2254 #  define COPY_U128(_dst,_src)                       \
2255       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2256            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2257       while (0)
2258
2259    COPY_U128( xmm[0],  gst->guest_YMM0 );
2260    COPY_U128( xmm[1],  gst->guest_YMM1 );
2261    COPY_U128( xmm[2],  gst->guest_YMM2 );
2262    COPY_U128( xmm[3],  gst->guest_YMM3 );
2263    COPY_U128( xmm[4],  gst->guest_YMM4 );
2264    COPY_U128( xmm[5],  gst->guest_YMM5 );
2265    COPY_U128( xmm[6],  gst->guest_YMM6 );
2266    COPY_U128( xmm[7],  gst->guest_YMM7 );
2267    COPY_U128( xmm[8],  gst->guest_YMM8 );
2268    COPY_U128( xmm[9],  gst->guest_YMM9 );
2269    COPY_U128( xmm[10], gst->guest_YMM10 );
2270    COPY_U128( xmm[11], gst->guest_YMM11 );
2271    COPY_U128( xmm[12], gst->guest_YMM12 );
2272    COPY_U128( xmm[13], gst->guest_YMM13 );
2273    COPY_U128( xmm[14], gst->guest_YMM14 );
2274    COPY_U128( xmm[15], gst->guest_YMM15 );
2275 #  undef COPY_U128
2276 }
2277
2278
2279 /*---------------------------------------------------------------*/
2280 /*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
2281 /*---------------------------------------------------------------*/
2282
2283 /* CALLED FROM GENERATED CODE */
2284 /* DIRTY HELPER (writes guest state, reads guest mem) */
2285 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
2286              ( VexGuestAMD64State* gst, HWord addr )
2287 {
2288    Fpu_State tmp;
2289    UShort*   addrS   = (UShort*)addr;
2290    UChar*    addrC   = (UChar*)addr;
2291    UShort    fp_tags;
2292    Int       r, stno, i;
2293
2294    /* Copy the x87 registers out of the image, into a temporary
2295       Fpu_State struct. */
2296    for (i = 0; i < 14; i++) tmp.env[i] = 0;
2297    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
2298    /* fill in tmp.reg[0..7] */
2299    for (stno = 0; stno < 8; stno++) {
2300       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
2301       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
2302       dstS[0] = srcS[0];
2303       dstS[1] = srcS[1];
2304       dstS[2] = srcS[2];
2305       dstS[3] = srcS[3];
2306       dstS[4] = srcS[4];
2307    }
2308    /* fill in tmp.env[0..13] */
2309    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
2310    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
2311
2312    fp_tags = 0;
2313    for (r = 0; r < 8; r++) {
2314       if (addrC[4] & (1<<r))
2315          fp_tags |= (0 << (2*r)); /* EMPTY */
2316       else
2317          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
2318    }
2319    tmp.env[FP_ENV_TAG] = fp_tags;
2320
2321    /* Now write 'tmp' into the guest state. */
2322    VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );
2323
2324    return warnX87;
2325 }
2326
2327
2328 /* CALLED FROM GENERATED CODE */
2329 /* DIRTY HELPER (writes guest state, reads guest mem) */
2330 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
2331              ( VexGuestAMD64State* gst, HWord addr )
2332 {
2333    UShort* addrS = (UShort*)addr;
2334    UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
2335                    | ((((UInt)addrS[13]) & 0xFFFF) << 16);
2336    ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );
2337
2338    VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
2339
2340    gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
2341    return warnXMM;
2342 }
2343
2344
2345 /* VISIBLE TO LIBVEX CLIENT */
2346 /* Do FXRSTOR from the supplied address and store read values to the given
2347    VexGuestAMD64State structure.
2348
2349    This function is not called from generated code.  FXRSTOR is dealt
2350    with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
2351    functions above plus some in-line IR.  This function is merely a
2352    convenience function for VEX's users.
2353 */
2354 VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
2355                                       /*MOD*/VexGuestAMD64State* gst )
2356 {
2357    /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
2358       to be byte-swapped. */
2359    U128 *xmm = (U128 *)(fp_state + 160);
2360
2361    vassert(host_is_little_endian());
2362
2363 #  define COPY_U128(_dst,_src)                       \
2364       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2365            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2366       while (0)
2367
2368    COPY_U128( gst->guest_YMM0, xmm[0] );
2369    COPY_U128( gst->guest_YMM1, xmm[1] );
2370    COPY_U128( gst->guest_YMM2, xmm[2] );
2371    COPY_U128( gst->guest_YMM3, xmm[3] );
2372    COPY_U128( gst->guest_YMM4, xmm[4] );
2373    COPY_U128( gst->guest_YMM5, xmm[5] );
2374    COPY_U128( gst->guest_YMM6, xmm[6] );
2375    COPY_U128( gst->guest_YMM7, xmm[7] );
2376    COPY_U128( gst->guest_YMM8, xmm[8] );
2377    COPY_U128( gst->guest_YMM9, xmm[9] );
2378    COPY_U128( gst->guest_YMM10, xmm[10] );
2379    COPY_U128( gst->guest_YMM11, xmm[11] );
2380    COPY_U128( gst->guest_YMM12, xmm[12] );
2381    COPY_U128( gst->guest_YMM13, xmm[13] );
2382    COPY_U128( gst->guest_YMM14, xmm[14] );
2383    COPY_U128( gst->guest_YMM15, xmm[15] );
2384
2385 #  undef COPY_U128
2386
2387    VexEmNote warnXMM
2388       = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2389    VexEmNote warnX87
2390       = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
2391
2392    /* Prefer an X87 emwarn over an XMM one, if both exist. */
2393    if (warnX87 != EmNote_NONE)
2394       return warnX87;
2395    else
2396       return warnXMM;
2397 }
2398
2399
2400 /*---------------------------------------------------------------*/
2401 /*--- Supporting functions for FSAVE/FRSTOR                   ---*/
2402 /*---------------------------------------------------------------*/
2403
2404 /* DIRTY HELPER (writes guest state) */
2405 /* Initialise the x87 FPU state as per 'finit'. */
2406 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
2407 {
2408    Int i;
2409    gst->guest_FTOP = 0;
2410    for (i = 0; i < 8; i++) {
2411       gst->guest_FPTAG[i] = 0; /* empty */
2412       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
2413    }
2414    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
2415    gst->guest_FC3210  = 0;
2416 }
2417
2418
2419 /* CALLED FROM GENERATED CODE */
2420 /* DIRTY HELPER (reads guest memory) */
2421 ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
2422 {
2423    ULong f64;
2424    convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
2425    return f64;
2426 }
2427
2428 /* CALLED FROM GENERATED CODE */
2429 /* DIRTY HELPER (writes guest memory) */
2430 void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
2431 {
2432    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
2433 }
2434
2435
2436 /* CALLED FROM GENERATED CODE */
2437 /* CLEAN HELPER */
2438 /* mxcsr[15:0] contains a SSE native format MXCSR value.
2439    Extract from it the required SSEROUND value and any resulting
2440    emulation warning, and return (warn << 32) | sseround value.
2441 */
2442 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
2443 {
2444    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
2445    /* NOTE, encoded exactly as per enum IRRoundingMode. */
2446    ULong rmode = (mxcsr >> 13) & 3;
2447
2448    /* Detect any required emulation warnings. */
2449    VexEmNote ew = EmNote_NONE;
2450
2451    if ((mxcsr & 0x1F80) != 0x1F80) {
2452       /* unmasked exceptions! */
2453       ew = EmWarn_X86_sseExns;
2454    }
2455    else
2456    if (mxcsr & (1<<15)) {
2457       /* FZ is set */
2458       ew = EmWarn_X86_fz;
2459    }
2460    else
2461    if (mxcsr & (1<<6)) {
2462       /* DAZ is set */
2463       ew = EmWarn_X86_daz;
2464    }
2465
2466    return (((ULong)ew) << 32) | ((ULong)rmode);
2467 }
2468
2469
2470 /* CALLED FROM GENERATED CODE */
2471 /* CLEAN HELPER */
2472 /* Given sseround as an IRRoundingMode value, create a suitable SSE
2473    native format MXCSR value. */
2474 ULong amd64g_create_mxcsr ( ULong sseround )
2475 {
2476    sseround &= 3;
2477    return 0x1F80 | (sseround << 13);
2478 }
2479
2480
2481 /* CLEAN HELPER */
2482 /* fpucw[15:0] contains a x87 native format FPU control word.
2483    Extract from it the required FPROUND value and any resulting
2484    emulation warning, and return (warn << 32) | fpround value.
2485 */
2486 ULong amd64g_check_fldcw ( ULong fpucw )
2487 {
2488    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
2489    /* NOTE, encoded exactly as per enum IRRoundingMode. */
2490    ULong rmode = (fpucw >> 10) & 3;
2491
2492    /* Detect any required emulation warnings. */
2493    VexEmNote ew = EmNote_NONE;
2494
2495    if ((fpucw & 0x3F) != 0x3F) {
2496       /* unmasked exceptions! */
2497       ew = EmWarn_X86_x87exns;
2498    }
2499    else
2500    if (((fpucw >> 8) & 3) != 3) {
2501       /* unsupported precision */
2502       ew = EmWarn_X86_x87precision;
2503    }
2504
2505    return (((ULong)ew) << 32) | ((ULong)rmode);
2506 }
2507
2508
2509 /* CLEAN HELPER */
2510 /* Given fpround as an IRRoundingMode value, create a suitable x87
2511    native format FPU control word. */
2512 ULong amd64g_create_fpucw ( ULong fpround )
2513 {
2514    fpround &= 3;
2515    return 0x037F | (fpround << 10);
2516 }
2517
2518
2519 /* This is used to implement 'fldenv'.
2520    Reads 28 bytes at x87_state[0 .. 27]. */
2521 /* CALLED FROM GENERATED CODE */
2522 /* DIRTY HELPER */
2523 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
2524                                       /*IN*/HWord x87_state)
2525 {
2526    return do_put_x87( False, (Fpu_State*)x87_state, vex_state );
2527 }
2528
2529
2530 /* CALLED FROM GENERATED CODE */
2531 /* DIRTY HELPER */
2532 /* Create an x87 FPU env from the guest state, as close as we can
2533    approximate it.  Writes 28 bytes at x87_state[0..27]. */
2534 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
2535                                  /*OUT*/HWord x87_state )
2536 {
2537    Int        i, stno, preg;
2538    UInt       tagw;
2539    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2540    Fpu_State* x87     = (Fpu_State*)x87_state;
2541    UInt       ftop    = vex_state->guest_FTOP;
2542    ULong      c3210   = vex_state->guest_FC3210;
2543
2544    for (i = 0; i < 14; i++)
2545       x87->env[i] = 0;
2546
2547    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
2548    x87->env[FP_ENV_STAT]
2549       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
2550    x87->env[FP_ENV_CTRL]
2551       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
2552
2553    /* Compute the x87 tag word. */
2554    tagw = 0;
2555    for (stno = 0; stno < 8; stno++) {
2556       preg = (stno + ftop) & 7;
2557       if (vexTags[preg] == 0) {
2558          /* register is empty */
2559          tagw |= (3 << (2*preg));
2560       } else {
2561          /* register is full. */
2562          tagw |= (0 << (2*preg));
2563       }
2564    }
2565    x87->env[FP_ENV_TAG] = toUShort(tagw);
2566
2567    /* We don't dump the x87 registers, tho. */
2568 }
2569
2570
2571 /* This is used to implement 'fnsave'.
2572    Writes 108 bytes at x87_state[0 .. 107]. */
2573 /* CALLED FROM GENERATED CODE */
2574 /* DIRTY HELPER */
2575 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2576                                  /*OUT*/HWord x87_state)
2577 {
2578    do_get_x87( vex_state, (Fpu_State*)x87_state );
2579 }
2580
2581
2582 /* This is used to implement 'fnsaves'.
2583    Writes 94 bytes at x87_state[0 .. 93]. */
2584 /* CALLED FROM GENERATED CODE */
2585 /* DIRTY HELPER */
2586 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2587                                   /*OUT*/HWord x87_state)
2588 {
2589    Int           i, stno, preg;
2590    UInt          tagw;
2591    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2592    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2593    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2594    UInt          ftop    = vex_state->guest_FTOP;
2595    UInt          c3210   = vex_state->guest_FC3210;
2596
2597    for (i = 0; i < 7; i++)
2598       x87->env[i] = 0;
2599
2600    x87->env[FPS_ENV_STAT]
2601       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2602    x87->env[FPS_ENV_CTRL]
2603       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2604
2605    /* Dump the register stack in ST order. */
2606    tagw = 0;
2607    for (stno = 0; stno < 8; stno++) {
2608       preg = (stno + ftop) & 7;
2609       if (vexTags[preg] == 0) {
2610          /* register is empty */
2611          tagw |= (3 << (2*preg));
2612          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2613                                  &x87->reg[10*stno] );
2614       } else {
2615          /* register is full. */
2616          tagw |= (0 << (2*preg));
2617          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2618                                  &x87->reg[10*stno] );
2619       }
2620    }
2621    x87->env[FPS_ENV_TAG] = toUShort(tagw);
2622 }
2623
2624
2625 /* This is used to implement 'frstor'.
2626    Reads 108 bytes at x87_state[0 .. 107]. */
2627 /* CALLED FROM GENERATED CODE */
2628 /* DIRTY HELPER */
2629 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2630                                       /*IN*/HWord x87_state)
2631 {
2632    return do_put_x87( True, (Fpu_State*)x87_state, vex_state );
2633 }
2634
2635
2636 /* This is used to implement 'frstors'.
2637    Reads 94 bytes at x87_state[0 .. 93]. */
2638 /* CALLED FROM GENERATED CODE */
2639 /* DIRTY HELPER */
2640 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2641                                        /*IN*/HWord x87_state)
2642 {
2643    Int           stno, preg;
2644    UInt          tag;
2645    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2646    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2647    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2648    UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2649    UInt          tagw    = x87->env[FPS_ENV_TAG];
2650    UInt          fpucw   = x87->env[FPS_ENV_CTRL];
2651    UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
2652    VexEmNote     ew;
2653    UInt          fpround;
2654    ULong         pair;
2655
2656    /* Copy registers and tags */
2657    for (stno = 0; stno < 8; stno++) {
2658       preg = (stno + ftop) & 7;
2659       tag = (tagw >> (2*preg)) & 3;
2660       if (tag == 3) {
2661          /* register is empty */
2662          /* hmm, if it's empty, does it still get written?  Probably
2663             safer to say it does.  If we don't, memcheck could get out
2664             of sync, in that it thinks all FP registers are defined by
2665             this helper, but in reality some have not been updated. */
2666          vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2667          vexTags[preg] = 0;
2668       } else {
2669          /* register is non-empty */
2670          convert_f80le_to_f64le( &x87->reg[10*stno],
2671                                  (UChar*)&vexRegs[preg] );
2672          vexTags[preg] = 1;
2673       }
2674    }
2675
2676    /* stack pointer */
2677    vex_state->guest_FTOP = ftop;
2678
2679    /* status word */
2680    vex_state->guest_FC3210 = c3210;
2681
2682    /* handle the control word, setting FPROUND and detecting any
2683       emulation warnings. */
2684    pair    = amd64g_check_fldcw ( (ULong)fpucw );
2685    fpround = (UInt)pair & 0xFFFFFFFFULL;
2686    ew      = (VexEmNote)(pair >> 32);
2687
2688    vex_state->guest_FPROUND = fpround & 3;
2689
2690    /* emulation warnings --> caller */
2691    return ew;
2692 }
2693
2694
2695 /*---------------------------------------------------------------*/
2696 /*--- CPUID helpers.                                          ---*/
2697 /*---------------------------------------------------------------*/
2698
2699 /* Claim to be the following CPU, which is probably representative of
2700    the lowliest (earliest) amd64 offerings.  It can do neither sse3
2701    nor cx16.
2702
2703    vendor_id       : AuthenticAMD
2704    cpu family      : 15
2705    model           : 5
2706    model name      : AMD Opteron (tm) Processor 848
2707    stepping        : 10
2708    cpu MHz         : 1797.682
2709    cache size      : 1024 KB
2710    fpu             : yes
2711    fpu_exception   : yes
2712    cpuid level     : 1
2713    wp              : yes
2714    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2715                      mtrr pge mca cmov pat pse36 clflush mmx fxsr
2716                      sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2717    bogomips        : 3600.62
2718    TLB size        : 1088 4K pages
2719    clflush size    : 64
2720    cache_alignment : 64
2721    address sizes   : 40 bits physical, 48 bits virtual
2722    power management: ts fid vid ttp
2723
2724    2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2725    we don't support them.  See #291568.  3dnow is 80000001.EDX.31
2726    and 3dnowext is 80000001.EDX.30.
2727 */
2728 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2729 {
2730 #  define SET_ABCD(_a,_b,_c,_d)                \
2731       do { st->guest_RAX = (ULong)(_a);        \
2732            st->guest_RBX = (ULong)(_b);        \
2733            st->guest_RCX = (ULong)(_c);        \
2734            st->guest_RDX = (ULong)(_d);        \
2735       } while (0)
2736
2737    switch (0xFFFFFFFF & st->guest_RAX) {
2738       case 0x00000000:
2739          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2740          break;
2741       case 0x00000001:
2742          SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2743          break;
2744       case 0x80000000:
2745          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2746          break;
2747       case 0x80000001:
2748          /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
2749             the original it-is-supported value that the h/w provides.
2750             See #291568. */
2751          SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2752                                                       0x21d3fbff);
2753          break;
2754       case 0x80000002:
2755          SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2756          break;
2757       case 0x80000003:
2758          SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2759          break;
2760       case 0x80000004:
2761          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2762          break;
2763       case 0x80000005:
2764          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2765          break;
2766       case 0x80000006:
2767          SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2768          break;
2769       case 0x80000007:
2770          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2771          break;
2772       case 0x80000008:
2773          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2774          break;
2775       default:
2776          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2777          break;
2778    }
2779 #  undef SET_ABCD
2780 }
2781
2782
2783 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2784    capable.
2785
2786    vendor_id       : GenuineIntel
2787    cpu family      : 6
2788    model           : 15
2789    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2790    stepping        : 6
2791    cpu MHz         : 2394.000
2792    cache size      : 4096 KB
2793    physical id     : 0
2794    siblings        : 2
2795    core id         : 0
2796    cpu cores       : 2
2797    fpu             : yes
2798    fpu_exception   : yes
2799    cpuid level     : 10
2800    wp              : yes
2801    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2802                      mtrr pge mca cmov pat pse36 clflush dts acpi
2803                      mmx fxsr sse sse2 ss ht tm syscall nx lm
2804                      constant_tsc pni monitor ds_cpl vmx est tm2
2805                      cx16 xtpr lahf_lm
2806    bogomips        : 4798.78
2807    clflush size    : 64
2808    cache_alignment : 64
2809    address sizes   : 36 bits physical, 48 bits virtual
2810    power management:
2811 */
2812 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2813 {
2814 #  define SET_ABCD(_a,_b,_c,_d)                \
2815       do { st->guest_RAX = (ULong)(_a);        \
2816            st->guest_RBX = (ULong)(_b);        \
2817            st->guest_RCX = (ULong)(_c);        \
2818            st->guest_RDX = (ULong)(_d);        \
2819       } while (0)
2820
2821    switch (0xFFFFFFFF & st->guest_RAX) {
2822       case 0x00000000:
2823          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2824          break;
2825       case 0x00000001:
2826          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2827          break;
2828       case 0x00000002:
2829          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2830          break;
2831       case 0x00000003:
2832          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2833          break;
2834       case 0x00000004: {
2835          switch (0xFFFFFFFF & st->guest_RCX) {
2836             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2837                                       0x0000003f, 0x00000001); break;
2838             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2839                                       0x0000003f, 0x00000001); break;
2840             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2841                                       0x00000fff, 0x00000001); break;
2842             default:         SET_ABCD(0x00000000, 0x00000000,
2843                                       0x00000000, 0x00000000); break;
2844          }
2845          break;
2846       }
2847       case 0x00000005:
2848          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2849          break;
2850       case 0x00000006:
2851          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2852          break;
2853       case 0x00000007:
2854          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2855          break;
2856       case 0x00000008:
2857          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2858          break;
2859       case 0x00000009:
2860          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2861          break;
2862       case 0x0000000a:
2863       unhandled_eax_value:
2864          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2865          break;
2866       case 0x80000000:
2867          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2868          break;
2869       case 0x80000001:
2870          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2871          break;
2872       case 0x80000002:
2873          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2874          break;
2875       case 0x80000003:
2876          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2877          break;
2878       case 0x80000004:
2879          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2880          break;
2881       case 0x80000005:
2882          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2883          break;
2884       case 0x80000006:
2885          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2886          break;
2887       case 0x80000007:
2888          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2889          break;
2890       case 0x80000008:
2891          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2892          break;
2893       default:
2894          goto unhandled_eax_value;
2895    }
2896 #  undef SET_ABCD
2897 }
2898
2899
2900 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2901    capable.
2902
2903    vendor_id       : GenuineIntel
2904    cpu family      : 6
2905    model           : 37
2906    model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
2907    stepping        : 2
2908    cpu MHz         : 3334.000
2909    cache size      : 4096 KB
2910    physical id     : 0
2911    siblings        : 4
2912    core id         : 0
2913    cpu cores       : 2
2914    apicid          : 0
2915    initial apicid  : 0
2916    fpu             : yes
2917    fpu_exception   : yes
2918    cpuid level     : 11
2919    wp              : yes
2920    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2921                      mtrr pge mca cmov pat pse36 clflush dts acpi
2922                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2923                      lm constant_tsc arch_perfmon pebs bts rep_good
2924                      xtopology nonstop_tsc aperfmperf pni pclmulqdq
2925                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2926                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2927                      arat tpr_shadow vnmi flexpriority ept vpid
2928    bogomips        : 6957.57
2929    clflush size    : 64
2930    cache_alignment : 64
2931    address sizes   : 36 bits physical, 48 bits virtual
2932    power management:
2933 */
2934 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2935 {
2936 #  define SET_ABCD(_a,_b,_c,_d)                \
2937       do { st->guest_RAX = (ULong)(_a);        \
2938            st->guest_RBX = (ULong)(_b);        \
2939            st->guest_RCX = (ULong)(_c);        \
2940            st->guest_RDX = (ULong)(_d);        \
2941       } while (0)
2942
2943    UInt old_eax = (UInt)st->guest_RAX;
2944    UInt old_ecx = (UInt)st->guest_RCX;
2945
2946    switch (old_eax) {
2947       case 0x00000000:
2948          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2949          break;
2950       case 0x00000001:
2951          SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
2952          break;
2953       case 0x00000002:
2954          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2955          break;
2956       case 0x00000003:
2957          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2958          break;
2959       case 0x00000004:
2960          switch (old_ecx) {
2961             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2962                                       0x0000003f, 0x00000000); break;
2963             case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2964                                       0x0000007f, 0x00000000); break;
2965             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2966                                       0x000001ff, 0x00000000); break;
2967             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2968                                       0x00000fff, 0x00000002); break;
2969             default:         SET_ABCD(0x00000000, 0x00000000,
2970                                       0x00000000, 0x00000000); break;
2971          }
2972          break;
2973       case 0x00000005:
2974          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2975          break;
2976       case 0x00000006:
2977          SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2978          break;
2979       case 0x00000007:
2980          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2981          break;
2982       case 0x00000008:
2983          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2984          break;
2985       case 0x00000009:
2986          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2987          break;
2988       case 0x0000000a:
2989          SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2990          break;
2991       case 0x0000000b:
2992          switch (old_ecx) {
2993             case 0x00000000:
2994                SET_ABCD(0x00000001, 0x00000002,
2995                         0x00000100, 0x00000000); break;
2996             case 0x00000001:
2997                SET_ABCD(0x00000004, 0x00000004,
2998                         0x00000201, 0x00000000); break;
2999             default:
3000                SET_ABCD(0x00000000, 0x00000000,
3001                         old_ecx,    0x00000000); break;
3002          }
3003          break;
3004       case 0x0000000c:
3005          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3006          break;
3007       case 0x0000000d:
3008          switch (old_ecx) {
3009             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3010                                       0x00000100, 0x00000000); break;
3011             case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
3012                                       0x00000201, 0x00000000); break;
3013             default:         SET_ABCD(0x00000000, 0x00000000,
3014                                       old_ecx,    0x00000000); break;
3015          }
3016          break;
3017       case 0x80000000:
3018          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3019          break;
3020       case 0x80000001:
3021          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3022          break;
3023       case 0x80000002:
3024          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3025          break;
3026       case 0x80000003:
3027          SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
3028          break;
3029       case 0x80000004:
3030          SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
3031          break;
3032       case 0x80000005:
3033          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3034          break;
3035       case 0x80000006:
3036          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3037          break;
3038       case 0x80000007:
3039          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3040          break;
3041       case 0x80000008:
3042          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3043          break;
3044       default:
3045          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3046          break;
3047    }
3048 #  undef SET_ABCD
3049 }
3050
3051
3052 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
3053    capable.  Plus (kludge!) it "supports" HTM.
3054
3055    Also with the following change: claim that XSaveOpt is not
3056    available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
3057    on the real CPU.  Consequently, programs that correctly observe
3058    these CPUID values should only try to use 3 of the 8 XSave-family
3059    instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
3060    having to implement the compacted or optimised save/restore
3061    variants.
3062
3063    vendor_id       : GenuineIntel
3064    cpu family      : 6
3065    model           : 42
3066    model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
3067    stepping        : 7
3068    cpu MHz         : 1600.000
3069    cache size      : 6144 KB
3070    physical id     : 0
3071    siblings        : 4
3072    core id         : 3
3073    cpu cores       : 4
3074    apicid          : 6
3075    initial apicid  : 6
3076    fpu             : yes
3077    fpu_exception   : yes
3078    cpuid level     : 13
3079    wp              : yes
3080    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
3081                      mtrr pge mca cmov pat pse36 clflush dts acpi
3082                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
3083                      lm constant_tsc arch_perfmon pebs bts rep_good
3084                      nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
3085                      dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
3086                      xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
3087                      lahf_lm ida arat epb xsaveopt pln pts dts
3088                      tpr_shadow vnmi flexpriority ept vpid
3089
3090    bogomips        : 5768.94
3091    clflush size    : 64
3092    cache_alignment : 64
3093    address sizes   : 36 bits physical, 48 bits virtual
3094    power management:
3095 */
3096 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
3097 {
3098 #  define SET_ABCD(_a,_b,_c,_d)                \
3099       do { st->guest_RAX = (ULong)(_a);        \
3100            st->guest_RBX = (ULong)(_b);        \
3101            st->guest_RCX = (ULong)(_c);        \
3102            st->guest_RDX = (ULong)(_d);        \
3103       } while (0)
3104
3105    UInt old_eax = (UInt)st->guest_RAX;
3106    UInt old_ecx = (UInt)st->guest_RCX;
3107
3108    switch (old_eax) {
3109       case 0x00000000:
3110          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3111          break;
3112       case 0x00000001:
3113          SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
3114          break;
3115       case 0x00000002:
3116          SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
3117          break;
3118       case 0x00000003:
3119          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3120          break;
3121       case 0x00000004:
3122          switch (old_ecx) {
3123             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3124                                       0x0000003f, 0x00000000); break;
3125             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3126                                       0x0000003f, 0x00000000); break;
3127             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3128                                       0x000001ff, 0x00000000); break;
3129             case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
3130                                       0x00001fff, 0x00000006); break;
3131             default:         SET_ABCD(0x00000000, 0x00000000,
3132                                       0x00000000, 0x00000000); break;
3133          }
3134          break;
3135       case 0x00000005:
3136          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3137          break;
3138       case 0x00000006:
3139          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3140          break;
3141       case 0x00000007:
3142          SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
3143          break;
3144       case 0x00000008:
3145          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3146          break;
3147       case 0x00000009:
3148          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3149          break;
3150       case 0x0000000a:
3151          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3152          break;
3153       case 0x0000000b:
3154          switch (old_ecx) {
3155             case 0x00000000:
3156                SET_ABCD(0x00000001, 0x00000001,
3157                         0x00000100, 0x00000000); break;
3158             case 0x00000001:
3159                SET_ABCD(0x00000004, 0x00000004,
3160                         0x00000201, 0x00000000); break;
3161             default:
3162                SET_ABCD(0x00000000, 0x00000000,
3163                         old_ecx,    0x00000000); break;
3164          }
3165          break;
3166       case 0x0000000c:
3167          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3168          break;
3169       case 0x0000000d:
3170          switch (old_ecx) {
3171             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3172                                       0x00000340, 0x00000000); break;
3173             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3174                                       0x00000000, 0x00000000); break;
3175             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3176                                       0x00000000, 0x00000000); break;
3177             default:         SET_ABCD(0x00000000, 0x00000000,
3178                                       0x00000000, 0x00000000); break;
3179          }
3180          break;
3181       case 0x0000000e:
3182          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3183          break;
3184       case 0x0000000f:
3185          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3186          break;
3187       case 0x80000000:
3188          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3189          break;
3190       case 0x80000001:
3191          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3192          break;
3193       case 0x80000002:
3194          SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
3195          break;
3196       case 0x80000003:
3197          SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
3198          break;
3199       case 0x80000004:
3200          SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
3201          break;
3202       case 0x80000005:
3203          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3204          break;
3205       case 0x80000006:
3206          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3207          break;
3208       case 0x80000007:
3209          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3210          break;
3211       case 0x80000008:
3212          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3213          break;
3214       default:
3215          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3216          break;
3217    }
3218 #  undef SET_ABCD
3219 }
3220
3221
3222 /* Claim to be the following CPU (4 x ...), which is AVX2 capable.
3223
3224    With the following change: claim that XSaveOpt is not available, by
3225    cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
3226    CPU.  Consequently, programs that correctly observe these CPUID
3227    values should only try to use 3 of the 8 XSave-family instructions:
3228    XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
3229    implement the compacted or optimised save/restore variants.
3230
3231    vendor_id       : GenuineIntel
3232    cpu family      : 6
3233    model           : 60
3234    model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
3235    stepping        : 3
3236    microcode       : 0x1c
3237    cpu MHz         : 919.957
3238    cache size      : 8192 KB
3239    physical id     : 0
3240    siblings        : 4
3241    core id         : 3
3242    cpu cores       : 4
3243    apicid          : 6
3244    initial apicid  : 6
3245    fpu             : yes
3246    fpu_exception   : yes
3247    cpuid level     : 13
3248    wp              : yes
3249    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
3250                      cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
3251                      tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
3252                      arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
3253                      aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
3254                      vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
3255                      sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
3256                      avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
3257                      tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
3258                      bmi1 avx2 smep bmi2 erms invpcid xsaveopt
3259    bugs            :
3260    bogomips        : 5786.68
3261    clflush size    : 64
3262    cache_alignment : 64
3263    address sizes   : 39 bits physical, 48 bits virtual
3264    power management:
3265 */
3266 void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
3267 {
3268 #  define SET_ABCD(_a,_b,_c,_d)                \
3269       do { st->guest_RAX = (ULong)(_a);        \
3270            st->guest_RBX = (ULong)(_b);        \
3271            st->guest_RCX = (ULong)(_c);        \
3272            st->guest_RDX = (ULong)(_d);        \
3273       } while (0)
3274
3275    UInt old_eax = (UInt)st->guest_RAX;
3276    UInt old_ecx = (UInt)st->guest_RCX;
3277
3278    switch (old_eax) {
3279       case 0x00000000:
3280          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3281          break;
3282       case 0x00000001:
3283          /* Don't advertise RDRAND support, bit 30 in ECX.  */
3284          SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff);
3285          break;
3286       case 0x00000002:
3287          SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
3288          break;
3289       case 0x00000003:
3290          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3291          break;
3292       case 0x00000004:
3293          switch (old_ecx) {
3294             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3295                                       0x0000003f, 0x00000000); break;
3296             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3297                                       0x0000003f, 0x00000000); break;
3298             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3299                                       0x000001ff, 0x00000000); break;
3300             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3301                                       0x00001fff, 0x00000006); break;
3302             default:         SET_ABCD(0x00000000, 0x00000000,
3303                                       0x00000000, 0x00000000); break;
3304          }
3305          break;
3306       case 0x00000005:
3307          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
3308          break;
3309       case 0x00000006:
3310          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3311          break;
3312       case 0x00000007:
3313          switch (old_ecx) {
3314             case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
3315                                       0x00000000, 0x00000000); break;
3316             default:         SET_ABCD(0x00000000, 0x00000000,
3317                                       0x00000000, 0x00000000); break;
3318          }
3319          break;
3320       case 0x00000008:
3321          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3322          break;
3323       case 0x00000009:
3324          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3325          break;
3326       case 0x0000000a:
3327          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3328          break;
3329       case 0x0000000b:
3330          switch (old_ecx) {
3331             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3332                                       0x00000100, 0x00000002); break;
3333             case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
3334                                       0x00000201, 0x00000002); break;
3335             default:         SET_ABCD(0x00000000, 0x00000000,
3336                                       old_ecx,    0x00000002); break;
3337          }
3338          break;
3339       case 0x0000000c:
3340          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3341          break;
3342       case 0x0000000d:
3343          switch (old_ecx) {
3344             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3345                                       0x00000340, 0x00000000); break;
3346             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3347                                       0x00000000, 0x00000000); break;
3348             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3349                                       0x00000000, 0x00000000); break;
3350             default:         SET_ABCD(0x00000000, 0x00000000,
3351                                       0x00000000, 0x00000000); break;
3352          }
3353          break;
3354       case 0x80000000:
3355          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3356          break;
3357       case 0x80000001:
3358          SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
3359          break;
3360       case 0x80000002:
3361          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3362          break;
3363       case 0x80000003:
3364          SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
3365          break;
3366       case 0x80000004:
3367          SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
3368          break;
3369       case 0x80000005:
3370          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3371          break;
3372       case 0x80000006:
3373          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3374          break;
3375       case 0x80000007:
3376          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3377          break;
3378       case 0x80000008:
3379          SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
3380          break;
3381       default:
3382          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3383          break;
3384    }
3385 #  undef SET_ABCD
3386 }
3387
3388
3389 /*---------------------------------------------------------------*/
3390 /*--- Misc integer helpers, including rotates and crypto.     ---*/
3391 /*---------------------------------------------------------------*/
3392
3393 ULong amd64g_calculate_RCR ( ULong arg,
3394                              ULong rot_amt,
3395                              ULong rflags_in,
3396                              Long  szIN )
3397 {
3398    Bool  wantRflags = toBool(szIN < 0);
3399    ULong sz         = wantRflags ? (-szIN) : szIN;
3400    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3401    ULong cf=0, of=0, tempcf;
3402
3403    switch (sz) {
3404       case 8:
3405          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3406          of        = ((arg >> 63) ^ cf) & 1;
3407          while (tempCOUNT > 0) {
3408             tempcf = arg & 1;
3409             arg    = (arg >> 1) | (cf << 63);
3410             cf     = tempcf;
3411             tempCOUNT--;
3412          }
3413          break;
3414       case 4:
3415          while (tempCOUNT >= 33) tempCOUNT -= 33;
3416          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3417          of        = ((arg >> 31) ^ cf) & 1;
3418          while (tempCOUNT > 0) {
3419             tempcf = arg & 1;
3420             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
3421             cf     = tempcf;
3422             tempCOUNT--;
3423          }
3424          break;
3425       case 2:
3426          while (tempCOUNT >= 17) tempCOUNT -= 17;
3427          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3428          of        = ((arg >> 15) ^ cf) & 1;
3429          while (tempCOUNT > 0) {
3430             tempcf = arg & 1;
3431             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
3432             cf     = tempcf;
3433             tempCOUNT--;
3434          }
3435          break;
3436       case 1:
3437          while (tempCOUNT >= 9) tempCOUNT -= 9;
3438          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3439          of        = ((arg >> 7) ^ cf) & 1;
3440          while (tempCOUNT > 0) {
3441             tempcf = arg & 1;
3442             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
3443             cf     = tempcf;
3444             tempCOUNT--;
3445          }
3446          break;
3447       default:
3448          vpanic("calculate_RCR(amd64g): invalid size");
3449    }
3450
3451    cf &= 1;
3452    of &= 1;
3453    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3454    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3455
3456    /* caller can ask to have back either the resulting flags or
3457       resulting value, but not both */
3458    return wantRflags ? rflags_in : arg;
3459 }
3460
3461 ULong amd64g_calculate_RCL ( ULong arg,
3462                              ULong rot_amt,
3463                              ULong rflags_in,
3464                              Long  szIN )
3465 {
3466    Bool  wantRflags = toBool(szIN < 0);
3467    ULong sz         = wantRflags ? (-szIN) : szIN;
3468    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3469    ULong cf=0, of=0, tempcf;
3470
3471    switch (sz) {
3472       case 8:
3473          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3474          while (tempCOUNT > 0) {
3475             tempcf = (arg >> 63) & 1;
3476             arg    = (arg << 1) | (cf & 1);
3477             cf     = tempcf;
3478             tempCOUNT--;
3479          }
3480          of = ((arg >> 63) ^ cf) & 1;
3481          break;
3482       case 4:
3483          while (tempCOUNT >= 33) tempCOUNT -= 33;
3484          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3485          while (tempCOUNT > 0) {
3486             tempcf = (arg >> 31) & 1;
3487             arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
3488             cf     = tempcf;
3489             tempCOUNT--;
3490          }
3491          of = ((arg >> 31) ^ cf) & 1;
3492          break;
3493       case 2:
3494          while (tempCOUNT >= 17) tempCOUNT -= 17;
3495          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3496          while (tempCOUNT > 0) {
3497             tempcf = (arg >> 15) & 1;
3498             arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
3499             cf     = tempcf;
3500             tempCOUNT--;
3501          }
3502          of = ((arg >> 15) ^ cf) & 1;
3503          break;
3504       case 1:
3505          while (tempCOUNT >= 9) tempCOUNT -= 9;
3506          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3507          while (tempCOUNT > 0) {
3508             tempcf = (arg >> 7) & 1;
3509             arg    = 0xFFULL & ((arg << 1) | (cf & 1));
3510             cf     = tempcf;
3511             tempCOUNT--;
3512          }
3513          of = ((arg >> 7) ^ cf) & 1;
3514          break;
3515       default:
3516          vpanic("calculate_RCL(amd64g): invalid size");
3517    }
3518
3519    cf &= 1;
3520    of &= 1;
3521    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3522    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3523
3524    return wantRflags ? rflags_in : arg;
3525 }
3526
3527 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
3528  * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
3529  */
3530 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
3531 {
3532     ULong hi, lo, tmp, A[16];
3533
3534    A[0] = 0;            A[1] = a;
3535    A[2] = A[1] << 1;    A[3] = A[2] ^ a;
3536    A[4] = A[2] << 1;    A[5] = A[4] ^ a;
3537    A[6] = A[3] << 1;    A[7] = A[6] ^ a;
3538    A[8] = A[4] << 1;    A[9] = A[8] ^ a;
3539    A[10] = A[5] << 1;   A[11] = A[10] ^ a;
3540    A[12] = A[6] << 1;   A[13] = A[12] ^ a;
3541    A[14] = A[7] << 1;   A[15] = A[14] ^ a;
3542
3543    lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
3544    hi = lo >> 56;
3545    lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
3546    hi = (hi << 8) | (lo >> 56);
3547    lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
3548    hi = (hi << 8) | (lo >> 56);
3549    lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
3550    hi = (hi << 8) | (lo >> 56);
3551    lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
3552    hi = (hi << 8) | (lo >> 56);
3553    lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
3554    hi = (hi << 8) | (lo >> 56);
3555    lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
3556    hi = (hi << 8) | (lo >> 56);
3557    lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
3558
3559    ULong m0 = -1;
3560    m0 /= 255;
3561    tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
3562    tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
3563    tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
3564    tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
3565    tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
3566    tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
3567    tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
3568
3569    return which ? hi : lo;
3570 }
3571
3572
3573 /* CALLED FROM GENERATED CODE */
3574 /* DIRTY HELPER (non-referentially-transparent) */
3575 /* Horrible hack.  On non-amd64 platforms, return 1. */
3576 ULong amd64g_dirtyhelper_RDTSC ( void )
3577 {
3578 #  if defined(__x86_64__)
3579    UInt  eax, edx;
3580    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
3581    return (((ULong)edx) << 32) | ((ULong)eax);
3582 #  else
3583    return 1ULL;
3584 #  endif
3585 }
3586
3587 /* CALLED FROM GENERATED CODE */
3588 /* DIRTY HELPER (non-referentially-transparent) */
3589 /* Horrible hack.  On non-amd64 platforms, return 1. */
3590 /* This uses a different calling convention from _RDTSC just above
3591    only because of the difficulty of returning 96 bits from a C
3592    function -- RDTSC returns 64 bits and so is simple by comparison,
3593    on amd64. */
3594 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
3595 {
3596 #  if defined(__x86_64__)
3597    UInt eax, ecx, edx;
3598    __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
3599    st->guest_RAX = (ULong)eax;
3600    st->guest_RCX = (ULong)ecx;
3601    st->guest_RDX = (ULong)edx;
3602 #  else
3603    /* Do nothing. */
3604 #  endif
3605 }
3606
3607 /* CALLED FROM GENERATED CODE */
3608 /* DIRTY HELPER (non-referentially-transparent) */
3609 /* Horrible hack.  On non-amd64 platforms, return 0. */
3610 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
3611 {
3612 #  if defined(__x86_64__)
3613    ULong r = 0;
3614    portno &= 0xFFFF;
3615    switch (sz) {
3616       case 4:
3617          __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
3618                               : "=a" (r) : "Nd" (portno));
3619          break;
3620       case 2:
3621          __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
3622                               : "=a" (r) : "Nd" (portno));
3623          break;
3624       case 1:
3625          __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
3626                               : "=a" (r) : "Nd" (portno));
3627          break;
3628       default:
3629          break; /* note: no 64-bit version of insn exists */
3630    }
3631    return r;
3632 #  else
3633    return 0;
3634 #  endif
3635 }
3636
3637
3638 /* CALLED FROM GENERATED CODE */
3639 /* DIRTY HELPER (non-referentially-transparent) */
3640 /* Horrible hack.  On non-amd64 platforms, do nothing. */
3641 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
3642 {
3643 #  if defined(__x86_64__)
3644    portno &= 0xFFFF;
3645    switch (sz) {
3646       case 4:
3647          __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
3648                               : : "a" (data), "Nd" (portno));
3649          break;
3650       case 2:
3651          __asm__ __volatile__("outw %w0, %w1"
3652                               : : "a" (data), "Nd" (portno));
3653          break;
3654       case 1:
3655          __asm__ __volatile__("outb %b0, %w1"
3656                               : : "a" (data), "Nd" (portno));
3657          break;
3658       default:
3659          break; /* note: no 64-bit version of insn exists */
3660    }
3661 #  else
3662    /* do nothing */
3663 #  endif
3664 }
3665
3666 /* CALLED FROM GENERATED CODE */
3667 /* DIRTY HELPER (non-referentially-transparent) */
3668 /* Horrible hack.  On non-amd64 platforms, do nothing. */
3669 /* op = 0: call the native SGDT instruction.
3670    op = 1: call the native SIDT instruction.
3671 */
3672 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
3673 #  if defined(__x86_64__)
3674    switch (op) {
3675       case 0:
3676          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
3677          break;
3678       case 1:
3679          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
3680          break;
3681       default:
3682          vpanic("amd64g_dirtyhelper_SxDT");
3683    }
3684 #  else
3685    /* do nothing */
3686    UChar* p = (UChar*)address;
3687    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
3688    p[6] = p[7] = p[8] = p[9] = 0;
3689 #  endif
3690 }
3691
3692 /*---------------------------------------------------------------*/
3693 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
3694 /*---------------------------------------------------------------*/
3695
3696 static inline UChar abdU8 ( UChar xx, UChar yy ) {
3697    return toUChar(xx>yy ? xx-yy : yy-xx);
3698 }
3699
3700 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
3701    return (((ULong)w1) << 32) | ((ULong)w0);
3702 }
3703
3704 static inline UShort sel16x4_3 ( ULong w64 ) {
3705    UInt hi32 = toUInt(w64 >> 32);
3706    return toUShort(hi32 >> 16);
3707 }
3708 static inline UShort sel16x4_2 ( ULong w64 ) {
3709    UInt hi32 = toUInt(w64 >> 32);
3710    return toUShort(hi32);
3711 }
3712 static inline UShort sel16x4_1 ( ULong w64 ) {
3713    UInt lo32 = toUInt(w64);
3714    return toUShort(lo32 >> 16);
3715 }
3716 static inline UShort sel16x4_0 ( ULong w64 ) {
3717    UInt lo32 = toUInt(w64);
3718    return toUShort(lo32);
3719 }
3720
3721 static inline UChar sel8x8_7 ( ULong w64 ) {
3722    UInt hi32 = toUInt(w64 >> 32);
3723    return toUChar(hi32 >> 24);
3724 }
3725 static inline UChar sel8x8_6 ( ULong w64 ) {
3726    UInt hi32 = toUInt(w64 >> 32);
3727    return toUChar(hi32 >> 16);
3728 }
3729 static inline UChar sel8x8_5 ( ULong w64 ) {
3730    UInt hi32 = toUInt(w64 >> 32);
3731    return toUChar(hi32 >> 8);
3732 }
3733 static inline UChar sel8x8_4 ( ULong w64 ) {
3734    UInt hi32 = toUInt(w64 >> 32);
3735    return toUChar(hi32 >> 0);
3736 }
3737 static inline UChar sel8x8_3 ( ULong w64 ) {
3738    UInt lo32 = toUInt(w64);
3739    return toUChar(lo32 >> 24);
3740 }
3741 static inline UChar sel8x8_2 ( ULong w64 ) {
3742    UInt lo32 = toUInt(w64);
3743    return toUChar(lo32 >> 16);
3744 }
3745 static inline UChar sel8x8_1 ( ULong w64 ) {
3746    UInt lo32 = toUInt(w64);
3747    return toUChar(lo32 >> 8);
3748 }
3749 static inline UChar sel8x8_0 ( ULong w64 ) {
3750    UInt lo32 = toUInt(w64);
3751    return toUChar(lo32 >> 0);
3752 }
3753
3754 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3755 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
3756 {
3757    return
3758       mk32x2(
3759          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
3760             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
3761          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
3762             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
3763       );
3764 }
3765
3766 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3767 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3768 {
3769    UInt t = 0;
3770    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3771    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3772    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3773    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3774    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3775    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3776    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3777    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3778    t &= 0xFFFF;
3779    return (ULong)t;
3780 }
3781
3782 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3783 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3784 {
3785    UShort t, min;
3786    UInt   idx;
3787    t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
3788    t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3789    t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3790    t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3791    t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3792    t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3793    t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3794    t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3795    return ((ULong)(idx << 16)) | ((ULong)min);
3796 }
3797
3798 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3799 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3800 {
3801    UInt  i;
3802    ULong crc = (b & 0xFFULL) ^ crcIn;
3803    for (i = 0; i < 8; i++)
3804       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3805    return crc;
3806 }
3807
3808 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3809 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3810 {
3811    UInt  i;
3812    ULong crc = (w & 0xFFFFULL) ^ crcIn;
3813    for (i = 0; i < 16; i++)
3814       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3815    return crc;
3816 }
3817
3818 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3819 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3820 {
3821    UInt i;
3822    ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3823    for (i = 0; i < 32; i++)
3824       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3825    return crc;
3826 }
3827
3828 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3829 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3830 {
3831    ULong crc = amd64g_calc_crc32l(crcIn, q);
3832    return amd64g_calc_crc32l(crc, q >> 32);
3833 }
3834
3835
3836 /* .. helper for next fn .. */
3837 static inline ULong sad_8x4 ( ULong xx, ULong yy )
3838 {
3839    UInt t = 0;
3840    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3841    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3842    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3843    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3844    return (ULong)t;
3845 }
3846
3847 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3848 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3849                             ULong dHi, ULong dLo,
3850                             ULong imm_and_return_control_bit )
3851 {
3852    UInt imm8     = imm_and_return_control_bit & 7;
3853    Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
3854    UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3855    UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3856    /* For src we only need 32 bits, so get them into the
3857       lower half of a 64 bit word. */
3858    ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3859    /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3860       11 bytes.  If calculating the low part of the result, need bytes
3861       dstOffsL * 4 + (0 .. 6); if calculating the high part,
3862       dstOffsL * 4 + (4 .. 10). */
3863    ULong dst;
3864    /* dstOffL = 0, Lo  ->  0 .. 6
3865       dstOffL = 1, Lo  ->  4 .. 10
3866       dstOffL = 0, Hi  ->  4 .. 10
3867       dstOffL = 1, Hi  ->  8 .. 14
3868    */
3869    if (calcHi && dstOffsL) {
3870       /* 8 .. 14 */
3871       dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3872    }
3873    else if (!calcHi && !dstOffsL) {
3874       /* 0 .. 6 */
3875       dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3876    }
3877    else {
3878       /* 4 .. 10 */
3879       dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3880    }
3881    ULong r0  = sad_8x4( dst >>  0, src );
3882    ULong r1  = sad_8x4( dst >>  8, src );
3883    ULong r2  = sad_8x4( dst >> 16, src );
3884    ULong r3  = sad_8x4( dst >> 24, src );
3885    ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3886    return res;
3887 }
3888
3889 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3890 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
3891 {
3892    ULong dst = 0;
3893    ULong src_bit;
3894    ULong dst_bit = 1;
3895    for (src_bit = 1; src_bit; src_bit <<= 1) {
3896       if (mask & src_bit) {
3897          if (src_masked & src_bit) dst |= dst_bit;
3898          dst_bit <<= 1;
3899       }
3900    }
3901    return dst;
3902 }
3903
3904 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3905 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
3906 {
3907    ULong dst = 0;
3908    ULong dst_bit;
3909    ULong src_bit = 1;
3910    for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
3911       if (mask & dst_bit) {
3912          if (src & src_bit) dst |= dst_bit;
3913          src_bit <<= 1;
3914       }
3915    }
3916    return dst;
3917 }
3918
3919 /*---------------------------------------------------------------*/
3920 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
3921 /*---------------------------------------------------------------*/
3922
3923 static UInt zmask_from_V128 ( V128* arg )
3924 {
3925    UInt i, res = 0;
3926    for (i = 0; i < 16; i++) {
3927       res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
3928    }
3929    return res;
3930 }
3931
3932 static UInt zmask_from_V128_wide ( V128* arg )
3933 {
3934    UInt i, res = 0;
3935    for (i = 0; i < 8; i++) {
3936       res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
3937    }
3938    return res;
3939 }
3940
3941 /* Helps with PCMP{I,E}STR{I,M}.
3942
3943    CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
3944    actually it could be a clean helper, but for the fact that we can't
3945    pass by value 2 x V128 to a clean helper, nor have one returned.)
3946    Reads guest state, writes to guest state for the xSTRM cases, no
3947    accesses of memory, is a pure function.
3948
3949    opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
3950    the callee knows which I/E and I/M variant it is dealing with and
3951    what the specific operation is.  4th byte of opcode is in the range
3952    0x60 to 0x63:
3953        istri  66 0F 3A 63
3954        istrm  66 0F 3A 62
3955        estri  66 0F 3A 61
3956        estrm  66 0F 3A 60
3957
3958    gstOffL and gstOffR are the guest state offsets for the two XMM
3959    register inputs.  We never have to deal with the memory case since
3960    that is handled by pre-loading the relevant value into the fake
3961    XMM16 register.
3962
3963    For ESTRx variants, edxIN and eaxIN hold the values of those two
3964    registers.
3965
3966    In all cases, the bottom 16 bits of the result contain the new
3967    OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
3968    result hold the new %ecx value.  For xSTRM variants, the helper
3969    writes the result directly to the guest XMM0.
3970
3971    Declarable side effects: in all cases, reads guest state at
3972    [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
3973    guest_XMM0.
3974
3975    Is expected to be called with opc_and_imm combinations which have
3976    actually been validated, and will assert if otherwise.  The front
3977    end should ensure we're only called with verified values.
3978 */
3979 ULong amd64g_dirtyhelper_PCMPxSTRx (
3980           VexGuestAMD64State* gst,
3981           HWord opc4_and_imm,
3982           HWord gstOffL, HWord gstOffR,
3983           HWord edxIN, HWord eaxIN
3984        )
3985 {
3986    HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
3987    HWord imm8 = opc4_and_imm & 0xFF;
3988    HWord isISTRx = opc4 & 2;
3989    HWord isxSTRM = (opc4 & 1) ^ 1;
3990    vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
3991    HWord wide = (imm8 & 1);
3992
3993    // where the args are
3994    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3995    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3996
3997    /* Create the arg validity masks, either from the vectors
3998       themselves or from the supplied edx/eax values. */
3999    // FIXME: this is only right for the 8-bit data cases.
4000    // At least that is asserted above.
4001    UInt zmaskL, zmaskR;
4002
4003    // temp spot for the resulting flags and vector.
4004    V128 resV;
4005    UInt resOSZACP;
4006
4007    // for checking whether case was handled
4008    Bool ok = False;
4009
4010    if (wide) {
4011       if (isISTRx) {
4012          zmaskL = zmask_from_V128_wide(argL);
4013          zmaskR = zmask_from_V128_wide(argR);
4014       } else {
4015          Int tmp;
4016          tmp = edxIN & 0xFFFFFFFF;
4017          if (tmp < -8) tmp = -8;
4018          if (tmp > 8)  tmp = 8;
4019          if (tmp < 0)  tmp = -tmp;
4020          vassert(tmp >= 0 && tmp <= 8);
4021          zmaskL = (1 << tmp) & 0xFF;
4022          tmp = eaxIN & 0xFFFFFFFF;
4023          if (tmp < -8) tmp = -8;
4024          if (tmp > 8)  tmp = 8;
4025          if (tmp < 0)  tmp = -tmp;
4026          vassert(tmp >= 0 && tmp <= 8);
4027          zmaskR = (1 << tmp) & 0xFF;
4028       }
4029       // do the meyaath
4030       ok = compute_PCMPxSTRx_wide (
4031               &resV, &resOSZACP, argL, argR,
4032               zmaskL, zmaskR, imm8, (Bool)isxSTRM
4033            );
4034    } else {
4035       if (isISTRx) {
4036          zmaskL = zmask_from_V128(argL);
4037          zmaskR = zmask_from_V128(argR);
4038       } else {
4039          Int tmp;
4040          tmp = edxIN & 0xFFFFFFFF;
4041          if (tmp < -16) tmp = -16;
4042          if (tmp > 16)  tmp = 16;
4043          if (tmp < 0)   tmp = -tmp;
4044          vassert(tmp >= 0 && tmp <= 16);
4045          zmaskL = (1 << tmp) & 0xFFFF;
4046          tmp = eaxIN & 0xFFFFFFFF;
4047          if (tmp < -16) tmp = -16;
4048          if (tmp > 16)  tmp = 16;
4049          if (tmp < 0)   tmp = -tmp;
4050          vassert(tmp >= 0 && tmp <= 16);
4051          zmaskR = (1 << tmp) & 0xFFFF;
4052       }
4053       // do the meyaath
4054       ok = compute_PCMPxSTRx (
4055               &resV, &resOSZACP, argL, argR,
4056               zmaskL, zmaskR, imm8, (Bool)isxSTRM
4057            );
4058    }
4059
4060    // front end shouldn't pass us any imm8 variants we can't
4061    // handle.  Hence:
4062    vassert(ok);
4063
4064    // So, finally we need to get the results back to the caller.
4065    // In all cases, the new OSZACP value is the lowest 16 of
4066    // the return value.
4067    if (isxSTRM) {
4068       gst->guest_YMM0[0] = resV.w32[0];
4069       gst->guest_YMM0[1] = resV.w32[1];
4070       gst->guest_YMM0[2] = resV.w32[2];
4071       gst->guest_YMM0[3] = resV.w32[3];
4072       return resOSZACP & 0x8D5;
4073    } else {
4074       UInt newECX = resV.w32[0] & 0xFFFF;
4075       return (newECX << 16) | (resOSZACP & 0x8D5);
4076    }
4077 }
4078
4079 /*---------------------------------------------------------------*/
4080 /*--- AES primitives and helpers                              ---*/
4081 /*---------------------------------------------------------------*/
4082 /* a 16 x 16 matrix */
4083 static const UChar sbox[256] = {                   // row nr
4084    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
4085    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
4086    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
4087    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
4088    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
4089    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
4090    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
4091    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
4092    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
4093    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
4094    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
4095    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
4096    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
4097    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
4098    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
4099    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
4100    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
4101    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
4102    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
4103    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
4104    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
4105    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
4106    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
4107    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
4108    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
4109    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
4110    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
4111    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
4112    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
4113    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
4114    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
4115    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
4116 };
4117 static void SubBytes (V128* v)
4118 {
4119    V128 r;
4120    UInt i;
4121    for (i = 0; i < 16; i++)
4122       r.w8[i] = sbox[v->w8[i]];
4123    *v = r;
4124 }
4125
4126 /* a 16 x 16 matrix */
4127 static const UChar invsbox[256] = {                // row nr
4128    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
4129    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
4130    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
4131    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
4132    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
4133    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
4134    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
4135    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
4136    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
4137    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
4138    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
4139    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
4140    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
4141    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
4142    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
4143    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
4144    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
4145    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
4146    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
4147    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
4148    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
4149    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
4150    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
4151    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
4152    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
4153    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
4154    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
4155    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
4156    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
4157    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
4158    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
4159    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
4160 };
4161 static void InvSubBytes (V128* v)
4162 {
4163    V128 r;
4164    UInt i;
4165    for (i = 0; i < 16; i++)
4166       r.w8[i] = invsbox[v->w8[i]];
4167    *v = r;
4168 }
4169
4170 static const UChar ShiftRows_op[16] =
4171    {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
4172 static void ShiftRows (V128* v)
4173 {
4174    V128 r;
4175    UInt i;
4176    for (i = 0; i < 16; i++)
4177       r.w8[i] = v->w8[ShiftRows_op[15-i]];
4178    *v = r;
4179 }
4180
4181 static const UChar InvShiftRows_op[16] =
4182    {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
4183 static void InvShiftRows (V128* v)
4184 {
4185    V128 r;
4186    UInt i;
4187    for (i = 0; i < 16; i++)
4188       r.w8[i] = v->w8[InvShiftRows_op[15-i]];
4189    *v = r;
4190 }
4191
4192 /* Multiplication of the finite fields elements of AES.
4193    See "A Specification for The AES Algorithm Rijndael
4194         (by Joan Daemen & Vincent Rijmen)"
4195         Dr. Brian Gladman, v3.1, 3rd March 2001. */
4196 /* N values so that (hex) xy = 0x03^N.
4197    0x00 cannot be used. We put 0xff for this value.*/
4198 /* a 16 x 16 matrix */
4199 static const UChar Nxy[256] = {                    // row nr
4200    0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
4201    0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
4202    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
4203    0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
4204    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
4205    0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
4206    0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
4207    0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
4208    0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
4209    0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
4210    0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
4211    0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
4212    0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
4213    0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
4214    0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
4215    0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
4216    0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
4217    0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
4218    0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
4219    0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
4220    0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
4221    0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
4222    0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
4223    0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
4224    0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
4225    0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
4226    0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
4227    0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
4228    0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
4229    0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
4230    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
4231    0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
4232 };
4233
4234 /* E values so that E = 0x03^xy. */
4235 static const UChar Exy[256] = {                    // row nr
4236    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
4237    0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
4238    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
4239    0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
4240    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
4241    0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
4242    0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
4243    0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
4244    0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
4245    0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
4246    0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
4247    0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
4248    0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
4249    0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
4250    0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
4251    0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
4252    0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
4253    0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
4254    0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
4255    0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
4256    0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
4257    0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
4258    0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
4259    0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
4260    0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
4261    0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
4262    0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
4263    0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
4264    0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
4265    0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
4266    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
4267    0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
4268
4269 static inline UChar ff_mul(UChar u1, UChar u2)
4270 {
4271    if ((u1 > 0) && (u2 > 0)) {
4272       UInt ui = Nxy[u1] + Nxy[u2];
4273       if (ui >= 255)
4274          ui = ui - 255;
4275       return Exy[ui];
4276    } else {
4277       return 0;
4278    };
4279 }
4280
4281 static void MixColumns (V128* v)
4282 {
4283    V128 r;
4284    Int j;
4285 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4286    for (j = 0; j < 4; j++) {
4287       P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
4288          ^ P(v,j,2) ^ P(v,j,3);
4289       P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
4290          ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
4291       P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
4292          ^ ff_mul(0x03, P(v,j,3) );
4293       P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
4294          ^ ff_mul( 0x02, P(v,j,3) );
4295    }
4296    *v = r;
4297 #undef P
4298 }
4299
4300 static void InvMixColumns (V128* v)
4301 {
4302    V128 r;
4303    Int j;
4304 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4305    for (j = 0; j < 4; j++) {
4306       P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
4307          ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
4308       P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
4309          ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
4310       P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
4311          ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
4312       P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
4313          ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
4314    }
4315    *v = r;
4316 #undef P
4317
4318 }
4319
4320 /* For description, see definition in guest_amd64_defs.h */
4321 void amd64g_dirtyhelper_AES (
4322           VexGuestAMD64State* gst,
4323           HWord opc4, HWord gstOffD,
4324           HWord gstOffL, HWord gstOffR
4325        )
4326 {
4327    // where the args are
4328    V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
4329    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4330    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4331    V128  r;
4332
4333    switch (opc4) {
4334       case 0xDC: /* AESENC */
4335       case 0xDD: /* AESENCLAST */
4336          r = *argR;
4337          ShiftRows (&r);
4338          SubBytes  (&r);
4339          if (opc4 == 0xDC)
4340             MixColumns (&r);
4341          argD->w64[0] = r.w64[0] ^ argL->w64[0];
4342          argD->w64[1] = r.w64[1] ^ argL->w64[1];
4343          break;
4344
4345       case 0xDE: /* AESDEC */
4346       case 0xDF: /* AESDECLAST */
4347          r = *argR;
4348          InvShiftRows (&r);
4349          InvSubBytes (&r);
4350          if (opc4 == 0xDE)
4351             InvMixColumns (&r);
4352          argD->w64[0] = r.w64[0] ^ argL->w64[0];
4353          argD->w64[1] = r.w64[1] ^ argL->w64[1];
4354          break;
4355
4356       case 0xDB: /* AESIMC */
4357          *argD = *argL;
4358          InvMixColumns (argD);
4359          break;
4360       default: vassert(0);
4361    }
4362 }
4363
4364 static inline UInt RotWord (UInt   w32)
4365 {
4366    return ((w32 >> 8) | (w32 << 24));
4367 }
4368
4369 static inline UInt SubWord (UInt   w32)
4370 {
4371    UChar *w8;
4372    UChar *r8;
4373    UInt res;
4374    w8 = (UChar*) &w32;
4375    r8 = (UChar*) &res;
4376    r8[0] = sbox[w8[0]];
4377    r8[1] = sbox[w8[1]];
4378    r8[2] = sbox[w8[2]];
4379    r8[3] = sbox[w8[3]];
4380    return res;
4381 }
4382
4383 /* For description, see definition in guest_amd64_defs.h */
4384 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
4385           VexGuestAMD64State* gst,
4386           HWord imm8,
4387           HWord gstOffL, HWord gstOffR
4388        )
4389 {
4390    // where the args are
4391    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4392    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4393
4394    // We have to create the result in a temporary in the
4395    // case where the src and dst regs are the same.  See #341698.
4396    V128 tmp;
4397
4398    tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
4399    tmp.w32[2] = SubWord (argL->w32[3]);
4400    tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
4401    tmp.w32[0] = SubWord (argL->w32[1]);
4402
4403    argR->w32[3] = tmp.w32[3];
4404    argR->w32[2] = tmp.w32[2];
4405    argR->w32[1] = tmp.w32[1];
4406    argR->w32[0] = tmp.w32[0];
4407 }
4408
4409
4410
4411 /*---------------------------------------------------------------*/
4412 /*--- Helpers for dealing with, and describing,               ---*/
4413 /*--- guest state as a whole.                                 ---*/
4414 /*---------------------------------------------------------------*/
4415
4416 /* Initialise the entire amd64 guest state. */
4417 /* VISIBLE TO LIBVEX CLIENT */
4418 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
4419 {
4420    vex_state->host_EvC_FAILADDR = 0;
4421    vex_state->host_EvC_COUNTER = 0;
4422    vex_state->pad0 = 0;
4423
4424    vex_state->guest_RAX = 0;
4425    vex_state->guest_RCX = 0;
4426    vex_state->guest_RDX = 0;
4427    vex_state->guest_RBX = 0;
4428    vex_state->guest_RSP = 0;
4429    vex_state->guest_RBP = 0;
4430    vex_state->guest_RSI = 0;
4431    vex_state->guest_RDI = 0;
4432    vex_state->guest_R8  = 0;
4433    vex_state->guest_R9  = 0;
4434    vex_state->guest_R10 = 0;
4435    vex_state->guest_R11 = 0;
4436    vex_state->guest_R12 = 0;
4437    vex_state->guest_R13 = 0;
4438    vex_state->guest_R14 = 0;
4439    vex_state->guest_R15 = 0;
4440
4441    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
4442    vex_state->guest_CC_DEP1 = 0;
4443    vex_state->guest_CC_DEP2 = 0;
4444    vex_state->guest_CC_NDEP = 0;
4445
4446    vex_state->guest_DFLAG   = 1; /* forwards */
4447    vex_state->guest_IDFLAG  = 0;
4448    vex_state->guest_ACFLAG  = 0;
4449
4450    /* HACK: represent the offset associated with a constant %fs.
4451       Typically, on linux, this assumes that %fs is only ever zero (main
4452       thread) or 0x63. */
4453    vex_state->guest_FS_CONST = 0;
4454
4455    vex_state->guest_RIP = 0;
4456
4457    /* Initialise the simulated FPU */
4458    amd64g_dirtyhelper_FINIT( vex_state );
4459
4460    /* Initialise the AVX state. */
4461 #  define AVXZERO(_ymm) \
4462       do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
4463            _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
4464       } while (0)
4465    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
4466    AVXZERO(vex_state->guest_YMM0);
4467    AVXZERO(vex_state->guest_YMM1);
4468    AVXZERO(vex_state->guest_YMM2);
4469    AVXZERO(vex_state->guest_YMM3);
4470    AVXZERO(vex_state->guest_YMM4);
4471    AVXZERO(vex_state->guest_YMM5);
4472    AVXZERO(vex_state->guest_YMM6);
4473    AVXZERO(vex_state->guest_YMM7);
4474    AVXZERO(vex_state->guest_YMM8);
4475    AVXZERO(vex_state->guest_YMM9);
4476    AVXZERO(vex_state->guest_YMM10);
4477    AVXZERO(vex_state->guest_YMM11);
4478    AVXZERO(vex_state->guest_YMM12);
4479    AVXZERO(vex_state->guest_YMM13);
4480    AVXZERO(vex_state->guest_YMM14);
4481    AVXZERO(vex_state->guest_YMM15);
4482    AVXZERO(vex_state->guest_YMM16);
4483
4484 #  undef AVXZERO
4485
4486    vex_state->guest_EMNOTE = EmNote_NONE;
4487
4488    /* These should not ever be either read or written, but we
4489       initialise them anyway. */
4490    vex_state->guest_CMSTART = 0;
4491    vex_state->guest_CMLEN   = 0;
4492
4493    vex_state->guest_NRADDR   = 0;
4494    vex_state->guest_SC_CLASS = 0;
4495    vex_state->guest_GS_CONST = 0;
4496
4497    vex_state->guest_IP_AT_SYSCALL = 0;
4498    vex_state->pad1 = 0;
4499 }
4500
4501
4502 /* Figure out if any part of the guest state contained in minoff
4503    .. maxoff requires precise memory exceptions.  If in doubt return
4504    True (but this generates significantly slower code).
4505
4506    By default we enforce precise exns for guest %RSP, %RBP and %RIP
4507    only.  These are the minimum needed to extract correct stack
4508    backtraces from amd64 code.
4509
4510    Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
4511 */
4512 Bool guest_amd64_state_requires_precise_mem_exns (
4513         Int minoff, Int maxoff, VexRegisterUpdates pxControl
4514      )
4515 {
4516    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
4517    Int rbp_max = rbp_min + 8 - 1;
4518    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
4519    Int rsp_max = rsp_min + 8 - 1;
4520    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
4521    Int rip_max = rip_min + 8 - 1;
4522
4523    if (maxoff < rsp_min || minoff > rsp_max) {
4524       /* no overlap with rsp */
4525       if (pxControl == VexRegUpdSpAtMemAccess)
4526          return False; // We only need to check stack pointer.
4527    } else {
4528       return True;
4529    }
4530
4531    if (maxoff < rbp_min || minoff > rbp_max) {
4532       /* no overlap with rbp */
4533    } else {
4534       return True;
4535    }
4536
4537    if (maxoff < rip_min || minoff > rip_max) {
4538       /* no overlap with eip */
4539    } else {
4540       return True;
4541    }
4542
4543    return False;
4544 }
4545
4546
4547 #define ALWAYSDEFD(field)                             \
4548     { offsetof(VexGuestAMD64State, field),            \
4549       (sizeof ((VexGuestAMD64State*)0)->field) }
4550
4551 VexGuestLayout
4552    amd64guest_layout
4553       = {
4554           /* Total size of the guest state, in bytes. */
4555           .total_sizeB = sizeof(VexGuestAMD64State),
4556
4557           /* Describe the stack pointer. */
4558           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
4559           .sizeof_SP = 8,
4560
4561           /* Describe the frame pointer. */
4562           .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
4563           .sizeof_FP = 8,
4564
4565           /* Describe the instruction pointer. */
4566           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
4567           .sizeof_IP = 8,
4568
4569           /* Describe any sections to be regarded by Memcheck as
4570              'always-defined'. */
4571           .n_alwaysDefd = 16,
4572
4573           /* flags thunk: OP and NDEP are always defd, whereas DEP1
4574              and DEP2 have to be tracked.  See detailed comment in
4575              gdefs.h on meaning of thunk fields. */
4576           .alwaysDefd
4577              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
4578                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
4579                  /*  2 */ ALWAYSDEFD(guest_DFLAG),
4580                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
4581                  /*  4 */ ALWAYSDEFD(guest_RIP),
4582                  /*  5 */ ALWAYSDEFD(guest_FS_CONST),
4583                  /*  6 */ ALWAYSDEFD(guest_FTOP),
4584                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
4585                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
4586                  /*  9 */ ALWAYSDEFD(guest_FC3210),
4587                  // /* */ ALWAYSDEFD(guest_CS),
4588                  // /* */ ALWAYSDEFD(guest_DS),
4589                  // /* */ ALWAYSDEFD(guest_ES),
4590                  // /* */ ALWAYSDEFD(guest_FS),
4591                  // /* */ ALWAYSDEFD(guest_GS),
4592                  // /* */ ALWAYSDEFD(guest_SS),
4593                  // /* */ ALWAYSDEFD(guest_LDT),
4594                  // /* */ ALWAYSDEFD(guest_GDT),
4595                  /* 10 */ ALWAYSDEFD(guest_EMNOTE),
4596                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
4597                  /* 12 */ ALWAYSDEFD(guest_CMSTART),
4598                  /* 13 */ ALWAYSDEFD(guest_CMLEN),
4599                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
4600                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
4601                }
4602         };
4603
4604
4605 /*---------------------------------------------------------------*/
4606 /*--- end                               guest_amd64_helpers.c ---*/
4607 /*---------------------------------------------------------------*/