2 /*---------------------------------------------------------------*/
3 /*--- begin guest_amd64_helpers.c ---*/
4 /*---------------------------------------------------------------*/
7 This file is part of Valgrind, a dynamic binary instrumentation
10 Copyright (C) 2004-2017 OpenWorks LLP
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
28 The GNU General Public License is contained in the file COPYING.
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
36 #include "libvex_basictypes.h"
37 #include "libvex_emnote.h"
38 #include "libvex_guest_amd64.h"
39 #include "libvex_ir.h"
42 #include "main_util.h"
43 #include "main_globals.h"
44 #include "guest_generic_bb_to_IR.h"
45 #include "guest_amd64_defs.h"
46 #include "guest_generic_x87.h"
49 /* This file contains helper functions for amd64 guest code.
50 Calls to these functions are generated by the back end.
51 These calls are of course in the host machine code and
52 this file will be compiled to host machine code, so that
55 Only change the signatures of these helper functions very
56 carefully. If you change the signature here, you'll have to change
57 the parameters passed to it in the IR calls constructed by
60 The convention used is that all functions called from generated
61 code are named amd64g_<something>, and any function whose name lacks
62 that prefix is not called from generated code. Note that some
63 LibVEX_* functions can however be called by VEX's client, but that
64 is not the same as calling them from VEX-generated code.
68 /* Set to 1 to get detailed profiling info about use of the flag
70 #define PROFILE_RFLAGS 0
73 /*---------------------------------------------------------------*/
74 /*--- %rflags run-time helpers. ---*/
75 /*---------------------------------------------------------------*/
77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
80 static void mullS64 ( Long u
, Long v
, Long
* rHi
, Long
* rLo
)
82 const Long halfMask
= 0xFFFFFFFFLL
;
84 Long u1
, v1
, w1
, w2
, t
;
90 t
= u1
* v0
+ (w0
>> 32);
94 *rHi
= u1
* v1
+ w2
+ (w1
>> 32);
95 *rLo
= (Long
)((ULong
)u
* (ULong
)v
);
98 static void mullU64 ( ULong u
, ULong v
, ULong
* rHi
, ULong
* rLo
)
100 const ULong halfMask
= 0xFFFFFFFFULL
;
102 ULong u1
, v1
, w1
,w2
,t
;
108 t
= u1
* v0
+ (w0
>> 32);
112 *rHi
= u1
* v1
+ w2
+ (w1
>> 32);
117 static const UChar parity_table
[256] = {
118 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
119 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
120 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
121 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
122 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
123 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
124 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
125 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
126 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
127 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
128 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
129 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
130 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
131 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
132 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
133 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
134 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
135 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
136 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
137 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
138 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
139 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
140 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
141 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
142 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
143 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
144 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
145 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
146 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
147 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
148 AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0,
149 0, AMD64G_CC_MASK_P
, AMD64G_CC_MASK_P
, 0, AMD64G_CC_MASK_P
, 0, 0, AMD64G_CC_MASK_P
,
152 /* generalised left-shifter */
153 static inline Long
lshift ( Long x
, Int n
)
156 return (ULong
)x
<< n
;
161 /* identity on ULong */
162 static inline ULong
idULong ( ULong x
)
168 #define PREAMBLE(__data_bits) \
169 /* const */ ULong DATA_MASK \
176 : 0xFFFFFFFFFFFFFFFFULL)); \
177 /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1); \
178 /* const */ ULong CC_DEP1 = cc_dep1_formal; \
179 /* const */ ULong CC_DEP2 = cc_dep2_formal; \
180 /* const */ ULong CC_NDEP = cc_ndep_formal; \
181 /* Four bogus assignments, which hopefully gcc can */ \
182 /* optimise away, and which stop it complaining about */ \
183 /* unused variables. */ \
184 SIGN_MASK = SIGN_MASK; \
185 DATA_MASK = DATA_MASK; \
190 /*-------------------------------------------------------------*/
192 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE) \
194 PREAMBLE(DATA_BITS); \
195 { ULong cf, pf, af, zf, sf, of; \
196 ULong argL, argR, res; \
200 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
201 pf = parity_table[(UChar)res]; \
202 af = (res ^ argL ^ argR) & 0x10; \
203 zf = ((DATA_UTYPE)res == 0) << 6; \
204 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
205 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
206 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
207 return cf | pf | af | zf | sf | of; \
211 /*-------------------------------------------------------------*/
213 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE) \
215 PREAMBLE(DATA_BITS); \
216 { ULong cf, pf, af, zf, sf, of; \
217 ULong argL, argR, res; \
221 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
222 pf = parity_table[(UChar)res]; \
223 af = (res ^ argL ^ argR) & 0x10; \
224 zf = ((DATA_UTYPE)res == 0) << 6; \
225 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
226 of = lshift((argL ^ argR) & (argL ^ res), \
227 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
228 return cf | pf | af | zf | sf | of; \
232 /*-------------------------------------------------------------*/
234 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE) \
236 PREAMBLE(DATA_BITS); \
237 { ULong cf, pf, af, zf, sf, of; \
238 ULong argL, argR, oldC, res; \
239 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
241 argR = CC_DEP2 ^ oldC; \
242 res = (argL + argR) + oldC; \
244 cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \
246 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
247 pf = parity_table[(UChar)res]; \
248 af = (res ^ argL ^ argR) & 0x10; \
249 zf = ((DATA_UTYPE)res == 0) << 6; \
250 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
251 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
252 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
253 return cf | pf | af | zf | sf | of; \
257 /*-------------------------------------------------------------*/
259 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE) \
261 PREAMBLE(DATA_BITS); \
262 { ULong cf, pf, af, zf, sf, of; \
263 ULong argL, argR, oldC, res; \
264 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
266 argR = CC_DEP2 ^ oldC; \
267 res = (argL - argR) - oldC; \
269 cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR; \
271 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
272 pf = parity_table[(UChar)res]; \
273 af = (res ^ argL ^ argR) & 0x10; \
274 zf = ((DATA_UTYPE)res == 0) << 6; \
275 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
276 of = lshift((argL ^ argR) & (argL ^ res), \
277 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
278 return cf | pf | af | zf | sf | of; \
282 /*-------------------------------------------------------------*/
284 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE) \
286 PREAMBLE(DATA_BITS); \
287 { ULong cf, pf, af, zf, sf, of; \
289 pf = parity_table[(UChar)CC_DEP1]; \
291 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
292 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
294 return cf | pf | af | zf | sf | of; \
298 /*-------------------------------------------------------------*/
300 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE) \
302 PREAMBLE(DATA_BITS); \
303 { ULong cf, pf, af, zf, sf, of; \
304 ULong argL, argR, res; \
308 cf = CC_NDEP & AMD64G_CC_MASK_C; \
309 pf = parity_table[(UChar)res]; \
310 af = (res ^ argL ^ argR) & 0x10; \
311 zf = ((DATA_UTYPE)res == 0) << 6; \
312 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
313 of = ((res & DATA_MASK) == SIGN_MASK) << 11; \
314 return cf | pf | af | zf | sf | of; \
318 /*-------------------------------------------------------------*/
320 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE) \
322 PREAMBLE(DATA_BITS); \
323 { ULong cf, pf, af, zf, sf, of; \
324 ULong argL, argR, res; \
328 cf = CC_NDEP & AMD64G_CC_MASK_C; \
329 pf = parity_table[(UChar)res]; \
330 af = (res ^ argL ^ argR) & 0x10; \
331 zf = ((DATA_UTYPE)res == 0) << 6; \
332 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
333 of = ((res & DATA_MASK) \
334 == ((ULong)SIGN_MASK - 1)) << 11; \
335 return cf | pf | af | zf | sf | of; \
339 /*-------------------------------------------------------------*/
341 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE) \
343 PREAMBLE(DATA_BITS); \
344 { ULong cf, pf, af, zf, sf, of; \
345 cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C; \
346 pf = parity_table[(UChar)CC_DEP1]; \
347 af = 0; /* undefined */ \
348 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
349 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
350 /* of is defined if shift count == 1 */ \
351 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
352 & AMD64G_CC_MASK_O; \
353 return cf | pf | af | zf | sf | of; \
357 /*-------------------------------------------------------------*/
359 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE) \
361 PREAMBLE(DATA_BITS); \
362 { ULong cf, pf, af, zf, sf, of; \
364 pf = parity_table[(UChar)CC_DEP1]; \
365 af = 0; /* undefined */ \
366 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
367 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
368 /* of is defined if shift count == 1 */ \
369 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
370 & AMD64G_CC_MASK_O; \
371 return cf | pf | af | zf | sf | of; \
375 /*-------------------------------------------------------------*/
377 /* ROL: cf' = lsb(result). of' = msb(result) ^ lsb(result). */
378 /* DEP1 = result, NDEP = old flags */
379 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE) \
381 PREAMBLE(DATA_BITS); \
383 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
384 | (AMD64G_CC_MASK_C & CC_DEP1) \
385 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
387 ^ lshift(CC_DEP1, 11))); \
392 /*-------------------------------------------------------------*/
394 /* ROR: cf' = msb(result). of' = msb(result) ^ msb-1(result). */
395 /* DEP1 = result, NDEP = old flags */
396 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE) \
398 PREAMBLE(DATA_BITS); \
400 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
401 | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1))) \
402 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
404 ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1))); \
409 /*-------------------------------------------------------------*/
411 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE, NARROWtoU, \
412 DATA_U2TYPE, NARROWto2U) \
414 PREAMBLE(DATA_BITS); \
415 { ULong cf, pf, af, zf, sf, of; \
418 = NARROWtoU( ((DATA_UTYPE)CC_DEP1) \
419 * ((DATA_UTYPE)CC_DEP2) ); \
422 ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1)) \
423 * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) ); \
424 hi = NARROWtoU(rr >>/*u*/ DATA_BITS); \
426 pf = parity_table[(UChar)lo]; \
427 af = 0; /* undefined */ \
428 zf = (lo == 0) << 6; \
429 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
431 return cf | pf | af | zf | sf | of; \
435 /*-------------------------------------------------------------*/
437 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE, NARROWtoS, \
438 DATA_S2TYPE, NARROWto2S) \
440 PREAMBLE(DATA_BITS); \
441 { ULong cf, pf, af, zf, sf, of; \
444 = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1) \
445 * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) ); \
448 ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1)) \
449 * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) ); \
450 hi = NARROWtoS(rr >>/*s*/ DATA_BITS); \
451 cf = (hi != (lo >>/*s*/ (DATA_BITS-1))); \
452 pf = parity_table[(UChar)lo]; \
453 af = 0; /* undefined */ \
454 zf = (lo == 0) << 6; \
455 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
457 return cf | pf | af | zf | sf | of; \
461 /*-------------------------------------------------------------*/
463 #define ACTIONS_UMULQ \
466 { ULong cf, pf, af, zf, sf, of; \
468 mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo ); \
470 pf = parity_table[(UChar)lo]; \
471 af = 0; /* undefined */ \
472 zf = (lo == 0) << 6; \
473 sf = lshift(lo, 8 - 64) & 0x80; \
475 return cf | pf | af | zf | sf | of; \
479 /*-------------------------------------------------------------*/
481 #define ACTIONS_SMULQ \
484 { ULong cf, pf, af, zf, sf, of; \
486 mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo ); \
487 cf = (hi != (lo >>/*s*/ (64-1))); \
488 pf = parity_table[(UChar)lo]; \
489 af = 0; /* undefined */ \
490 zf = (lo == 0) << 6; \
491 sf = lshift(lo, 8 - 64) & 0x80; \
493 return cf | pf | af | zf | sf | of; \
497 /*-------------------------------------------------------------*/
499 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE) \
501 PREAMBLE(DATA_BITS); \
502 { ULong cf, pf, af, zf, sf, of; \
506 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
507 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
509 return cf | pf | af | zf | sf | of; \
513 /*-------------------------------------------------------------*/
515 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE) \
517 PREAMBLE(DATA_BITS); \
518 { ULong cf, pf, af, zf, sf, of; \
519 cf = ((DATA_UTYPE)CC_DEP2 != 0); \
522 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
523 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
525 return cf | pf | af | zf | sf | of; \
529 /*-------------------------------------------------------------*/
531 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE) \
533 PREAMBLE(DATA_BITS); \
534 { Long cf, pf, af, zf, sf, of; \
535 cf = ((DATA_UTYPE)CC_DEP2 == 0); \
539 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
541 return cf | pf | af | zf | sf | of; \
545 /*-------------------------------------------------------------*/
547 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE) \
549 PREAMBLE(DATA_BITS); \
550 { ULong cf, pf, af, zf, sf, of; \
551 cf = ((DATA_UTYPE)CC_DEP2 == 0); \
554 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
555 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
557 return cf | pf | af | zf | sf | of; \
561 /*-------------------------------------------------------------*/
563 #define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME) \
565 PREAMBLE(DATA_BITS); \
566 { ULong ocf; /* o or c */ \
567 ULong argL, argR, oldOC, res; \
568 oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1; \
570 argR = CC_DEP2 ^ oldOC; \
571 res = (argL + argR) + oldOC; \
573 ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \
575 ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
576 return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME) \
577 | (ocf << AMD64G_CC_SHIFT_##FLAGNAME); \
581 /*-------------------------------------------------------------*/
586 static Bool initted
= False
;
588 /* C flag, fast route */
589 static UInt tabc_fast
[AMD64G_CC_OP_NUMBER
];
590 /* C flag, slow route */
591 static UInt tabc_slow
[AMD64G_CC_OP_NUMBER
];
592 /* table for calculate_cond */
593 static UInt tab_cond
[AMD64G_CC_OP_NUMBER
][16];
594 /* total entry counts for calc_all, calc_c, calc_cond. */
595 static UInt n_calc_all
= 0;
596 static UInt n_calc_c
= 0;
597 static UInt n_calc_cond
= 0;
599 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
602 static void showCounts ( void )
606 vex_printf("\nTotal calls: calc_all=%u calc_cond=%u calc_c=%u\n",
607 n_calc_all
, n_calc_cond
, n_calc_c
);
609 vex_printf(" cSLOW cFAST O NO B NB Z NZ BE NBE"
610 " S NS P NP L NL LE NLE\n");
611 vex_printf(" -----------------------------------------------------"
612 "----------------------------------------\n");
613 for (op
= 0; op
< AMD64G_CC_OP_NUMBER
; op
++) {
616 if (op
> 0 && (op
-1) % 4 == 0)
618 if (op
> 0 && (op
-1) % 4 == 1)
620 if (op
> 0 && (op
-1) % 4 == 2)
622 if (op
> 0 && (op
-1) % 4 == 3)
625 vex_printf("%2d%c: ", op
, ch
);
626 vex_printf("%6u ", tabc_slow
[op
]);
627 vex_printf("%6u ", tabc_fast
[op
]);
628 for (co
= 0; co
< 16; co
++) {
629 Int n
= tab_cond
[op
][co
];
631 vex_printf(" %3dK", n
/ 1000);
634 vex_printf(" %3d ", n
);
644 static void initCounts ( void )
648 for (op
= 0; op
< AMD64G_CC_OP_NUMBER
; op
++) {
649 tabc_fast
[op
] = tabc_slow
[op
] = 0;
650 for (co
= 0; co
< 16; co
++)
651 tab_cond
[op
][co
] = 0;
655 #endif /* PROFILE_RFLAGS */
658 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
659 /* Calculate all the 6 flags from the supplied thunk parameters.
660 Worker function, not directly called from generated code. */
662 ULong
amd64g_calculate_rflags_all_WRK ( ULong cc_op
,
663 ULong cc_dep1_formal
,
664 ULong cc_dep2_formal
,
665 ULong cc_ndep_formal
)
668 case AMD64G_CC_OP_COPY
:
669 return cc_dep1_formal
670 & (AMD64G_CC_MASK_O
| AMD64G_CC_MASK_S
| AMD64G_CC_MASK_Z
671 | AMD64G_CC_MASK_A
| AMD64G_CC_MASK_C
| AMD64G_CC_MASK_P
);
673 case AMD64G_CC_OP_ADDB
: ACTIONS_ADD( 8, UChar
);
674 case AMD64G_CC_OP_ADDW
: ACTIONS_ADD( 16, UShort
);
675 case AMD64G_CC_OP_ADDL
: ACTIONS_ADD( 32, UInt
);
676 case AMD64G_CC_OP_ADDQ
: ACTIONS_ADD( 64, ULong
);
678 case AMD64G_CC_OP_ADCB
: ACTIONS_ADC( 8, UChar
);
679 case AMD64G_CC_OP_ADCW
: ACTIONS_ADC( 16, UShort
);
680 case AMD64G_CC_OP_ADCL
: ACTIONS_ADC( 32, UInt
);
681 case AMD64G_CC_OP_ADCQ
: ACTIONS_ADC( 64, ULong
);
683 case AMD64G_CC_OP_SUBB
: ACTIONS_SUB( 8, UChar
);
684 case AMD64G_CC_OP_SUBW
: ACTIONS_SUB( 16, UShort
);
685 case AMD64G_CC_OP_SUBL
: ACTIONS_SUB( 32, UInt
);
686 case AMD64G_CC_OP_SUBQ
: ACTIONS_SUB( 64, ULong
);
688 case AMD64G_CC_OP_SBBB
: ACTIONS_SBB( 8, UChar
);
689 case AMD64G_CC_OP_SBBW
: ACTIONS_SBB( 16, UShort
);
690 case AMD64G_CC_OP_SBBL
: ACTIONS_SBB( 32, UInt
);
691 case AMD64G_CC_OP_SBBQ
: ACTIONS_SBB( 64, ULong
);
693 case AMD64G_CC_OP_LOGICB
: ACTIONS_LOGIC( 8, UChar
);
694 case AMD64G_CC_OP_LOGICW
: ACTIONS_LOGIC( 16, UShort
);
695 case AMD64G_CC_OP_LOGICL
: ACTIONS_LOGIC( 32, UInt
);
696 case AMD64G_CC_OP_LOGICQ
: ACTIONS_LOGIC( 64, ULong
);
698 case AMD64G_CC_OP_INCB
: ACTIONS_INC( 8, UChar
);
699 case AMD64G_CC_OP_INCW
: ACTIONS_INC( 16, UShort
);
700 case AMD64G_CC_OP_INCL
: ACTIONS_INC( 32, UInt
);
701 case AMD64G_CC_OP_INCQ
: ACTIONS_INC( 64, ULong
);
703 case AMD64G_CC_OP_DECB
: ACTIONS_DEC( 8, UChar
);
704 case AMD64G_CC_OP_DECW
: ACTIONS_DEC( 16, UShort
);
705 case AMD64G_CC_OP_DECL
: ACTIONS_DEC( 32, UInt
);
706 case AMD64G_CC_OP_DECQ
: ACTIONS_DEC( 64, ULong
);
708 case AMD64G_CC_OP_SHLB
: ACTIONS_SHL( 8, UChar
);
709 case AMD64G_CC_OP_SHLW
: ACTIONS_SHL( 16, UShort
);
710 case AMD64G_CC_OP_SHLL
: ACTIONS_SHL( 32, UInt
);
711 case AMD64G_CC_OP_SHLQ
: ACTIONS_SHL( 64, ULong
);
713 case AMD64G_CC_OP_SHRB
: ACTIONS_SHR( 8, UChar
);
714 case AMD64G_CC_OP_SHRW
: ACTIONS_SHR( 16, UShort
);
715 case AMD64G_CC_OP_SHRL
: ACTIONS_SHR( 32, UInt
);
716 case AMD64G_CC_OP_SHRQ
: ACTIONS_SHR( 64, ULong
);
718 case AMD64G_CC_OP_ROLB
: ACTIONS_ROL( 8, UChar
);
719 case AMD64G_CC_OP_ROLW
: ACTIONS_ROL( 16, UShort
);
720 case AMD64G_CC_OP_ROLL
: ACTIONS_ROL( 32, UInt
);
721 case AMD64G_CC_OP_ROLQ
: ACTIONS_ROL( 64, ULong
);
723 case AMD64G_CC_OP_RORB
: ACTIONS_ROR( 8, UChar
);
724 case AMD64G_CC_OP_RORW
: ACTIONS_ROR( 16, UShort
);
725 case AMD64G_CC_OP_RORL
: ACTIONS_ROR( 32, UInt
);
726 case AMD64G_CC_OP_RORQ
: ACTIONS_ROR( 64, ULong
);
728 case AMD64G_CC_OP_UMULB
: ACTIONS_UMUL( 8, UChar
, toUChar
,
730 case AMD64G_CC_OP_UMULW
: ACTIONS_UMUL( 16, UShort
, toUShort
,
732 case AMD64G_CC_OP_UMULL
: ACTIONS_UMUL( 32, UInt
, toUInt
,
735 case AMD64G_CC_OP_UMULQ
: ACTIONS_UMULQ
;
737 case AMD64G_CC_OP_SMULB
: ACTIONS_SMUL( 8, Char
, toUChar
,
739 case AMD64G_CC_OP_SMULW
: ACTIONS_SMUL( 16, Short
, toUShort
,
741 case AMD64G_CC_OP_SMULL
: ACTIONS_SMUL( 32, Int
, toUInt
,
744 case AMD64G_CC_OP_SMULQ
: ACTIONS_SMULQ
;
746 case AMD64G_CC_OP_ANDN32
: ACTIONS_ANDN( 32, UInt
);
747 case AMD64G_CC_OP_ANDN64
: ACTIONS_ANDN( 64, ULong
);
749 case AMD64G_CC_OP_BLSI32
: ACTIONS_BLSI( 32, UInt
);
750 case AMD64G_CC_OP_BLSI64
: ACTIONS_BLSI( 64, ULong
);
752 case AMD64G_CC_OP_BLSMSK32
: ACTIONS_BLSMSK( 32, UInt
);
753 case AMD64G_CC_OP_BLSMSK64
: ACTIONS_BLSMSK( 64, ULong
);
755 case AMD64G_CC_OP_BLSR32
: ACTIONS_BLSR( 32, UInt
);
756 case AMD64G_CC_OP_BLSR64
: ACTIONS_BLSR( 64, ULong
);
758 case AMD64G_CC_OP_ADCX32
: ACTIONS_ADX( 32, UInt
, C
);
759 case AMD64G_CC_OP_ADCX64
: ACTIONS_ADX( 64, ULong
, C
);
761 case AMD64G_CC_OP_ADOX32
: ACTIONS_ADX( 32, UInt
, O
);
762 case AMD64G_CC_OP_ADOX64
: ACTIONS_ADX( 64, ULong
, O
);
765 /* shouldn't really make these calls from generated code */
766 vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
767 "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
768 cc_op
, cc_dep1_formal
, cc_dep2_formal
, cc_ndep_formal
);
769 vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
774 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
775 /* Calculate all the 6 flags from the supplied thunk parameters. */
776 ULong
amd64g_calculate_rflags_all ( ULong cc_op
,
782 if (!initted
) initCounts();
784 if (SHOW_COUNTS_NOW
) showCounts();
787 amd64g_calculate_rflags_all_WRK ( cc_op
, cc_dep1
, cc_dep2
, cc_ndep
);
791 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
792 /* Calculate just the carry flag from the supplied thunk parameters. */
793 ULong
amd64g_calculate_rflags_c ( ULong cc_op
,
799 if (!initted
) initCounts();
802 if (SHOW_COUNTS_NOW
) showCounts();
805 /* Fast-case some common ones. */
807 case AMD64G_CC_OP_COPY
:
808 return (cc_dep1
>> AMD64G_CC_SHIFT_C
) & 1;
809 case AMD64G_CC_OP_LOGICQ
:
810 case AMD64G_CC_OP_LOGICL
:
811 case AMD64G_CC_OP_LOGICW
:
812 case AMD64G_CC_OP_LOGICB
:
814 // case AMD64G_CC_OP_SUBL:
815 // return ((UInt)cc_dep1) < ((UInt)cc_dep2)
816 // ? AMD64G_CC_MASK_C : 0;
817 // case AMD64G_CC_OP_SUBW:
818 // return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
819 // ? AMD64G_CC_MASK_C : 0;
820 // case AMD64G_CC_OP_SUBB:
821 // return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
822 // ? AMD64G_CC_MASK_C : 0;
823 // case AMD64G_CC_OP_INCL:
824 // case AMD64G_CC_OP_DECL:
825 // return cc_ndep & AMD64G_CC_MASK_C;
835 return amd64g_calculate_rflags_all_WRK(cc_op
,cc_dep1
,cc_dep2
,cc_ndep
)
840 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
842 ULong
amd64g_calculate_condition ( ULong
/*AMD64Condcode*/ cond
,
848 ULong rflags
= amd64g_calculate_rflags_all_WRK(cc_op
, cc_dep1
,
850 ULong of
,sf
,zf
,cf
,pf
;
851 ULong inv
= cond
& 1;
854 if (!initted
) initCounts();
855 tab_cond
[cc_op
][cond
]++;
857 if (SHOW_COUNTS_NOW
) showCounts();
862 case AMD64CondO
: /* OF == 1 */
863 of
= rflags
>> AMD64G_CC_SHIFT_O
;
864 return 1 & (inv
^ of
);
867 case AMD64CondZ
: /* ZF == 1 */
868 zf
= rflags
>> AMD64G_CC_SHIFT_Z
;
869 return 1 & (inv
^ zf
);
872 case AMD64CondB
: /* CF == 1 */
873 cf
= rflags
>> AMD64G_CC_SHIFT_C
;
874 return 1 & (inv
^ cf
);
878 case AMD64CondBE
: /* (CF or ZF) == 1 */
879 cf
= rflags
>> AMD64G_CC_SHIFT_C
;
880 zf
= rflags
>> AMD64G_CC_SHIFT_Z
;
881 return 1 & (inv
^ (cf
| zf
));
885 case AMD64CondS
: /* SF == 1 */
886 sf
= rflags
>> AMD64G_CC_SHIFT_S
;
887 return 1 & (inv
^ sf
);
890 case AMD64CondP
: /* PF == 1 */
891 pf
= rflags
>> AMD64G_CC_SHIFT_P
;
892 return 1 & (inv
^ pf
);
895 case AMD64CondL
: /* (SF xor OF) == 1 */
896 sf
= rflags
>> AMD64G_CC_SHIFT_S
;
897 of
= rflags
>> AMD64G_CC_SHIFT_O
;
898 return 1 & (inv
^ (sf
^ of
));
902 case AMD64CondLE
: /* ((SF xor OF) or ZF) == 1 */
903 sf
= rflags
>> AMD64G_CC_SHIFT_S
;
904 of
= rflags
>> AMD64G_CC_SHIFT_O
;
905 zf
= rflags
>> AMD64G_CC_SHIFT_Z
;
906 return 1 & (inv
^ ((sf
^ of
) | zf
));
910 /* shouldn't really make these calls from generated code */
911 vex_printf("amd64g_calculate_condition"
912 "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
913 cond
, cc_op
, cc_dep1
, cc_dep2
, cc_ndep
);
914 vpanic("amd64g_calculate_condition");
919 /* VISIBLE TO LIBVEX CLIENT */
920 ULong
LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State
* vex_state
)
922 ULong rflags
= amd64g_calculate_rflags_all_WRK(
923 vex_state
->guest_CC_OP
,
924 vex_state
->guest_CC_DEP1
,
925 vex_state
->guest_CC_DEP2
,
926 vex_state
->guest_CC_NDEP
928 Long dflag
= vex_state
->guest_DFLAG
;
929 vassert(dflag
== 1 || dflag
== -1);
932 if (vex_state
->guest_IDFLAG
== 1)
934 if (vex_state
->guest_ACFLAG
== 1)
940 /* VISIBLE TO LIBVEX CLIENT */
942 LibVEX_GuestAMD64_put_rflags ( ULong rflags
,
943 /*MOD*/VexGuestAMD64State
* vex_state
)
946 if (rflags
& AMD64G_CC_MASK_D
) {
947 vex_state
->guest_DFLAG
= -1;
948 rflags
&= ~AMD64G_CC_MASK_D
;
951 vex_state
->guest_DFLAG
= 1;
954 if (rflags
& AMD64G_CC_MASK_ID
) {
955 vex_state
->guest_IDFLAG
= 1;
956 rflags
&= ~AMD64G_CC_MASK_ID
;
959 vex_state
->guest_IDFLAG
= 0;
962 if (rflags
& AMD64G_CC_MASK_AC
) {
963 vex_state
->guest_ACFLAG
= 1;
964 rflags
&= ~AMD64G_CC_MASK_AC
;
967 vex_state
->guest_ACFLAG
= 0;
969 UInt cc_mask
= AMD64G_CC_MASK_O
| AMD64G_CC_MASK_S
| AMD64G_CC_MASK_Z
|
970 AMD64G_CC_MASK_A
| AMD64G_CC_MASK_C
| AMD64G_CC_MASK_P
;
971 vex_state
->guest_CC_OP
= AMD64G_CC_OP_COPY
;
972 vex_state
->guest_CC_DEP1
= rflags
& cc_mask
;
973 vex_state
->guest_CC_DEP2
= 0;
974 vex_state
->guest_CC_NDEP
= 0;
977 /* VISIBLE TO LIBVEX CLIENT */
979 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag
,
980 /*MOD*/VexGuestAMD64State
* vex_state
)
982 ULong oszacp
= amd64g_calculate_rflags_all_WRK(
983 vex_state
->guest_CC_OP
,
984 vex_state
->guest_CC_DEP1
,
985 vex_state
->guest_CC_DEP2
,
986 vex_state
->guest_CC_NDEP
988 if (new_carry_flag
& 1) {
989 oszacp
|= AMD64G_CC_MASK_C
;
991 oszacp
&= ~AMD64G_CC_MASK_C
;
993 vex_state
->guest_CC_OP
= AMD64G_CC_OP_COPY
;
994 vex_state
->guest_CC_DEP1
= oszacp
;
995 vex_state
->guest_CC_DEP2
= 0;
996 vex_state
->guest_CC_NDEP
= 0;
1000 /*---------------------------------------------------------------*/
1001 /*--- %rflags translation-time function specialisers. ---*/
1002 /*--- These help iropt specialise calls the above run-time ---*/
1003 /*--- %rflags functions. ---*/
1004 /*---------------------------------------------------------------*/
1006 /* Used by the optimiser to try specialisations. Returns an
1007 equivalent expression, or NULL if none. */
1009 static inline Bool
isU64 ( IRExpr
* e
, ULong n
)
1011 return e
->tag
== Iex_Const
1012 && e
->Iex
.Const
.con
->tag
== Ico_U64
1013 && e
->Iex
.Const
.con
->Ico
.U64
== n
;
1016 /* Returns N if E is an immediate of the form 1 << N for N in 1 to 31,
1017 and zero in any other case. */
1018 static Int
isU64_1_shl_N ( IRExpr
* e
)
1020 if (e
->tag
!= Iex_Const
|| e
->Iex
.Const
.con
->tag
!= Ico_U64
)
1022 ULong w64
= e
->Iex
.Const
.con
->Ico
.U64
;
1023 if (w64
< (1ULL << 1) || w64
> (1ULL << 31))
1025 if ((w64
& (w64
- 1)) != 0)
1027 /* At this point, we know w64 is a power of two in the range 2^1 .. 2^31,
1028 and we only need to find out which one it is. */
1029 for (Int n
= 1; n
<= 31; n
++) {
1030 if (w64
== (1ULL << n
))
1033 /* Consequently we should never get here. */
1039 IRExpr
* guest_amd64_spechelper ( const HChar
* function_name
,
1041 IRStmt
** precedingStmts
,
1042 Int n_precedingStmts
)
1044 # define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
1045 # define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
1046 # define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
1047 # define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
1048 # define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
1051 for (i
= 0; args
[i
]; i
++)
1054 vex_printf("spec request:\n");
1055 vex_printf(" %s ", function_name
);
1056 for (i
= 0; i
< arity
; i
++) {
1063 /* --------- specialising "amd64g_calculate_condition" --------- */
1065 if (vex_streq(function_name
, "amd64g_calculate_condition")) {
1066 /* specialise calls to above "calculate condition" function */
1067 IRExpr
*cond
, *cc_op
, *cc_dep1
, *cc_dep2
;
1068 vassert(arity
== 5);
1074 /*---------------- ADDQ ----------------*/
1076 if (isU64(cc_op
, AMD64G_CC_OP_ADDQ
) && isU64(cond
, AMD64CondZ
)) {
1077 /* long long add, then Z --> test (dst+src == 0) */
1078 return unop(Iop_1Uto64
,
1080 binop(Iop_Add64
, cc_dep1
, cc_dep2
),
1084 /*---------------- ADDL ----------------*/
1086 if (isU64(cc_op
, AMD64G_CC_OP_ADDL
) && isU64(cond
, AMD64CondO
)) {
1087 /* This is very commonly generated by Javascript JITs, for
1088 the idiom "do a 32-bit add and jump to out-of-line code if
1089 an overflow occurs". */
1090 /* long add, then O (overflow)
1091 --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
1092 --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1093 --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1095 vassert(isIRAtom(cc_dep1
));
1096 vassert(isIRAtom(cc_dep2
));
1102 binop(Iop_Xor64
, cc_dep1
, cc_dep2
)),
1105 binop(Iop_Add64
, cc_dep1
, cc_dep2
))),
1111 /*---------------- SUBQ ----------------*/
1114 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondO
)) {
1115 /* long long sub/cmp, then O (overflow)
1116 --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
1117 --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
1119 vassert(isIRAtom(cc_dep1
));
1120 vassert(isIRAtom(cc_dep2
));
1121 return binop(Iop_Shr64
,
1123 binop(Iop_Xor64
, cc_dep1
, cc_dep2
),
1126 binop(Iop_Sub64
, cc_dep1
, cc_dep2
))),
1129 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondNO
)) {
1130 /* No action. Never yet found a test case. */
1134 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondB
)) {
1135 /* long long sub/cmp, then B (unsigned less than)
1136 --> test dst <u src */
1137 return unop(Iop_1Uto64
,
1138 binop(Iop_CmpLT64U
, cc_dep1
, cc_dep2
));
1140 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondNB
)) {
1141 /* long long sub/cmp, then NB (unsigned greater than or equal)
1142 --> test src <=u dst */
1143 /* Note, args are opposite way round from the usual */
1144 return unop(Iop_1Uto64
,
1145 binop(Iop_CmpLE64U
, cc_dep2
, cc_dep1
));
1149 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondZ
)) {
1150 /* long long sub/cmp, then Z --> test dst==src */
1151 return unop(Iop_1Uto64
,
1152 binop(Iop_CmpEQ64
,cc_dep1
,cc_dep2
));
1154 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondNZ
)) {
1155 /* long long sub/cmp, then NZ --> test dst!=src */
1156 return unop(Iop_1Uto64
,
1157 binop(Iop_CmpNE64
,cc_dep1
,cc_dep2
));
1161 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondBE
)) {
1162 /* long long sub/cmp, then BE (unsigned less than or equal)
1163 --> test dst <=u src */
1164 return unop(Iop_1Uto64
,
1165 binop(Iop_CmpLE64U
, cc_dep1
, cc_dep2
));
1167 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondNBE
)) {
1168 /* long long sub/cmp, then NBE (unsigned greater than)
1169 --> test !(dst <=u src) */
1170 return binop(Iop_Xor64
,
1172 binop(Iop_CmpLE64U
, cc_dep1
, cc_dep2
)),
1177 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondS
)) {
1178 /* long long sub/cmp, then S (negative)
1180 --> (dst-src) >>u 63 */
1181 return binop(Iop_Shr64
,
1182 binop(Iop_Sub64
, cc_dep1
, cc_dep2
),
1185 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondNS
)) {
1186 /* long long sub/cmp, then NS (not negative)
1187 --> (dst-src)[63] ^ 1
1188 --> ((dst-src) >>u 63) ^ 1 */
1189 return binop(Iop_Xor64
,
1191 binop(Iop_Sub64
, cc_dep1
, cc_dep2
),
1197 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondL
)) {
1198 /* long long sub/cmp, then L (signed less than)
1199 --> test dst <s src */
1200 return unop(Iop_1Uto64
,
1201 binop(Iop_CmpLT64S
, cc_dep1
, cc_dep2
));
1203 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondNL
)) {
1204 /* long long sub/cmp, then NL (signed greater than or equal)
1205 --> test dst >=s src
1206 --> test src <=s dst */
1207 return unop(Iop_1Uto64
,
1208 binop(Iop_CmpLE64S
, cc_dep2
, cc_dep1
));
1212 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondLE
)) {
1213 /* long long sub/cmp, then LE (signed less than or equal)
1214 --> test dst <=s src */
1215 return unop(Iop_1Uto64
,
1216 binop(Iop_CmpLE64S
, cc_dep1
, cc_dep2
));
1218 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
) && isU64(cond
, AMD64CondNLE
)) {
1219 /* long sub/cmp, then NLE (signed greater than)
1220 --> test !(dst <=s src)
1221 --> test (dst >s src)
1222 --> test (src <s dst) */
1223 return unop(Iop_1Uto64
,
1224 binop(Iop_CmpLT64S
, cc_dep2
, cc_dep1
));
1228 /*---------------- SUBL ----------------*/
1231 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondO
)) {
1232 /* This is very commonly generated by Javascript JITs, for
1233 the idiom "do a 32-bit subtract and jump to out-of-line
1234 code if an overflow occurs". */
1235 /* long sub/cmp, then O (overflow)
1236 --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
1237 --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
1239 vassert(isIRAtom(cc_dep1
));
1240 vassert(isIRAtom(cc_dep2
));
1245 binop(Iop_Xor64
, cc_dep1
, cc_dep2
),
1248 binop(Iop_Sub64
, cc_dep1
, cc_dep2
))),
1252 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondNO
)) {
1253 /* No action. Never yet found a test case. */
1258 /* It appears that LLVM 5.0 and later have a new way to find out
1259 whether the top N bits of a word W are all zero, by computing
1261 W <u 0---(N-1)---0 1 0---0
1263 In particular, the result will be defined if the top N bits of W
1264 are defined, even if the trailing bits -- those corresponding to
1265 the 0---0 section -- are undefined. Rather than make Memcheck
1266 more complex, we detect this case where we can and shift out the
1267 irrelevant and potentially undefined bits. */
1269 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
)
1270 && (isU64(cond
, AMD64CondB
) || isU64(cond
, AMD64CondNB
))
1271 && (n
= isU64_1_shl_N(cc_dep2
)) > 0) {
1272 /* long sub/cmp, then B (unsigned less than),
1273 where dep2 is a power of 2:
1274 -> CmpLT32(dep1, 1 << N)
1275 -> CmpEQ32(dep1 >>u N, 0)
1277 long sub/cmp, then NB (unsigned greater than or equal),
1278 where dep2 is a power of 2:
1279 -> CmpGE32(dep1, 1 << N)
1280 -> CmpNE32(dep1 >>u N, 0)
1281 This avoids CmpLT32U/CmpGE32U being applied to potentially
1282 uninitialised bits in the area being shifted out. */
1283 vassert(n
>= 1 && n
<= 31);
1284 Bool isNB
= isU64(cond
, AMD64CondNB
);
1285 return unop(Iop_1Uto64
,
1286 binop(isNB
? Iop_CmpNE32
: Iop_CmpEQ32
,
1287 binop(Iop_Shr32
, unop(Iop_64to32
, cc_dep1
),
1292 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondB
)) {
1293 /* long sub/cmp, then B (unsigned less than)
1294 --> test dst <u src */
1295 return unop(Iop_1Uto64
,
1297 unop(Iop_64to32
, cc_dep1
),
1298 unop(Iop_64to32
, cc_dep2
)));
1300 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondNB
)) {
1301 /* long sub/cmp, then NB (unsigned greater than or equal)
1302 --> test src <=u dst */
1303 /* Note, args are opposite way round from the usual */
1304 return unop(Iop_1Uto64
,
1306 unop(Iop_64to32
, cc_dep2
),
1307 unop(Iop_64to32
, cc_dep1
)));
1311 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondZ
)) {
1312 /* long sub/cmp, then Z --> test dst==src */
1313 return unop(Iop_1Uto64
,
1315 unop(Iop_64to32
, cc_dep1
),
1316 unop(Iop_64to32
, cc_dep2
)));
1318 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondNZ
)) {
1319 /* long sub/cmp, then NZ --> test dst!=src */
1320 return unop(Iop_1Uto64
,
1322 unop(Iop_64to32
, cc_dep1
),
1323 unop(Iop_64to32
, cc_dep2
)));
1327 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondBE
)) {
1328 /* long sub/cmp, then BE (unsigned less than or equal)
1329 --> test dst <=u src */
1330 return unop(Iop_1Uto64
,
1332 unop(Iop_64to32
, cc_dep1
),
1333 unop(Iop_64to32
, cc_dep2
)));
1335 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondNBE
)) {
1336 /* long sub/cmp, then NBE (unsigned greater than)
1337 --> test src <u dst */
1338 /* Note, args are opposite way round from the usual */
1339 return unop(Iop_1Uto64
,
1341 unop(Iop_64to32
, cc_dep2
),
1342 unop(Iop_64to32
, cc_dep1
)));
1346 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondS
)) {
1347 /* long sub/cmp, then S (negative)
1349 --> ((dst -64 src) >>u 31) & 1
1350 Pointless to narrow the args to 32 bit before the subtract. */
1351 return binop(Iop_And64
,
1353 binop(Iop_Sub64
, cc_dep1
, cc_dep2
),
1357 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondNS
)) {
1358 /* long sub/cmp, then NS (not negative)
1359 --> (dst-src)[31] ^ 1
1360 --> (((dst -64 src) >>u 31) & 1) ^ 1
1361 Pointless to narrow the args to 32 bit before the subtract. */
1362 return binop(Iop_Xor64
,
1365 binop(Iop_Sub64
, cc_dep1
, cc_dep2
),
1372 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondL
)) {
1373 /* long sub/cmp, then L (signed less than)
1374 --> test dst <s src */
1375 return unop(Iop_1Uto64
,
1377 unop(Iop_64to32
, cc_dep1
),
1378 unop(Iop_64to32
, cc_dep2
)));
1380 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondNL
)) {
1381 /* long sub/cmp, then NL (signed greater than or equal)
1382 --> test dst >=s src
1383 --> test src <=s dst */
1384 return unop(Iop_1Uto64
,
1386 unop(Iop_64to32
, cc_dep2
),
1387 unop(Iop_64to32
, cc_dep1
)));
1391 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondLE
)) {
1392 /* long sub/cmp, then LE (signed less than or equal)
1393 --> test dst <=s src */
1394 return unop(Iop_1Uto64
,
1396 unop(Iop_64to32
, cc_dep1
),
1397 unop(Iop_64to32
, cc_dep2
)));
1400 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
) && isU64(cond
, AMD64CondNLE
)) {
1401 /* long sub/cmp, then NLE (signed greater than)
1402 --> test !(dst <=s src)
1403 --> test (dst >s src)
1404 --> test (src <s dst) */
1405 return unop(Iop_1Uto64
,
1407 unop(Iop_64to32
, cc_dep2
),
1408 unop(Iop_64to32
, cc_dep1
)));
1412 /*---------------- SUBW ----------------*/
1415 if (isU64(cc_op
, AMD64G_CC_OP_SUBW
) && isU64(cond
, AMD64CondZ
)) {
1416 /* word sub/cmp, then Z --> test dst==src */
1417 return unop(Iop_1Uto64
,
1419 unop(Iop_64to16
,cc_dep1
),
1420 unop(Iop_64to16
,cc_dep2
)));
1422 if (isU64(cc_op
, AMD64G_CC_OP_SUBW
) && isU64(cond
, AMD64CondNZ
)) {
1423 /* word sub/cmp, then NZ --> test dst!=src */
1424 return unop(Iop_1Uto64
,
1426 unop(Iop_64to16
,cc_dep1
),
1427 unop(Iop_64to16
,cc_dep2
)));
1431 if (isU64(cc_op
, AMD64G_CC_OP_SUBW
) && isU64(cond
, AMD64CondBE
)) {
1432 /* word sub/cmp, then BE (unsigned less than or equal)
1433 --> test dst <=u src */
1434 return unop(Iop_1Uto64
,
1436 binop(Iop_Shl64
, cc_dep1
, mkU8(48)),
1437 binop(Iop_Shl64
, cc_dep2
, mkU8(48))));
1441 if (isU64(cc_op
, AMD64G_CC_OP_SUBW
) && isU64(cond
, AMD64CondS
)
1442 && isU64(cc_dep2
, 0)) {
1443 /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
1446 This is yet another scheme by which clang figures out if the
1447 top bit of a word is 1 or 0. See also LOGICB/CondS below. */
1448 /* Note: isU64(cc_dep2, 0) is correct, even though this is
1449 for an 16-bit comparison, since the args to the helper
1450 function are always U64s. */
1451 return binop(Iop_And64
,
1452 binop(Iop_Shr64
,cc_dep1
,mkU8(15)),
1455 if (isU64(cc_op
, AMD64G_CC_OP_SUBW
) && isU64(cond
, AMD64CondNS
)
1456 && isU64(cc_dep2
, 0)) {
1457 /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1458 --> test !(dst <s 0)
1459 --> (ULong) !dst[15]
1461 return binop(Iop_Xor64
,
1463 binop(Iop_Shr64
,cc_dep1
,mkU8(15)),
1469 if (isU64(cc_op
, AMD64G_CC_OP_SUBW
) && isU64(cond
, AMD64CondLE
)) {
1470 /* word sub/cmp, then LE (signed less than or equal)
1471 --> test dst <=s src */
1472 return unop(Iop_1Uto64
,
1474 binop(Iop_Shl64
,cc_dep1
,mkU8(48)),
1475 binop(Iop_Shl64
,cc_dep2
,mkU8(48))));
1479 /*---------------- SUBB ----------------*/
1482 if (isU64(cc_op
, AMD64G_CC_OP_SUBB
) && isU64(cond
, AMD64CondB
)) {
1483 /* byte sub/cmp, then B (unsigned less than)
1484 --> test dst <u src */
1485 return unop(Iop_1Uto64
,
1487 binop(Iop_And64
, cc_dep1
, mkU64(0xFF)),
1488 binop(Iop_And64
, cc_dep2
, mkU64(0xFF))));
1490 if (isU64(cc_op
, AMD64G_CC_OP_SUBB
) && isU64(cond
, AMD64CondNB
)) {
1491 /* byte sub/cmp, then NB (unsigned greater than or equal)
1492 --> test src <=u dst */
1493 /* Note, args are opposite way round from the usual */
1494 return unop(Iop_1Uto64
,
1496 binop(Iop_And64
, cc_dep2
, mkU64(0xFF)),
1497 binop(Iop_And64
, cc_dep1
, mkU64(0xFF))));
1501 if (isU64(cc_op
, AMD64G_CC_OP_SUBB
) && isU64(cond
, AMD64CondZ
)) {
1502 /* byte sub/cmp, then Z --> test dst==src */
1503 return unop(Iop_1Uto64
,
1505 unop(Iop_64to8
,cc_dep1
),
1506 unop(Iop_64to8
,cc_dep2
)));
1508 if (isU64(cc_op
, AMD64G_CC_OP_SUBB
) && isU64(cond
, AMD64CondNZ
)) {
1509 /* byte sub/cmp, then NZ --> test dst!=src */
1510 return unop(Iop_1Uto64
,
1512 unop(Iop_64to8
,cc_dep1
),
1513 unop(Iop_64to8
,cc_dep2
)));
1517 if (isU64(cc_op
, AMD64G_CC_OP_SUBB
) && isU64(cond
, AMD64CondBE
)) {
1518 /* byte sub/cmp, then BE (unsigned less than or equal)
1519 --> test dst <=u src */
1520 return unop(Iop_1Uto64
,
1522 binop(Iop_And64
, cc_dep1
, mkU64(0xFF)),
1523 binop(Iop_And64
, cc_dep2
, mkU64(0xFF))));
1527 if (isU64(cc_op
, AMD64G_CC_OP_SUBB
) && isU64(cond
, AMD64CondS
)
1528 && isU64(cc_dep2
, 0)) {
1529 /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1532 This is yet another scheme by which gcc figures out if the
1533 top bit of a byte is 1 or 0. See also LOGICB/CondS below. */
1534 /* Note: isU64(cc_dep2, 0) is correct, even though this is
1535 for an 8-bit comparison, since the args to the helper
1536 function are always U64s. */
1537 return binop(Iop_And64
,
1538 binop(Iop_Shr64
,cc_dep1
,mkU8(7)),
1541 if (isU64(cc_op
, AMD64G_CC_OP_SUBB
) && isU64(cond
, AMD64CondNS
)
1542 && isU64(cc_dep2
, 0)) {
1543 /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1544 --> test !(dst <s 0)
1547 return binop(Iop_Xor64
,
1549 binop(Iop_Shr64
,cc_dep1
,mkU8(7)),
1554 /*---------------- LOGICQ ----------------*/
1556 if (isU64(cc_op
, AMD64G_CC_OP_LOGICQ
) && isU64(cond
, AMD64CondZ
)) {
1557 /* long long and/or/xor, then Z --> test dst==0 */
1558 return unop(Iop_1Uto64
,
1559 binop(Iop_CmpEQ64
, cc_dep1
, mkU64(0)));
1561 if (isU64(cc_op
, AMD64G_CC_OP_LOGICQ
) && isU64(cond
, AMD64CondNZ
)) {
1562 /* long long and/or/xor, then NZ --> test dst!=0 */
1563 return unop(Iop_1Uto64
,
1564 binop(Iop_CmpNE64
, cc_dep1
, mkU64(0)));
1567 if (isU64(cc_op
, AMD64G_CC_OP_LOGICQ
) && isU64(cond
, AMD64CondL
)) {
1568 /* long long and/or/xor, then L
1569 LOGIC sets SF and ZF according to the
1570 result and makes OF be zero. L computes SF ^ OF, but
1571 OF is zero, so this reduces to SF -- which will be 1 iff
1572 the result is < signed 0. Hence ...
1574 return unop(Iop_1Uto64
,
1580 /*---------------- LOGICL ----------------*/
1582 if (isU64(cc_op
, AMD64G_CC_OP_LOGICL
) && isU64(cond
, AMD64CondZ
)) {
1583 /* long and/or/xor, then Z --> test dst==0 */
1584 return unop(Iop_1Uto64
,
1586 unop(Iop_64to32
, cc_dep1
),
1589 if (isU64(cc_op
, AMD64G_CC_OP_LOGICL
) && isU64(cond
, AMD64CondNZ
)) {
1590 /* long and/or/xor, then NZ --> test dst!=0 */
1591 return unop(Iop_1Uto64
,
1593 unop(Iop_64to32
, cc_dep1
),
1597 if (isU64(cc_op
, AMD64G_CC_OP_LOGICL
) && isU64(cond
, AMD64CondLE
)) {
1598 /* long and/or/xor, then LE
1599 This is pretty subtle. LOGIC sets SF and ZF according to the
1600 result and makes OF be zero. LE computes (SF ^ OF) | ZF, but
1601 OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1602 the result is <=signed 0. Hence ...
1604 return unop(Iop_1Uto64
,
1606 unop(Iop_64to32
, cc_dep1
),
1610 if (isU64(cc_op
, AMD64G_CC_OP_LOGICL
) && isU64(cond
, AMD64CondS
)) {
1611 /* long and/or/xor, then S --> (ULong)result[31] */
1612 return binop(Iop_And64
,
1613 binop(Iop_Shr64
, cc_dep1
, mkU8(31)),
1616 if (isU64(cc_op
, AMD64G_CC_OP_LOGICL
) && isU64(cond
, AMD64CondNS
)) {
1617 /* long and/or/xor, then S --> (ULong) ~ result[31] */
1618 return binop(Iop_Xor64
,
1620 binop(Iop_Shr64
, cc_dep1
, mkU8(31)),
1625 /*---------------- LOGICW ----------------*/
1627 if (isU64(cc_op
, AMD64G_CC_OP_LOGICW
) && isU64(cond
, AMD64CondZ
)) {
1628 /* word and/or/xor, then Z --> test dst==0 */
1629 return unop(Iop_1Uto64
,
1631 binop(Iop_And64
, cc_dep1
, mkU64(0xFFFF)),
1634 if (isU64(cc_op
, AMD64G_CC_OP_LOGICW
) && isU64(cond
, AMD64CondNZ
)) {
1635 /* word and/or/xor, then NZ --> test dst!=0 */
1636 return unop(Iop_1Uto64
,
1638 binop(Iop_And64
, cc_dep1
, mkU64(0xFFFF)),
1642 /*---------------- LOGICB ----------------*/
1644 if (isU64(cc_op
, AMD64G_CC_OP_LOGICB
) && isU64(cond
, AMD64CondZ
)) {
1645 /* byte and/or/xor, then Z --> test dst==0 */
1646 return unop(Iop_1Uto64
,
1647 binop(Iop_CmpEQ64
, binop(Iop_And64
,cc_dep1
,mkU64(255)),
1650 if (isU64(cc_op
, AMD64G_CC_OP_LOGICB
) && isU64(cond
, AMD64CondNZ
)) {
1651 /* byte and/or/xor, then NZ --> test dst!=0 */
1652 return unop(Iop_1Uto64
,
1653 binop(Iop_CmpNE64
, binop(Iop_And64
,cc_dep1
,mkU64(255)),
1657 if (isU64(cc_op
, AMD64G_CC_OP_LOGICB
) && isU64(cond
, AMD64CondS
)) {
1658 /* this is an idiom gcc sometimes uses to find out if the top
1659 bit of a byte register is set: eg testb %al,%al; js ..
1660 Since it just depends on the top bit of the byte, extract
1661 that bit and explicitly get rid of all the rest. This
1662 helps memcheck avoid false positives in the case where any
1663 of the other bits in the byte are undefined. */
1664 /* byte and/or/xor, then S --> (UInt)result[7] */
1665 return binop(Iop_And64
,
1666 binop(Iop_Shr64
,cc_dep1
,mkU8(7)),
1669 if (isU64(cc_op
, AMD64G_CC_OP_LOGICB
) && isU64(cond
, AMD64CondNS
)) {
1670 /* byte and/or/xor, then NS --> (UInt)!result[7] */
1671 return binop(Iop_Xor64
,
1673 binop(Iop_Shr64
,cc_dep1
,mkU8(7)),
1678 /*---------------- INCB ----------------*/
1680 if (isU64(cc_op
, AMD64G_CC_OP_INCB
) && isU64(cond
, AMD64CondLE
)) {
1681 /* 8-bit inc, then LE --> sign bit of the arg */
1682 return binop(Iop_And64
,
1684 binop(Iop_Sub64
, cc_dep1
, mkU64(1)),
1689 /*---------------- INCW ----------------*/
1691 if (isU64(cc_op
, AMD64G_CC_OP_INCW
) && isU64(cond
, AMD64CondZ
)) {
1692 /* 16-bit inc, then Z --> test dst == 0 */
1693 return unop(Iop_1Uto64
,
1695 binop(Iop_Shl64
,cc_dep1
,mkU8(48)),
1699 /*---------------- DECL ----------------*/
1701 if (isU64(cc_op
, AMD64G_CC_OP_DECL
) && isU64(cond
, AMD64CondZ
)) {
1702 /* dec L, then Z --> test dst == 0 */
1703 return unop(Iop_1Uto64
,
1705 unop(Iop_64to32
, cc_dep1
),
1709 /*---------------- DECW ----------------*/
1711 if (isU64(cc_op
, AMD64G_CC_OP_DECW
) && isU64(cond
, AMD64CondNZ
)) {
1712 /* 16-bit dec, then NZ --> test dst != 0 */
1713 return unop(Iop_1Uto64
,
1715 binop(Iop_Shl64
,cc_dep1
,mkU8(48)),
1719 /*---------------- SHRQ ----------------*/
1721 if (isU64(cc_op
, AMD64G_CC_OP_SHRQ
) && isU64(cond
, AMD64CondZ
)) {
1722 /* SHRQ, then Z --> test dep1 == 0 */
1723 return unop(Iop_1Uto64
,
1724 binop(Iop_CmpEQ64
, cc_dep1
, mkU64(0)));
1726 if (isU64(cc_op
, AMD64G_CC_OP_SHRQ
) && isU64(cond
, AMD64CondNZ
)) {
1727 /* SHRQ, then NZ --> test dep1 != 0 */
1728 return unop(Iop_1Uto64
,
1729 binop(Iop_CmpNE64
, cc_dep1
, mkU64(0)));
1732 /*---------------- SHRL ----------------*/
1734 if (isU64(cc_op
, AMD64G_CC_OP_SHRL
) && isU64(cond
, AMD64CondZ
)) {
1735 /* SHRL, then Z --> test dep1 == 0 */
1736 return unop(Iop_1Uto64
,
1737 binop(Iop_CmpEQ32
, unop(Iop_64to32
, cc_dep1
),
1740 if (isU64(cc_op
, AMD64G_CC_OP_SHRL
) && isU64(cond
, AMD64CondNZ
)) {
1741 /* SHRL, then NZ --> test dep1 != 0 */
1742 return unop(Iop_1Uto64
,
1743 binop(Iop_CmpNE32
, unop(Iop_64to32
, cc_dep1
),
1747 if (isU64(cc_op
, AMD64G_CC_OP_SHRL
) && isU64(cond
, AMD64CondS
)) {
1748 /* SHRL/SARL, then S --> (ULong)result[31] */
1749 return binop(Iop_And64
,
1750 binop(Iop_Shr64
, cc_dep1
, mkU8(31)),
1753 // The following looks correct to me, but never seems to happen because
1754 // the front end converts jns to js by switching the fallthrough vs
1755 // taken addresses. See jcc_01(). But then why do other conditions
1756 // considered by this function show up in both variants (xx and Nxx) ?
1757 //if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNS)) {
1758 // /* SHRL/SARL, then NS --> (ULong) ~ result[31] */
1760 // return binop(Iop_Xor64,
1762 // binop(Iop_Shr64, cc_dep1, mkU8(31)),
1767 /*---------------- COPY ----------------*/
1768 /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1769 jbe" for example. */
1771 if (isU64(cc_op
, AMD64G_CC_OP_COPY
)
1772 && (isU64(cond
, AMD64CondBE
) || isU64(cond
, AMD64CondNBE
))) {
1773 /* COPY, then BE --> extract C and Z from dep1, and test (C
1775 /* COPY, then NBE --> extract C and Z from dep1, and test (C
1777 ULong nnn
= isU64(cond
, AMD64CondBE
) ? 1 : 0;
1787 binop(Iop_Shr64
, cc_dep1
, mkU8(AMD64G_CC_SHIFT_C
)),
1788 binop(Iop_Shr64
, cc_dep1
, mkU8(AMD64G_CC_SHIFT_Z
))
1797 if (isU64(cc_op
, AMD64G_CC_OP_COPY
)
1798 && (isU64(cond
, AMD64CondB
) || isU64(cond
, AMD64CondNB
))) {
1799 /* COPY, then B --> extract C from dep1, and test (C == 1). */
1800 /* COPY, then NB --> extract C from dep1, and test (C == 0). */
1801 ULong nnn
= isU64(cond
, AMD64CondB
) ? 1 : 0;
1809 binop(Iop_Shr64
, cc_dep1
, mkU8(AMD64G_CC_SHIFT_C
)),
1817 if (isU64(cc_op
, AMD64G_CC_OP_COPY
)
1818 && (isU64(cond
, AMD64CondZ
) || isU64(cond
, AMD64CondNZ
))) {
1819 /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1820 /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1821 ULong nnn
= isU64(cond
, AMD64CondZ
) ? 1 : 0;
1829 binop(Iop_Shr64
, cc_dep1
, mkU8(AMD64G_CC_SHIFT_Z
)),
1837 if (isU64(cc_op
, AMD64G_CC_OP_COPY
)
1838 && (isU64(cond
, AMD64CondP
) || isU64(cond
, AMD64CondNP
))) {
1839 /* COPY, then P --> extract P from dep1, and test (P == 1). */
1840 /* COPY, then NP --> extract P from dep1, and test (P == 0). */
1841 ULong nnn
= isU64(cond
, AMD64CondP
) ? 1 : 0;
1849 binop(Iop_Shr64
, cc_dep1
, mkU8(AMD64G_CC_SHIFT_P
)),
1860 /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1862 if (vex_streq(function_name
, "amd64g_calculate_rflags_c")) {
1863 /* specialise calls to above "calculate_rflags_c" function */
1864 IRExpr
*cc_op
, *cc_dep1
, *cc_dep2
, *cc_ndep
;
1865 vassert(arity
== 4);
1871 if (isU64(cc_op
, AMD64G_CC_OP_SUBQ
)) {
1872 /* C after sub denotes unsigned less than */
1873 return unop(Iop_1Uto64
,
1878 if (isU64(cc_op
, AMD64G_CC_OP_SUBL
)) {
1879 /* C after sub denotes unsigned less than */
1880 return unop(Iop_1Uto64
,
1882 unop(Iop_64to32
, cc_dep1
),
1883 unop(Iop_64to32
, cc_dep2
)));
1885 if (isU64(cc_op
, AMD64G_CC_OP_SUBB
)) {
1886 /* C after sub denotes unsigned less than */
1887 return unop(Iop_1Uto64
,
1889 binop(Iop_And64
,cc_dep1
,mkU64(0xFF)),
1890 binop(Iop_And64
,cc_dep2
,mkU64(0xFF))));
1892 if (isU64(cc_op
, AMD64G_CC_OP_ADDQ
)) {
1893 /* C after add denotes sum <u either arg */
1894 return unop(Iop_1Uto64
,
1896 binop(Iop_Add64
, cc_dep1
, cc_dep2
),
1899 if (isU64(cc_op
, AMD64G_CC_OP_ADDL
)) {
1900 /* C after add denotes sum <u either arg */
1901 return unop(Iop_1Uto64
,
1903 unop(Iop_64to32
, binop(Iop_Add64
, cc_dep1
, cc_dep2
)),
1904 unop(Iop_64to32
, cc_dep1
)));
1906 if (isU64(cc_op
, AMD64G_CC_OP_LOGICQ
)
1907 || isU64(cc_op
, AMD64G_CC_OP_LOGICL
)
1908 || isU64(cc_op
, AMD64G_CC_OP_LOGICW
)
1909 || isU64(cc_op
, AMD64G_CC_OP_LOGICB
)) {
1910 /* cflag after logic is zero */
1913 if (isU64(cc_op
, AMD64G_CC_OP_DECL
) || isU64(cc_op
, AMD64G_CC_OP_INCL
)
1914 || isU64(cc_op
, AMD64G_CC_OP_DECQ
) || isU64(cc_op
, AMD64G_CC_OP_INCQ
)) {
1915 /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1920 if (cc_op
->tag
== Iex_Const
) {
1921 vex_printf("CFLAG "); ppIRExpr(cc_op
); vex_printf("\n");
1938 /*---------------------------------------------------------------*/
1939 /*--- Supporting functions for x87 FPU activities. ---*/
1940 /*---------------------------------------------------------------*/
1942 static inline Bool
host_is_little_endian ( void )
1944 UInt x
= 0x76543210;
1945 UChar
* p
= (UChar
*)(&x
);
1946 return toBool(*p
== 0x10);
1949 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1950 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
1951 ULong
amd64g_calculate_FXAM ( ULong tag
, ULong dbl
)
1953 Bool mantissaIsZero
;
1958 vassert(host_is_little_endian());
1960 /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1962 f64
= (UChar
*)(&dbl
);
1963 sign
= toUChar( (f64
[7] >> 7) & 1 );
1965 /* First off, if the tag indicates the register was empty,
1966 return 1,0,sign,1 */
1968 /* vex_printf("Empty\n"); */
1969 return AMD64G_FC_MASK_C3
| 0 | (sign
<< AMD64G_FC_SHIFT_C1
)
1970 | AMD64G_FC_MASK_C0
;
1973 bexp
= (f64
[7] << 4) | ((f64
[6] >> 4) & 0x0F);
1978 (f64
[6] & 0x0F) == 0
1979 && (f64
[5] | f64
[4] | f64
[3] | f64
[2] | f64
[1] | f64
[0]) == 0
1982 /* If both exponent and mantissa are zero, the value is zero.
1983 Return 1,0,sign,0. */
1984 if (bexp
== 0 && mantissaIsZero
) {
1985 /* vex_printf("Zero\n"); */
1986 return AMD64G_FC_MASK_C3
| 0
1987 | (sign
<< AMD64G_FC_SHIFT_C1
) | 0;
1990 /* If exponent is zero but mantissa isn't, it's a denormal.
1991 Return 1,1,sign,0. */
1992 if (bexp
== 0 && !mantissaIsZero
) {
1993 /* vex_printf("Denormal\n"); */
1994 return AMD64G_FC_MASK_C3
| AMD64G_FC_MASK_C2
1995 | (sign
<< AMD64G_FC_SHIFT_C1
) | 0;
1998 /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1999 Return 0,1,sign,1. */
2000 if (bexp
== 0x7FF && mantissaIsZero
) {
2001 /* vex_printf("Inf\n"); */
2002 return 0 | AMD64G_FC_MASK_C2
| (sign
<< AMD64G_FC_SHIFT_C1
)
2003 | AMD64G_FC_MASK_C0
;
2006 /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
2007 Return 0,0,sign,1. */
2008 if (bexp
== 0x7FF && !mantissaIsZero
) {
2009 /* vex_printf("NaN\n"); */
2010 return 0 | 0 | (sign
<< AMD64G_FC_SHIFT_C1
) | AMD64G_FC_MASK_C0
;
2013 /* Uh, ok, we give up. It must be a normal finite number.
2016 /* vex_printf("normal\n"); */
2017 return 0 | AMD64G_FC_MASK_C2
| (sign
<< AMD64G_FC_SHIFT_C1
) | 0;
2021 /* This is used to implement both 'frstor' and 'fldenv'. The latter
2022 appears to differ from the former only in that the 8 FP registers
2023 themselves are not transferred into the guest state. */
2025 VexEmNote
do_put_x87 ( Bool moveRegs
,
2026 /*IN*/Fpu_State
* x87_state
,
2027 /*OUT*/VexGuestAMD64State
* vex_state
)
2031 ULong
* vexRegs
= (ULong
*)(&vex_state
->guest_FPREG
[0]);
2032 UChar
* vexTags
= (UChar
*)(&vex_state
->guest_FPTAG
[0]);
2033 UInt ftop
= (x87_state
->env
[FP_ENV_STAT
] >> 11) & 7;
2034 UInt tagw
= x87_state
->env
[FP_ENV_TAG
];
2035 UInt fpucw
= x87_state
->env
[FP_ENV_CTRL
];
2036 UInt c3210
= x87_state
->env
[FP_ENV_STAT
] & 0x4700;
2041 /* Copy registers and tags */
2042 for (stno
= 0; stno
< 8; stno
++) {
2043 preg
= (stno
+ ftop
) & 7;
2044 tag
= (tagw
>> (2*preg
)) & 3;
2046 /* register is empty */
2047 /* hmm, if it's empty, does it still get written? Probably
2048 safer to say it does. If we don't, memcheck could get out
2049 of sync, in that it thinks all FP registers are defined by
2050 this helper, but in reality some have not been updated. */
2052 vexRegs
[preg
] = 0; /* IEEE754 64-bit zero */
2055 /* register is non-empty */
2057 convert_f80le_to_f64le( &x87_state
->reg
[10*stno
],
2058 (UChar
*)&vexRegs
[preg
] );
2064 vex_state
->guest_FTOP
= ftop
;
2067 vex_state
->guest_FC3210
= c3210
;
2069 /* handle the control word, setting FPROUND and detecting any
2070 emulation warnings. */
2071 pair
= amd64g_check_fldcw ( (ULong
)fpucw
);
2072 fpround
= (UInt
)pair
& 0xFFFFFFFFULL
;
2073 ew
= (VexEmNote
)(pair
>> 32);
2075 vex_state
->guest_FPROUND
= fpround
& 3;
2077 /* emulation warnings --> caller */
2082 /* Create an x87 FPU state from the guest state, as close as
2083 we can approximate it. */
2085 void do_get_x87 ( /*IN*/VexGuestAMD64State
* vex_state
,
2086 /*OUT*/Fpu_State
* x87_state
)
2090 ULong
* vexRegs
= (ULong
*)(&vex_state
->guest_FPREG
[0]);
2091 UChar
* vexTags
= (UChar
*)(&vex_state
->guest_FPTAG
[0]);
2092 UInt ftop
= vex_state
->guest_FTOP
;
2093 UInt c3210
= vex_state
->guest_FC3210
;
2095 for (i
= 0; i
< 14; i
++)
2096 x87_state
->env
[i
] = 0;
2098 x87_state
->env
[1] = x87_state
->env
[3] = x87_state
->env
[5]
2099 = x87_state
->env
[13] = 0xFFFF;
2100 x87_state
->env
[FP_ENV_STAT
]
2101 = toUShort(((ftop
& 7) << 11) | (c3210
& 0x4700));
2102 x87_state
->env
[FP_ENV_CTRL
]
2103 = toUShort(amd64g_create_fpucw( vex_state
->guest_FPROUND
));
2105 /* Dump the register stack in ST order. */
2107 for (stno
= 0; stno
< 8; stno
++) {
2108 preg
= (stno
+ ftop
) & 7;
2109 if (vexTags
[preg
] == 0) {
2110 /* register is empty */
2111 tagw
|= (3 << (2*preg
));
2112 convert_f64le_to_f80le( (UChar
*)&vexRegs
[preg
],
2113 &x87_state
->reg
[10*stno
] );
2115 /* register is full. */
2116 tagw
|= (0 << (2*preg
));
2117 convert_f64le_to_f80le( (UChar
*)&vexRegs
[preg
],
2118 &x87_state
->reg
[10*stno
] );
2121 x87_state
->env
[FP_ENV_TAG
] = toUShort(tagw
);
2125 /*---------------------------------------------------------------*/
2126 /*--- Supporting functions for XSAVE/FXSAVE. ---*/
2127 /*---------------------------------------------------------------*/
2129 /* CALLED FROM GENERATED CODE */
2130 /* DIRTY HELPER (reads guest state, writes guest mem) */
2131 /* XSAVE component 0 is the x87 FPU state. */
2132 void amd64g_dirtyhelper_XSAVE_COMPONENT_0
2133 ( VexGuestAMD64State
* gst
, HWord addr
)
2135 /* Derived from values obtained from
2136 vendor_id : AuthenticAMD
2139 model name : AMD Athlon(tm) 64 Processor 3200+
2144 /* Somewhat roundabout, but at least it's simple. */
2146 UShort
* addrS
= (UShort
*)addr
;
2147 UChar
* addrC
= (UChar
*)addr
;
2151 UShort
*srcS
, *dstS
;
2153 do_get_x87( gst
, &tmp
);
2155 /* Now build the proper fxsave x87 image from the fsave x87 image
2158 addrS
[0] = tmp
.env
[FP_ENV_CTRL
]; /* FCW: fpu control word */
2159 addrS
[1] = tmp
.env
[FP_ENV_STAT
]; /* FCW: fpu status word */
2161 /* set addrS[2] in an endian-independent way */
2163 fp_tags
= tmp
.env
[FP_ENV_TAG
];
2164 for (r
= 0; r
< 8; r
++) {
2165 if ( ((fp_tags
>> (2*r
)) & 3) != 3 )
2166 summary_tags
|= (1 << r
);
2168 addrC
[4] = toUChar(summary_tags
); /* FTW: tag summary byte */
2169 addrC
[5] = 0; /* pad */
2171 /* FOP: faulting fpu opcode. From experimentation, the real CPU
2172 does not write this field. (?!) */
2173 addrS
[3] = 0; /* BOGUS */
2175 /* RIP (Last x87 instruction pointer). From experimentation, the
2176 real CPU does not write this field. (?!) */
2177 addrS
[4] = 0; /* BOGUS */
2178 addrS
[5] = 0; /* BOGUS */
2179 addrS
[6] = 0; /* BOGUS */
2180 addrS
[7] = 0; /* BOGUS */
2182 /* RDP (Last x87 data pointer). From experimentation, the real CPU
2183 does not write this field. (?!) */
2184 addrS
[8] = 0; /* BOGUS */
2185 addrS
[9] = 0; /* BOGUS */
2186 addrS
[10] = 0; /* BOGUS */
2187 addrS
[11] = 0; /* BOGUS */
2189 /* addrS[13,12] are MXCSR -- not written */
2190 /* addrS[15,14] are MXCSR_MASK -- not written */
2192 /* Copy in the FP registers, in ST order. */
2193 for (stno
= 0; stno
< 8; stno
++) {
2194 srcS
= (UShort
*)(&tmp
.reg
[10*stno
]);
2195 dstS
= (UShort
*)(&addrS
[16 + 8*stno
]);
2208 /* CALLED FROM GENERATED CODE */
2209 /* DIRTY HELPER (reads guest state, writes guest mem) */
2210 /* XSAVE component 1 is the SSE state. */
2211 void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
2212 ( VexGuestAMD64State
* gst
, HWord addr
)
2214 UShort
* addrS
= (UShort
*)addr
;
2217 /* The only non-register parts of the SSE state are MXCSR and
2219 mxcsr
= amd64g_create_mxcsr( gst
->guest_SSEROUND
);
2221 addrS
[12] = toUShort(mxcsr
); /* MXCSR */
2222 addrS
[13] = toUShort(mxcsr
>> 16);
2224 addrS
[14] = 0xFFFF; /* MXCSR mask (lo16) */
2225 addrS
[15] = 0x0000; /* MXCSR mask (hi16) */
2229 /* VISIBLE TO LIBVEX CLIENT */
2230 /* Do FXSAVE from the supplied VexGuestAMD64State structure and store
2231 the result at the given address which represents a buffer of at
2234 This function is not called from generated code. FXSAVE is dealt
2235 with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
2236 functions above plus some in-line IR. This function is merely a
2237 convenience function for VEX's users.
2239 void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State
* gst
,
2240 /*OUT*/HWord fp_state
)
2242 /* Do the x87 part */
2243 amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst
, fp_state
);
2245 /* And now the SSE part, except for the registers themselves. */
2246 amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst
, fp_state
);
2248 /* That's the first 160 bytes of the image done. */
2249 /* Now only %xmm0 .. %xmm15 remain to be copied. If the host is
2250 big-endian, these need to be byte-swapped. */
2251 U128
*xmm
= (U128
*)(fp_state
+ 160);
2252 vassert(host_is_little_endian());
2254 # define COPY_U128(_dst,_src) \
2255 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
2256 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
2259 COPY_U128( xmm
[0], gst
->guest_YMM0
);
2260 COPY_U128( xmm
[1], gst
->guest_YMM1
);
2261 COPY_U128( xmm
[2], gst
->guest_YMM2
);
2262 COPY_U128( xmm
[3], gst
->guest_YMM3
);
2263 COPY_U128( xmm
[4], gst
->guest_YMM4
);
2264 COPY_U128( xmm
[5], gst
->guest_YMM5
);
2265 COPY_U128( xmm
[6], gst
->guest_YMM6
);
2266 COPY_U128( xmm
[7], gst
->guest_YMM7
);
2267 COPY_U128( xmm
[8], gst
->guest_YMM8
);
2268 COPY_U128( xmm
[9], gst
->guest_YMM9
);
2269 COPY_U128( xmm
[10], gst
->guest_YMM10
);
2270 COPY_U128( xmm
[11], gst
->guest_YMM11
);
2271 COPY_U128( xmm
[12], gst
->guest_YMM12
);
2272 COPY_U128( xmm
[13], gst
->guest_YMM13
);
2273 COPY_U128( xmm
[14], gst
->guest_YMM14
);
2274 COPY_U128( xmm
[15], gst
->guest_YMM15
);
2279 /*---------------------------------------------------------------*/
2280 /*--- Supporting functions for XRSTOR/FXRSTOR. ---*/
2281 /*---------------------------------------------------------------*/
2283 /* CALLED FROM GENERATED CODE */
2284 /* DIRTY HELPER (writes guest state, reads guest mem) */
2285 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
2286 ( VexGuestAMD64State
* gst
, HWord addr
)
2289 UShort
* addrS
= (UShort
*)addr
;
2290 UChar
* addrC
= (UChar
*)addr
;
2294 /* Copy the x87 registers out of the image, into a temporary
2295 Fpu_State struct. */
2296 for (i
= 0; i
< 14; i
++) tmp
.env
[i
] = 0;
2297 for (i
= 0; i
< 80; i
++) tmp
.reg
[i
] = 0;
2298 /* fill in tmp.reg[0..7] */
2299 for (stno
= 0; stno
< 8; stno
++) {
2300 UShort
* dstS
= (UShort
*)(&tmp
.reg
[10*stno
]);
2301 UShort
* srcS
= (UShort
*)(&addrS
[16 + 8*stno
]);
2308 /* fill in tmp.env[0..13] */
2309 tmp
.env
[FP_ENV_CTRL
] = addrS
[0]; /* FCW: fpu control word */
2310 tmp
.env
[FP_ENV_STAT
] = addrS
[1]; /* FCW: fpu status word */
2313 for (r
= 0; r
< 8; r
++) {
2314 if (addrC
[4] & (1<<r
))
2315 fp_tags
|= (0 << (2*r
)); /* EMPTY */
2317 fp_tags
|= (3 << (2*r
)); /* VALID -- not really precise enough. */
2319 tmp
.env
[FP_ENV_TAG
] = fp_tags
;
2321 /* Now write 'tmp' into the guest state. */
2322 VexEmNote warnX87
= do_put_x87( True
/*moveRegs*/, &tmp
, gst
);
2328 /* CALLED FROM GENERATED CODE */
2329 /* DIRTY HELPER (writes guest state, reads guest mem) */
2330 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
2331 ( VexGuestAMD64State
* gst
, HWord addr
)
2333 UShort
* addrS
= (UShort
*)addr
;
2334 UInt w32
= (((UInt
)addrS
[12]) & 0xFFFF)
2335 | ((((UInt
)addrS
[13]) & 0xFFFF) << 16);
2336 ULong w64
= amd64g_check_ldmxcsr( (ULong
)w32
);
2338 VexEmNote warnXMM
= (VexEmNote
)(w64
>> 32);
2340 gst
->guest_SSEROUND
= w64
& 0xFFFFFFFFULL
;
2345 /* VISIBLE TO LIBVEX CLIENT */
2346 /* Do FXRSTOR from the supplied address and store read values to the given
2347 VexGuestAMD64State structure.
2349 This function is not called from generated code. FXRSTOR is dealt
2350 with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
2351 functions above plus some in-line IR. This function is merely a
2352 convenience function for VEX's users.
2354 VexEmNote
LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state
,
2355 /*MOD*/VexGuestAMD64State
* gst
)
2357 /* Restore %xmm0 .. %xmm15. If the host is big-endian, these need
2358 to be byte-swapped. */
2359 U128
*xmm
= (U128
*)(fp_state
+ 160);
2361 vassert(host_is_little_endian());
2363 # define COPY_U128(_dst,_src) \
2364 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
2365 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
2368 COPY_U128( gst
->guest_YMM0
, xmm
[0] );
2369 COPY_U128( gst
->guest_YMM1
, xmm
[1] );
2370 COPY_U128( gst
->guest_YMM2
, xmm
[2] );
2371 COPY_U128( gst
->guest_YMM3
, xmm
[3] );
2372 COPY_U128( gst
->guest_YMM4
, xmm
[4] );
2373 COPY_U128( gst
->guest_YMM5
, xmm
[5] );
2374 COPY_U128( gst
->guest_YMM6
, xmm
[6] );
2375 COPY_U128( gst
->guest_YMM7
, xmm
[7] );
2376 COPY_U128( gst
->guest_YMM8
, xmm
[8] );
2377 COPY_U128( gst
->guest_YMM9
, xmm
[9] );
2378 COPY_U128( gst
->guest_YMM10
, xmm
[10] );
2379 COPY_U128( gst
->guest_YMM11
, xmm
[11] );
2380 COPY_U128( gst
->guest_YMM12
, xmm
[12] );
2381 COPY_U128( gst
->guest_YMM13
, xmm
[13] );
2382 COPY_U128( gst
->guest_YMM14
, xmm
[14] );
2383 COPY_U128( gst
->guest_YMM15
, xmm
[15] );
2388 = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst
, fp_state
);
2390 = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst
, fp_state
);
2392 /* Prefer an X87 emwarn over an XMM one, if both exist. */
2393 if (warnX87
!= EmNote_NONE
)
2400 /*---------------------------------------------------------------*/
2401 /*--- Supporting functions for FSAVE/FRSTOR ---*/
2402 /*---------------------------------------------------------------*/
2404 /* DIRTY HELPER (writes guest state) */
2405 /* Initialise the x87 FPU state as per 'finit'. */
2406 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State
* gst
)
2409 gst
->guest_FTOP
= 0;
2410 for (i
= 0; i
< 8; i
++) {
2411 gst
->guest_FPTAG
[i
] = 0; /* empty */
2412 gst
->guest_FPREG
[i
] = 0; /* IEEE754 64-bit zero */
2414 gst
->guest_FPROUND
= (ULong
)Irrm_NEAREST
;
2415 gst
->guest_FC3210
= 0;
2419 /* CALLED FROM GENERATED CODE */
2420 /* DIRTY HELPER (reads guest memory) */
2421 ULong
amd64g_dirtyhelper_loadF80le ( Addr addrU
)
2424 convert_f80le_to_f64le ( (UChar
*)addrU
, (UChar
*)&f64
);
2428 /* CALLED FROM GENERATED CODE */
2429 /* DIRTY HELPER (writes guest memory) */
2430 void amd64g_dirtyhelper_storeF80le ( Addr addrU
, ULong f64
)
2432 convert_f64le_to_f80le( (UChar
*)&f64
, (UChar
*)addrU
);
2436 /* CALLED FROM GENERATED CODE */
2438 /* mxcsr[15:0] contains a SSE native format MXCSR value.
2439 Extract from it the required SSEROUND value and any resulting
2440 emulation warning, and return (warn << 32) | sseround value.
2442 ULong
amd64g_check_ldmxcsr ( ULong mxcsr
)
2444 /* Decide on a rounding mode. mxcsr[14:13] holds it. */
2445 /* NOTE, encoded exactly as per enum IRRoundingMode. */
2446 ULong rmode
= (mxcsr
>> 13) & 3;
2448 /* Detect any required emulation warnings. */
2449 VexEmNote ew
= EmNote_NONE
;
2451 if ((mxcsr
& 0x1F80) != 0x1F80) {
2452 /* unmasked exceptions! */
2453 ew
= EmWarn_X86_sseExns
;
2456 if (mxcsr
& (1<<15)) {
2461 if (mxcsr
& (1<<6)) {
2463 ew
= EmWarn_X86_daz
;
2466 return (((ULong
)ew
) << 32) | ((ULong
)rmode
);
2470 /* CALLED FROM GENERATED CODE */
2472 /* Given sseround as an IRRoundingMode value, create a suitable SSE
2473 native format MXCSR value. */
2474 ULong
amd64g_create_mxcsr ( ULong sseround
)
2477 return 0x1F80 | (sseround
<< 13);
2482 /* fpucw[15:0] contains a x87 native format FPU control word.
2483 Extract from it the required FPROUND value and any resulting
2484 emulation warning, and return (warn << 32) | fpround value.
2486 ULong
amd64g_check_fldcw ( ULong fpucw
)
2488 /* Decide on a rounding mode. fpucw[11:10] holds it. */
2489 /* NOTE, encoded exactly as per enum IRRoundingMode. */
2490 ULong rmode
= (fpucw
>> 10) & 3;
2492 /* Detect any required emulation warnings. */
2493 VexEmNote ew
= EmNote_NONE
;
2495 if ((fpucw
& 0x3F) != 0x3F) {
2496 /* unmasked exceptions! */
2497 ew
= EmWarn_X86_x87exns
;
2500 if (((fpucw
>> 8) & 3) != 3) {
2501 /* unsupported precision */
2502 ew
= EmWarn_X86_x87precision
;
2505 return (((ULong
)ew
) << 32) | ((ULong
)rmode
);
2510 /* Given fpround as an IRRoundingMode value, create a suitable x87
2511 native format FPU control word. */
2512 ULong
amd64g_create_fpucw ( ULong fpround
)
2515 return 0x037F | (fpround
<< 10);
2519 /* This is used to implement 'fldenv'.
2520 Reads 28 bytes at x87_state[0 .. 27]. */
2521 /* CALLED FROM GENERATED CODE */
2523 VexEmNote
amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State
* vex_state
,
2524 /*IN*/HWord x87_state
)
2526 return do_put_x87( False
, (Fpu_State
*)x87_state
, vex_state
);
2530 /* CALLED FROM GENERATED CODE */
2532 /* Create an x87 FPU env from the guest state, as close as we can
2533 approximate it. Writes 28 bytes at x87_state[0..27]. */
2534 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State
* vex_state
,
2535 /*OUT*/HWord x87_state
)
2539 UChar
* vexTags
= (UChar
*)(&vex_state
->guest_FPTAG
[0]);
2540 Fpu_State
* x87
= (Fpu_State
*)x87_state
;
2541 UInt ftop
= vex_state
->guest_FTOP
;
2542 ULong c3210
= vex_state
->guest_FC3210
;
2544 for (i
= 0; i
< 14; i
++)
2547 x87
->env
[1] = x87
->env
[3] = x87
->env
[5] = x87
->env
[13] = 0xFFFF;
2548 x87
->env
[FP_ENV_STAT
]
2549 = toUShort(toUInt( ((ftop
& 7) << 11) | (c3210
& 0x4700) ));
2550 x87
->env
[FP_ENV_CTRL
]
2551 = toUShort(toUInt( amd64g_create_fpucw( vex_state
->guest_FPROUND
) ));
2553 /* Compute the x87 tag word. */
2555 for (stno
= 0; stno
< 8; stno
++) {
2556 preg
= (stno
+ ftop
) & 7;
2557 if (vexTags
[preg
] == 0) {
2558 /* register is empty */
2559 tagw
|= (3 << (2*preg
));
2561 /* register is full. */
2562 tagw
|= (0 << (2*preg
));
2565 x87
->env
[FP_ENV_TAG
] = toUShort(tagw
);
2567 /* We don't dump the x87 registers, tho. */
2571 /* This is used to implement 'fnsave'.
2572 Writes 108 bytes at x87_state[0 .. 107]. */
2573 /* CALLED FROM GENERATED CODE */
2575 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State
* vex_state
,
2576 /*OUT*/HWord x87_state
)
2578 do_get_x87( vex_state
, (Fpu_State
*)x87_state
);
2582 /* This is used to implement 'fnsaves'.
2583 Writes 94 bytes at x87_state[0 .. 93]. */
2584 /* CALLED FROM GENERATED CODE */
2586 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State
* vex_state
,
2587 /*OUT*/HWord x87_state
)
2591 ULong
* vexRegs
= (ULong
*)(&vex_state
->guest_FPREG
[0]);
2592 UChar
* vexTags
= (UChar
*)(&vex_state
->guest_FPTAG
[0]);
2593 Fpu_State_16
* x87
= (Fpu_State_16
*)x87_state
;
2594 UInt ftop
= vex_state
->guest_FTOP
;
2595 UInt c3210
= vex_state
->guest_FC3210
;
2597 for (i
= 0; i
< 7; i
++)
2600 x87
->env
[FPS_ENV_STAT
]
2601 = toUShort(((ftop
& 7) << 11) | (c3210
& 0x4700));
2602 x87
->env
[FPS_ENV_CTRL
]
2603 = toUShort(amd64g_create_fpucw( vex_state
->guest_FPROUND
));
2605 /* Dump the register stack in ST order. */
2607 for (stno
= 0; stno
< 8; stno
++) {
2608 preg
= (stno
+ ftop
) & 7;
2609 if (vexTags
[preg
] == 0) {
2610 /* register is empty */
2611 tagw
|= (3 << (2*preg
));
2612 convert_f64le_to_f80le( (UChar
*)&vexRegs
[preg
],
2613 &x87
->reg
[10*stno
] );
2615 /* register is full. */
2616 tagw
|= (0 << (2*preg
));
2617 convert_f64le_to_f80le( (UChar
*)&vexRegs
[preg
],
2618 &x87
->reg
[10*stno
] );
2621 x87
->env
[FPS_ENV_TAG
] = toUShort(tagw
);
2625 /* This is used to implement 'frstor'.
2626 Reads 108 bytes at x87_state[0 .. 107]. */
2627 /* CALLED FROM GENERATED CODE */
2629 VexEmNote
amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State
* vex_state
,
2630 /*IN*/HWord x87_state
)
2632 return do_put_x87( True
, (Fpu_State
*)x87_state
, vex_state
);
2636 /* This is used to implement 'frstors'.
2637 Reads 94 bytes at x87_state[0 .. 93]. */
2638 /* CALLED FROM GENERATED CODE */
2640 VexEmNote
amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State
* vex_state
,
2641 /*IN*/HWord x87_state
)
2645 ULong
* vexRegs
= (ULong
*)(&vex_state
->guest_FPREG
[0]);
2646 UChar
* vexTags
= (UChar
*)(&vex_state
->guest_FPTAG
[0]);
2647 Fpu_State_16
* x87
= (Fpu_State_16
*)x87_state
;
2648 UInt ftop
= (x87
->env
[FPS_ENV_STAT
] >> 11) & 7;
2649 UInt tagw
= x87
->env
[FPS_ENV_TAG
];
2650 UInt fpucw
= x87
->env
[FPS_ENV_CTRL
];
2651 UInt c3210
= x87
->env
[FPS_ENV_STAT
] & 0x4700;
2656 /* Copy registers and tags */
2657 for (stno
= 0; stno
< 8; stno
++) {
2658 preg
= (stno
+ ftop
) & 7;
2659 tag
= (tagw
>> (2*preg
)) & 3;
2661 /* register is empty */
2662 /* hmm, if it's empty, does it still get written? Probably
2663 safer to say it does. If we don't, memcheck could get out
2664 of sync, in that it thinks all FP registers are defined by
2665 this helper, but in reality some have not been updated. */
2666 vexRegs
[preg
] = 0; /* IEEE754 64-bit zero */
2669 /* register is non-empty */
2670 convert_f80le_to_f64le( &x87
->reg
[10*stno
],
2671 (UChar
*)&vexRegs
[preg
] );
2677 vex_state
->guest_FTOP
= ftop
;
2680 vex_state
->guest_FC3210
= c3210
;
2682 /* handle the control word, setting FPROUND and detecting any
2683 emulation warnings. */
2684 pair
= amd64g_check_fldcw ( (ULong
)fpucw
);
2685 fpround
= (UInt
)pair
& 0xFFFFFFFFULL
;
2686 ew
= (VexEmNote
)(pair
>> 32);
2688 vex_state
->guest_FPROUND
= fpround
& 3;
2690 /* emulation warnings --> caller */
2695 /*---------------------------------------------------------------*/
2696 /*--- CPUID helpers. ---*/
2697 /*---------------------------------------------------------------*/
2699 /* Claim to be the following CPU, which is probably representative of
2700 the lowliest (earliest) amd64 offerings. It can do neither sse3
2703 vendor_id : AuthenticAMD
2706 model name : AMD Opteron (tm) Processor 848
2709 cache size : 1024 KB
2714 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2715 mtrr pge mca cmov pat pse36 clflush mmx fxsr
2716 sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2718 TLB size : 1088 4K pages
2720 cache_alignment : 64
2721 address sizes : 40 bits physical, 48 bits virtual
2722 power management: ts fid vid ttp
2724 2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2725 we don't support them. See #291568. 3dnow is 80000001.EDX.31
2726 and 3dnowext is 80000001.EDX.30.
2728 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State
* st
)
2730 # define SET_ABCD(_a,_b,_c,_d) \
2731 do { st->guest_RAX = (ULong)(_a); \
2732 st->guest_RBX = (ULong)(_b); \
2733 st->guest_RCX = (ULong)(_c); \
2734 st->guest_RDX = (ULong)(_d); \
2737 switch (0xFFFFFFFF & st
->guest_RAX
) {
2739 SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2742 SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2745 SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2748 /* Don't claim to support 3dnow or 3dnowext. 0xe1d3fbff is
2749 the original it-is-supported value that the h/w provides.
2751 SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2755 SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2758 SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2761 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2764 SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2767 SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2770 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2773 SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2776 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2783 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2786 vendor_id : GenuineIntel
2789 model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2792 cache size : 4096 KB
2801 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2802 mtrr pge mca cmov pat pse36 clflush dts acpi
2803 mmx fxsr sse sse2 ss ht tm syscall nx lm
2804 constant_tsc pni monitor ds_cpl vmx est tm2
2808 cache_alignment : 64
2809 address sizes : 36 bits physical, 48 bits virtual
2812 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State
* st
)
2814 # define SET_ABCD(_a,_b,_c,_d) \
2815 do { st->guest_RAX = (ULong)(_a); \
2816 st->guest_RBX = (ULong)(_b); \
2817 st->guest_RCX = (ULong)(_c); \
2818 st->guest_RDX = (ULong)(_d); \
2821 switch (0xFFFFFFFF & st
->guest_RAX
) {
2823 SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2826 SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2829 SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2832 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2835 switch (0xFFFFFFFF & st
->guest_RCX
) {
2836 case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2837 0x0000003f, 0x00000001); break;
2838 case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2839 0x0000003f, 0x00000001); break;
2840 case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2841 0x00000fff, 0x00000001); break;
2842 default: SET_ABCD(0x00000000, 0x00000000,
2843 0x00000000, 0x00000000); break;
2848 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2851 SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2854 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2857 SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2860 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2863 unhandled_eax_value
:
2864 SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2867 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2870 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2873 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2876 SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2879 SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2882 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2885 SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2888 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2891 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2894 goto unhandled_eax_value
;
2900 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2903 vendor_id : GenuineIntel
2906 model name : Intel(R) Core(TM) i5 CPU 670 @ 3.47GHz
2909 cache size : 4096 KB
2920 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2921 mtrr pge mca cmov pat pse36 clflush dts acpi
2922 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2923 lm constant_tsc arch_perfmon pebs bts rep_good
2924 xtopology nonstop_tsc aperfmperf pni pclmulqdq
2925 dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2926 xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2927 arat tpr_shadow vnmi flexpriority ept vpid
2930 cache_alignment : 64
2931 address sizes : 36 bits physical, 48 bits virtual
2934 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State
* st
)
2936 # define SET_ABCD(_a,_b,_c,_d) \
2937 do { st->guest_RAX = (ULong)(_a); \
2938 st->guest_RBX = (ULong)(_b); \
2939 st->guest_RCX = (ULong)(_c); \
2940 st->guest_RDX = (ULong)(_d); \
2943 UInt old_eax
= (UInt
)st
->guest_RAX
;
2944 UInt old_ecx
= (UInt
)st
->guest_RCX
;
2948 SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2951 SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
2954 SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2957 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2961 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2962 0x0000003f, 0x00000000); break;
2963 case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2964 0x0000007f, 0x00000000); break;
2965 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2966 0x000001ff, 0x00000000); break;
2967 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2968 0x00000fff, 0x00000002); break;
2969 default: SET_ABCD(0x00000000, 0x00000000,
2970 0x00000000, 0x00000000); break;
2974 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2977 SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2980 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2983 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2986 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2989 SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2994 SET_ABCD(0x00000001, 0x00000002,
2995 0x00000100, 0x00000000); break;
2997 SET_ABCD(0x00000004, 0x00000004,
2998 0x00000201, 0x00000000); break;
3000 SET_ABCD(0x00000000, 0x00000000,
3001 old_ecx
, 0x00000000); break;
3005 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3009 case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3010 0x00000100, 0x00000000); break;
3011 case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
3012 0x00000201, 0x00000000); break;
3013 default: SET_ABCD(0x00000000, 0x00000000,
3014 old_ecx
, 0x00000000); break;
3018 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3021 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3024 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3027 SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
3030 SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
3033 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3036 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3039 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3042 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3045 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3052 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
3053 capable. Plus (kludge!) it "supports" HTM.
3055 Also with the following change: claim that XSaveOpt is not
3056 available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
3057 on the real CPU. Consequently, programs that correctly observe
3058 these CPUID values should only try to use 3 of the 8 XSave-family
3059 instructions: XGETBV, XSAVE and XRSTOR. In particular this avoids
3060 having to implement the compacted or optimised save/restore
3063 vendor_id : GenuineIntel
3066 model name : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
3069 cache size : 6144 KB
3080 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
3081 mtrr pge mca cmov pat pse36 clflush dts acpi
3082 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
3083 lm constant_tsc arch_perfmon pebs bts rep_good
3084 nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
3085 dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
3086 xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
3087 lahf_lm ida arat epb xsaveopt pln pts dts
3088 tpr_shadow vnmi flexpriority ept vpid
3092 cache_alignment : 64
3093 address sizes : 36 bits physical, 48 bits virtual
3096 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State
* st
)
3098 # define SET_ABCD(_a,_b,_c,_d) \
3099 do { st->guest_RAX = (ULong)(_a); \
3100 st->guest_RBX = (ULong)(_b); \
3101 st->guest_RCX = (ULong)(_c); \
3102 st->guest_RDX = (ULong)(_d); \
3105 UInt old_eax
= (UInt
)st
->guest_RAX
;
3106 UInt old_ecx
= (UInt
)st
->guest_RCX
;
3110 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3113 SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
3116 SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
3119 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3123 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3124 0x0000003f, 0x00000000); break;
3125 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3126 0x0000003f, 0x00000000); break;
3127 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3128 0x000001ff, 0x00000000); break;
3129 case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
3130 0x00001fff, 0x00000006); break;
3131 default: SET_ABCD(0x00000000, 0x00000000,
3132 0x00000000, 0x00000000); break;
3136 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3139 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3142 SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
3145 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3148 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3151 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3156 SET_ABCD(0x00000001, 0x00000001,
3157 0x00000100, 0x00000000); break;
3159 SET_ABCD(0x00000004, 0x00000004,
3160 0x00000201, 0x00000000); break;
3162 SET_ABCD(0x00000000, 0x00000000,
3163 old_ecx
, 0x00000000); break;
3167 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3171 case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3172 0x00000340, 0x00000000); break;
3173 case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3174 0x00000000, 0x00000000); break;
3175 case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3176 0x00000000, 0x00000000); break;
3177 default: SET_ABCD(0x00000000, 0x00000000,
3178 0x00000000, 0x00000000); break;
3182 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3185 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3188 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3191 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3194 SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
3197 SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
3200 SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
3203 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3206 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3209 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3212 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3215 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3222 /* Claim to be the following CPU (4 x ...), which is AVX2 capable.
3224 With the following change: claim that XSaveOpt is not available, by
3225 cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
3226 CPU. Consequently, programs that correctly observe these CPUID
3227 values should only try to use 3 of the 8 XSave-family instructions:
3228 XGETBV, XSAVE and XRSTOR. In particular this avoids having to
3229 implement the compacted or optimised save/restore variants.
3231 vendor_id : GenuineIntel
3234 model name : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
3238 cache size : 8192 KB
3249 flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
3250 cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
3251 tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
3252 arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
3253 aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
3254 vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
3255 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
3256 avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
3257 tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
3258 bmi1 avx2 smep bmi2 erms invpcid xsaveopt
3262 cache_alignment : 64
3263 address sizes : 39 bits physical, 48 bits virtual
3266 void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State
* st
)
3268 # define SET_ABCD(_a,_b,_c,_d) \
3269 do { st->guest_RAX = (ULong)(_a); \
3270 st->guest_RBX = (ULong)(_b); \
3271 st->guest_RCX = (ULong)(_c); \
3272 st->guest_RDX = (ULong)(_d); \
3275 UInt old_eax
= (UInt
)st
->guest_RAX
;
3276 UInt old_ecx
= (UInt
)st
->guest_RCX
;
3280 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3283 /* Don't advertise RDRAND support, bit 30 in ECX. */
3284 SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff);
3287 SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
3290 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3294 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3295 0x0000003f, 0x00000000); break;
3296 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3297 0x0000003f, 0x00000000); break;
3298 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3299 0x000001ff, 0x00000000); break;
3300 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3301 0x00001fff, 0x00000006); break;
3302 default: SET_ABCD(0x00000000, 0x00000000,
3303 0x00000000, 0x00000000); break;
3307 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
3310 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3314 case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
3315 0x00000000, 0x00000000); break;
3316 default: SET_ABCD(0x00000000, 0x00000000,
3317 0x00000000, 0x00000000); break;
3321 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3324 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3327 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3331 case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3332 0x00000100, 0x00000002); break;
3333 case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
3334 0x00000201, 0x00000002); break;
3335 default: SET_ABCD(0x00000000, 0x00000000,
3336 old_ecx
, 0x00000002); break;
3340 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3344 case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3345 0x00000340, 0x00000000); break;
3346 case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3347 0x00000000, 0x00000000); break;
3348 case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3349 0x00000000, 0x00000000); break;
3350 default: SET_ABCD(0x00000000, 0x00000000,
3351 0x00000000, 0x00000000); break;
3355 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3358 SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
3361 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3364 SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
3367 SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
3370 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3373 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3376 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3379 SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
3382 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3389 /*---------------------------------------------------------------*/
3390 /*--- Misc integer helpers, including rotates and crypto. ---*/
3391 /*---------------------------------------------------------------*/
3393 ULong
amd64g_calculate_RCR ( ULong arg
,
3398 Bool wantRflags
= toBool(szIN
< 0);
3399 ULong sz
= wantRflags
? (-szIN
) : szIN
;
3400 ULong tempCOUNT
= rot_amt
& (sz
== 8 ? 0x3F : 0x1F);
3401 ULong cf
=0, of
=0, tempcf
;
3405 cf
= (rflags_in
>> AMD64G_CC_SHIFT_C
) & 1;
3406 of
= ((arg
>> 63) ^ cf
) & 1;
3407 while (tempCOUNT
> 0) {
3409 arg
= (arg
>> 1) | (cf
<< 63);
3415 while (tempCOUNT
>= 33) tempCOUNT
-= 33;
3416 cf
= (rflags_in
>> AMD64G_CC_SHIFT_C
) & 1;
3417 of
= ((arg
>> 31) ^ cf
) & 1;
3418 while (tempCOUNT
> 0) {
3420 arg
= ((arg
>> 1) & 0x7FFFFFFFULL
) | (cf
<< 31);
3426 while (tempCOUNT
>= 17) tempCOUNT
-= 17;
3427 cf
= (rflags_in
>> AMD64G_CC_SHIFT_C
) & 1;
3428 of
= ((arg
>> 15) ^ cf
) & 1;
3429 while (tempCOUNT
> 0) {
3431 arg
= ((arg
>> 1) & 0x7FFFULL
) | (cf
<< 15);
3437 while (tempCOUNT
>= 9) tempCOUNT
-= 9;
3438 cf
= (rflags_in
>> AMD64G_CC_SHIFT_C
) & 1;
3439 of
= ((arg
>> 7) ^ cf
) & 1;
3440 while (tempCOUNT
> 0) {
3442 arg
= ((arg
>> 1) & 0x7FULL
) | (cf
<< 7);
3448 vpanic("calculate_RCR(amd64g): invalid size");
3453 rflags_in
&= ~(AMD64G_CC_MASK_C
| AMD64G_CC_MASK_O
);
3454 rflags_in
|= (cf
<< AMD64G_CC_SHIFT_C
) | (of
<< AMD64G_CC_SHIFT_O
);
3456 /* caller can ask to have back either the resulting flags or
3457 resulting value, but not both */
3458 return wantRflags
? rflags_in
: arg
;
3461 ULong
amd64g_calculate_RCL ( ULong arg
,
3466 Bool wantRflags
= toBool(szIN
< 0);
3467 ULong sz
= wantRflags
? (-szIN
) : szIN
;
3468 ULong tempCOUNT
= rot_amt
& (sz
== 8 ? 0x3F : 0x1F);
3469 ULong cf
=0, of
=0, tempcf
;
3473 cf
= (rflags_in
>> AMD64G_CC_SHIFT_C
) & 1;
3474 while (tempCOUNT
> 0) {
3475 tempcf
= (arg
>> 63) & 1;
3476 arg
= (arg
<< 1) | (cf
& 1);
3480 of
= ((arg
>> 63) ^ cf
) & 1;
3483 while (tempCOUNT
>= 33) tempCOUNT
-= 33;
3484 cf
= (rflags_in
>> AMD64G_CC_SHIFT_C
) & 1;
3485 while (tempCOUNT
> 0) {
3486 tempcf
= (arg
>> 31) & 1;
3487 arg
= 0xFFFFFFFFULL
& ((arg
<< 1) | (cf
& 1));
3491 of
= ((arg
>> 31) ^ cf
) & 1;
3494 while (tempCOUNT
>= 17) tempCOUNT
-= 17;
3495 cf
= (rflags_in
>> AMD64G_CC_SHIFT_C
) & 1;
3496 while (tempCOUNT
> 0) {
3497 tempcf
= (arg
>> 15) & 1;
3498 arg
= 0xFFFFULL
& ((arg
<< 1) | (cf
& 1));
3502 of
= ((arg
>> 15) ^ cf
) & 1;
3505 while (tempCOUNT
>= 9) tempCOUNT
-= 9;
3506 cf
= (rflags_in
>> AMD64G_CC_SHIFT_C
) & 1;
3507 while (tempCOUNT
> 0) {
3508 tempcf
= (arg
>> 7) & 1;
3509 arg
= 0xFFULL
& ((arg
<< 1) | (cf
& 1));
3513 of
= ((arg
>> 7) ^ cf
) & 1;
3516 vpanic("calculate_RCL(amd64g): invalid size");
3521 rflags_in
&= ~(AMD64G_CC_MASK_C
| AMD64G_CC_MASK_O
);
3522 rflags_in
|= (cf
<< AMD64G_CC_SHIFT_C
) | (of
<< AMD64G_CC_SHIFT_O
);
3524 return wantRflags
? rflags_in
: arg
;
3527 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
3528 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
3530 ULong
amd64g_calculate_pclmul(ULong a
, ULong b
, ULong which
)
3532 ULong hi
, lo
, tmp
, A
[16];
3535 A
[2] = A
[1] << 1; A
[3] = A
[2] ^ a
;
3536 A
[4] = A
[2] << 1; A
[5] = A
[4] ^ a
;
3537 A
[6] = A
[3] << 1; A
[7] = A
[6] ^ a
;
3538 A
[8] = A
[4] << 1; A
[9] = A
[8] ^ a
;
3539 A
[10] = A
[5] << 1; A
[11] = A
[10] ^ a
;
3540 A
[12] = A
[6] << 1; A
[13] = A
[12] ^ a
;
3541 A
[14] = A
[7] << 1; A
[15] = A
[14] ^ a
;
3543 lo
= (A
[b
>> 60] << 4) ^ A
[(b
>> 56) & 15];
3545 lo
= (lo
<< 8) ^ (A
[(b
>> 52) & 15] << 4) ^ A
[(b
>> 48) & 15];
3546 hi
= (hi
<< 8) | (lo
>> 56);
3547 lo
= (lo
<< 8) ^ (A
[(b
>> 44) & 15] << 4) ^ A
[(b
>> 40) & 15];
3548 hi
= (hi
<< 8) | (lo
>> 56);
3549 lo
= (lo
<< 8) ^ (A
[(b
>> 36) & 15] << 4) ^ A
[(b
>> 32) & 15];
3550 hi
= (hi
<< 8) | (lo
>> 56);
3551 lo
= (lo
<< 8) ^ (A
[(b
>> 28) & 15] << 4) ^ A
[(b
>> 24) & 15];
3552 hi
= (hi
<< 8) | (lo
>> 56);
3553 lo
= (lo
<< 8) ^ (A
[(b
>> 20) & 15] << 4) ^ A
[(b
>> 16) & 15];
3554 hi
= (hi
<< 8) | (lo
>> 56);
3555 lo
= (lo
<< 8) ^ (A
[(b
>> 12) & 15] << 4) ^ A
[(b
>> 8) & 15];
3556 hi
= (hi
<< 8) | (lo
>> 56);
3557 lo
= (lo
<< 8) ^ (A
[(b
>> 4) & 15] << 4) ^ A
[b
& 15];
3561 tmp
= -((a
>> 63) & 1); tmp
&= ((b
& (m0
* 0xfe)) >> 1); hi
= hi
^ tmp
;
3562 tmp
= -((a
>> 62) & 1); tmp
&= ((b
& (m0
* 0xfc)) >> 2); hi
= hi
^ tmp
;
3563 tmp
= -((a
>> 61) & 1); tmp
&= ((b
& (m0
* 0xf8)) >> 3); hi
= hi
^ tmp
;
3564 tmp
= -((a
>> 60) & 1); tmp
&= ((b
& (m0
* 0xf0)) >> 4); hi
= hi
^ tmp
;
3565 tmp
= -((a
>> 59) & 1); tmp
&= ((b
& (m0
* 0xe0)) >> 5); hi
= hi
^ tmp
;
3566 tmp
= -((a
>> 58) & 1); tmp
&= ((b
& (m0
* 0xc0)) >> 6); hi
= hi
^ tmp
;
3567 tmp
= -((a
>> 57) & 1); tmp
&= ((b
& (m0
* 0x80)) >> 7); hi
= hi
^ tmp
;
3569 return which
? hi
: lo
;
3573 /* CALLED FROM GENERATED CODE */
3574 /* DIRTY HELPER (non-referentially-transparent) */
3575 /* Horrible hack. On non-amd64 platforms, return 1. */
3576 ULong
amd64g_dirtyhelper_RDTSC ( void )
3578 # if defined(__x86_64__)
3580 __asm__
__volatile__("rdtsc" : "=a" (eax
), "=d" (edx
));
3581 return (((ULong
)edx
) << 32) | ((ULong
)eax
);
3587 /* CALLED FROM GENERATED CODE */
3588 /* DIRTY HELPER (non-referentially-transparent) */
3589 /* Horrible hack. On non-amd64 platforms, return 1. */
3590 /* This uses a different calling convention from _RDTSC just above
3591 only because of the difficulty of returning 96 bits from a C
3592 function -- RDTSC returns 64 bits and so is simple by comparison,
3594 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State
* st
)
3596 # if defined(__x86_64__)
3598 __asm__
__volatile__("rdtscp" : "=a" (eax
), "=d" (edx
), "=c" (ecx
));
3599 st
->guest_RAX
= (ULong
)eax
;
3600 st
->guest_RCX
= (ULong
)ecx
;
3601 st
->guest_RDX
= (ULong
)edx
;
3607 /* CALLED FROM GENERATED CODE */
3608 /* DIRTY HELPER (non-referentially-transparent) */
3609 /* Horrible hack. On non-amd64 platforms, return 0. */
3610 ULong
amd64g_dirtyhelper_IN ( ULong portno
, ULong sz
/*1,2 or 4*/ )
3612 # if defined(__x86_64__)
3617 __asm__
__volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
3618 : "=a" (r
) : "Nd" (portno
));
3621 __asm__
__volatile__("movq $0,%%rax; inw %w1,%w0"
3622 : "=a" (r
) : "Nd" (portno
));
3625 __asm__
__volatile__("movq $0,%%rax; inb %w1,%b0"
3626 : "=a" (r
) : "Nd" (portno
));
3629 break; /* note: no 64-bit version of insn exists */
3638 /* CALLED FROM GENERATED CODE */
3639 /* DIRTY HELPER (non-referentially-transparent) */
3640 /* Horrible hack. On non-amd64 platforms, do nothing. */
3641 void amd64g_dirtyhelper_OUT ( ULong portno
, ULong data
, ULong sz
/*1,2 or 4*/ )
3643 # if defined(__x86_64__)
3647 __asm__
__volatile__("movq %0,%%rax; outl %%eax, %w1"
3648 : : "a" (data
), "Nd" (portno
));
3651 __asm__
__volatile__("outw %w0, %w1"
3652 : : "a" (data
), "Nd" (portno
));
3655 __asm__
__volatile__("outb %b0, %w1"
3656 : : "a" (data
), "Nd" (portno
));
3659 break; /* note: no 64-bit version of insn exists */
3666 /* CALLED FROM GENERATED CODE */
3667 /* DIRTY HELPER (non-referentially-transparent) */
3668 /* Horrible hack. On non-amd64 platforms, do nothing. */
3669 /* op = 0: call the native SGDT instruction.
3670 op = 1: call the native SIDT instruction.
3672 void amd64g_dirtyhelper_SxDT ( void *address
, ULong op
) {
3673 # if defined(__x86_64__)
3676 __asm__
__volatile__("sgdt (%0)" : : "r" (address
) : "memory");
3679 __asm__
__volatile__("sidt (%0)" : : "r" (address
) : "memory");
3682 vpanic("amd64g_dirtyhelper_SxDT");
3686 UChar
* p
= (UChar
*)address
;
3687 p
[0] = p
[1] = p
[2] = p
[3] = p
[4] = p
[5] = 0;
3688 p
[6] = p
[7] = p
[8] = p
[9] = 0;
3692 /*---------------------------------------------------------------*/
3693 /*--- Helpers for MMX/SSE/SSE2. ---*/
3694 /*---------------------------------------------------------------*/
3696 static inline UChar
abdU8 ( UChar xx
, UChar yy
) {
3697 return toUChar(xx
>yy
? xx
-yy
: yy
-xx
);
3700 static inline ULong
mk32x2 ( UInt w1
, UInt w0
) {
3701 return (((ULong
)w1
) << 32) | ((ULong
)w0
);
3704 static inline UShort
sel16x4_3 ( ULong w64
) {
3705 UInt hi32
= toUInt(w64
>> 32);
3706 return toUShort(hi32
>> 16);
3708 static inline UShort
sel16x4_2 ( ULong w64
) {
3709 UInt hi32
= toUInt(w64
>> 32);
3710 return toUShort(hi32
);
3712 static inline UShort
sel16x4_1 ( ULong w64
) {
3713 UInt lo32
= toUInt(w64
);
3714 return toUShort(lo32
>> 16);
3716 static inline UShort
sel16x4_0 ( ULong w64
) {
3717 UInt lo32
= toUInt(w64
);
3718 return toUShort(lo32
);
3721 static inline UChar
sel8x8_7 ( ULong w64
) {
3722 UInt hi32
= toUInt(w64
>> 32);
3723 return toUChar(hi32
>> 24);
3725 static inline UChar
sel8x8_6 ( ULong w64
) {
3726 UInt hi32
= toUInt(w64
>> 32);
3727 return toUChar(hi32
>> 16);
3729 static inline UChar
sel8x8_5 ( ULong w64
) {
3730 UInt hi32
= toUInt(w64
>> 32);
3731 return toUChar(hi32
>> 8);
3733 static inline UChar
sel8x8_4 ( ULong w64
) {
3734 UInt hi32
= toUInt(w64
>> 32);
3735 return toUChar(hi32
>> 0);
3737 static inline UChar
sel8x8_3 ( ULong w64
) {
3738 UInt lo32
= toUInt(w64
);
3739 return toUChar(lo32
>> 24);
3741 static inline UChar
sel8x8_2 ( ULong w64
) {
3742 UInt lo32
= toUInt(w64
);
3743 return toUChar(lo32
>> 16);
3745 static inline UChar
sel8x8_1 ( ULong w64
) {
3746 UInt lo32
= toUInt(w64
);
3747 return toUChar(lo32
>> 8);
3749 static inline UChar
sel8x8_0 ( ULong w64
) {
3750 UInt lo32
= toUInt(w64
);
3751 return toUChar(lo32
>> 0);
3754 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3755 ULong
amd64g_calculate_mmx_pmaddwd ( ULong xx
, ULong yy
)
3759 (((Int
)(Short
)sel16x4_3(xx
)) * ((Int
)(Short
)sel16x4_3(yy
)))
3760 + (((Int
)(Short
)sel16x4_2(xx
)) * ((Int
)(Short
)sel16x4_2(yy
))),
3761 (((Int
)(Short
)sel16x4_1(xx
)) * ((Int
)(Short
)sel16x4_1(yy
)))
3762 + (((Int
)(Short
)sel16x4_0(xx
)) * ((Int
)(Short
)sel16x4_0(yy
)))
3766 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3767 ULong
amd64g_calculate_mmx_psadbw ( ULong xx
, ULong yy
)
3770 t
+= (UInt
)abdU8( sel8x8_7(xx
), sel8x8_7(yy
) );
3771 t
+= (UInt
)abdU8( sel8x8_6(xx
), sel8x8_6(yy
) );
3772 t
+= (UInt
)abdU8( sel8x8_5(xx
), sel8x8_5(yy
) );
3773 t
+= (UInt
)abdU8( sel8x8_4(xx
), sel8x8_4(yy
) );
3774 t
+= (UInt
)abdU8( sel8x8_3(xx
), sel8x8_3(yy
) );
3775 t
+= (UInt
)abdU8( sel8x8_2(xx
), sel8x8_2(yy
) );
3776 t
+= (UInt
)abdU8( sel8x8_1(xx
), sel8x8_1(yy
) );
3777 t
+= (UInt
)abdU8( sel8x8_0(xx
), sel8x8_0(yy
) );
3782 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3783 ULong
amd64g_calculate_sse_phminposuw ( ULong sLo
, ULong sHi
)
3787 t
= sel16x4_0(sLo
); if (True
) { min
= t
; idx
= 0; }
3788 t
= sel16x4_1(sLo
); if (t
< min
) { min
= t
; idx
= 1; }
3789 t
= sel16x4_2(sLo
); if (t
< min
) { min
= t
; idx
= 2; }
3790 t
= sel16x4_3(sLo
); if (t
< min
) { min
= t
; idx
= 3; }
3791 t
= sel16x4_0(sHi
); if (t
< min
) { min
= t
; idx
= 4; }
3792 t
= sel16x4_1(sHi
); if (t
< min
) { min
= t
; idx
= 5; }
3793 t
= sel16x4_2(sHi
); if (t
< min
) { min
= t
; idx
= 6; }
3794 t
= sel16x4_3(sHi
); if (t
< min
) { min
= t
; idx
= 7; }
3795 return ((ULong
)(idx
<< 16)) | ((ULong
)min
);
3798 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3799 ULong
amd64g_calc_crc32b ( ULong crcIn
, ULong b
)
3802 ULong crc
= (b
& 0xFFULL
) ^ crcIn
;
3803 for (i
= 0; i
< 8; i
++)
3804 crc
= (crc
>> 1) ^ ((crc
& 1) ? 0x82f63b78ULL
: 0);
3808 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3809 ULong
amd64g_calc_crc32w ( ULong crcIn
, ULong w
)
3812 ULong crc
= (w
& 0xFFFFULL
) ^ crcIn
;
3813 for (i
= 0; i
< 16; i
++)
3814 crc
= (crc
>> 1) ^ ((crc
& 1) ? 0x82f63b78ULL
: 0);
3818 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3819 ULong
amd64g_calc_crc32l ( ULong crcIn
, ULong l
)
3822 ULong crc
= (l
& 0xFFFFFFFFULL
) ^ crcIn
;
3823 for (i
= 0; i
< 32; i
++)
3824 crc
= (crc
>> 1) ^ ((crc
& 1) ? 0x82f63b78ULL
: 0);
3828 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3829 ULong
amd64g_calc_crc32q ( ULong crcIn
, ULong q
)
3831 ULong crc
= amd64g_calc_crc32l(crcIn
, q
);
3832 return amd64g_calc_crc32l(crc
, q
>> 32);
3836 /* .. helper for next fn .. */
3837 static inline ULong
sad_8x4 ( ULong xx
, ULong yy
)
3840 t
+= (UInt
)abdU8( sel8x8_3(xx
), sel8x8_3(yy
) );
3841 t
+= (UInt
)abdU8( sel8x8_2(xx
), sel8x8_2(yy
) );
3842 t
+= (UInt
)abdU8( sel8x8_1(xx
), sel8x8_1(yy
) );
3843 t
+= (UInt
)abdU8( sel8x8_0(xx
), sel8x8_0(yy
) );
3847 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3848 ULong
amd64g_calc_mpsadbw ( ULong sHi
, ULong sLo
,
3849 ULong dHi
, ULong dLo
,
3850 ULong imm_and_return_control_bit
)
3852 UInt imm8
= imm_and_return_control_bit
& 7;
3853 Bool calcHi
= (imm_and_return_control_bit
>> 7) & 1;
3854 UInt srcOffsL
= imm8
& 3; /* src offs in 32-bit (L) chunks */
3855 UInt dstOffsL
= (imm8
>> 2) & 1; /* dst offs in ditto chunks */
3856 /* For src we only need 32 bits, so get them into the
3857 lower half of a 64 bit word. */
3858 ULong src
= ((srcOffsL
& 2) ? sHi
: sLo
) >> (32 * (srcOffsL
& 1));
3859 /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3860 11 bytes. If calculating the low part of the result, need bytes
3861 dstOffsL * 4 + (0 .. 6); if calculating the high part,
3862 dstOffsL * 4 + (4 .. 10). */
3864 /* dstOffL = 0, Lo -> 0 .. 6
3865 dstOffL = 1, Lo -> 4 .. 10
3866 dstOffL = 0, Hi -> 4 .. 10
3867 dstOffL = 1, Hi -> 8 .. 14
3869 if (calcHi
&& dstOffsL
) {
3871 dst
= dHi
& 0x00FFFFFFFFFFFFFFULL
;
3873 else if (!calcHi
&& !dstOffsL
) {
3875 dst
= dLo
& 0x00FFFFFFFFFFFFFFULL
;
3879 dst
= (dLo
>> 32) | ((dHi
& 0x00FFFFFFULL
) << 32);
3881 ULong r0
= sad_8x4( dst
>> 0, src
);
3882 ULong r1
= sad_8x4( dst
>> 8, src
);
3883 ULong r2
= sad_8x4( dst
>> 16, src
);
3884 ULong r3
= sad_8x4( dst
>> 24, src
);
3885 ULong res
= (r3
<< 48) | (r2
<< 32) | (r1
<< 16) | r0
;
3889 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3890 ULong
amd64g_calculate_pext ( ULong src_masked
, ULong mask
)
3895 for (src_bit
= 1; src_bit
; src_bit
<<= 1) {
3896 if (mask
& src_bit
) {
3897 if (src_masked
& src_bit
) dst
|= dst_bit
;
3904 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3905 ULong
amd64g_calculate_pdep ( ULong src
, ULong mask
)
3910 for (dst_bit
= 1; dst_bit
; dst_bit
<<= 1) {
3911 if (mask
& dst_bit
) {
3912 if (src
& src_bit
) dst
|= dst_bit
;
3919 /*---------------------------------------------------------------*/
3920 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/
3921 /*---------------------------------------------------------------*/
3923 static UInt
zmask_from_V128 ( V128
* arg
)
3926 for (i
= 0; i
< 16; i
++) {
3927 res
|= ((arg
->w8
[i
] == 0) ? 1 : 0) << i
;
3932 static UInt
zmask_from_V128_wide ( V128
* arg
)
3935 for (i
= 0; i
< 8; i
++) {
3936 res
|= ((arg
->w16
[i
] == 0) ? 1 : 0) << i
;
3941 /* Helps with PCMP{I,E}STR{I,M}.
3943 CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really,
3944 actually it could be a clean helper, but for the fact that we can't
3945 pass by value 2 x V128 to a clean helper, nor have one returned.)
3946 Reads guest state, writes to guest state for the xSTRM cases, no
3947 accesses of memory, is a pure function.
3949 opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
3950 the callee knows which I/E and I/M variant it is dealing with and
3951 what the specific operation is. 4th byte of opcode is in the range
3958 gstOffL and gstOffR are the guest state offsets for the two XMM
3959 register inputs. We never have to deal with the memory case since
3960 that is handled by pre-loading the relevant value into the fake
3963 For ESTRx variants, edxIN and eaxIN hold the values of those two
3966 In all cases, the bottom 16 bits of the result contain the new
3967 OSZACP %rflags values. For xSTRI variants, bits[31:16] of the
3968 result hold the new %ecx value. For xSTRM variants, the helper
3969 writes the result directly to the guest XMM0.
3971 Declarable side effects: in all cases, reads guest state at
3972 [gstOffL, +16) and [gstOffR, +16). For xSTRM variants, also writes
3975 Is expected to be called with opc_and_imm combinations which have
3976 actually been validated, and will assert if otherwise. The front
3977 end should ensure we're only called with verified values.
3979 ULong
amd64g_dirtyhelper_PCMPxSTRx (
3980 VexGuestAMD64State
* gst
,
3982 HWord gstOffL
, HWord gstOffR
,
3983 HWord edxIN
, HWord eaxIN
3986 HWord opc4
= (opc4_and_imm
>> 8) & 0xFF;
3987 HWord imm8
= opc4_and_imm
& 0xFF;
3988 HWord isISTRx
= opc4
& 2;
3989 HWord isxSTRM
= (opc4
& 1) ^ 1;
3990 vassert((opc4
& 0xFC) == 0x60); /* 0x60 .. 0x63 */
3991 HWord wide
= (imm8
& 1);
3993 // where the args are
3994 V128
* argL
= (V128
*)( ((UChar
*)gst
) + gstOffL
);
3995 V128
* argR
= (V128
*)( ((UChar
*)gst
) + gstOffR
);
3997 /* Create the arg validity masks, either from the vectors
3998 themselves or from the supplied edx/eax values. */
3999 // FIXME: this is only right for the 8-bit data cases.
4000 // At least that is asserted above.
4001 UInt zmaskL
, zmaskR
;
4003 // temp spot for the resulting flags and vector.
4007 // for checking whether case was handled
4012 zmaskL
= zmask_from_V128_wide(argL
);
4013 zmaskR
= zmask_from_V128_wide(argR
);
4016 tmp
= edxIN
& 0xFFFFFFFF;
4017 if (tmp
< -8) tmp
= -8;
4018 if (tmp
> 8) tmp
= 8;
4019 if (tmp
< 0) tmp
= -tmp
;
4020 vassert(tmp
>= 0 && tmp
<= 8);
4021 zmaskL
= (1 << tmp
) & 0xFF;
4022 tmp
= eaxIN
& 0xFFFFFFFF;
4023 if (tmp
< -8) tmp
= -8;
4024 if (tmp
> 8) tmp
= 8;
4025 if (tmp
< 0) tmp
= -tmp
;
4026 vassert(tmp
>= 0 && tmp
<= 8);
4027 zmaskR
= (1 << tmp
) & 0xFF;
4030 ok
= compute_PCMPxSTRx_wide (
4031 &resV
, &resOSZACP
, argL
, argR
,
4032 zmaskL
, zmaskR
, imm8
, (Bool
)isxSTRM
4036 zmaskL
= zmask_from_V128(argL
);
4037 zmaskR
= zmask_from_V128(argR
);
4040 tmp
= edxIN
& 0xFFFFFFFF;
4041 if (tmp
< -16) tmp
= -16;
4042 if (tmp
> 16) tmp
= 16;
4043 if (tmp
< 0) tmp
= -tmp
;
4044 vassert(tmp
>= 0 && tmp
<= 16);
4045 zmaskL
= (1 << tmp
) & 0xFFFF;
4046 tmp
= eaxIN
& 0xFFFFFFFF;
4047 if (tmp
< -16) tmp
= -16;
4048 if (tmp
> 16) tmp
= 16;
4049 if (tmp
< 0) tmp
= -tmp
;
4050 vassert(tmp
>= 0 && tmp
<= 16);
4051 zmaskR
= (1 << tmp
) & 0xFFFF;
4054 ok
= compute_PCMPxSTRx (
4055 &resV
, &resOSZACP
, argL
, argR
,
4056 zmaskL
, zmaskR
, imm8
, (Bool
)isxSTRM
4060 // front end shouldn't pass us any imm8 variants we can't
4064 // So, finally we need to get the results back to the caller.
4065 // In all cases, the new OSZACP value is the lowest 16 of
4066 // the return value.
4068 gst
->guest_YMM0
[0] = resV
.w32
[0];
4069 gst
->guest_YMM0
[1] = resV
.w32
[1];
4070 gst
->guest_YMM0
[2] = resV
.w32
[2];
4071 gst
->guest_YMM0
[3] = resV
.w32
[3];
4072 return resOSZACP
& 0x8D5;
4074 UInt newECX
= resV
.w32
[0] & 0xFFFF;
4075 return (newECX
<< 16) | (resOSZACP
& 0x8D5);
4079 /*---------------------------------------------------------------*/
4080 /*--- AES primitives and helpers ---*/
4081 /*---------------------------------------------------------------*/
4082 /* a 16 x 16 matrix */
4083 static const UChar sbox
[256] = { // row nr
4084 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
4085 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
4086 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
4087 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
4088 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
4089 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
4090 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
4091 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
4092 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
4093 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
4094 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
4095 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
4096 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
4097 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
4098 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
4099 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
4100 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
4101 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
4102 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
4103 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
4104 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
4105 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
4106 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
4107 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
4108 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
4109 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
4110 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
4111 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
4112 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
4113 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
4114 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
4115 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
4117 static void SubBytes (V128
* v
)
4121 for (i
= 0; i
< 16; i
++)
4122 r
.w8
[i
] = sbox
[v
->w8
[i
]];
4126 /* a 16 x 16 matrix */
4127 static const UChar invsbox
[256] = { // row nr
4128 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
4129 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
4130 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
4131 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
4132 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
4133 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
4134 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
4135 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
4136 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
4137 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
4138 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
4139 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
4140 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
4141 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
4142 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
4143 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
4144 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
4145 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
4146 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
4147 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
4148 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
4149 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
4150 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
4151 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
4152 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
4153 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
4154 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
4155 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
4156 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
4157 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
4158 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
4159 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
4161 static void InvSubBytes (V128
* v
)
4165 for (i
= 0; i
< 16; i
++)
4166 r
.w8
[i
] = invsbox
[v
->w8
[i
]];
4170 static const UChar ShiftRows_op
[16] =
4171 {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
4172 static void ShiftRows (V128
* v
)
4176 for (i
= 0; i
< 16; i
++)
4177 r
.w8
[i
] = v
->w8
[ShiftRows_op
[15-i
]];
4181 static const UChar InvShiftRows_op
[16] =
4182 {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
4183 static void InvShiftRows (V128
* v
)
4187 for (i
= 0; i
< 16; i
++)
4188 r
.w8
[i
] = v
->w8
[InvShiftRows_op
[15-i
]];
4192 /* Multiplication of the finite fields elements of AES.
4193 See "A Specification for The AES Algorithm Rijndael
4194 (by Joan Daemen & Vincent Rijmen)"
4195 Dr. Brian Gladman, v3.1, 3rd March 2001. */
4196 /* N values so that (hex) xy = 0x03^N.
4197 0x00 cannot be used. We put 0xff for this value.*/
4198 /* a 16 x 16 matrix */
4199 static const UChar Nxy
[256] = { // row nr
4200 0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
4201 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
4202 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
4203 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
4204 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
4205 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
4206 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
4207 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
4208 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
4209 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
4210 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
4211 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
4212 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
4213 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
4214 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
4215 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
4216 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
4217 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
4218 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
4219 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
4220 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
4221 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
4222 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
4223 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
4224 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
4225 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
4226 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
4227 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
4228 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
4229 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
4230 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
4231 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
4234 /* E values so that E = 0x03^xy. */
4235 static const UChar Exy
[256] = { // row nr
4236 0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
4237 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
4238 0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
4239 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
4240 0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
4241 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
4242 0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
4243 0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
4244 0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
4245 0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
4246 0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
4247 0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
4248 0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
4249 0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
4250 0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
4251 0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
4252 0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
4253 0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
4254 0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
4255 0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
4256 0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
4257 0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
4258 0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
4259 0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
4260 0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
4261 0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
4262 0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
4263 0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
4264 0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
4265 0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
4266 0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
4267 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
4269 static inline UChar
ff_mul(UChar u1
, UChar u2
)
4271 if ((u1
> 0) && (u2
> 0)) {
4272 UInt ui
= Nxy
[u1
] + Nxy
[u2
];
4281 static void MixColumns (V128
* v
)
4285 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4286 for (j
= 0; j
< 4; j
++) {
4287 P(&r
,j
,0) = ff_mul(0x02, P(v
,j
,0)) ^ ff_mul(0x03, P(v
,j
,1))
4288 ^ P(v
,j
,2) ^ P(v
,j
,3);
4289 P(&r
,j
,1) = P(v
,j
,0) ^ ff_mul( 0x02, P(v
,j
,1) )
4290 ^ ff_mul(0x03, P(v
,j
,2) ) ^ P(v
,j
,3);
4291 P(&r
,j
,2) = P(v
,j
,0) ^ P(v
,j
,1) ^ ff_mul( 0x02, P(v
,j
,2) )
4292 ^ ff_mul(0x03, P(v
,j
,3) );
4293 P(&r
,j
,3) = ff_mul(0x03, P(v
,j
,0) ) ^ P(v
,j
,1) ^ P(v
,j
,2)
4294 ^ ff_mul( 0x02, P(v
,j
,3) );
4300 static void InvMixColumns (V128
* v
)
4304 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4305 for (j
= 0; j
< 4; j
++) {
4306 P(&r
,j
,0) = ff_mul(0x0e, P(v
,j
,0) ) ^ ff_mul(0x0b, P(v
,j
,1) )
4307 ^ ff_mul(0x0d,P(v
,j
,2) ) ^ ff_mul(0x09, P(v
,j
,3) );
4308 P(&r
,j
,1) = ff_mul(0x09, P(v
,j
,0) ) ^ ff_mul(0x0e, P(v
,j
,1) )
4309 ^ ff_mul(0x0b,P(v
,j
,2) ) ^ ff_mul(0x0d, P(v
,j
,3) );
4310 P(&r
,j
,2) = ff_mul(0x0d, P(v
,j
,0) ) ^ ff_mul(0x09, P(v
,j
,1) )
4311 ^ ff_mul(0x0e,P(v
,j
,2) ) ^ ff_mul(0x0b, P(v
,j
,3) );
4312 P(&r
,j
,3) = ff_mul(0x0b, P(v
,j
,0) ) ^ ff_mul(0x0d, P(v
,j
,1) )
4313 ^ ff_mul(0x09,P(v
,j
,2) ) ^ ff_mul(0x0e, P(v
,j
,3) );
4320 /* For description, see definition in guest_amd64_defs.h */
4321 void amd64g_dirtyhelper_AES (
4322 VexGuestAMD64State
* gst
,
4323 HWord opc4
, HWord gstOffD
,
4324 HWord gstOffL
, HWord gstOffR
4327 // where the args are
4328 V128
* argD
= (V128
*)( ((UChar
*)gst
) + gstOffD
);
4329 V128
* argL
= (V128
*)( ((UChar
*)gst
) + gstOffL
);
4330 V128
* argR
= (V128
*)( ((UChar
*)gst
) + gstOffR
);
4334 case 0xDC: /* AESENC */
4335 case 0xDD: /* AESENCLAST */
4341 argD
->w64
[0] = r
.w64
[0] ^ argL
->w64
[0];
4342 argD
->w64
[1] = r
.w64
[1] ^ argL
->w64
[1];
4345 case 0xDE: /* AESDEC */
4346 case 0xDF: /* AESDECLAST */
4352 argD
->w64
[0] = r
.w64
[0] ^ argL
->w64
[0];
4353 argD
->w64
[1] = r
.w64
[1] ^ argL
->w64
[1];
4356 case 0xDB: /* AESIMC */
4358 InvMixColumns (argD
);
4360 default: vassert(0);
4364 static inline UInt
RotWord (UInt w32
)
4366 return ((w32
>> 8) | (w32
<< 24));
4369 static inline UInt
SubWord (UInt w32
)
4376 r8
[0] = sbox
[w8
[0]];
4377 r8
[1] = sbox
[w8
[1]];
4378 r8
[2] = sbox
[w8
[2]];
4379 r8
[3] = sbox
[w8
[3]];
4383 /* For description, see definition in guest_amd64_defs.h */
4384 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
4385 VexGuestAMD64State
* gst
,
4387 HWord gstOffL
, HWord gstOffR
4390 // where the args are
4391 V128
* argL
= (V128
*)( ((UChar
*)gst
) + gstOffL
);
4392 V128
* argR
= (V128
*)( ((UChar
*)gst
) + gstOffR
);
4394 // We have to create the result in a temporary in the
4395 // case where the src and dst regs are the same. See #341698.
4398 tmp
.w32
[3] = RotWord (SubWord (argL
->w32
[3])) ^ imm8
;
4399 tmp
.w32
[2] = SubWord (argL
->w32
[3]);
4400 tmp
.w32
[1] = RotWord (SubWord (argL
->w32
[1])) ^ imm8
;
4401 tmp
.w32
[0] = SubWord (argL
->w32
[1]);
4403 argR
->w32
[3] = tmp
.w32
[3];
4404 argR
->w32
[2] = tmp
.w32
[2];
4405 argR
->w32
[1] = tmp
.w32
[1];
4406 argR
->w32
[0] = tmp
.w32
[0];
4411 /*---------------------------------------------------------------*/
4412 /*--- Helpers for dealing with, and describing, ---*/
4413 /*--- guest state as a whole. ---*/
4414 /*---------------------------------------------------------------*/
4416 /* Initialise the entire amd64 guest state. */
4417 /* VISIBLE TO LIBVEX CLIENT */
4418 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State
* vex_state
)
4420 vex_state
->host_EvC_FAILADDR
= 0;
4421 vex_state
->host_EvC_COUNTER
= 0;
4422 vex_state
->pad0
= 0;
4424 vex_state
->guest_RAX
= 0;
4425 vex_state
->guest_RCX
= 0;
4426 vex_state
->guest_RDX
= 0;
4427 vex_state
->guest_RBX
= 0;
4428 vex_state
->guest_RSP
= 0;
4429 vex_state
->guest_RBP
= 0;
4430 vex_state
->guest_RSI
= 0;
4431 vex_state
->guest_RDI
= 0;
4432 vex_state
->guest_R8
= 0;
4433 vex_state
->guest_R9
= 0;
4434 vex_state
->guest_R10
= 0;
4435 vex_state
->guest_R11
= 0;
4436 vex_state
->guest_R12
= 0;
4437 vex_state
->guest_R13
= 0;
4438 vex_state
->guest_R14
= 0;
4439 vex_state
->guest_R15
= 0;
4441 vex_state
->guest_CC_OP
= AMD64G_CC_OP_COPY
;
4442 vex_state
->guest_CC_DEP1
= 0;
4443 vex_state
->guest_CC_DEP2
= 0;
4444 vex_state
->guest_CC_NDEP
= 0;
4446 vex_state
->guest_DFLAG
= 1; /* forwards */
4447 vex_state
->guest_IDFLAG
= 0;
4448 vex_state
->guest_ACFLAG
= 0;
4450 /* HACK: represent the offset associated with a constant %fs.
4451 Typically, on linux, this assumes that %fs is only ever zero (main
4453 vex_state
->guest_FS_CONST
= 0;
4455 vex_state
->guest_RIP
= 0;
4457 /* Initialise the simulated FPU */
4458 amd64g_dirtyhelper_FINIT( vex_state
);
4460 /* Initialise the AVX state. */
4461 # define AVXZERO(_ymm) \
4462 do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
4463 _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
4465 vex_state
->guest_SSEROUND
= (ULong
)Irrm_NEAREST
;
4466 AVXZERO(vex_state
->guest_YMM0
);
4467 AVXZERO(vex_state
->guest_YMM1
);
4468 AVXZERO(vex_state
->guest_YMM2
);
4469 AVXZERO(vex_state
->guest_YMM3
);
4470 AVXZERO(vex_state
->guest_YMM4
);
4471 AVXZERO(vex_state
->guest_YMM5
);
4472 AVXZERO(vex_state
->guest_YMM6
);
4473 AVXZERO(vex_state
->guest_YMM7
);
4474 AVXZERO(vex_state
->guest_YMM8
);
4475 AVXZERO(vex_state
->guest_YMM9
);
4476 AVXZERO(vex_state
->guest_YMM10
);
4477 AVXZERO(vex_state
->guest_YMM11
);
4478 AVXZERO(vex_state
->guest_YMM12
);
4479 AVXZERO(vex_state
->guest_YMM13
);
4480 AVXZERO(vex_state
->guest_YMM14
);
4481 AVXZERO(vex_state
->guest_YMM15
);
4482 AVXZERO(vex_state
->guest_YMM16
);
4486 vex_state
->guest_EMNOTE
= EmNote_NONE
;
4488 /* These should not ever be either read or written, but we
4489 initialise them anyway. */
4490 vex_state
->guest_CMSTART
= 0;
4491 vex_state
->guest_CMLEN
= 0;
4493 vex_state
->guest_NRADDR
= 0;
4494 vex_state
->guest_SC_CLASS
= 0;
4495 vex_state
->guest_GS_CONST
= 0;
4497 vex_state
->guest_IP_AT_SYSCALL
= 0;
4498 vex_state
->pad1
= 0;
4502 /* Figure out if any part of the guest state contained in minoff
4503 .. maxoff requires precise memory exceptions. If in doubt return
4504 True (but this generates significantly slower code).
4506 By default we enforce precise exns for guest %RSP, %RBP and %RIP
4507 only. These are the minimum needed to extract correct stack
4508 backtraces from amd64 code.
4510 Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
4512 Bool
guest_amd64_state_requires_precise_mem_exns (
4513 Int minoff
, Int maxoff
, VexRegisterUpdates pxControl
4516 Int rbp_min
= offsetof(VexGuestAMD64State
, guest_RBP
);
4517 Int rbp_max
= rbp_min
+ 8 - 1;
4518 Int rsp_min
= offsetof(VexGuestAMD64State
, guest_RSP
);
4519 Int rsp_max
= rsp_min
+ 8 - 1;
4520 Int rip_min
= offsetof(VexGuestAMD64State
, guest_RIP
);
4521 Int rip_max
= rip_min
+ 8 - 1;
4523 if (maxoff
< rsp_min
|| minoff
> rsp_max
) {
4524 /* no overlap with rsp */
4525 if (pxControl
== VexRegUpdSpAtMemAccess
)
4526 return False
; // We only need to check stack pointer.
4531 if (maxoff
< rbp_min
|| minoff
> rbp_max
) {
4532 /* no overlap with rbp */
4537 if (maxoff
< rip_min
|| minoff
> rip_max
) {
4538 /* no overlap with eip */
4547 #define ALWAYSDEFD(field) \
4548 { offsetof(VexGuestAMD64State, field), \
4549 (sizeof ((VexGuestAMD64State*)0)->field) }
4554 /* Total size of the guest state, in bytes. */
4555 .total_sizeB
= sizeof(VexGuestAMD64State
),
4557 /* Describe the stack pointer. */
4558 .offset_SP
= offsetof(VexGuestAMD64State
,guest_RSP
),
4561 /* Describe the frame pointer. */
4562 .offset_FP
= offsetof(VexGuestAMD64State
,guest_RBP
),
4565 /* Describe the instruction pointer. */
4566 .offset_IP
= offsetof(VexGuestAMD64State
,guest_RIP
),
4569 /* Describe any sections to be regarded by Memcheck as
4570 'always-defined'. */
4573 /* flags thunk: OP and NDEP are always defd, whereas DEP1
4574 and DEP2 have to be tracked. See detailed comment in
4575 gdefs.h on meaning of thunk fields. */
4577 = { /* 0 */ ALWAYSDEFD(guest_CC_OP
),
4578 /* 1 */ ALWAYSDEFD(guest_CC_NDEP
),
4579 /* 2 */ ALWAYSDEFD(guest_DFLAG
),
4580 /* 3 */ ALWAYSDEFD(guest_IDFLAG
),
4581 /* 4 */ ALWAYSDEFD(guest_RIP
),
4582 /* 5 */ ALWAYSDEFD(guest_FS_CONST
),
4583 /* 6 */ ALWAYSDEFD(guest_FTOP
),
4584 /* 7 */ ALWAYSDEFD(guest_FPTAG
),
4585 /* 8 */ ALWAYSDEFD(guest_FPROUND
),
4586 /* 9 */ ALWAYSDEFD(guest_FC3210
),
4587 // /* */ ALWAYSDEFD(guest_CS),
4588 // /* */ ALWAYSDEFD(guest_DS),
4589 // /* */ ALWAYSDEFD(guest_ES),
4590 // /* */ ALWAYSDEFD(guest_FS),
4591 // /* */ ALWAYSDEFD(guest_GS),
4592 // /* */ ALWAYSDEFD(guest_SS),
4593 // /* */ ALWAYSDEFD(guest_LDT),
4594 // /* */ ALWAYSDEFD(guest_GDT),
4595 /* 10 */ ALWAYSDEFD(guest_EMNOTE
),
4596 /* 11 */ ALWAYSDEFD(guest_SSEROUND
),
4597 /* 12 */ ALWAYSDEFD(guest_CMSTART
),
4598 /* 13 */ ALWAYSDEFD(guest_CMLEN
),
4599 /* 14 */ ALWAYSDEFD(guest_SC_CLASS
),
4600 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL
)
4605 /*---------------------------------------------------------------*/
4606 /*--- end guest_amd64_helpers.c ---*/
4607 /*---------------------------------------------------------------*/