Add a test program for the membarrier() system call
[valgrind.git] / VEX / priv / guest_amd64_helpers.c
bloba53419aaad8a81c393d8f0b7042181e5779318ca
2 /*---------------------------------------------------------------*/
3 /*--- begin guest_amd64_helpers.c ---*/
4 /*---------------------------------------------------------------*/
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
28 The GNU General Public License is contained in the file COPYING.
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
36 #include "libvex_basictypes.h"
37 #include "libvex_emnote.h"
38 #include "libvex_guest_amd64.h"
39 #include "libvex_ir.h"
40 #include "libvex.h"
42 #include "main_util.h"
43 #include "main_globals.h"
44 #include "guest_generic_bb_to_IR.h"
45 #include "guest_amd64_defs.h"
46 #include "guest_generic_x87.h"
49 /* This file contains helper functions for amd64 guest code.
50 Calls to these functions are generated by the back end.
51 These calls are of course in the host machine code and
52 this file will be compiled to host machine code, so that
53 all makes sense.
55 Only change the signatures of these helper functions very
56 carefully. If you change the signature here, you'll have to change
57 the parameters passed to it in the IR calls constructed by
58 guest-amd64/toIR.c.
60 The convention used is that all functions called from generated
61 code are named amd64g_<something>, and any function whose name lacks
62 that prefix is not called from generated code. Note that some
63 LibVEX_* functions can however be called by VEX's client, but that
64 is not the same as calling them from VEX-generated code.
68 /* Set to 1 to get detailed profiling info about use of the flag
69 machinery. */
70 #define PROFILE_RFLAGS 0
73 /*---------------------------------------------------------------*/
74 /*--- %rflags run-time helpers. ---*/
75 /*---------------------------------------------------------------*/
77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
78 after imulq/mulq. */
80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
82 const Long halfMask = 0xFFFFFFFFLL;
83 ULong u0, v0, w0;
84 Long u1, v1, w1, w2, t;
85 u0 = u & halfMask;
86 u1 = u >> 32;
87 v0 = v & halfMask;
88 v1 = v >> 32;
89 w0 = u0 * v0;
90 t = u1 * v0 + (w0 >> 32);
91 w1 = t & halfMask;
92 w2 = t >> 32;
93 w1 = u0 * v1 + w1;
94 *rHi = u1 * v1 + w2 + (w1 >> 32);
95 *rLo = (Long)((ULong)u * (ULong)v);
98 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
100 const ULong halfMask = 0xFFFFFFFFULL;
101 ULong u0, v0, w0;
102 ULong u1, v1, w1,w2,t;
103 u0 = u & halfMask;
104 u1 = u >> 32;
105 v0 = v & halfMask;
106 v1 = v >> 32;
107 w0 = u0 * v0;
108 t = u1 * v0 + (w0 >> 32);
109 w1 = t & halfMask;
110 w2 = t >> 32;
111 w1 = u0 * v1 + w1;
112 *rHi = u1 * v1 + w2 + (w1 >> 32);
113 *rLo = u * v;
117 static const UChar parity_table[256] = {
118 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
119 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
120 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
121 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
123 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
124 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
125 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
126 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
127 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
128 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
129 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
131 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
132 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
133 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
135 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
136 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
137 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
139 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
140 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
141 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
142 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
143 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
144 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
145 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
147 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
148 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
149 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
152 /* generalised left-shifter */
153 static inline Long lshift ( Long x, Int n )
155 if (n >= 0)
156 return (ULong)x << n;
157 else
158 return x >> (-n);
161 /* identity on ULong */
162 static inline ULong idULong ( ULong x )
164 return x;
168 #define PREAMBLE(__data_bits) \
169 /* const */ ULong DATA_MASK \
170 = __data_bits==8 \
171 ? 0xFFULL \
172 : (__data_bits==16 \
173 ? 0xFFFFULL \
174 : (__data_bits==32 \
175 ? 0xFFFFFFFFULL \
176 : 0xFFFFFFFFFFFFFFFFULL)); \
177 /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1); \
178 /* const */ ULong CC_DEP1 = cc_dep1_formal; \
179 /* const */ ULong CC_DEP2 = cc_dep2_formal; \
180 /* const */ ULong CC_NDEP = cc_ndep_formal; \
181 /* Four bogus assignments, which hopefully gcc can */ \
182 /* optimise away, and which stop it complaining about */ \
183 /* unused variables. */ \
184 SIGN_MASK = SIGN_MASK; \
185 DATA_MASK = DATA_MASK; \
186 CC_DEP2 = CC_DEP2; \
187 CC_NDEP = CC_NDEP;
190 /*-------------------------------------------------------------*/
192 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE) \
194 PREAMBLE(DATA_BITS); \
195 { ULong cf, pf, af, zf, sf, of; \
196 ULong argL, argR, res; \
197 argL = CC_DEP1; \
198 argR = CC_DEP2; \
199 res = argL + argR; \
200 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
201 pf = parity_table[(UChar)res]; \
202 af = (res ^ argL ^ argR) & 0x10; \
203 zf = ((DATA_UTYPE)res == 0) << 6; \
204 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
205 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
206 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
207 return cf | pf | af | zf | sf | of; \
211 /*-------------------------------------------------------------*/
213 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE) \
215 PREAMBLE(DATA_BITS); \
216 { ULong cf, pf, af, zf, sf, of; \
217 ULong argL, argR, res; \
218 argL = CC_DEP1; \
219 argR = CC_DEP2; \
220 res = argL - argR; \
221 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
222 pf = parity_table[(UChar)res]; \
223 af = (res ^ argL ^ argR) & 0x10; \
224 zf = ((DATA_UTYPE)res == 0) << 6; \
225 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
226 of = lshift((argL ^ argR) & (argL ^ res), \
227 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
228 return cf | pf | af | zf | sf | of; \
232 /*-------------------------------------------------------------*/
234 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE) \
236 PREAMBLE(DATA_BITS); \
237 { ULong cf, pf, af, zf, sf, of; \
238 ULong argL, argR, oldC, res; \
239 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
240 argL = CC_DEP1; \
241 argR = CC_DEP2 ^ oldC; \
242 res = (argL + argR) + oldC; \
243 if (oldC) \
244 cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \
245 else \
246 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
247 pf = parity_table[(UChar)res]; \
248 af = (res ^ argL ^ argR) & 0x10; \
249 zf = ((DATA_UTYPE)res == 0) << 6; \
250 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
251 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
252 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
253 return cf | pf | af | zf | sf | of; \
257 /*-------------------------------------------------------------*/
259 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE) \
261 PREAMBLE(DATA_BITS); \
262 { ULong cf, pf, af, zf, sf, of; \
263 ULong argL, argR, oldC, res; \
264 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
265 argL = CC_DEP1; \
266 argR = CC_DEP2 ^ oldC; \
267 res = (argL - argR) - oldC; \
268 if (oldC) \
269 cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR; \
270 else \
271 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
272 pf = parity_table[(UChar)res]; \
273 af = (res ^ argL ^ argR) & 0x10; \
274 zf = ((DATA_UTYPE)res == 0) << 6; \
275 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
276 of = lshift((argL ^ argR) & (argL ^ res), \
277 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
278 return cf | pf | af | zf | sf | of; \
282 /*-------------------------------------------------------------*/
284 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE) \
286 PREAMBLE(DATA_BITS); \
287 { ULong cf, pf, af, zf, sf, of; \
288 cf = 0; \
289 pf = parity_table[(UChar)CC_DEP1]; \
290 af = 0; \
291 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
292 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
293 of = 0; \
294 return cf | pf | af | zf | sf | of; \
298 /*-------------------------------------------------------------*/
300 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE) \
302 PREAMBLE(DATA_BITS); \
303 { ULong cf, pf, af, zf, sf, of; \
304 ULong argL, argR, res; \
305 res = CC_DEP1; \
306 argL = res - 1; \
307 argR = 1; \
308 cf = CC_NDEP & AMD64G_CC_MASK_C; \
309 pf = parity_table[(UChar)res]; \
310 af = (res ^ argL ^ argR) & 0x10; \
311 zf = ((DATA_UTYPE)res == 0) << 6; \
312 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
313 of = ((res & DATA_MASK) == SIGN_MASK) << 11; \
314 return cf | pf | af | zf | sf | of; \
318 /*-------------------------------------------------------------*/
320 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE) \
322 PREAMBLE(DATA_BITS); \
323 { ULong cf, pf, af, zf, sf, of; \
324 ULong argL, argR, res; \
325 res = CC_DEP1; \
326 argL = res + 1; \
327 argR = 1; \
328 cf = CC_NDEP & AMD64G_CC_MASK_C; \
329 pf = parity_table[(UChar)res]; \
330 af = (res ^ argL ^ argR) & 0x10; \
331 zf = ((DATA_UTYPE)res == 0) << 6; \
332 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
333 of = ((res & DATA_MASK) \
334 == ((ULong)SIGN_MASK - 1)) << 11; \
335 return cf | pf | af | zf | sf | of; \
339 /*-------------------------------------------------------------*/
341 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE) \
343 PREAMBLE(DATA_BITS); \
344 { ULong cf, pf, af, zf, sf, of; \
345 cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C; \
346 pf = parity_table[(UChar)CC_DEP1]; \
347 af = 0; /* undefined */ \
348 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
349 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
350 /* of is defined if shift count == 1 */ \
351 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
352 & AMD64G_CC_MASK_O; \
353 return cf | pf | af | zf | sf | of; \
357 /*-------------------------------------------------------------*/
359 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE) \
361 PREAMBLE(DATA_BITS); \
362 { ULong cf, pf, af, zf, sf, of; \
363 cf = CC_DEP2 & 1; \
364 pf = parity_table[(UChar)CC_DEP1]; \
365 af = 0; /* undefined */ \
366 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
367 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
368 /* of is defined if shift count == 1 */ \
369 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
370 & AMD64G_CC_MASK_O; \
371 return cf | pf | af | zf | sf | of; \
375 /*-------------------------------------------------------------*/
377 /* ROL: cf' = lsb(result). of' = msb(result) ^ lsb(result). */
378 /* DEP1 = result, NDEP = old flags */
379 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE) \
381 PREAMBLE(DATA_BITS); \
382 { ULong fl \
383 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
384 | (AMD64G_CC_MASK_C & CC_DEP1) \
385 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
386 11-(DATA_BITS-1)) \
387 ^ lshift(CC_DEP1, 11))); \
388 return fl; \
392 /*-------------------------------------------------------------*/
394 /* ROR: cf' = msb(result). of' = msb(result) ^ msb-1(result). */
395 /* DEP1 = result, NDEP = old flags */
396 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE) \
398 PREAMBLE(DATA_BITS); \
399 { ULong fl \
400 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
401 | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1))) \
402 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
403 11-(DATA_BITS-1)) \
404 ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1))); \
405 return fl; \
409 /*-------------------------------------------------------------*/
411 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE, NARROWtoU, \
412 DATA_U2TYPE, NARROWto2U) \
414 PREAMBLE(DATA_BITS); \
415 { ULong cf, pf, af, zf, sf, of; \
416 DATA_UTYPE hi; \
417 DATA_UTYPE lo \
418 = NARROWtoU( ((DATA_UTYPE)CC_DEP1) \
419 * ((DATA_UTYPE)CC_DEP2) ); \
420 DATA_U2TYPE rr \
421 = NARROWto2U( \
422 ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1)) \
423 * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) ); \
424 hi = NARROWtoU(rr >>/*u*/ DATA_BITS); \
425 cf = (hi != 0); \
426 pf = parity_table[(UChar)lo]; \
427 af = 0; /* undefined */ \
428 zf = (lo == 0) << 6; \
429 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
430 of = cf << 11; \
431 return cf | pf | af | zf | sf | of; \
435 /*-------------------------------------------------------------*/
437 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE, NARROWtoS, \
438 DATA_S2TYPE, NARROWto2S) \
440 PREAMBLE(DATA_BITS); \
441 { ULong cf, pf, af, zf, sf, of; \
442 DATA_STYPE hi; \
443 DATA_STYPE lo \
444 = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1) \
445 * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) ); \
446 DATA_S2TYPE rr \
447 = NARROWto2S( \
448 ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1)) \
449 * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) ); \
450 hi = NARROWtoS(rr >>/*s*/ DATA_BITS); \
451 cf = (hi != (lo >>/*s*/ (DATA_BITS-1))); \
452 pf = parity_table[(UChar)lo]; \
453 af = 0; /* undefined */ \
454 zf = (lo == 0) << 6; \
455 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
456 of = cf << 11; \
457 return cf | pf | af | zf | sf | of; \
461 /*-------------------------------------------------------------*/
463 #define ACTIONS_UMULQ \
465 PREAMBLE(64); \
466 { ULong cf, pf, af, zf, sf, of; \
467 ULong lo, hi; \
468 mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo ); \
469 cf = (hi != 0); \
470 pf = parity_table[(UChar)lo]; \
471 af = 0; /* undefined */ \
472 zf = (lo == 0) << 6; \
473 sf = lshift(lo, 8 - 64) & 0x80; \
474 of = cf << 11; \
475 return cf | pf | af | zf | sf | of; \
479 /*-------------------------------------------------------------*/
481 #define ACTIONS_SMULQ \
483 PREAMBLE(64); \
484 { ULong cf, pf, af, zf, sf, of; \
485 Long lo, hi; \
486 mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo ); \
487 cf = (hi != (lo >>/*s*/ (64-1))); \
488 pf = parity_table[(UChar)lo]; \
489 af = 0; /* undefined */ \
490 zf = (lo == 0) << 6; \
491 sf = lshift(lo, 8 - 64) & 0x80; \
492 of = cf << 11; \
493 return cf | pf | af | zf | sf | of; \
497 /*-------------------------------------------------------------*/
499 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE) \
501 PREAMBLE(DATA_BITS); \
502 { ULong cf, pf, af, zf, sf, of; \
503 cf = 0; \
504 pf = 0; \
505 af = 0; \
506 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
507 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
508 of = 0; \
509 return cf | pf | af | zf | sf | of; \
513 /*-------------------------------------------------------------*/
515 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE) \
517 PREAMBLE(DATA_BITS); \
518 { ULong cf, pf, af, zf, sf, of; \
519 cf = ((DATA_UTYPE)CC_DEP2 != 0); \
520 pf = 0; \
521 af = 0; \
522 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
523 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
524 of = 0; \
525 return cf | pf | af | zf | sf | of; \
529 /*-------------------------------------------------------------*/
531 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE) \
533 PREAMBLE(DATA_BITS); \
534 { Long cf, pf, af, zf, sf, of; \
535 cf = ((DATA_UTYPE)CC_DEP2 == 0); \
536 pf = 0; \
537 af = 0; \
538 zf = 0; \
539 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
540 of = 0; \
541 return cf | pf | af | zf | sf | of; \
545 /*-------------------------------------------------------------*/
547 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE) \
549 PREAMBLE(DATA_BITS); \
550 { ULong cf, pf, af, zf, sf, of; \
551 cf = ((DATA_UTYPE)CC_DEP2 == 0); \
552 pf = 0; \
553 af = 0; \
554 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
555 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
556 of = 0; \
557 return cf | pf | af | zf | sf | of; \
561 /*-------------------------------------------------------------*/
563 #define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME) \
565 PREAMBLE(DATA_BITS); \
566 { ULong ocf; /* o or c */ \
567 ULong argL, argR, oldOC, res; \
568 oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1; \
569 argL = CC_DEP1; \
570 argR = CC_DEP2 ^ oldOC; \
571 res = (argL + argR) + oldOC; \
572 if (oldOC) \
573 ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \
574 else \
575 ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
576 return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME) \
577 | (ocf << AMD64G_CC_SHIFT_##FLAGNAME); \
581 /*-------------------------------------------------------------*/
584 #if PROFILE_RFLAGS
586 static Bool initted = False;
588 /* C flag, fast route */
589 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
590 /* C flag, slow route */
591 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
592 /* table for calculate_cond */
593 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
594 /* total entry counts for calc_all, calc_c, calc_cond. */
595 static UInt n_calc_all = 0;
596 static UInt n_calc_c = 0;
597 static UInt n_calc_cond = 0;
599 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
602 static void showCounts ( void )
604 Int op, co;
605 HChar ch;
606 vex_printf("\nTotal calls: calc_all=%u calc_cond=%u calc_c=%u\n",
607 n_calc_all, n_calc_cond, n_calc_c);
609 vex_printf(" cSLOW cFAST O NO B NB Z NZ BE NBE"
610 " S NS P NP L NL LE NLE\n");
611 vex_printf(" -----------------------------------------------------"
612 "----------------------------------------\n");
613 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
615 ch = ' ';
616 if (op > 0 && (op-1) % 4 == 0)
617 ch = 'B';
618 if (op > 0 && (op-1) % 4 == 1)
619 ch = 'W';
620 if (op > 0 && (op-1) % 4 == 2)
621 ch = 'L';
622 if (op > 0 && (op-1) % 4 == 3)
623 ch = 'Q';
625 vex_printf("%2d%c: ", op, ch);
626 vex_printf("%6u ", tabc_slow[op]);
627 vex_printf("%6u ", tabc_fast[op]);
628 for (co = 0; co < 16; co++) {
629 Int n = tab_cond[op][co];
630 if (n >= 1000) {
631 vex_printf(" %3dK", n / 1000);
632 } else
633 if (n >= 0) {
634 vex_printf(" %3d ", n );
635 } else {
636 vex_printf(" ");
639 vex_printf("\n");
641 vex_printf("\n");
644 static void initCounts ( void )
646 Int op, co;
647 initted = True;
648 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
649 tabc_fast[op] = tabc_slow[op] = 0;
650 for (co = 0; co < 16; co++)
651 tab_cond[op][co] = 0;
655 #endif /* PROFILE_RFLAGS */
658 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
659 /* Calculate all the 6 flags from the supplied thunk parameters.
660 Worker function, not directly called from generated code. */
661 static
662 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
663 ULong cc_dep1_formal,
664 ULong cc_dep2_formal,
665 ULong cc_ndep_formal )
667 switch (cc_op) {
668 case AMD64G_CC_OP_COPY:
669 return cc_dep1_formal
670 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
671 | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
673 case AMD64G_CC_OP_ADDB: ACTIONS_ADD( 8, UChar );
674 case AMD64G_CC_OP_ADDW: ACTIONS_ADD( 16, UShort );
675 case AMD64G_CC_OP_ADDL: ACTIONS_ADD( 32, UInt );
676 case AMD64G_CC_OP_ADDQ: ACTIONS_ADD( 64, ULong );
678 case AMD64G_CC_OP_ADCB: ACTIONS_ADC( 8, UChar );
679 case AMD64G_CC_OP_ADCW: ACTIONS_ADC( 16, UShort );
680 case AMD64G_CC_OP_ADCL: ACTIONS_ADC( 32, UInt );
681 case AMD64G_CC_OP_ADCQ: ACTIONS_ADC( 64, ULong );
683 case AMD64G_CC_OP_SUBB: ACTIONS_SUB( 8, UChar );
684 case AMD64G_CC_OP_SUBW: ACTIONS_SUB( 16, UShort );
685 case AMD64G_CC_OP_SUBL: ACTIONS_SUB( 32, UInt );
686 case AMD64G_CC_OP_SUBQ: ACTIONS_SUB( 64, ULong );
688 case AMD64G_CC_OP_SBBB: ACTIONS_SBB( 8, UChar );
689 case AMD64G_CC_OP_SBBW: ACTIONS_SBB( 16, UShort );
690 case AMD64G_CC_OP_SBBL: ACTIONS_SBB( 32, UInt );
691 case AMD64G_CC_OP_SBBQ: ACTIONS_SBB( 64, ULong );
693 case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC( 8, UChar );
694 case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
695 case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt );
696 case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong );
698 case AMD64G_CC_OP_INCB: ACTIONS_INC( 8, UChar );
699 case AMD64G_CC_OP_INCW: ACTIONS_INC( 16, UShort );
700 case AMD64G_CC_OP_INCL: ACTIONS_INC( 32, UInt );
701 case AMD64G_CC_OP_INCQ: ACTIONS_INC( 64, ULong );
703 case AMD64G_CC_OP_DECB: ACTIONS_DEC( 8, UChar );
704 case AMD64G_CC_OP_DECW: ACTIONS_DEC( 16, UShort );
705 case AMD64G_CC_OP_DECL: ACTIONS_DEC( 32, UInt );
706 case AMD64G_CC_OP_DECQ: ACTIONS_DEC( 64, ULong );
708 case AMD64G_CC_OP_SHLB: ACTIONS_SHL( 8, UChar );
709 case AMD64G_CC_OP_SHLW: ACTIONS_SHL( 16, UShort );
710 case AMD64G_CC_OP_SHLL: ACTIONS_SHL( 32, UInt );
711 case AMD64G_CC_OP_SHLQ: ACTIONS_SHL( 64, ULong );
713 case AMD64G_CC_OP_SHRB: ACTIONS_SHR( 8, UChar );
714 case AMD64G_CC_OP_SHRW: ACTIONS_SHR( 16, UShort );
715 case AMD64G_CC_OP_SHRL: ACTIONS_SHR( 32, UInt );
716 case AMD64G_CC_OP_SHRQ: ACTIONS_SHR( 64, ULong );
718 case AMD64G_CC_OP_ROLB: ACTIONS_ROL( 8, UChar );
719 case AMD64G_CC_OP_ROLW: ACTIONS_ROL( 16, UShort );
720 case AMD64G_CC_OP_ROLL: ACTIONS_ROL( 32, UInt );
721 case AMD64G_CC_OP_ROLQ: ACTIONS_ROL( 64, ULong );
723 case AMD64G_CC_OP_RORB: ACTIONS_ROR( 8, UChar );
724 case AMD64G_CC_OP_RORW: ACTIONS_ROR( 16, UShort );
725 case AMD64G_CC_OP_RORL: ACTIONS_ROR( 32, UInt );
726 case AMD64G_CC_OP_RORQ: ACTIONS_ROR( 64, ULong );
728 case AMD64G_CC_OP_UMULB: ACTIONS_UMUL( 8, UChar, toUChar,
729 UShort, toUShort );
730 case AMD64G_CC_OP_UMULW: ACTIONS_UMUL( 16, UShort, toUShort,
731 UInt, toUInt );
732 case AMD64G_CC_OP_UMULL: ACTIONS_UMUL( 32, UInt, toUInt,
733 ULong, idULong );
735 case AMD64G_CC_OP_UMULQ: ACTIONS_UMULQ;
737 case AMD64G_CC_OP_SMULB: ACTIONS_SMUL( 8, Char, toUChar,
738 Short, toUShort );
739 case AMD64G_CC_OP_SMULW: ACTIONS_SMUL( 16, Short, toUShort,
740 Int, toUInt );
741 case AMD64G_CC_OP_SMULL: ACTIONS_SMUL( 32, Int, toUInt,
742 Long, idULong );
744 case AMD64G_CC_OP_SMULQ: ACTIONS_SMULQ;
746 case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt );
747 case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong );
749 case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt );
750 case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong );
752 case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt );
753 case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong );
755 case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt );
756 case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong );
758 case AMD64G_CC_OP_ADCX32: ACTIONS_ADX( 32, UInt, C );
759 case AMD64G_CC_OP_ADCX64: ACTIONS_ADX( 64, ULong, C );
761 case AMD64G_CC_OP_ADOX32: ACTIONS_ADX( 32, UInt, O );
762 case AMD64G_CC_OP_ADOX64: ACTIONS_ADX( 64, ULong, O );
764 default:
765 /* shouldn't really make these calls from generated code */
766 vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
767 "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
768 cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
769 vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
774 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
775 /* Calculate all the 6 flags from the supplied thunk parameters. */
776 ULong amd64g_calculate_rflags_all ( ULong cc_op,
777 ULong cc_dep1,
778 ULong cc_dep2,
779 ULong cc_ndep )
781 # if PROFILE_RFLAGS
782 if (!initted) initCounts();
783 n_calc_all++;
784 if (SHOW_COUNTS_NOW) showCounts();
785 # endif
786 return
787 amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
791 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
792 /* Calculate just the carry flag from the supplied thunk parameters. */
793 ULong amd64g_calculate_rflags_c ( ULong cc_op,
794 ULong cc_dep1,
795 ULong cc_dep2,
796 ULong cc_ndep )
798 # if PROFILE_RFLAGS
799 if (!initted) initCounts();
800 n_calc_c++;
801 tabc_fast[cc_op]++;
802 if (SHOW_COUNTS_NOW) showCounts();
803 # endif
805 /* Fast-case some common ones. */
806 switch (cc_op) {
807 case AMD64G_CC_OP_COPY:
808 return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
809 case AMD64G_CC_OP_LOGICQ:
810 case AMD64G_CC_OP_LOGICL:
811 case AMD64G_CC_OP_LOGICW:
812 case AMD64G_CC_OP_LOGICB:
813 return 0;
814 // case AMD64G_CC_OP_SUBL:
815 // return ((UInt)cc_dep1) < ((UInt)cc_dep2)
816 // ? AMD64G_CC_MASK_C : 0;
817 // case AMD64G_CC_OP_SUBW:
818 // return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
819 // ? AMD64G_CC_MASK_C : 0;
820 // case AMD64G_CC_OP_SUBB:
821 // return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
822 // ? AMD64G_CC_MASK_C : 0;
823 // case AMD64G_CC_OP_INCL:
824 // case AMD64G_CC_OP_DECL:
825 // return cc_ndep & AMD64G_CC_MASK_C;
826 default:
827 break;
830 # if PROFILE_RFLAGS
831 tabc_fast[cc_op]--;
832 tabc_slow[cc_op]++;
833 # endif
835 return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
836 & AMD64G_CC_MASK_C;
840 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
841 /* returns 1 or 0 */
842 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
843 ULong cc_op,
844 ULong cc_dep1,
845 ULong cc_dep2,
846 ULong cc_ndep )
848 ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
849 cc_dep2, cc_ndep);
850 ULong of,sf,zf,cf,pf;
851 ULong inv = cond & 1;
853 # if PROFILE_RFLAGS
854 if (!initted) initCounts();
855 tab_cond[cc_op][cond]++;
856 n_calc_cond++;
857 if (SHOW_COUNTS_NOW) showCounts();
858 # endif
860 switch (cond) {
861 case AMD64CondNO:
862 case AMD64CondO: /* OF == 1 */
863 of = rflags >> AMD64G_CC_SHIFT_O;
864 return 1 & (inv ^ of);
866 case AMD64CondNZ:
867 case AMD64CondZ: /* ZF == 1 */
868 zf = rflags >> AMD64G_CC_SHIFT_Z;
869 return 1 & (inv ^ zf);
871 case AMD64CondNB:
872 case AMD64CondB: /* CF == 1 */
873 cf = rflags >> AMD64G_CC_SHIFT_C;
874 return 1 & (inv ^ cf);
875 break;
877 case AMD64CondNBE:
878 case AMD64CondBE: /* (CF or ZF) == 1 */
879 cf = rflags >> AMD64G_CC_SHIFT_C;
880 zf = rflags >> AMD64G_CC_SHIFT_Z;
881 return 1 & (inv ^ (cf | zf));
882 break;
884 case AMD64CondNS:
885 case AMD64CondS: /* SF == 1 */
886 sf = rflags >> AMD64G_CC_SHIFT_S;
887 return 1 & (inv ^ sf);
889 case AMD64CondNP:
890 case AMD64CondP: /* PF == 1 */
891 pf = rflags >> AMD64G_CC_SHIFT_P;
892 return 1 & (inv ^ pf);
894 case AMD64CondNL:
895 case AMD64CondL: /* (SF xor OF) == 1 */
896 sf = rflags >> AMD64G_CC_SHIFT_S;
897 of = rflags >> AMD64G_CC_SHIFT_O;
898 return 1 & (inv ^ (sf ^ of));
899 break;
901 case AMD64CondNLE:
902 case AMD64CondLE: /* ((SF xor OF) or ZF) == 1 */
903 sf = rflags >> AMD64G_CC_SHIFT_S;
904 of = rflags >> AMD64G_CC_SHIFT_O;
905 zf = rflags >> AMD64G_CC_SHIFT_Z;
906 return 1 & (inv ^ ((sf ^ of) | zf));
907 break;
909 default:
910 /* shouldn't really make these calls from generated code */
911 vex_printf("amd64g_calculate_condition"
912 "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
913 cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
914 vpanic("amd64g_calculate_condition");
919 /* VISIBLE TO LIBVEX CLIENT */
920 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
922 ULong rflags = amd64g_calculate_rflags_all_WRK(
923 vex_state->guest_CC_OP,
924 vex_state->guest_CC_DEP1,
925 vex_state->guest_CC_DEP2,
926 vex_state->guest_CC_NDEP
928 Long dflag = vex_state->guest_DFLAG;
929 vassert(dflag == 1 || dflag == -1);
930 if (dflag == -1)
931 rflags |= (1<<10);
932 if (vex_state->guest_IDFLAG == 1)
933 rflags |= (1<<21);
934 if (vex_state->guest_ACFLAG == 1)
935 rflags |= (1<<18);
937 return rflags;
940 /* VISIBLE TO LIBVEX CLIENT */
941 void
942 LibVEX_GuestAMD64_put_rflags ( ULong rflags,
943 /*MOD*/VexGuestAMD64State* vex_state )
945 /* D flag */
946 if (rflags & AMD64G_CC_MASK_D) {
947 vex_state->guest_DFLAG = -1;
948 rflags &= ~AMD64G_CC_MASK_D;
950 else
951 vex_state->guest_DFLAG = 1;
953 /* ID flag */
954 if (rflags & AMD64G_CC_MASK_ID) {
955 vex_state->guest_IDFLAG = 1;
956 rflags &= ~AMD64G_CC_MASK_ID;
958 else
959 vex_state->guest_IDFLAG = 0;
961 /* AC flag */
962 if (rflags & AMD64G_CC_MASK_AC) {
963 vex_state->guest_ACFLAG = 1;
964 rflags &= ~AMD64G_CC_MASK_AC;
966 else
967 vex_state->guest_ACFLAG = 0;
969 UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
970 AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
971 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
972 vex_state->guest_CC_DEP1 = rflags & cc_mask;
973 vex_state->guest_CC_DEP2 = 0;
974 vex_state->guest_CC_NDEP = 0;
977 /* VISIBLE TO LIBVEX CLIENT */
978 void
979 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
980 /*MOD*/VexGuestAMD64State* vex_state )
982 ULong oszacp = amd64g_calculate_rflags_all_WRK(
983 vex_state->guest_CC_OP,
984 vex_state->guest_CC_DEP1,
985 vex_state->guest_CC_DEP2,
986 vex_state->guest_CC_NDEP
988 if (new_carry_flag & 1) {
989 oszacp |= AMD64G_CC_MASK_C;
990 } else {
991 oszacp &= ~AMD64G_CC_MASK_C;
993 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
994 vex_state->guest_CC_DEP1 = oszacp;
995 vex_state->guest_CC_DEP2 = 0;
996 vex_state->guest_CC_NDEP = 0;
1000 /*---------------------------------------------------------------*/
1001 /*--- %rflags translation-time function specialisers. ---*/
1002 /*--- These help iropt specialise calls the above run-time ---*/
1003 /*--- %rflags functions. ---*/
1004 /*---------------------------------------------------------------*/
1006 /* Used by the optimiser to try specialisations. Returns an
1007 equivalent expression, or NULL if none. */
1009 static inline Bool isU64 ( IRExpr* e, ULong n )
1011 return e->tag == Iex_Const
1012 && e->Iex.Const.con->tag == Ico_U64
1013 && e->Iex.Const.con->Ico.U64 == n;
1016 /* Returns N if E is an immediate of the form 1 << N for N in 1 to 31,
1017 and zero in any other case. */
1018 static Int isU64_1_shl_N ( IRExpr* e )
1020 if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1021 return 0;
1022 ULong w64 = e->Iex.Const.con->Ico.U64;
1023 if (w64 < (1ULL << 1) || w64 > (1ULL << 31))
1024 return 0;
1025 if ((w64 & (w64 - 1)) != 0)
1026 return 0;
1027 /* At this point, we know w64 is a power of two in the range 2^1 .. 2^31,
1028 and we only need to find out which one it is. */
1029 for (Int n = 1; n <= 31; n++) {
1030 if (w64 == (1ULL << n))
1031 return n;
1033 /* Consequently we should never get here. */
1034 /*UNREACHED*/
1035 vassert(0);
1036 return 0;
1039 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
1040 IRExpr** args,
1041 IRStmt** precedingStmts,
1042 Int n_precedingStmts )
1044 # define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
1045 # define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
1046 # define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
1047 # define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
1048 # define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
1050 Int i, arity = 0;
1051 for (i = 0; args[i]; i++)
1052 arity++;
1053 # if 0
1054 vex_printf("spec request:\n");
1055 vex_printf(" %s ", function_name);
1056 for (i = 0; i < arity; i++) {
1057 vex_printf(" ");
1058 ppIRExpr(args[i]);
1060 vex_printf("\n");
1061 # endif
1063 /* --------- specialising "amd64g_calculate_condition" --------- */
1065 if (vex_streq(function_name, "amd64g_calculate_condition")) {
1066 /* specialise calls to above "calculate condition" function */
1067 IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
1068 vassert(arity == 5);
1069 cond = args[0];
1070 cc_op = args[1];
1071 cc_dep1 = args[2];
1072 cc_dep2 = args[3];
1074 /*---------------- ADDQ ----------------*/
1076 if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
1077 /* long long add, then Z --> test (dst+src == 0) */
1078 return unop(Iop_1Uto64,
1079 binop(Iop_CmpEQ64,
1080 binop(Iop_Add64, cc_dep1, cc_dep2),
1081 mkU64(0)));
1084 /*---------------- ADDL ----------------*/
1086 if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
1087 /* This is very commonly generated by Javascript JITs, for
1088 the idiom "do a 32-bit add and jump to out-of-line code if
1089 an overflow occurs". */
1090 /* long add, then O (overflow)
1091 --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
1092 --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1093 --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1095 vassert(isIRAtom(cc_dep1));
1096 vassert(isIRAtom(cc_dep2));
1097 return
1098 binop(Iop_And64,
1099 binop(Iop_Shr64,
1100 binop(Iop_And64,
1101 unop(Iop_Not64,
1102 binop(Iop_Xor64, cc_dep1, cc_dep2)),
1103 binop(Iop_Xor64,
1104 cc_dep1,
1105 binop(Iop_Add64, cc_dep1, cc_dep2))),
1106 mkU8(31)),
1107 mkU64(1));
1111 /*---------------- SUBQ ----------------*/
1113 /* 0, */
1114 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
1115 /* long long sub/cmp, then O (overflow)
1116 --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
1117 --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
1119 vassert(isIRAtom(cc_dep1));
1120 vassert(isIRAtom(cc_dep2));
1121 return binop(Iop_Shr64,
1122 binop(Iop_And64,
1123 binop(Iop_Xor64, cc_dep1, cc_dep2),
1124 binop(Iop_Xor64,
1125 cc_dep1,
1126 binop(Iop_Sub64, cc_dep1, cc_dep2))),
1127 mkU8(63));
1129 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
1130 /* No action. Never yet found a test case. */
1133 /* 2, 3 */
1134 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
1135 /* long long sub/cmp, then B (unsigned less than)
1136 --> test dst <u src */
1137 return unop(Iop_1Uto64,
1138 binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1140 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
1141 /* long long sub/cmp, then NB (unsigned greater than or equal)
1142 --> test src <=u dst */
1143 /* Note, args are opposite way round from the usual */
1144 return unop(Iop_1Uto64,
1145 binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1148 /* 4, 5 */
1149 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
1150 /* long long sub/cmp, then Z --> test dst==src */
1151 return unop(Iop_1Uto64,
1152 binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
1154 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
1155 /* long long sub/cmp, then NZ --> test dst!=src */
1156 return unop(Iop_1Uto64,
1157 binop(Iop_CmpNE64,cc_dep1,cc_dep2));
1160 /* 6, 7 */
1161 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
1162 /* long long sub/cmp, then BE (unsigned less than or equal)
1163 --> test dst <=u src */
1164 return unop(Iop_1Uto64,
1165 binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1167 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
1168 /* long long sub/cmp, then NBE (unsigned greater than)
1169 --> test !(dst <=u src) */
1170 return binop(Iop_Xor64,
1171 unop(Iop_1Uto64,
1172 binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
1173 mkU64(1));
1176 /* 8, 9 */
1177 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
1178 /* long long sub/cmp, then S (negative)
1179 --> (dst-src)[63]
1180 --> (dst-src) >>u 63 */
1181 return binop(Iop_Shr64,
1182 binop(Iop_Sub64, cc_dep1, cc_dep2),
1183 mkU8(63));
1185 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
1186 /* long long sub/cmp, then NS (not negative)
1187 --> (dst-src)[63] ^ 1
1188 --> ((dst-src) >>u 63) ^ 1 */
1189 return binop(Iop_Xor64,
1190 binop(Iop_Shr64,
1191 binop(Iop_Sub64, cc_dep1, cc_dep2),
1192 mkU8(63)),
1193 mkU64(1));
1196 /* 12, 13 */
1197 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
1198 /* long long sub/cmp, then L (signed less than)
1199 --> test dst <s src */
1200 return unop(Iop_1Uto64,
1201 binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1203 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
1204 /* long long sub/cmp, then NL (signed greater than or equal)
1205 --> test dst >=s src
1206 --> test src <=s dst */
1207 return unop(Iop_1Uto64,
1208 binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
1211 /* 14, 15 */
1212 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
1213 /* long long sub/cmp, then LE (signed less than or equal)
1214 --> test dst <=s src */
1215 return unop(Iop_1Uto64,
1216 binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
1218 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
1219 /* long sub/cmp, then NLE (signed greater than)
1220 --> test !(dst <=s src)
1221 --> test (dst >s src)
1222 --> test (src <s dst) */
1223 return unop(Iop_1Uto64,
1224 binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1228 /*---------------- SUBL ----------------*/
1230 /* 0, */
1231 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
1232 /* This is very commonly generated by Javascript JITs, for
1233 the idiom "do a 32-bit subtract and jump to out-of-line
1234 code if an overflow occurs". */
1235 /* long sub/cmp, then O (overflow)
1236 --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
1237 --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
1239 vassert(isIRAtom(cc_dep1));
1240 vassert(isIRAtom(cc_dep2));
1241 return
1242 binop(Iop_And64,
1243 binop(Iop_Shr64,
1244 binop(Iop_And64,
1245 binop(Iop_Xor64, cc_dep1, cc_dep2),
1246 binop(Iop_Xor64,
1247 cc_dep1,
1248 binop(Iop_Sub64, cc_dep1, cc_dep2))),
1249 mkU8(31)),
1250 mkU64(1));
1252 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
1253 /* No action. Never yet found a test case. */
1256 /* 2, 3 */
1258 /* It appears that LLVM 5.0 and later have a new way to find out
1259 whether the top N bits of a word W are all zero, by computing
1261 W <u 0---(N-1)---0 1 0---0
1263 In particular, the result will be defined if the top N bits of W
1264 are defined, even if the trailing bits -- those corresponding to
1265 the 0---0 section -- are undefined. Rather than make Memcheck
1266 more complex, we detect this case where we can and shift out the
1267 irrelevant and potentially undefined bits. */
1268 Int n = 0;
1269 if (isU64(cc_op, AMD64G_CC_OP_SUBL)
1270 && (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB))
1271 && (n = isU64_1_shl_N(cc_dep2)) > 0) {
1272 /* long sub/cmp, then B (unsigned less than),
1273 where dep2 is a power of 2:
1274 -> CmpLT32(dep1, 1 << N)
1275 -> CmpEQ32(dep1 >>u N, 0)
1277 long sub/cmp, then NB (unsigned greater than or equal),
1278 where dep2 is a power of 2:
1279 -> CmpGE32(dep1, 1 << N)
1280 -> CmpNE32(dep1 >>u N, 0)
1281 This avoids CmpLT32U/CmpGE32U being applied to potentially
1282 uninitialised bits in the area being shifted out. */
1283 vassert(n >= 1 && n <= 31);
1284 Bool isNB = isU64(cond, AMD64CondNB);
1285 return unop(Iop_1Uto64,
1286 binop(isNB ? Iop_CmpNE32 : Iop_CmpEQ32,
1287 binop(Iop_Shr32, unop(Iop_64to32, cc_dep1),
1288 mkU8(n)),
1289 mkU32(0)));
1292 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1293 /* long sub/cmp, then B (unsigned less than)
1294 --> test dst <u src */
1295 return unop(Iop_1Uto64,
1296 binop(Iop_CmpLT32U,
1297 unop(Iop_64to32, cc_dep1),
1298 unop(Iop_64to32, cc_dep2)));
1300 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
1301 /* long sub/cmp, then NB (unsigned greater than or equal)
1302 --> test src <=u dst */
1303 /* Note, args are opposite way round from the usual */
1304 return unop(Iop_1Uto64,
1305 binop(Iop_CmpLE32U,
1306 unop(Iop_64to32, cc_dep2),
1307 unop(Iop_64to32, cc_dep1)));
1310 /* 4, 5 */
1311 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
1312 /* long sub/cmp, then Z --> test dst==src */
1313 return unop(Iop_1Uto64,
1314 binop(Iop_CmpEQ32,
1315 unop(Iop_64to32, cc_dep1),
1316 unop(Iop_64to32, cc_dep2)));
1318 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
1319 /* long sub/cmp, then NZ --> test dst!=src */
1320 return unop(Iop_1Uto64,
1321 binop(Iop_CmpNE32,
1322 unop(Iop_64to32, cc_dep1),
1323 unop(Iop_64to32, cc_dep2)));
1326 /* 6, 7 */
1327 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1328 /* long sub/cmp, then BE (unsigned less than or equal)
1329 --> test dst <=u src */
1330 return unop(Iop_1Uto64,
1331 binop(Iop_CmpLE32U,
1332 unop(Iop_64to32, cc_dep1),
1333 unop(Iop_64to32, cc_dep2)));
1335 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1336 /* long sub/cmp, then NBE (unsigned greater than)
1337 --> test src <u dst */
1338 /* Note, args are opposite way round from the usual */
1339 return unop(Iop_1Uto64,
1340 binop(Iop_CmpLT32U,
1341 unop(Iop_64to32, cc_dep2),
1342 unop(Iop_64to32, cc_dep1)));
1345 /* 8, 9 */
1346 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1347 /* long sub/cmp, then S (negative)
1348 --> (dst-src)[31]
1349 --> ((dst -64 src) >>u 31) & 1
1350 Pointless to narrow the args to 32 bit before the subtract. */
1351 return binop(Iop_And64,
1352 binop(Iop_Shr64,
1353 binop(Iop_Sub64, cc_dep1, cc_dep2),
1354 mkU8(31)),
1355 mkU64(1));
1357 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
1358 /* long sub/cmp, then NS (not negative)
1359 --> (dst-src)[31] ^ 1
1360 --> (((dst -64 src) >>u 31) & 1) ^ 1
1361 Pointless to narrow the args to 32 bit before the subtract. */
1362 return binop(Iop_Xor64,
1363 binop(Iop_And64,
1364 binop(Iop_Shr64,
1365 binop(Iop_Sub64, cc_dep1, cc_dep2),
1366 mkU8(31)),
1367 mkU64(1)),
1368 mkU64(1));
1371 /* 12, 13 */
1372 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
1373 /* long sub/cmp, then L (signed less than)
1374 --> test dst <s src */
1375 return unop(Iop_1Uto64,
1376 binop(Iop_CmpLT32S,
1377 unop(Iop_64to32, cc_dep1),
1378 unop(Iop_64to32, cc_dep2)));
1380 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
1381 /* long sub/cmp, then NL (signed greater than or equal)
1382 --> test dst >=s src
1383 --> test src <=s dst */
1384 return unop(Iop_1Uto64,
1385 binop(Iop_CmpLE32S,
1386 unop(Iop_64to32, cc_dep2),
1387 unop(Iop_64to32, cc_dep1)));
1390 /* 14, 15 */
1391 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
1392 /* long sub/cmp, then LE (signed less than or equal)
1393 --> test dst <=s src */
1394 return unop(Iop_1Uto64,
1395 binop(Iop_CmpLE32S,
1396 unop(Iop_64to32, cc_dep1),
1397 unop(Iop_64to32, cc_dep2)));
1400 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1401 /* long sub/cmp, then NLE (signed greater than)
1402 --> test !(dst <=s src)
1403 --> test (dst >s src)
1404 --> test (src <s dst) */
1405 return unop(Iop_1Uto64,
1406 binop(Iop_CmpLT32S,
1407 unop(Iop_64to32, cc_dep2),
1408 unop(Iop_64to32, cc_dep1)));
1412 /*---------------- SUBW ----------------*/
1414 /* 4, 5 */
1415 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1416 /* word sub/cmp, then Z --> test dst==src */
1417 return unop(Iop_1Uto64,
1418 binop(Iop_CmpEQ16,
1419 unop(Iop_64to16,cc_dep1),
1420 unop(Iop_64to16,cc_dep2)));
1422 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1423 /* word sub/cmp, then NZ --> test dst!=src */
1424 return unop(Iop_1Uto64,
1425 binop(Iop_CmpNE16,
1426 unop(Iop_64to16,cc_dep1),
1427 unop(Iop_64to16,cc_dep2)));
1430 /* 6, */
1431 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
1432 /* word sub/cmp, then BE (unsigned less than or equal)
1433 --> test dst <=u src */
1434 return unop(Iop_1Uto64,
1435 binop(Iop_CmpLE64U,
1436 binop(Iop_Shl64, cc_dep1, mkU8(48)),
1437 binop(Iop_Shl64, cc_dep2, mkU8(48))));
1440 /* 8, 9 */
1441 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondS)
1442 && isU64(cc_dep2, 0)) {
1443 /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
1444 --> test dst <s 0
1445 --> (ULong)dst[15]
1446 This is yet another scheme by which clang figures out if the
1447 top bit of a word is 1 or 0. See also LOGICB/CondS below. */
1448 /* Note: isU64(cc_dep2, 0) is correct, even though this is
1449 for an 16-bit comparison, since the args to the helper
1450 function are always U64s. */
1451 return binop(Iop_And64,
1452 binop(Iop_Shr64,cc_dep1,mkU8(15)),
1453 mkU64(1));
1455 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNS)
1456 && isU64(cc_dep2, 0)) {
1457 /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1458 --> test !(dst <s 0)
1459 --> (ULong) !dst[15]
1461 return binop(Iop_Xor64,
1462 binop(Iop_And64,
1463 binop(Iop_Shr64,cc_dep1,mkU8(15)),
1464 mkU64(1)),
1465 mkU64(1));
1468 /* 14, */
1469 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1470 /* word sub/cmp, then LE (signed less than or equal)
1471 --> test dst <=s src */
1472 return unop(Iop_1Uto64,
1473 binop(Iop_CmpLE64S,
1474 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1475 binop(Iop_Shl64,cc_dep2,mkU8(48))));
1479 /*---------------- SUBB ----------------*/
1481 /* 2, 3 */
1482 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
1483 /* byte sub/cmp, then B (unsigned less than)
1484 --> test dst <u src */
1485 return unop(Iop_1Uto64,
1486 binop(Iop_CmpLT64U,
1487 binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1488 binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1490 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
1491 /* byte sub/cmp, then NB (unsigned greater than or equal)
1492 --> test src <=u dst */
1493 /* Note, args are opposite way round from the usual */
1494 return unop(Iop_1Uto64,
1495 binop(Iop_CmpLE64U,
1496 binop(Iop_And64, cc_dep2, mkU64(0xFF)),
1497 binop(Iop_And64, cc_dep1, mkU64(0xFF))));
1500 /* 4, 5 */
1501 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1502 /* byte sub/cmp, then Z --> test dst==src */
1503 return unop(Iop_1Uto64,
1504 binop(Iop_CmpEQ8,
1505 unop(Iop_64to8,cc_dep1),
1506 unop(Iop_64to8,cc_dep2)));
1508 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1509 /* byte sub/cmp, then NZ --> test dst!=src */
1510 return unop(Iop_1Uto64,
1511 binop(Iop_CmpNE8,
1512 unop(Iop_64to8,cc_dep1),
1513 unop(Iop_64to8,cc_dep2)));
1516 /* 6, */
1517 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1518 /* byte sub/cmp, then BE (unsigned less than or equal)
1519 --> test dst <=u src */
1520 return unop(Iop_1Uto64,
1521 binop(Iop_CmpLE64U,
1522 binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1523 binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1526 /* 8, 9 */
1527 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1528 && isU64(cc_dep2, 0)) {
1529 /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1530 --> test dst <s 0
1531 --> (ULong)dst[7]
1532 This is yet another scheme by which gcc figures out if the
1533 top bit of a byte is 1 or 0. See also LOGICB/CondS below. */
1534 /* Note: isU64(cc_dep2, 0) is correct, even though this is
1535 for an 8-bit comparison, since the args to the helper
1536 function are always U64s. */
1537 return binop(Iop_And64,
1538 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1539 mkU64(1));
1541 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1542 && isU64(cc_dep2, 0)) {
1543 /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1544 --> test !(dst <s 0)
1545 --> (ULong) !dst[7]
1547 return binop(Iop_Xor64,
1548 binop(Iop_And64,
1549 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1550 mkU64(1)),
1551 mkU64(1));
1554 /*---------------- LOGICQ ----------------*/
1556 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1557 /* long long and/or/xor, then Z --> test dst==0 */
1558 return unop(Iop_1Uto64,
1559 binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1561 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1562 /* long long and/or/xor, then NZ --> test dst!=0 */
1563 return unop(Iop_1Uto64,
1564 binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1567 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1568 /* long long and/or/xor, then L
1569 LOGIC sets SF and ZF according to the
1570 result and makes OF be zero. L computes SF ^ OF, but
1571 OF is zero, so this reduces to SF -- which will be 1 iff
1572 the result is < signed 0. Hence ...
1574 return unop(Iop_1Uto64,
1575 binop(Iop_CmpLT64S,
1576 cc_dep1,
1577 mkU64(0)));
1580 /*---------------- LOGICL ----------------*/
1582 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1583 /* long and/or/xor, then Z --> test dst==0 */
1584 return unop(Iop_1Uto64,
1585 binop(Iop_CmpEQ32,
1586 unop(Iop_64to32, cc_dep1),
1587 mkU32(0)));
1589 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1590 /* long and/or/xor, then NZ --> test dst!=0 */
1591 return unop(Iop_1Uto64,
1592 binop(Iop_CmpNE32,
1593 unop(Iop_64to32, cc_dep1),
1594 mkU32(0)));
1597 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1598 /* long and/or/xor, then LE
1599 This is pretty subtle. LOGIC sets SF and ZF according to the
1600 result and makes OF be zero. LE computes (SF ^ OF) | ZF, but
1601 OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1602 the result is <=signed 0. Hence ...
1604 return unop(Iop_1Uto64,
1605 binop(Iop_CmpLE32S,
1606 unop(Iop_64to32, cc_dep1),
1607 mkU32(0)));
1610 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1611 /* long and/or/xor, then S --> (ULong)result[31] */
1612 return binop(Iop_And64,
1613 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1614 mkU64(1));
1616 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1617 /* long and/or/xor, then S --> (ULong) ~ result[31] */
1618 return binop(Iop_Xor64,
1619 binop(Iop_And64,
1620 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1621 mkU64(1)),
1622 mkU64(1));
1625 /*---------------- LOGICW ----------------*/
1627 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1628 /* word and/or/xor, then Z --> test dst==0 */
1629 return unop(Iop_1Uto64,
1630 binop(Iop_CmpEQ64,
1631 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1632 mkU64(0)));
1634 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1635 /* word and/or/xor, then NZ --> test dst!=0 */
1636 return unop(Iop_1Uto64,
1637 binop(Iop_CmpNE64,
1638 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1639 mkU64(0)));
1642 /*---------------- LOGICB ----------------*/
1644 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1645 /* byte and/or/xor, then Z --> test dst==0 */
1646 return unop(Iop_1Uto64,
1647 binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
1648 mkU64(0)));
1650 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1651 /* byte and/or/xor, then NZ --> test dst!=0 */
1652 return unop(Iop_1Uto64,
1653 binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
1654 mkU64(0)));
1657 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1658 /* this is an idiom gcc sometimes uses to find out if the top
1659 bit of a byte register is set: eg testb %al,%al; js ..
1660 Since it just depends on the top bit of the byte, extract
1661 that bit and explicitly get rid of all the rest. This
1662 helps memcheck avoid false positives in the case where any
1663 of the other bits in the byte are undefined. */
1664 /* byte and/or/xor, then S --> (UInt)result[7] */
1665 return binop(Iop_And64,
1666 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1667 mkU64(1));
1669 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1670 /* byte and/or/xor, then NS --> (UInt)!result[7] */
1671 return binop(Iop_Xor64,
1672 binop(Iop_And64,
1673 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1674 mkU64(1)),
1675 mkU64(1));
1678 /*---------------- INCB ----------------*/
1680 if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1681 /* 8-bit inc, then LE --> sign bit of the arg */
1682 return binop(Iop_And64,
1683 binop(Iop_Shr64,
1684 binop(Iop_Sub64, cc_dep1, mkU64(1)),
1685 mkU8(7)),
1686 mkU64(1));
1689 /*---------------- INCW ----------------*/
1691 if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1692 /* 16-bit inc, then Z --> test dst == 0 */
1693 return unop(Iop_1Uto64,
1694 binop(Iop_CmpEQ64,
1695 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1696 mkU64(0)));
1699 /*---------------- DECL ----------------*/
1701 if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1702 /* dec L, then Z --> test dst == 0 */
1703 return unop(Iop_1Uto64,
1704 binop(Iop_CmpEQ32,
1705 unop(Iop_64to32, cc_dep1),
1706 mkU32(0)));
1709 /*---------------- DECW ----------------*/
1711 if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1712 /* 16-bit dec, then NZ --> test dst != 0 */
1713 return unop(Iop_1Uto64,
1714 binop(Iop_CmpNE64,
1715 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1716 mkU64(0)));
1719 /*---------------- SHRQ ----------------*/
1721 if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
1722 /* SHRQ, then Z --> test dep1 == 0 */
1723 return unop(Iop_1Uto64,
1724 binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1726 if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
1727 /* SHRQ, then NZ --> test dep1 != 0 */
1728 return unop(Iop_1Uto64,
1729 binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1732 /*---------------- SHRL ----------------*/
1734 if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
1735 /* SHRL, then Z --> test dep1 == 0 */
1736 return unop(Iop_1Uto64,
1737 binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1738 mkU32(0)));
1740 if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNZ)) {
1741 /* SHRL, then NZ --> test dep1 != 0 */
1742 return unop(Iop_1Uto64,
1743 binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1744 mkU32(0)));
1747 if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondS)) {
1748 /* SHRL/SARL, then S --> (ULong)result[31] */
1749 return binop(Iop_And64,
1750 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1751 mkU64(1));
1753 // The following looks correct to me, but never seems to happen because
1754 // the front end converts jns to js by switching the fallthrough vs
1755 // taken addresses. See jcc_01(). But then why do other conditions
1756 // considered by this function show up in both variants (xx and Nxx) ?
1757 //if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNS)) {
1758 // /* SHRL/SARL, then NS --> (ULong) ~ result[31] */
1759 // vassert(0);
1760 // return binop(Iop_Xor64,
1761 // binop(Iop_And64,
1762 // binop(Iop_Shr64, cc_dep1, mkU8(31)),
1763 // mkU64(1)),
1764 // mkU64(1));
1767 /*---------------- COPY ----------------*/
1768 /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1769 jbe" for example. */
1771 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1772 && (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1773 /* COPY, then BE --> extract C and Z from dep1, and test (C
1774 or Z == 1). */
1775 /* COPY, then NBE --> extract C and Z from dep1, and test (C
1776 or Z == 0). */
1777 ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1778 return
1779 unop(
1780 Iop_1Uto64,
1781 binop(
1782 Iop_CmpEQ64,
1783 binop(
1784 Iop_And64,
1785 binop(
1786 Iop_Or64,
1787 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1788 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1790 mkU64(1)
1792 mkU64(nnn)
1797 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1798 && (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB))) {
1799 /* COPY, then B --> extract C from dep1, and test (C == 1). */
1800 /* COPY, then NB --> extract C from dep1, and test (C == 0). */
1801 ULong nnn = isU64(cond, AMD64CondB) ? 1 : 0;
1802 return
1803 unop(
1804 Iop_1Uto64,
1805 binop(
1806 Iop_CmpEQ64,
1807 binop(
1808 Iop_And64,
1809 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1810 mkU64(1)
1812 mkU64(nnn)
1817 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1818 && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1819 /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1820 /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1821 ULong nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1822 return
1823 unop(
1824 Iop_1Uto64,
1825 binop(
1826 Iop_CmpEQ64,
1827 binop(
1828 Iop_And64,
1829 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1830 mkU64(1)
1832 mkU64(nnn)
1837 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1838 && (isU64(cond, AMD64CondP) || isU64(cond, AMD64CondNP))) {
1839 /* COPY, then P --> extract P from dep1, and test (P == 1). */
1840 /* COPY, then NP --> extract P from dep1, and test (P == 0). */
1841 ULong nnn = isU64(cond, AMD64CondP) ? 1 : 0;
1842 return
1843 unop(
1844 Iop_1Uto64,
1845 binop(
1846 Iop_CmpEQ64,
1847 binop(
1848 Iop_And64,
1849 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1850 mkU64(1)
1852 mkU64(nnn)
1857 return NULL;
1860 /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1862 if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1863 /* specialise calls to above "calculate_rflags_c" function */
1864 IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1865 vassert(arity == 4);
1866 cc_op = args[0];
1867 cc_dep1 = args[1];
1868 cc_dep2 = args[2];
1869 cc_ndep = args[3];
1871 if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1872 /* C after sub denotes unsigned less than */
1873 return unop(Iop_1Uto64,
1874 binop(Iop_CmpLT64U,
1875 cc_dep1,
1876 cc_dep2));
1878 if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1879 /* C after sub denotes unsigned less than */
1880 return unop(Iop_1Uto64,
1881 binop(Iop_CmpLT32U,
1882 unop(Iop_64to32, cc_dep1),
1883 unop(Iop_64to32, cc_dep2)));
1885 if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1886 /* C after sub denotes unsigned less than */
1887 return unop(Iop_1Uto64,
1888 binop(Iop_CmpLT64U,
1889 binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1890 binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1892 if (isU64(cc_op, AMD64G_CC_OP_ADDQ)) {
1893 /* C after add denotes sum <u either arg */
1894 return unop(Iop_1Uto64,
1895 binop(Iop_CmpLT64U,
1896 binop(Iop_Add64, cc_dep1, cc_dep2),
1897 cc_dep1));
1899 if (isU64(cc_op, AMD64G_CC_OP_ADDL)) {
1900 /* C after add denotes sum <u either arg */
1901 return unop(Iop_1Uto64,
1902 binop(Iop_CmpLT32U,
1903 unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
1904 unop(Iop_64to32, cc_dep1)));
1906 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1907 || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1908 || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1909 || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1910 /* cflag after logic is zero */
1911 return mkU64(0);
1913 if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1914 || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1915 /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1916 return cc_ndep;
1919 # if 0
1920 if (cc_op->tag == Iex_Const) {
1921 vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1923 # endif
1925 return NULL;
1928 # undef unop
1929 # undef binop
1930 # undef mkU64
1931 # undef mkU32
1932 # undef mkU8
1934 return NULL;
1938 /*---------------------------------------------------------------*/
1939 /*--- Supporting functions for x87 FPU activities. ---*/
1940 /*---------------------------------------------------------------*/
1942 static inline Bool host_is_little_endian ( void )
1944 UInt x = 0x76543210;
1945 UChar* p = (UChar*)(&x);
1946 return toBool(*p == 0x10);
1949 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1950 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
1951 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
1953 Bool mantissaIsZero;
1954 Int bexp;
1955 UChar sign;
1956 UChar* f64;
1958 vassert(host_is_little_endian());
1960 /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1962 f64 = (UChar*)(&dbl);
1963 sign = toUChar( (f64[7] >> 7) & 1 );
1965 /* First off, if the tag indicates the register was empty,
1966 return 1,0,sign,1 */
1967 if (tag == 0) {
1968 /* vex_printf("Empty\n"); */
1969 return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
1970 | AMD64G_FC_MASK_C0;
1973 bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
1974 bexp &= 0x7FF;
1976 mantissaIsZero
1977 = toBool(
1978 (f64[6] & 0x0F) == 0
1979 && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
1982 /* If both exponent and mantissa are zero, the value is zero.
1983 Return 1,0,sign,0. */
1984 if (bexp == 0 && mantissaIsZero) {
1985 /* vex_printf("Zero\n"); */
1986 return AMD64G_FC_MASK_C3 | 0
1987 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1990 /* If exponent is zero but mantissa isn't, it's a denormal.
1991 Return 1,1,sign,0. */
1992 if (bexp == 0 && !mantissaIsZero) {
1993 /* vex_printf("Denormal\n"); */
1994 return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
1995 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1998 /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1999 Return 0,1,sign,1. */
2000 if (bexp == 0x7FF && mantissaIsZero) {
2001 /* vex_printf("Inf\n"); */
2002 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
2003 | AMD64G_FC_MASK_C0;
2006 /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
2007 Return 0,0,sign,1. */
2008 if (bexp == 0x7FF && !mantissaIsZero) {
2009 /* vex_printf("NaN\n"); */
2010 return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
2013 /* Uh, ok, we give up. It must be a normal finite number.
2014 Return 0,1,sign,0.
2016 /* vex_printf("normal\n"); */
2017 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
2021 /* This is used to implement both 'frstor' and 'fldenv'. The latter
2022 appears to differ from the former only in that the 8 FP registers
2023 themselves are not transferred into the guest state. */
2024 static
2025 VexEmNote do_put_x87 ( Bool moveRegs,
2026 /*IN*/Fpu_State* x87_state,
2027 /*OUT*/VexGuestAMD64State* vex_state )
2029 Int stno, preg;
2030 UInt tag;
2031 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2032 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2033 UInt ftop = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
2034 UInt tagw = x87_state->env[FP_ENV_TAG];
2035 UInt fpucw = x87_state->env[FP_ENV_CTRL];
2036 UInt c3210 = x87_state->env[FP_ENV_STAT] & 0x4700;
2037 VexEmNote ew;
2038 UInt fpround;
2039 ULong pair;
2041 /* Copy registers and tags */
2042 for (stno = 0; stno < 8; stno++) {
2043 preg = (stno + ftop) & 7;
2044 tag = (tagw >> (2*preg)) & 3;
2045 if (tag == 3) {
2046 /* register is empty */
2047 /* hmm, if it's empty, does it still get written? Probably
2048 safer to say it does. If we don't, memcheck could get out
2049 of sync, in that it thinks all FP registers are defined by
2050 this helper, but in reality some have not been updated. */
2051 if (moveRegs)
2052 vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2053 vexTags[preg] = 0;
2054 } else {
2055 /* register is non-empty */
2056 if (moveRegs)
2057 convert_f80le_to_f64le( &x87_state->reg[10*stno],
2058 (UChar*)&vexRegs[preg] );
2059 vexTags[preg] = 1;
2063 /* stack pointer */
2064 vex_state->guest_FTOP = ftop;
2066 /* status word */
2067 vex_state->guest_FC3210 = c3210;
2069 /* handle the control word, setting FPROUND and detecting any
2070 emulation warnings. */
2071 pair = amd64g_check_fldcw ( (ULong)fpucw );
2072 fpround = (UInt)pair & 0xFFFFFFFFULL;
2073 ew = (VexEmNote)(pair >> 32);
2075 vex_state->guest_FPROUND = fpround & 3;
2077 /* emulation warnings --> caller */
2078 return ew;
2082 /* Create an x87 FPU state from the guest state, as close as
2083 we can approximate it. */
2084 static
2085 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
2086 /*OUT*/Fpu_State* x87_state )
2088 Int i, stno, preg;
2089 UInt tagw;
2090 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2091 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2092 UInt ftop = vex_state->guest_FTOP;
2093 UInt c3210 = vex_state->guest_FC3210;
2095 for (i = 0; i < 14; i++)
2096 x87_state->env[i] = 0;
2098 x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
2099 = x87_state->env[13] = 0xFFFF;
2100 x87_state->env[FP_ENV_STAT]
2101 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2102 x87_state->env[FP_ENV_CTRL]
2103 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2105 /* Dump the register stack in ST order. */
2106 tagw = 0;
2107 for (stno = 0; stno < 8; stno++) {
2108 preg = (stno + ftop) & 7;
2109 if (vexTags[preg] == 0) {
2110 /* register is empty */
2111 tagw |= (3 << (2*preg));
2112 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2113 &x87_state->reg[10*stno] );
2114 } else {
2115 /* register is full. */
2116 tagw |= (0 << (2*preg));
2117 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2118 &x87_state->reg[10*stno] );
2121 x87_state->env[FP_ENV_TAG] = toUShort(tagw);
2125 /*---------------------------------------------------------------*/
2126 /*--- Supporting functions for XSAVE/FXSAVE. ---*/
2127 /*---------------------------------------------------------------*/
2129 /* CALLED FROM GENERATED CODE */
2130 /* DIRTY HELPER (reads guest state, writes guest mem) */
2131 /* XSAVE component 0 is the x87 FPU state. */
2132 void amd64g_dirtyhelper_XSAVE_COMPONENT_0
2133 ( VexGuestAMD64State* gst, HWord addr )
2135 /* Derived from values obtained from
2136 vendor_id : AuthenticAMD
2137 cpu family : 15
2138 model : 12
2139 model name : AMD Athlon(tm) 64 Processor 3200+
2140 stepping : 0
2141 cpu MHz : 2200.000
2142 cache size : 512 KB
2144 /* Somewhat roundabout, but at least it's simple. */
2145 Fpu_State tmp;
2146 UShort* addrS = (UShort*)addr;
2147 UChar* addrC = (UChar*)addr;
2148 UShort fp_tags;
2149 UInt summary_tags;
2150 Int r, stno;
2151 UShort *srcS, *dstS;
2153 do_get_x87( gst, &tmp );
2155 /* Now build the proper fxsave x87 image from the fsave x87 image
2156 we just made. */
2158 addrS[0] = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
2159 addrS[1] = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
2161 /* set addrS[2] in an endian-independent way */
2162 summary_tags = 0;
2163 fp_tags = tmp.env[FP_ENV_TAG];
2164 for (r = 0; r < 8; r++) {
2165 if ( ((fp_tags >> (2*r)) & 3) != 3 )
2166 summary_tags |= (1 << r);
2168 addrC[4] = toUChar(summary_tags); /* FTW: tag summary byte */
2169 addrC[5] = 0; /* pad */
2171 /* FOP: faulting fpu opcode. From experimentation, the real CPU
2172 does not write this field. (?!) */
2173 addrS[3] = 0; /* BOGUS */
2175 /* RIP (Last x87 instruction pointer). From experimentation, the
2176 real CPU does not write this field. (?!) */
2177 addrS[4] = 0; /* BOGUS */
2178 addrS[5] = 0; /* BOGUS */
2179 addrS[6] = 0; /* BOGUS */
2180 addrS[7] = 0; /* BOGUS */
2182 /* RDP (Last x87 data pointer). From experimentation, the real CPU
2183 does not write this field. (?!) */
2184 addrS[8] = 0; /* BOGUS */
2185 addrS[9] = 0; /* BOGUS */
2186 addrS[10] = 0; /* BOGUS */
2187 addrS[11] = 0; /* BOGUS */
2189 /* addrS[13,12] are MXCSR -- not written */
2190 /* addrS[15,14] are MXCSR_MASK -- not written */
2192 /* Copy in the FP registers, in ST order. */
2193 for (stno = 0; stno < 8; stno++) {
2194 srcS = (UShort*)(&tmp.reg[10*stno]);
2195 dstS = (UShort*)(&addrS[16 + 8*stno]);
2196 dstS[0] = srcS[0];
2197 dstS[1] = srcS[1];
2198 dstS[2] = srcS[2];
2199 dstS[3] = srcS[3];
2200 dstS[4] = srcS[4];
2201 dstS[5] = 0;
2202 dstS[6] = 0;
2203 dstS[7] = 0;
2208 /* CALLED FROM GENERATED CODE */
2209 /* DIRTY HELPER (reads guest state, writes guest mem) */
2210 /* XSAVE component 1 is the SSE state. */
2211 void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
2212 ( VexGuestAMD64State* gst, HWord addr )
2214 UShort* addrS = (UShort*)addr;
2215 UInt mxcsr;
2217 /* The only non-register parts of the SSE state are MXCSR and
2218 MXCSR_MASK. */
2219 mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
2221 addrS[12] = toUShort(mxcsr); /* MXCSR */
2222 addrS[13] = toUShort(mxcsr >> 16);
2224 addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
2225 addrS[15] = 0x0000; /* MXCSR mask (hi16) */
2229 /* VISIBLE TO LIBVEX CLIENT */
2230 /* Do FXSAVE from the supplied VexGuestAMD64State structure and store
2231 the result at the given address which represents a buffer of at
2232 least 416 bytes.
2234 This function is not called from generated code. FXSAVE is dealt
2235 with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
2236 functions above plus some in-line IR. This function is merely a
2237 convenience function for VEX's users.
2239 void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
2240 /*OUT*/HWord fp_state )
2242 /* Do the x87 part */
2243 amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
2245 /* And now the SSE part, except for the registers themselves. */
2246 amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2248 /* That's the first 160 bytes of the image done. */
2249 /* Now only %xmm0 .. %xmm15 remain to be copied. If the host is
2250 big-endian, these need to be byte-swapped. */
2251 U128 *xmm = (U128 *)(fp_state + 160);
2252 vassert(host_is_little_endian());
2254 # define COPY_U128(_dst,_src) \
2255 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
2256 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
2257 while (0)
2259 COPY_U128( xmm[0], gst->guest_YMM0 );
2260 COPY_U128( xmm[1], gst->guest_YMM1 );
2261 COPY_U128( xmm[2], gst->guest_YMM2 );
2262 COPY_U128( xmm[3], gst->guest_YMM3 );
2263 COPY_U128( xmm[4], gst->guest_YMM4 );
2264 COPY_U128( xmm[5], gst->guest_YMM5 );
2265 COPY_U128( xmm[6], gst->guest_YMM6 );
2266 COPY_U128( xmm[7], gst->guest_YMM7 );
2267 COPY_U128( xmm[8], gst->guest_YMM8 );
2268 COPY_U128( xmm[9], gst->guest_YMM9 );
2269 COPY_U128( xmm[10], gst->guest_YMM10 );
2270 COPY_U128( xmm[11], gst->guest_YMM11 );
2271 COPY_U128( xmm[12], gst->guest_YMM12 );
2272 COPY_U128( xmm[13], gst->guest_YMM13 );
2273 COPY_U128( xmm[14], gst->guest_YMM14 );
2274 COPY_U128( xmm[15], gst->guest_YMM15 );
2275 # undef COPY_U128
2279 /*---------------------------------------------------------------*/
2280 /*--- Supporting functions for XRSTOR/FXRSTOR. ---*/
2281 /*---------------------------------------------------------------*/
2283 /* CALLED FROM GENERATED CODE */
2284 /* DIRTY HELPER (writes guest state, reads guest mem) */
2285 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
2286 ( VexGuestAMD64State* gst, HWord addr )
2288 Fpu_State tmp;
2289 UShort* addrS = (UShort*)addr;
2290 UChar* addrC = (UChar*)addr;
2291 UShort fp_tags;
2292 Int r, stno, i;
2294 /* Copy the x87 registers out of the image, into a temporary
2295 Fpu_State struct. */
2296 for (i = 0; i < 14; i++) tmp.env[i] = 0;
2297 for (i = 0; i < 80; i++) tmp.reg[i] = 0;
2298 /* fill in tmp.reg[0..7] */
2299 for (stno = 0; stno < 8; stno++) {
2300 UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
2301 UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
2302 dstS[0] = srcS[0];
2303 dstS[1] = srcS[1];
2304 dstS[2] = srcS[2];
2305 dstS[3] = srcS[3];
2306 dstS[4] = srcS[4];
2308 /* fill in tmp.env[0..13] */
2309 tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
2310 tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
2312 fp_tags = 0;
2313 for (r = 0; r < 8; r++) {
2314 if (addrC[4] & (1<<r))
2315 fp_tags |= (0 << (2*r)); /* EMPTY */
2316 else
2317 fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
2319 tmp.env[FP_ENV_TAG] = fp_tags;
2321 /* Now write 'tmp' into the guest state. */
2322 VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );
2324 return warnX87;
2328 /* CALLED FROM GENERATED CODE */
2329 /* DIRTY HELPER (writes guest state, reads guest mem) */
2330 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
2331 ( VexGuestAMD64State* gst, HWord addr )
2333 UShort* addrS = (UShort*)addr;
2334 UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
2335 | ((((UInt)addrS[13]) & 0xFFFF) << 16);
2336 ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
2338 VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
2340 gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
2341 return warnXMM;
2345 /* VISIBLE TO LIBVEX CLIENT */
2346 /* Do FXRSTOR from the supplied address and store read values to the given
2347 VexGuestAMD64State structure.
2349 This function is not called from generated code. FXRSTOR is dealt
2350 with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
2351 functions above plus some in-line IR. This function is merely a
2352 convenience function for VEX's users.
2354 VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
2355 /*MOD*/VexGuestAMD64State* gst )
2357 /* Restore %xmm0 .. %xmm15. If the host is big-endian, these need
2358 to be byte-swapped. */
2359 U128 *xmm = (U128 *)(fp_state + 160);
2361 vassert(host_is_little_endian());
2363 # define COPY_U128(_dst,_src) \
2364 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
2365 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
2366 while (0)
2368 COPY_U128( gst->guest_YMM0, xmm[0] );
2369 COPY_U128( gst->guest_YMM1, xmm[1] );
2370 COPY_U128( gst->guest_YMM2, xmm[2] );
2371 COPY_U128( gst->guest_YMM3, xmm[3] );
2372 COPY_U128( gst->guest_YMM4, xmm[4] );
2373 COPY_U128( gst->guest_YMM5, xmm[5] );
2374 COPY_U128( gst->guest_YMM6, xmm[6] );
2375 COPY_U128( gst->guest_YMM7, xmm[7] );
2376 COPY_U128( gst->guest_YMM8, xmm[8] );
2377 COPY_U128( gst->guest_YMM9, xmm[9] );
2378 COPY_U128( gst->guest_YMM10, xmm[10] );
2379 COPY_U128( gst->guest_YMM11, xmm[11] );
2380 COPY_U128( gst->guest_YMM12, xmm[12] );
2381 COPY_U128( gst->guest_YMM13, xmm[13] );
2382 COPY_U128( gst->guest_YMM14, xmm[14] );
2383 COPY_U128( gst->guest_YMM15, xmm[15] );
2385 # undef COPY_U128
2387 VexEmNote warnXMM
2388 = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2389 VexEmNote warnX87
2390 = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
2392 /* Prefer an X87 emwarn over an XMM one, if both exist. */
2393 if (warnX87 != EmNote_NONE)
2394 return warnX87;
2395 else
2396 return warnXMM;
2400 /*---------------------------------------------------------------*/
2401 /*--- Supporting functions for FSAVE/FRSTOR ---*/
2402 /*---------------------------------------------------------------*/
2404 /* DIRTY HELPER (writes guest state) */
2405 /* Initialise the x87 FPU state as per 'finit'. */
2406 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
2408 Int i;
2409 gst->guest_FTOP = 0;
2410 for (i = 0; i < 8; i++) {
2411 gst->guest_FPTAG[i] = 0; /* empty */
2412 gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
2414 gst->guest_FPROUND = (ULong)Irrm_NEAREST;
2415 gst->guest_FC3210 = 0;
2419 /* CALLED FROM GENERATED CODE */
2420 /* DIRTY HELPER (reads guest memory) */
2421 ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
2423 ULong f64;
2424 convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
2425 return f64;
2428 /* CALLED FROM GENERATED CODE */
2429 /* DIRTY HELPER (writes guest memory) */
2430 void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
2432 convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
2436 /* CALLED FROM GENERATED CODE */
2437 /* CLEAN HELPER */
2438 /* mxcsr[15:0] contains a SSE native format MXCSR value.
2439 Extract from it the required SSEROUND value and any resulting
2440 emulation warning, and return (warn << 32) | sseround value.
2442 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
2444 /* Decide on a rounding mode. mxcsr[14:13] holds it. */
2445 /* NOTE, encoded exactly as per enum IRRoundingMode. */
2446 ULong rmode = (mxcsr >> 13) & 3;
2448 /* Detect any required emulation warnings. */
2449 VexEmNote ew = EmNote_NONE;
2451 if ((mxcsr & 0x1F80) != 0x1F80) {
2452 /* unmasked exceptions! */
2453 ew = EmWarn_X86_sseExns;
2455 else
2456 if (mxcsr & (1<<15)) {
2457 /* FZ is set */
2458 ew = EmWarn_X86_fz;
2460 else
2461 if (mxcsr & (1<<6)) {
2462 /* DAZ is set */
2463 ew = EmWarn_X86_daz;
2466 return (((ULong)ew) << 32) | ((ULong)rmode);
2470 /* CALLED FROM GENERATED CODE */
2471 /* CLEAN HELPER */
2472 /* Given sseround as an IRRoundingMode value, create a suitable SSE
2473 native format MXCSR value. */
2474 ULong amd64g_create_mxcsr ( ULong sseround )
2476 sseround &= 3;
2477 return 0x1F80 | (sseround << 13);
2481 /* CLEAN HELPER */
2482 /* fpucw[15:0] contains a x87 native format FPU control word.
2483 Extract from it the required FPROUND value and any resulting
2484 emulation warning, and return (warn << 32) | fpround value.
2486 ULong amd64g_check_fldcw ( ULong fpucw )
2488 /* Decide on a rounding mode. fpucw[11:10] holds it. */
2489 /* NOTE, encoded exactly as per enum IRRoundingMode. */
2490 ULong rmode = (fpucw >> 10) & 3;
2492 /* Detect any required emulation warnings. */
2493 VexEmNote ew = EmNote_NONE;
2495 if ((fpucw & 0x3F) != 0x3F) {
2496 /* unmasked exceptions! */
2497 ew = EmWarn_X86_x87exns;
2499 else
2500 if (((fpucw >> 8) & 3) != 3) {
2501 /* unsupported precision */
2502 ew = EmWarn_X86_x87precision;
2505 return (((ULong)ew) << 32) | ((ULong)rmode);
2509 /* CLEAN HELPER */
2510 /* Given fpround as an IRRoundingMode value, create a suitable x87
2511 native format FPU control word. */
2512 ULong amd64g_create_fpucw ( ULong fpround )
2514 fpround &= 3;
2515 return 0x037F | (fpround << 10);
2519 /* This is used to implement 'fldenv'.
2520 Reads 28 bytes at x87_state[0 .. 27]. */
2521 /* CALLED FROM GENERATED CODE */
2522 /* DIRTY HELPER */
2523 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
2524 /*IN*/HWord x87_state)
2526 return do_put_x87( False, (Fpu_State*)x87_state, vex_state );
2530 /* CALLED FROM GENERATED CODE */
2531 /* DIRTY HELPER */
2532 /* Create an x87 FPU env from the guest state, as close as we can
2533 approximate it. Writes 28 bytes at x87_state[0..27]. */
2534 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
2535 /*OUT*/HWord x87_state )
2537 Int i, stno, preg;
2538 UInt tagw;
2539 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2540 Fpu_State* x87 = (Fpu_State*)x87_state;
2541 UInt ftop = vex_state->guest_FTOP;
2542 ULong c3210 = vex_state->guest_FC3210;
2544 for (i = 0; i < 14; i++)
2545 x87->env[i] = 0;
2547 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
2548 x87->env[FP_ENV_STAT]
2549 = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
2550 x87->env[FP_ENV_CTRL]
2551 = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
2553 /* Compute the x87 tag word. */
2554 tagw = 0;
2555 for (stno = 0; stno < 8; stno++) {
2556 preg = (stno + ftop) & 7;
2557 if (vexTags[preg] == 0) {
2558 /* register is empty */
2559 tagw |= (3 << (2*preg));
2560 } else {
2561 /* register is full. */
2562 tagw |= (0 << (2*preg));
2565 x87->env[FP_ENV_TAG] = toUShort(tagw);
2567 /* We don't dump the x87 registers, tho. */
2571 /* This is used to implement 'fnsave'.
2572 Writes 108 bytes at x87_state[0 .. 107]. */
2573 /* CALLED FROM GENERATED CODE */
2574 /* DIRTY HELPER */
2575 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2576 /*OUT*/HWord x87_state)
2578 do_get_x87( vex_state, (Fpu_State*)x87_state );
2582 /* This is used to implement 'fnsaves'.
2583 Writes 94 bytes at x87_state[0 .. 93]. */
2584 /* CALLED FROM GENERATED CODE */
2585 /* DIRTY HELPER */
2586 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2587 /*OUT*/HWord x87_state)
2589 Int i, stno, preg;
2590 UInt tagw;
2591 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2592 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2593 Fpu_State_16* x87 = (Fpu_State_16*)x87_state;
2594 UInt ftop = vex_state->guest_FTOP;
2595 UInt c3210 = vex_state->guest_FC3210;
2597 for (i = 0; i < 7; i++)
2598 x87->env[i] = 0;
2600 x87->env[FPS_ENV_STAT]
2601 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2602 x87->env[FPS_ENV_CTRL]
2603 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2605 /* Dump the register stack in ST order. */
2606 tagw = 0;
2607 for (stno = 0; stno < 8; stno++) {
2608 preg = (stno + ftop) & 7;
2609 if (vexTags[preg] == 0) {
2610 /* register is empty */
2611 tagw |= (3 << (2*preg));
2612 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2613 &x87->reg[10*stno] );
2614 } else {
2615 /* register is full. */
2616 tagw |= (0 << (2*preg));
2617 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2618 &x87->reg[10*stno] );
2621 x87->env[FPS_ENV_TAG] = toUShort(tagw);
2625 /* This is used to implement 'frstor'.
2626 Reads 108 bytes at x87_state[0 .. 107]. */
2627 /* CALLED FROM GENERATED CODE */
2628 /* DIRTY HELPER */
2629 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2630 /*IN*/HWord x87_state)
2632 return do_put_x87( True, (Fpu_State*)x87_state, vex_state );
2636 /* This is used to implement 'frstors'.
2637 Reads 94 bytes at x87_state[0 .. 93]. */
2638 /* CALLED FROM GENERATED CODE */
2639 /* DIRTY HELPER */
2640 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2641 /*IN*/HWord x87_state)
2643 Int stno, preg;
2644 UInt tag;
2645 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2646 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2647 Fpu_State_16* x87 = (Fpu_State_16*)x87_state;
2648 UInt ftop = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2649 UInt tagw = x87->env[FPS_ENV_TAG];
2650 UInt fpucw = x87->env[FPS_ENV_CTRL];
2651 UInt c3210 = x87->env[FPS_ENV_STAT] & 0x4700;
2652 VexEmNote ew;
2653 UInt fpround;
2654 ULong pair;
2656 /* Copy registers and tags */
2657 for (stno = 0; stno < 8; stno++) {
2658 preg = (stno + ftop) & 7;
2659 tag = (tagw >> (2*preg)) & 3;
2660 if (tag == 3) {
2661 /* register is empty */
2662 /* hmm, if it's empty, does it still get written? Probably
2663 safer to say it does. If we don't, memcheck could get out
2664 of sync, in that it thinks all FP registers are defined by
2665 this helper, but in reality some have not been updated. */
2666 vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2667 vexTags[preg] = 0;
2668 } else {
2669 /* register is non-empty */
2670 convert_f80le_to_f64le( &x87->reg[10*stno],
2671 (UChar*)&vexRegs[preg] );
2672 vexTags[preg] = 1;
2676 /* stack pointer */
2677 vex_state->guest_FTOP = ftop;
2679 /* status word */
2680 vex_state->guest_FC3210 = c3210;
2682 /* handle the control word, setting FPROUND and detecting any
2683 emulation warnings. */
2684 pair = amd64g_check_fldcw ( (ULong)fpucw );
2685 fpround = (UInt)pair & 0xFFFFFFFFULL;
2686 ew = (VexEmNote)(pair >> 32);
2688 vex_state->guest_FPROUND = fpround & 3;
2690 /* emulation warnings --> caller */
2691 return ew;
2695 /*---------------------------------------------------------------*/
2696 /*--- CPUID helpers. ---*/
2697 /*---------------------------------------------------------------*/
2699 /* Claim to be the following CPU, which is probably representative of
2700 the lowliest (earliest) amd64 offerings. It can do neither sse3
2701 nor cx16.
2703 vendor_id : AuthenticAMD
2704 cpu family : 15
2705 model : 5
2706 model name : AMD Opteron (tm) Processor 848
2707 stepping : 10
2708 cpu MHz : 1797.682
2709 cache size : 1024 KB
2710 fpu : yes
2711 fpu_exception : yes
2712 cpuid level : 1
2713 wp : yes
2714 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2715 mtrr pge mca cmov pat pse36 clflush mmx fxsr
2716 sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2717 bogomips : 3600.62
2718 TLB size : 1088 4K pages
2719 clflush size : 64
2720 cache_alignment : 64
2721 address sizes : 40 bits physical, 48 bits virtual
2722 power management: ts fid vid ttp
2724 2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2725 we don't support them. See #291568. 3dnow is 80000001.EDX.31
2726 and 3dnowext is 80000001.EDX.30.
2728 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2730 # define SET_ABCD(_a,_b,_c,_d) \
2731 do { st->guest_RAX = (ULong)(_a); \
2732 st->guest_RBX = (ULong)(_b); \
2733 st->guest_RCX = (ULong)(_c); \
2734 st->guest_RDX = (ULong)(_d); \
2735 } while (0)
2737 switch (0xFFFFFFFF & st->guest_RAX) {
2738 case 0x00000000:
2739 SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2740 break;
2741 case 0x00000001:
2742 SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2743 break;
2744 case 0x80000000:
2745 SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2746 break;
2747 case 0x80000001:
2748 /* Don't claim to support 3dnow or 3dnowext. 0xe1d3fbff is
2749 the original it-is-supported value that the h/w provides.
2750 See #291568. */
2751 SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2752 0x21d3fbff);
2753 break;
2754 case 0x80000002:
2755 SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2756 break;
2757 case 0x80000003:
2758 SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2759 break;
2760 case 0x80000004:
2761 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2762 break;
2763 case 0x80000005:
2764 SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2765 break;
2766 case 0x80000006:
2767 SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2768 break;
2769 case 0x80000007:
2770 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2771 break;
2772 case 0x80000008:
2773 SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2774 break;
2775 default:
2776 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2777 break;
2779 # undef SET_ABCD
2783 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2784 capable.
2786 vendor_id : GenuineIntel
2787 cpu family : 6
2788 model : 15
2789 model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2790 stepping : 6
2791 cpu MHz : 2394.000
2792 cache size : 4096 KB
2793 physical id : 0
2794 siblings : 2
2795 core id : 0
2796 cpu cores : 2
2797 fpu : yes
2798 fpu_exception : yes
2799 cpuid level : 10
2800 wp : yes
2801 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2802 mtrr pge mca cmov pat pse36 clflush dts acpi
2803 mmx fxsr sse sse2 ss ht tm syscall nx lm
2804 constant_tsc pni monitor ds_cpl vmx est tm2
2805 cx16 xtpr lahf_lm
2806 bogomips : 4798.78
2807 clflush size : 64
2808 cache_alignment : 64
2809 address sizes : 36 bits physical, 48 bits virtual
2810 power management:
2812 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2814 # define SET_ABCD(_a,_b,_c,_d) \
2815 do { st->guest_RAX = (ULong)(_a); \
2816 st->guest_RBX = (ULong)(_b); \
2817 st->guest_RCX = (ULong)(_c); \
2818 st->guest_RDX = (ULong)(_d); \
2819 } while (0)
2821 switch (0xFFFFFFFF & st->guest_RAX) {
2822 case 0x00000000:
2823 SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2824 break;
2825 case 0x00000001:
2826 SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2827 break;
2828 case 0x00000002:
2829 SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2830 break;
2831 case 0x00000003:
2832 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2833 break;
2834 case 0x00000004: {
2835 switch (0xFFFFFFFF & st->guest_RCX) {
2836 case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2837 0x0000003f, 0x00000001); break;
2838 case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2839 0x0000003f, 0x00000001); break;
2840 case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2841 0x00000fff, 0x00000001); break;
2842 default: SET_ABCD(0x00000000, 0x00000000,
2843 0x00000000, 0x00000000); break;
2845 break;
2847 case 0x00000005:
2848 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2849 break;
2850 case 0x00000006:
2851 SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2852 break;
2853 case 0x00000007:
2854 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2855 break;
2856 case 0x00000008:
2857 SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2858 break;
2859 case 0x00000009:
2860 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2861 break;
2862 case 0x0000000a:
2863 unhandled_eax_value:
2864 SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2865 break;
2866 case 0x80000000:
2867 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2868 break;
2869 case 0x80000001:
2870 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2871 break;
2872 case 0x80000002:
2873 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2874 break;
2875 case 0x80000003:
2876 SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2877 break;
2878 case 0x80000004:
2879 SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2880 break;
2881 case 0x80000005:
2882 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2883 break;
2884 case 0x80000006:
2885 SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2886 break;
2887 case 0x80000007:
2888 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2889 break;
2890 case 0x80000008:
2891 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2892 break;
2893 default:
2894 goto unhandled_eax_value;
2896 # undef SET_ABCD
2900 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2901 capable.
2903 vendor_id : GenuineIntel
2904 cpu family : 6
2905 model : 37
2906 model name : Intel(R) Core(TM) i5 CPU 670 @ 3.47GHz
2907 stepping : 2
2908 cpu MHz : 3334.000
2909 cache size : 4096 KB
2910 physical id : 0
2911 siblings : 4
2912 core id : 0
2913 cpu cores : 2
2914 apicid : 0
2915 initial apicid : 0
2916 fpu : yes
2917 fpu_exception : yes
2918 cpuid level : 11
2919 wp : yes
2920 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2921 mtrr pge mca cmov pat pse36 clflush dts acpi
2922 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2923 lm constant_tsc arch_perfmon pebs bts rep_good
2924 xtopology nonstop_tsc aperfmperf pni pclmulqdq
2925 dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2926 xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2927 arat tpr_shadow vnmi flexpriority ept vpid
2928 bogomips : 6957.57
2929 clflush size : 64
2930 cache_alignment : 64
2931 address sizes : 36 bits physical, 48 bits virtual
2932 power management:
2934 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2936 # define SET_ABCD(_a,_b,_c,_d) \
2937 do { st->guest_RAX = (ULong)(_a); \
2938 st->guest_RBX = (ULong)(_b); \
2939 st->guest_RCX = (ULong)(_c); \
2940 st->guest_RDX = (ULong)(_d); \
2941 } while (0)
2943 UInt old_eax = (UInt)st->guest_RAX;
2944 UInt old_ecx = (UInt)st->guest_RCX;
2946 switch (old_eax) {
2947 case 0x00000000:
2948 SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2949 break;
2950 case 0x00000001:
2951 SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
2952 break;
2953 case 0x00000002:
2954 SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2955 break;
2956 case 0x00000003:
2957 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2958 break;
2959 case 0x00000004:
2960 switch (old_ecx) {
2961 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2962 0x0000003f, 0x00000000); break;
2963 case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2964 0x0000007f, 0x00000000); break;
2965 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2966 0x000001ff, 0x00000000); break;
2967 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2968 0x00000fff, 0x00000002); break;
2969 default: SET_ABCD(0x00000000, 0x00000000,
2970 0x00000000, 0x00000000); break;
2972 break;
2973 case 0x00000005:
2974 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2975 break;
2976 case 0x00000006:
2977 SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2978 break;
2979 case 0x00000007:
2980 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2981 break;
2982 case 0x00000008:
2983 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2984 break;
2985 case 0x00000009:
2986 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2987 break;
2988 case 0x0000000a:
2989 SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2990 break;
2991 case 0x0000000b:
2992 switch (old_ecx) {
2993 case 0x00000000:
2994 SET_ABCD(0x00000001, 0x00000002,
2995 0x00000100, 0x00000000); break;
2996 case 0x00000001:
2997 SET_ABCD(0x00000004, 0x00000004,
2998 0x00000201, 0x00000000); break;
2999 default:
3000 SET_ABCD(0x00000000, 0x00000000,
3001 old_ecx, 0x00000000); break;
3003 break;
3004 case 0x0000000c:
3005 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3006 break;
3007 case 0x0000000d:
3008 switch (old_ecx) {
3009 case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3010 0x00000100, 0x00000000); break;
3011 case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
3012 0x00000201, 0x00000000); break;
3013 default: SET_ABCD(0x00000000, 0x00000000,
3014 old_ecx, 0x00000000); break;
3016 break;
3017 case 0x80000000:
3018 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3019 break;
3020 case 0x80000001:
3021 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3022 break;
3023 case 0x80000002:
3024 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3025 break;
3026 case 0x80000003:
3027 SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
3028 break;
3029 case 0x80000004:
3030 SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
3031 break;
3032 case 0x80000005:
3033 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3034 break;
3035 case 0x80000006:
3036 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3037 break;
3038 case 0x80000007:
3039 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3040 break;
3041 case 0x80000008:
3042 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3043 break;
3044 default:
3045 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3046 break;
3048 # undef SET_ABCD
3052 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
3053 capable. Plus (kludge!) it "supports" HTM.
3055 Also with the following change: claim that XSaveOpt is not
3056 available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
3057 on the real CPU. Consequently, programs that correctly observe
3058 these CPUID values should only try to use 3 of the 8 XSave-family
3059 instructions: XGETBV, XSAVE and XRSTOR. In particular this avoids
3060 having to implement the compacted or optimised save/restore
3061 variants.
3063 vendor_id : GenuineIntel
3064 cpu family : 6
3065 model : 42
3066 model name : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
3067 stepping : 7
3068 cpu MHz : 1600.000
3069 cache size : 6144 KB
3070 physical id : 0
3071 siblings : 4
3072 core id : 3
3073 cpu cores : 4
3074 apicid : 6
3075 initial apicid : 6
3076 fpu : yes
3077 fpu_exception : yes
3078 cpuid level : 13
3079 wp : yes
3080 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
3081 mtrr pge mca cmov pat pse36 clflush dts acpi
3082 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
3083 lm constant_tsc arch_perfmon pebs bts rep_good
3084 nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
3085 dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
3086 xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
3087 lahf_lm ida arat epb xsaveopt pln pts dts
3088 tpr_shadow vnmi flexpriority ept vpid
3090 bogomips : 5768.94
3091 clflush size : 64
3092 cache_alignment : 64
3093 address sizes : 36 bits physical, 48 bits virtual
3094 power management:
3096 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
3098 # define SET_ABCD(_a,_b,_c,_d) \
3099 do { st->guest_RAX = (ULong)(_a); \
3100 st->guest_RBX = (ULong)(_b); \
3101 st->guest_RCX = (ULong)(_c); \
3102 st->guest_RDX = (ULong)(_d); \
3103 } while (0)
3105 UInt old_eax = (UInt)st->guest_RAX;
3106 UInt old_ecx = (UInt)st->guest_RCX;
3108 switch (old_eax) {
3109 case 0x00000000:
3110 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3111 break;
3112 case 0x00000001:
3113 SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
3114 break;
3115 case 0x00000002:
3116 SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
3117 break;
3118 case 0x00000003:
3119 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3120 break;
3121 case 0x00000004:
3122 switch (old_ecx) {
3123 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3124 0x0000003f, 0x00000000); break;
3125 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3126 0x0000003f, 0x00000000); break;
3127 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3128 0x000001ff, 0x00000000); break;
3129 case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
3130 0x00001fff, 0x00000006); break;
3131 default: SET_ABCD(0x00000000, 0x00000000,
3132 0x00000000, 0x00000000); break;
3134 break;
3135 case 0x00000005:
3136 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3137 break;
3138 case 0x00000006:
3139 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3140 break;
3141 case 0x00000007:
3142 SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
3143 break;
3144 case 0x00000008:
3145 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3146 break;
3147 case 0x00000009:
3148 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3149 break;
3150 case 0x0000000a:
3151 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3152 break;
3153 case 0x0000000b:
3154 switch (old_ecx) {
3155 case 0x00000000:
3156 SET_ABCD(0x00000001, 0x00000001,
3157 0x00000100, 0x00000000); break;
3158 case 0x00000001:
3159 SET_ABCD(0x00000004, 0x00000004,
3160 0x00000201, 0x00000000); break;
3161 default:
3162 SET_ABCD(0x00000000, 0x00000000,
3163 old_ecx, 0x00000000); break;
3165 break;
3166 case 0x0000000c:
3167 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3168 break;
3169 case 0x0000000d:
3170 switch (old_ecx) {
3171 case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3172 0x00000340, 0x00000000); break;
3173 case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3174 0x00000000, 0x00000000); break;
3175 case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3176 0x00000000, 0x00000000); break;
3177 default: SET_ABCD(0x00000000, 0x00000000,
3178 0x00000000, 0x00000000); break;
3180 break;
3181 case 0x0000000e:
3182 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3183 break;
3184 case 0x0000000f:
3185 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3186 break;
3187 case 0x80000000:
3188 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3189 break;
3190 case 0x80000001:
3191 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3192 break;
3193 case 0x80000002:
3194 SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
3195 break;
3196 case 0x80000003:
3197 SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
3198 break;
3199 case 0x80000004:
3200 SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
3201 break;
3202 case 0x80000005:
3203 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3204 break;
3205 case 0x80000006:
3206 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3207 break;
3208 case 0x80000007:
3209 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3210 break;
3211 case 0x80000008:
3212 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3213 break;
3214 default:
3215 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3216 break;
3218 # undef SET_ABCD
3222 /* Claim to be the following CPU (4 x ...), which is AVX2 capable.
3224 With the following change: claim that XSaveOpt is not available, by
3225 cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
3226 CPU. Consequently, programs that correctly observe these CPUID
3227 values should only try to use 3 of the 8 XSave-family instructions:
3228 XGETBV, XSAVE and XRSTOR. In particular this avoids having to
3229 implement the compacted or optimised save/restore variants.
3231 vendor_id : GenuineIntel
3232 cpu family : 6
3233 model : 60
3234 model name : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
3235 stepping : 3
3236 microcode : 0x1c
3237 cpu MHz : 919.957
3238 cache size : 8192 KB
3239 physical id : 0
3240 siblings : 4
3241 core id : 3
3242 cpu cores : 4
3243 apicid : 6
3244 initial apicid : 6
3245 fpu : yes
3246 fpu_exception : yes
3247 cpuid level : 13
3248 wp : yes
3249 flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
3250 cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
3251 tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
3252 arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
3253 aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
3254 vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
3255 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
3256 avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
3257 tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
3258 bmi1 avx2 smep bmi2 erms invpcid xsaveopt
3259 bugs :
3260 bogomips : 5786.68
3261 clflush size : 64
3262 cache_alignment : 64
3263 address sizes : 39 bits physical, 48 bits virtual
3264 power management:
3266 void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
3268 # define SET_ABCD(_a,_b,_c,_d) \
3269 do { st->guest_RAX = (ULong)(_a); \
3270 st->guest_RBX = (ULong)(_b); \
3271 st->guest_RCX = (ULong)(_c); \
3272 st->guest_RDX = (ULong)(_d); \
3273 } while (0)
3275 UInt old_eax = (UInt)st->guest_RAX;
3276 UInt old_ecx = (UInt)st->guest_RCX;
3278 switch (old_eax) {
3279 case 0x00000000:
3280 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3281 break;
3282 case 0x00000001:
3283 /* Don't advertise RDRAND support, bit 30 in ECX. */
3284 SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff);
3285 break;
3286 case 0x00000002:
3287 SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
3288 break;
3289 case 0x00000003:
3290 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3291 break;
3292 case 0x00000004:
3293 switch (old_ecx) {
3294 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3295 0x0000003f, 0x00000000); break;
3296 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3297 0x0000003f, 0x00000000); break;
3298 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3299 0x000001ff, 0x00000000); break;
3300 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3301 0x00001fff, 0x00000006); break;
3302 default: SET_ABCD(0x00000000, 0x00000000,
3303 0x00000000, 0x00000000); break;
3305 break;
3306 case 0x00000005:
3307 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
3308 break;
3309 case 0x00000006:
3310 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3311 break;
3312 case 0x00000007:
3313 switch (old_ecx) {
3314 case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
3315 0x00000000, 0x00000000); break;
3316 default: SET_ABCD(0x00000000, 0x00000000,
3317 0x00000000, 0x00000000); break;
3319 break;
3320 case 0x00000008:
3321 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3322 break;
3323 case 0x00000009:
3324 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3325 break;
3326 case 0x0000000a:
3327 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3328 break;
3329 case 0x0000000b:
3330 switch (old_ecx) {
3331 case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3332 0x00000100, 0x00000002); break;
3333 case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
3334 0x00000201, 0x00000002); break;
3335 default: SET_ABCD(0x00000000, 0x00000000,
3336 old_ecx, 0x00000002); break;
3338 break;
3339 case 0x0000000c:
3340 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3341 break;
3342 case 0x0000000d:
3343 switch (old_ecx) {
3344 case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3345 0x00000340, 0x00000000); break;
3346 case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3347 0x00000000, 0x00000000); break;
3348 case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3349 0x00000000, 0x00000000); break;
3350 default: SET_ABCD(0x00000000, 0x00000000,
3351 0x00000000, 0x00000000); break;
3353 break;
3354 case 0x80000000:
3355 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3356 break;
3357 case 0x80000001:
3358 SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
3359 break;
3360 case 0x80000002:
3361 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3362 break;
3363 case 0x80000003:
3364 SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
3365 break;
3366 case 0x80000004:
3367 SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
3368 break;
3369 case 0x80000005:
3370 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3371 break;
3372 case 0x80000006:
3373 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3374 break;
3375 case 0x80000007:
3376 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3377 break;
3378 case 0x80000008:
3379 SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
3380 break;
3381 default:
3382 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3383 break;
3385 # undef SET_ABCD
3389 /*---------------------------------------------------------------*/
3390 /*--- Misc integer helpers, including rotates and crypto. ---*/
3391 /*---------------------------------------------------------------*/
3393 ULong amd64g_calculate_RCR ( ULong arg,
3394 ULong rot_amt,
3395 ULong rflags_in,
3396 Long szIN )
3398 Bool wantRflags = toBool(szIN < 0);
3399 ULong sz = wantRflags ? (-szIN) : szIN;
3400 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3401 ULong cf=0, of=0, tempcf;
3403 switch (sz) {
3404 case 8:
3405 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3406 of = ((arg >> 63) ^ cf) & 1;
3407 while (tempCOUNT > 0) {
3408 tempcf = arg & 1;
3409 arg = (arg >> 1) | (cf << 63);
3410 cf = tempcf;
3411 tempCOUNT--;
3413 break;
3414 case 4:
3415 while (tempCOUNT >= 33) tempCOUNT -= 33;
3416 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3417 of = ((arg >> 31) ^ cf) & 1;
3418 while (tempCOUNT > 0) {
3419 tempcf = arg & 1;
3420 arg = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
3421 cf = tempcf;
3422 tempCOUNT--;
3424 break;
3425 case 2:
3426 while (tempCOUNT >= 17) tempCOUNT -= 17;
3427 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3428 of = ((arg >> 15) ^ cf) & 1;
3429 while (tempCOUNT > 0) {
3430 tempcf = arg & 1;
3431 arg = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
3432 cf = tempcf;
3433 tempCOUNT--;
3435 break;
3436 case 1:
3437 while (tempCOUNT >= 9) tempCOUNT -= 9;
3438 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3439 of = ((arg >> 7) ^ cf) & 1;
3440 while (tempCOUNT > 0) {
3441 tempcf = arg & 1;
3442 arg = ((arg >> 1) & 0x7FULL) | (cf << 7);
3443 cf = tempcf;
3444 tempCOUNT--;
3446 break;
3447 default:
3448 vpanic("calculate_RCR(amd64g): invalid size");
3451 cf &= 1;
3452 of &= 1;
3453 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3454 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3456 /* caller can ask to have back either the resulting flags or
3457 resulting value, but not both */
3458 return wantRflags ? rflags_in : arg;
3461 ULong amd64g_calculate_RCL ( ULong arg,
3462 ULong rot_amt,
3463 ULong rflags_in,
3464 Long szIN )
3466 Bool wantRflags = toBool(szIN < 0);
3467 ULong sz = wantRflags ? (-szIN) : szIN;
3468 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3469 ULong cf=0, of=0, tempcf;
3471 switch (sz) {
3472 case 8:
3473 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3474 while (tempCOUNT > 0) {
3475 tempcf = (arg >> 63) & 1;
3476 arg = (arg << 1) | (cf & 1);
3477 cf = tempcf;
3478 tempCOUNT--;
3480 of = ((arg >> 63) ^ cf) & 1;
3481 break;
3482 case 4:
3483 while (tempCOUNT >= 33) tempCOUNT -= 33;
3484 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3485 while (tempCOUNT > 0) {
3486 tempcf = (arg >> 31) & 1;
3487 arg = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
3488 cf = tempcf;
3489 tempCOUNT--;
3491 of = ((arg >> 31) ^ cf) & 1;
3492 break;
3493 case 2:
3494 while (tempCOUNT >= 17) tempCOUNT -= 17;
3495 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3496 while (tempCOUNT > 0) {
3497 tempcf = (arg >> 15) & 1;
3498 arg = 0xFFFFULL & ((arg << 1) | (cf & 1));
3499 cf = tempcf;
3500 tempCOUNT--;
3502 of = ((arg >> 15) ^ cf) & 1;
3503 break;
3504 case 1:
3505 while (tempCOUNT >= 9) tempCOUNT -= 9;
3506 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3507 while (tempCOUNT > 0) {
3508 tempcf = (arg >> 7) & 1;
3509 arg = 0xFFULL & ((arg << 1) | (cf & 1));
3510 cf = tempcf;
3511 tempCOUNT--;
3513 of = ((arg >> 7) ^ cf) & 1;
3514 break;
3515 default:
3516 vpanic("calculate_RCL(amd64g): invalid size");
3519 cf &= 1;
3520 of &= 1;
3521 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3522 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3524 return wantRflags ? rflags_in : arg;
3527 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
3528 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
3530 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
3532 ULong hi, lo, tmp, A[16];
3534 A[0] = 0; A[1] = a;
3535 A[2] = A[1] << 1; A[3] = A[2] ^ a;
3536 A[4] = A[2] << 1; A[5] = A[4] ^ a;
3537 A[6] = A[3] << 1; A[7] = A[6] ^ a;
3538 A[8] = A[4] << 1; A[9] = A[8] ^ a;
3539 A[10] = A[5] << 1; A[11] = A[10] ^ a;
3540 A[12] = A[6] << 1; A[13] = A[12] ^ a;
3541 A[14] = A[7] << 1; A[15] = A[14] ^ a;
3543 lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
3544 hi = lo >> 56;
3545 lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
3546 hi = (hi << 8) | (lo >> 56);
3547 lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
3548 hi = (hi << 8) | (lo >> 56);
3549 lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
3550 hi = (hi << 8) | (lo >> 56);
3551 lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
3552 hi = (hi << 8) | (lo >> 56);
3553 lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
3554 hi = (hi << 8) | (lo >> 56);
3555 lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
3556 hi = (hi << 8) | (lo >> 56);
3557 lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
3559 ULong m0 = -1;
3560 m0 /= 255;
3561 tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
3562 tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
3563 tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
3564 tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
3565 tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
3566 tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
3567 tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
3569 return which ? hi : lo;
3573 /* CALLED FROM GENERATED CODE */
3574 /* DIRTY HELPER (non-referentially-transparent) */
3575 /* Horrible hack. On non-amd64 platforms, return 1. */
3576 ULong amd64g_dirtyhelper_RDTSC ( void )
3578 # if defined(__x86_64__)
3579 UInt eax, edx;
3580 __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
3581 return (((ULong)edx) << 32) | ((ULong)eax);
3582 # else
3583 return 1ULL;
3584 # endif
3587 /* CALLED FROM GENERATED CODE */
3588 /* DIRTY HELPER (non-referentially-transparent) */
3589 /* Horrible hack. On non-amd64 platforms, return 1. */
3590 /* This uses a different calling convention from _RDTSC just above
3591 only because of the difficulty of returning 96 bits from a C
3592 function -- RDTSC returns 64 bits and so is simple by comparison,
3593 on amd64. */
3594 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
3596 # if defined(__x86_64__)
3597 UInt eax, ecx, edx;
3598 __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
3599 st->guest_RAX = (ULong)eax;
3600 st->guest_RCX = (ULong)ecx;
3601 st->guest_RDX = (ULong)edx;
3602 # else
3603 /* Do nothing. */
3604 # endif
3607 /* CALLED FROM GENERATED CODE */
3608 /* DIRTY HELPER (non-referentially-transparent) */
3609 /* Horrible hack. On non-amd64 platforms, return 0. */
3610 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
3612 # if defined(__x86_64__)
3613 ULong r = 0;
3614 portno &= 0xFFFF;
3615 switch (sz) {
3616 case 4:
3617 __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
3618 : "=a" (r) : "Nd" (portno));
3619 break;
3620 case 2:
3621 __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
3622 : "=a" (r) : "Nd" (portno));
3623 break;
3624 case 1:
3625 __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
3626 : "=a" (r) : "Nd" (portno));
3627 break;
3628 default:
3629 break; /* note: no 64-bit version of insn exists */
3631 return r;
3632 # else
3633 return 0;
3634 # endif
3638 /* CALLED FROM GENERATED CODE */
3639 /* DIRTY HELPER (non-referentially-transparent) */
3640 /* Horrible hack. On non-amd64 platforms, do nothing. */
3641 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
3643 # if defined(__x86_64__)
3644 portno &= 0xFFFF;
3645 switch (sz) {
3646 case 4:
3647 __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
3648 : : "a" (data), "Nd" (portno));
3649 break;
3650 case 2:
3651 __asm__ __volatile__("outw %w0, %w1"
3652 : : "a" (data), "Nd" (portno));
3653 break;
3654 case 1:
3655 __asm__ __volatile__("outb %b0, %w1"
3656 : : "a" (data), "Nd" (portno));
3657 break;
3658 default:
3659 break; /* note: no 64-bit version of insn exists */
3661 # else
3662 /* do nothing */
3663 # endif
3666 /* CALLED FROM GENERATED CODE */
3667 /* DIRTY HELPER (non-referentially-transparent) */
3668 /* Horrible hack. On non-amd64 platforms, do nothing. */
3669 /* op = 0: call the native SGDT instruction.
3670 op = 1: call the native SIDT instruction.
3672 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
3673 # if defined(__x86_64__)
3674 switch (op) {
3675 case 0:
3676 __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
3677 break;
3678 case 1:
3679 __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
3680 break;
3681 default:
3682 vpanic("amd64g_dirtyhelper_SxDT");
3684 # else
3685 /* do nothing */
3686 UChar* p = (UChar*)address;
3687 p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
3688 p[6] = p[7] = p[8] = p[9] = 0;
3689 # endif
3692 /*---------------------------------------------------------------*/
3693 /*--- Helpers for MMX/SSE/SSE2. ---*/
3694 /*---------------------------------------------------------------*/
3696 static inline UChar abdU8 ( UChar xx, UChar yy ) {
3697 return toUChar(xx>yy ? xx-yy : yy-xx);
3700 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
3701 return (((ULong)w1) << 32) | ((ULong)w0);
3704 static inline UShort sel16x4_3 ( ULong w64 ) {
3705 UInt hi32 = toUInt(w64 >> 32);
3706 return toUShort(hi32 >> 16);
3708 static inline UShort sel16x4_2 ( ULong w64 ) {
3709 UInt hi32 = toUInt(w64 >> 32);
3710 return toUShort(hi32);
3712 static inline UShort sel16x4_1 ( ULong w64 ) {
3713 UInt lo32 = toUInt(w64);
3714 return toUShort(lo32 >> 16);
3716 static inline UShort sel16x4_0 ( ULong w64 ) {
3717 UInt lo32 = toUInt(w64);
3718 return toUShort(lo32);
3721 static inline UChar sel8x8_7 ( ULong w64 ) {
3722 UInt hi32 = toUInt(w64 >> 32);
3723 return toUChar(hi32 >> 24);
3725 static inline UChar sel8x8_6 ( ULong w64 ) {
3726 UInt hi32 = toUInt(w64 >> 32);
3727 return toUChar(hi32 >> 16);
3729 static inline UChar sel8x8_5 ( ULong w64 ) {
3730 UInt hi32 = toUInt(w64 >> 32);
3731 return toUChar(hi32 >> 8);
3733 static inline UChar sel8x8_4 ( ULong w64 ) {
3734 UInt hi32 = toUInt(w64 >> 32);
3735 return toUChar(hi32 >> 0);
3737 static inline UChar sel8x8_3 ( ULong w64 ) {
3738 UInt lo32 = toUInt(w64);
3739 return toUChar(lo32 >> 24);
3741 static inline UChar sel8x8_2 ( ULong w64 ) {
3742 UInt lo32 = toUInt(w64);
3743 return toUChar(lo32 >> 16);
3745 static inline UChar sel8x8_1 ( ULong w64 ) {
3746 UInt lo32 = toUInt(w64);
3747 return toUChar(lo32 >> 8);
3749 static inline UChar sel8x8_0 ( ULong w64 ) {
3750 UInt lo32 = toUInt(w64);
3751 return toUChar(lo32 >> 0);
3754 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3755 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
3757 return
3758 mk32x2(
3759 (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
3760 + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
3761 (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
3762 + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
3766 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3767 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3769 UInt t = 0;
3770 t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3771 t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3772 t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3773 t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3774 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3775 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3776 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3777 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3778 t &= 0xFFFF;
3779 return (ULong)t;
3782 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3783 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3785 UShort t, min;
3786 UInt idx;
3787 t = sel16x4_0(sLo); if (True) { min = t; idx = 0; }
3788 t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3789 t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3790 t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3791 t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3792 t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3793 t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3794 t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3795 return ((ULong)(idx << 16)) | ((ULong)min);
3798 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3799 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3801 UInt i;
3802 ULong crc = (b & 0xFFULL) ^ crcIn;
3803 for (i = 0; i < 8; i++)
3804 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3805 return crc;
3808 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3809 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3811 UInt i;
3812 ULong crc = (w & 0xFFFFULL) ^ crcIn;
3813 for (i = 0; i < 16; i++)
3814 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3815 return crc;
3818 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3819 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3821 UInt i;
3822 ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3823 for (i = 0; i < 32; i++)
3824 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3825 return crc;
3828 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3829 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3831 ULong crc = amd64g_calc_crc32l(crcIn, q);
3832 return amd64g_calc_crc32l(crc, q >> 32);
3836 /* .. helper for next fn .. */
3837 static inline ULong sad_8x4 ( ULong xx, ULong yy )
3839 UInt t = 0;
3840 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3841 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3842 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3843 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3844 return (ULong)t;
3847 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3848 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3849 ULong dHi, ULong dLo,
3850 ULong imm_and_return_control_bit )
3852 UInt imm8 = imm_and_return_control_bit & 7;
3853 Bool calcHi = (imm_and_return_control_bit >> 7) & 1;
3854 UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3855 UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3856 /* For src we only need 32 bits, so get them into the
3857 lower half of a 64 bit word. */
3858 ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3859 /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3860 11 bytes. If calculating the low part of the result, need bytes
3861 dstOffsL * 4 + (0 .. 6); if calculating the high part,
3862 dstOffsL * 4 + (4 .. 10). */
3863 ULong dst;
3864 /* dstOffL = 0, Lo -> 0 .. 6
3865 dstOffL = 1, Lo -> 4 .. 10
3866 dstOffL = 0, Hi -> 4 .. 10
3867 dstOffL = 1, Hi -> 8 .. 14
3869 if (calcHi && dstOffsL) {
3870 /* 8 .. 14 */
3871 dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3873 else if (!calcHi && !dstOffsL) {
3874 /* 0 .. 6 */
3875 dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3877 else {
3878 /* 4 .. 10 */
3879 dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3881 ULong r0 = sad_8x4( dst >> 0, src );
3882 ULong r1 = sad_8x4( dst >> 8, src );
3883 ULong r2 = sad_8x4( dst >> 16, src );
3884 ULong r3 = sad_8x4( dst >> 24, src );
3885 ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3886 return res;
3889 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3890 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
3892 ULong dst = 0;
3893 ULong src_bit;
3894 ULong dst_bit = 1;
3895 for (src_bit = 1; src_bit; src_bit <<= 1) {
3896 if (mask & src_bit) {
3897 if (src_masked & src_bit) dst |= dst_bit;
3898 dst_bit <<= 1;
3901 return dst;
3904 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
3905 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
3907 ULong dst = 0;
3908 ULong dst_bit;
3909 ULong src_bit = 1;
3910 for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
3911 if (mask & dst_bit) {
3912 if (src & src_bit) dst |= dst_bit;
3913 src_bit <<= 1;
3916 return dst;
3919 /*---------------------------------------------------------------*/
3920 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/
3921 /*---------------------------------------------------------------*/
3923 static UInt zmask_from_V128 ( V128* arg )
3925 UInt i, res = 0;
3926 for (i = 0; i < 16; i++) {
3927 res |= ((arg->w8[i] == 0) ? 1 : 0) << i;
3929 return res;
3932 static UInt zmask_from_V128_wide ( V128* arg )
3934 UInt i, res = 0;
3935 for (i = 0; i < 8; i++) {
3936 res |= ((arg->w16[i] == 0) ? 1 : 0) << i;
3938 return res;
3941 /* Helps with PCMP{I,E}STR{I,M}.
3943 CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really,
3944 actually it could be a clean helper, but for the fact that we can't
3945 pass by value 2 x V128 to a clean helper, nor have one returned.)
3946 Reads guest state, writes to guest state for the xSTRM cases, no
3947 accesses of memory, is a pure function.
3949 opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
3950 the callee knows which I/E and I/M variant it is dealing with and
3951 what the specific operation is. 4th byte of opcode is in the range
3952 0x60 to 0x63:
3953 istri 66 0F 3A 63
3954 istrm 66 0F 3A 62
3955 estri 66 0F 3A 61
3956 estrm 66 0F 3A 60
3958 gstOffL and gstOffR are the guest state offsets for the two XMM
3959 register inputs. We never have to deal with the memory case since
3960 that is handled by pre-loading the relevant value into the fake
3961 XMM16 register.
3963 For ESTRx variants, edxIN and eaxIN hold the values of those two
3964 registers.
3966 In all cases, the bottom 16 bits of the result contain the new
3967 OSZACP %rflags values. For xSTRI variants, bits[31:16] of the
3968 result hold the new %ecx value. For xSTRM variants, the helper
3969 writes the result directly to the guest XMM0.
3971 Declarable side effects: in all cases, reads guest state at
3972 [gstOffL, +16) and [gstOffR, +16). For xSTRM variants, also writes
3973 guest_XMM0.
3975 Is expected to be called with opc_and_imm combinations which have
3976 actually been validated, and will assert if otherwise. The front
3977 end should ensure we're only called with verified values.
3979 ULong amd64g_dirtyhelper_PCMPxSTRx (
3980 VexGuestAMD64State* gst,
3981 HWord opc4_and_imm,
3982 HWord gstOffL, HWord gstOffR,
3983 HWord edxIN, HWord eaxIN
3986 HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
3987 HWord imm8 = opc4_and_imm & 0xFF;
3988 HWord isISTRx = opc4 & 2;
3989 HWord isxSTRM = (opc4 & 1) ^ 1;
3990 vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
3991 HWord wide = (imm8 & 1);
3993 // where the args are
3994 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3995 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3997 /* Create the arg validity masks, either from the vectors
3998 themselves or from the supplied edx/eax values. */
3999 // FIXME: this is only right for the 8-bit data cases.
4000 // At least that is asserted above.
4001 UInt zmaskL, zmaskR;
4003 // temp spot for the resulting flags and vector.
4004 V128 resV;
4005 UInt resOSZACP;
4007 // for checking whether case was handled
4008 Bool ok = False;
4010 if (wide) {
4011 if (isISTRx) {
4012 zmaskL = zmask_from_V128_wide(argL);
4013 zmaskR = zmask_from_V128_wide(argR);
4014 } else {
4015 Int tmp;
4016 tmp = edxIN & 0xFFFFFFFF;
4017 if (tmp < -8) tmp = -8;
4018 if (tmp > 8) tmp = 8;
4019 if (tmp < 0) tmp = -tmp;
4020 vassert(tmp >= 0 && tmp <= 8);
4021 zmaskL = (1 << tmp) & 0xFF;
4022 tmp = eaxIN & 0xFFFFFFFF;
4023 if (tmp < -8) tmp = -8;
4024 if (tmp > 8) tmp = 8;
4025 if (tmp < 0) tmp = -tmp;
4026 vassert(tmp >= 0 && tmp <= 8);
4027 zmaskR = (1 << tmp) & 0xFF;
4029 // do the meyaath
4030 ok = compute_PCMPxSTRx_wide (
4031 &resV, &resOSZACP, argL, argR,
4032 zmaskL, zmaskR, imm8, (Bool)isxSTRM
4034 } else {
4035 if (isISTRx) {
4036 zmaskL = zmask_from_V128(argL);
4037 zmaskR = zmask_from_V128(argR);
4038 } else {
4039 Int tmp;
4040 tmp = edxIN & 0xFFFFFFFF;
4041 if (tmp < -16) tmp = -16;
4042 if (tmp > 16) tmp = 16;
4043 if (tmp < 0) tmp = -tmp;
4044 vassert(tmp >= 0 && tmp <= 16);
4045 zmaskL = (1 << tmp) & 0xFFFF;
4046 tmp = eaxIN & 0xFFFFFFFF;
4047 if (tmp < -16) tmp = -16;
4048 if (tmp > 16) tmp = 16;
4049 if (tmp < 0) tmp = -tmp;
4050 vassert(tmp >= 0 && tmp <= 16);
4051 zmaskR = (1 << tmp) & 0xFFFF;
4053 // do the meyaath
4054 ok = compute_PCMPxSTRx (
4055 &resV, &resOSZACP, argL, argR,
4056 zmaskL, zmaskR, imm8, (Bool)isxSTRM
4060 // front end shouldn't pass us any imm8 variants we can't
4061 // handle. Hence:
4062 vassert(ok);
4064 // So, finally we need to get the results back to the caller.
4065 // In all cases, the new OSZACP value is the lowest 16 of
4066 // the return value.
4067 if (isxSTRM) {
4068 gst->guest_YMM0[0] = resV.w32[0];
4069 gst->guest_YMM0[1] = resV.w32[1];
4070 gst->guest_YMM0[2] = resV.w32[2];
4071 gst->guest_YMM0[3] = resV.w32[3];
4072 return resOSZACP & 0x8D5;
4073 } else {
4074 UInt newECX = resV.w32[0] & 0xFFFF;
4075 return (newECX << 16) | (resOSZACP & 0x8D5);
4079 /*---------------------------------------------------------------*/
4080 /*--- AES primitives and helpers ---*/
4081 /*---------------------------------------------------------------*/
4082 /* a 16 x 16 matrix */
4083 static const UChar sbox[256] = { // row nr
4084 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
4085 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
4086 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
4087 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
4088 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
4089 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
4090 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
4091 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
4092 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
4093 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
4094 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
4095 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
4096 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
4097 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
4098 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
4099 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
4100 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
4101 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
4102 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
4103 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
4104 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
4105 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
4106 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
4107 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
4108 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
4109 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
4110 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
4111 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
4112 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
4113 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
4114 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
4115 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
4117 static void SubBytes (V128* v)
4119 V128 r;
4120 UInt i;
4121 for (i = 0; i < 16; i++)
4122 r.w8[i] = sbox[v->w8[i]];
4123 *v = r;
4126 /* a 16 x 16 matrix */
4127 static const UChar invsbox[256] = { // row nr
4128 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
4129 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
4130 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
4131 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
4132 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
4133 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
4134 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
4135 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
4136 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
4137 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
4138 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
4139 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
4140 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
4141 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
4142 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
4143 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
4144 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
4145 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
4146 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
4147 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
4148 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
4149 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
4150 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
4151 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
4152 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
4153 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
4154 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
4155 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
4156 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
4157 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
4158 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
4159 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
4161 static void InvSubBytes (V128* v)
4163 V128 r;
4164 UInt i;
4165 for (i = 0; i < 16; i++)
4166 r.w8[i] = invsbox[v->w8[i]];
4167 *v = r;
4170 static const UChar ShiftRows_op[16] =
4171 {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
4172 static void ShiftRows (V128* v)
4174 V128 r;
4175 UInt i;
4176 for (i = 0; i < 16; i++)
4177 r.w8[i] = v->w8[ShiftRows_op[15-i]];
4178 *v = r;
4181 static const UChar InvShiftRows_op[16] =
4182 {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
4183 static void InvShiftRows (V128* v)
4185 V128 r;
4186 UInt i;
4187 for (i = 0; i < 16; i++)
4188 r.w8[i] = v->w8[InvShiftRows_op[15-i]];
4189 *v = r;
4192 /* Multiplication of the finite fields elements of AES.
4193 See "A Specification for The AES Algorithm Rijndael
4194 (by Joan Daemen & Vincent Rijmen)"
4195 Dr. Brian Gladman, v3.1, 3rd March 2001. */
4196 /* N values so that (hex) xy = 0x03^N.
4197 0x00 cannot be used. We put 0xff for this value.*/
4198 /* a 16 x 16 matrix */
4199 static const UChar Nxy[256] = { // row nr
4200 0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
4201 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
4202 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
4203 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
4204 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
4205 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
4206 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
4207 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
4208 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
4209 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
4210 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
4211 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
4212 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
4213 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
4214 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
4215 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
4216 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
4217 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
4218 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
4219 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
4220 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
4221 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
4222 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
4223 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
4224 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
4225 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
4226 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
4227 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
4228 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
4229 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
4230 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
4231 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
4234 /* E values so that E = 0x03^xy. */
4235 static const UChar Exy[256] = { // row nr
4236 0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
4237 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
4238 0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
4239 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
4240 0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
4241 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
4242 0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
4243 0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
4244 0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
4245 0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
4246 0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
4247 0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
4248 0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
4249 0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
4250 0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
4251 0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
4252 0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
4253 0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
4254 0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
4255 0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
4256 0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
4257 0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
4258 0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
4259 0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
4260 0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
4261 0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
4262 0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
4263 0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
4264 0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
4265 0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
4266 0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
4267 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
4269 static inline UChar ff_mul(UChar u1, UChar u2)
4271 if ((u1 > 0) && (u2 > 0)) {
4272 UInt ui = Nxy[u1] + Nxy[u2];
4273 if (ui >= 255)
4274 ui = ui - 255;
4275 return Exy[ui];
4276 } else {
4277 return 0;
4281 static void MixColumns (V128* v)
4283 V128 r;
4284 Int j;
4285 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4286 for (j = 0; j < 4; j++) {
4287 P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
4288 ^ P(v,j,2) ^ P(v,j,3);
4289 P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
4290 ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
4291 P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
4292 ^ ff_mul(0x03, P(v,j,3) );
4293 P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
4294 ^ ff_mul( 0x02, P(v,j,3) );
4296 *v = r;
4297 #undef P
4300 static void InvMixColumns (V128* v)
4302 V128 r;
4303 Int j;
4304 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4305 for (j = 0; j < 4; j++) {
4306 P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
4307 ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
4308 P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
4309 ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
4310 P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
4311 ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
4312 P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
4313 ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
4315 *v = r;
4316 #undef P
4320 /* For description, see definition in guest_amd64_defs.h */
4321 void amd64g_dirtyhelper_AES (
4322 VexGuestAMD64State* gst,
4323 HWord opc4, HWord gstOffD,
4324 HWord gstOffL, HWord gstOffR
4327 // where the args are
4328 V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
4329 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4330 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4331 V128 r;
4333 switch (opc4) {
4334 case 0xDC: /* AESENC */
4335 case 0xDD: /* AESENCLAST */
4336 r = *argR;
4337 ShiftRows (&r);
4338 SubBytes (&r);
4339 if (opc4 == 0xDC)
4340 MixColumns (&r);
4341 argD->w64[0] = r.w64[0] ^ argL->w64[0];
4342 argD->w64[1] = r.w64[1] ^ argL->w64[1];
4343 break;
4345 case 0xDE: /* AESDEC */
4346 case 0xDF: /* AESDECLAST */
4347 r = *argR;
4348 InvShiftRows (&r);
4349 InvSubBytes (&r);
4350 if (opc4 == 0xDE)
4351 InvMixColumns (&r);
4352 argD->w64[0] = r.w64[0] ^ argL->w64[0];
4353 argD->w64[1] = r.w64[1] ^ argL->w64[1];
4354 break;
4356 case 0xDB: /* AESIMC */
4357 *argD = *argL;
4358 InvMixColumns (argD);
4359 break;
4360 default: vassert(0);
4364 static inline UInt RotWord (UInt w32)
4366 return ((w32 >> 8) | (w32 << 24));
4369 static inline UInt SubWord (UInt w32)
4371 UChar *w8;
4372 UChar *r8;
4373 UInt res;
4374 w8 = (UChar*) &w32;
4375 r8 = (UChar*) &res;
4376 r8[0] = sbox[w8[0]];
4377 r8[1] = sbox[w8[1]];
4378 r8[2] = sbox[w8[2]];
4379 r8[3] = sbox[w8[3]];
4380 return res;
4383 /* For description, see definition in guest_amd64_defs.h */
4384 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
4385 VexGuestAMD64State* gst,
4386 HWord imm8,
4387 HWord gstOffL, HWord gstOffR
4390 // where the args are
4391 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4392 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4394 // We have to create the result in a temporary in the
4395 // case where the src and dst regs are the same. See #341698.
4396 V128 tmp;
4398 tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
4399 tmp.w32[2] = SubWord (argL->w32[3]);
4400 tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
4401 tmp.w32[0] = SubWord (argL->w32[1]);
4403 argR->w32[3] = tmp.w32[3];
4404 argR->w32[2] = tmp.w32[2];
4405 argR->w32[1] = tmp.w32[1];
4406 argR->w32[0] = tmp.w32[0];
4411 /*---------------------------------------------------------------*/
4412 /*--- Helpers for dealing with, and describing, ---*/
4413 /*--- guest state as a whole. ---*/
4414 /*---------------------------------------------------------------*/
4416 /* Initialise the entire amd64 guest state. */
4417 /* VISIBLE TO LIBVEX CLIENT */
4418 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
4420 vex_state->host_EvC_FAILADDR = 0;
4421 vex_state->host_EvC_COUNTER = 0;
4422 vex_state->pad0 = 0;
4424 vex_state->guest_RAX = 0;
4425 vex_state->guest_RCX = 0;
4426 vex_state->guest_RDX = 0;
4427 vex_state->guest_RBX = 0;
4428 vex_state->guest_RSP = 0;
4429 vex_state->guest_RBP = 0;
4430 vex_state->guest_RSI = 0;
4431 vex_state->guest_RDI = 0;
4432 vex_state->guest_R8 = 0;
4433 vex_state->guest_R9 = 0;
4434 vex_state->guest_R10 = 0;
4435 vex_state->guest_R11 = 0;
4436 vex_state->guest_R12 = 0;
4437 vex_state->guest_R13 = 0;
4438 vex_state->guest_R14 = 0;
4439 vex_state->guest_R15 = 0;
4441 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
4442 vex_state->guest_CC_DEP1 = 0;
4443 vex_state->guest_CC_DEP2 = 0;
4444 vex_state->guest_CC_NDEP = 0;
4446 vex_state->guest_DFLAG = 1; /* forwards */
4447 vex_state->guest_IDFLAG = 0;
4448 vex_state->guest_ACFLAG = 0;
4450 /* HACK: represent the offset associated with a constant %fs.
4451 Typically, on linux, this assumes that %fs is only ever zero (main
4452 thread) or 0x63. */
4453 vex_state->guest_FS_CONST = 0;
4455 vex_state->guest_RIP = 0;
4457 /* Initialise the simulated FPU */
4458 amd64g_dirtyhelper_FINIT( vex_state );
4460 /* Initialise the AVX state. */
4461 # define AVXZERO(_ymm) \
4462 do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
4463 _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
4464 } while (0)
4465 vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
4466 AVXZERO(vex_state->guest_YMM0);
4467 AVXZERO(vex_state->guest_YMM1);
4468 AVXZERO(vex_state->guest_YMM2);
4469 AVXZERO(vex_state->guest_YMM3);
4470 AVXZERO(vex_state->guest_YMM4);
4471 AVXZERO(vex_state->guest_YMM5);
4472 AVXZERO(vex_state->guest_YMM6);
4473 AVXZERO(vex_state->guest_YMM7);
4474 AVXZERO(vex_state->guest_YMM8);
4475 AVXZERO(vex_state->guest_YMM9);
4476 AVXZERO(vex_state->guest_YMM10);
4477 AVXZERO(vex_state->guest_YMM11);
4478 AVXZERO(vex_state->guest_YMM12);
4479 AVXZERO(vex_state->guest_YMM13);
4480 AVXZERO(vex_state->guest_YMM14);
4481 AVXZERO(vex_state->guest_YMM15);
4482 AVXZERO(vex_state->guest_YMM16);
4484 # undef AVXZERO
4486 vex_state->guest_EMNOTE = EmNote_NONE;
4488 /* These should not ever be either read or written, but we
4489 initialise them anyway. */
4490 vex_state->guest_CMSTART = 0;
4491 vex_state->guest_CMLEN = 0;
4493 vex_state->guest_NRADDR = 0;
4494 vex_state->guest_SC_CLASS = 0;
4495 vex_state->guest_GS_CONST = 0;
4497 vex_state->guest_IP_AT_SYSCALL = 0;
4498 vex_state->pad1 = 0;
4502 /* Figure out if any part of the guest state contained in minoff
4503 .. maxoff requires precise memory exceptions. If in doubt return
4504 True (but this generates significantly slower code).
4506 By default we enforce precise exns for guest %RSP, %RBP and %RIP
4507 only. These are the minimum needed to extract correct stack
4508 backtraces from amd64 code.
4510 Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
4512 Bool guest_amd64_state_requires_precise_mem_exns (
4513 Int minoff, Int maxoff, VexRegisterUpdates pxControl
4516 Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
4517 Int rbp_max = rbp_min + 8 - 1;
4518 Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
4519 Int rsp_max = rsp_min + 8 - 1;
4520 Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
4521 Int rip_max = rip_min + 8 - 1;
4523 if (maxoff < rsp_min || minoff > rsp_max) {
4524 /* no overlap with rsp */
4525 if (pxControl == VexRegUpdSpAtMemAccess)
4526 return False; // We only need to check stack pointer.
4527 } else {
4528 return True;
4531 if (maxoff < rbp_min || minoff > rbp_max) {
4532 /* no overlap with rbp */
4533 } else {
4534 return True;
4537 if (maxoff < rip_min || minoff > rip_max) {
4538 /* no overlap with eip */
4539 } else {
4540 return True;
4543 return False;
4547 #define ALWAYSDEFD(field) \
4548 { offsetof(VexGuestAMD64State, field), \
4549 (sizeof ((VexGuestAMD64State*)0)->field) }
4551 VexGuestLayout
4552 amd64guest_layout
4554 /* Total size of the guest state, in bytes. */
4555 .total_sizeB = sizeof(VexGuestAMD64State),
4557 /* Describe the stack pointer. */
4558 .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
4559 .sizeof_SP = 8,
4561 /* Describe the frame pointer. */
4562 .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
4563 .sizeof_FP = 8,
4565 /* Describe the instruction pointer. */
4566 .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
4567 .sizeof_IP = 8,
4569 /* Describe any sections to be regarded by Memcheck as
4570 'always-defined'. */
4571 .n_alwaysDefd = 16,
4573 /* flags thunk: OP and NDEP are always defd, whereas DEP1
4574 and DEP2 have to be tracked. See detailed comment in
4575 gdefs.h on meaning of thunk fields. */
4576 .alwaysDefd
4577 = { /* 0 */ ALWAYSDEFD(guest_CC_OP),
4578 /* 1 */ ALWAYSDEFD(guest_CC_NDEP),
4579 /* 2 */ ALWAYSDEFD(guest_DFLAG),
4580 /* 3 */ ALWAYSDEFD(guest_IDFLAG),
4581 /* 4 */ ALWAYSDEFD(guest_RIP),
4582 /* 5 */ ALWAYSDEFD(guest_FS_CONST),
4583 /* 6 */ ALWAYSDEFD(guest_FTOP),
4584 /* 7 */ ALWAYSDEFD(guest_FPTAG),
4585 /* 8 */ ALWAYSDEFD(guest_FPROUND),
4586 /* 9 */ ALWAYSDEFD(guest_FC3210),
4587 // /* */ ALWAYSDEFD(guest_CS),
4588 // /* */ ALWAYSDEFD(guest_DS),
4589 // /* */ ALWAYSDEFD(guest_ES),
4590 // /* */ ALWAYSDEFD(guest_FS),
4591 // /* */ ALWAYSDEFD(guest_GS),
4592 // /* */ ALWAYSDEFD(guest_SS),
4593 // /* */ ALWAYSDEFD(guest_LDT),
4594 // /* */ ALWAYSDEFD(guest_GDT),
4595 /* 10 */ ALWAYSDEFD(guest_EMNOTE),
4596 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
4597 /* 12 */ ALWAYSDEFD(guest_CMSTART),
4598 /* 13 */ ALWAYSDEFD(guest_CMLEN),
4599 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
4600 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
4605 /*---------------------------------------------------------------*/
4606 /*--- end guest_amd64_helpers.c ---*/
4607 /*---------------------------------------------------------------*/