Bug 497723 - forgot to restore callgrind output cleanup
[valgrind.git] / VEX / priv / guest_generic_x87.c
blob85ebebdc77ed557f6027b6cc225272f0e908bd50
2 /*---------------------------------------------------------------*/
3 /*--- begin guest_generic_x87.c ---*/
4 /*---------------------------------------------------------------*/
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
28 Neither the names of the U.S. Department of Energy nor the
29 University of California nor the names of its contributors may be
30 used to endorse or promote products derived from this software
31 without prior written permission.
34 /* This file contains functions for doing some x87-specific
35 operations. Both the amd64 and x86 front ends (guests) indirectly
36 call these functions via guest helper calls. By putting them here,
37 code duplication is avoided. Some of these functions are tricky
38 and hard to verify, so there is much to be said for only having one
39 copy thereof.
42 #include "libvex_basictypes.h"
44 #include "main_util.h"
45 #include "guest_generic_x87.h"
48 /* 80 and 64-bit floating point formats:
50 80-bit:
52 S 0 0-------0 zero
53 S 0 0X------X denormals
54 S 1-7FFE 1X------X normals (all normals have leading 1)
55 S 7FFF 10------0 infinity
56 S 7FFF 10X-----X snan
57 S 7FFF 11X-----X qnan
59 S is the sign bit. For runs X----X, at least one of the Xs must be
60 nonzero. Exponent is 15 bits, fractional part is 63 bits, and
61 there is an explicitly represented leading 1, and a sign bit,
62 giving 80 in total.
64 64-bit avoids the confusion of an explicitly represented leading 1
65 and so is simpler:
67 S 0 0------0 zero
68 S 0 X------X denormals
69 S 1-7FE any normals
70 S 7FF 0------0 infinity
71 S 7FF 0X-----X snan
72 S 7FF 1X-----X qnan
74 Exponent is 11 bits, fractional part is 52 bits, and there is a
75 sign bit, giving 64 in total.
79 static inline UInt read_bit_array ( UChar* arr, UInt n )
81 UChar c = arr[n >> 3];
82 c >>= (n&7);
83 return c & 1;
86 static inline void write_bit_array ( UChar* arr, UInt n, UInt b )
88 UChar c = arr[n >> 3];
89 c = toUChar( c & ~(1 << (n&7)) );
90 c = toUChar( c | ((b&1) << (n&7)) );
91 arr[n >> 3] = c;
94 /* Convert an IEEE754 double (64-bit) into an x87 extended double
95 (80-bit), mimicing the hardware fairly closely. Both numbers are
96 stored little-endian. Limitations, all of which could be fixed,
97 given some level of hassle:
99 * Identity of NaNs is not preserved.
101 See comments in the code for more details.
103 void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 )
105 Bool mantissaIsZero;
106 Int bexp, i, j, shift;
107 UChar sign;
109 sign = toUChar( (f64[7] >> 7) & 1 );
110 bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
111 bexp &= 0x7FF;
113 mantissaIsZero = False;
114 if (bexp == 0 || bexp == 0x7FF) {
115 /* We'll need to know whether or not the mantissa (bits 51:0) is
116 all zeroes in order to handle these cases. So figure it
117 out. */
118 mantissaIsZero
119 = toBool(
120 (f64[6] & 0x0F) == 0
121 && f64[5] == 0 && f64[4] == 0 && f64[3] == 0
122 && f64[2] == 0 && f64[1] == 0 && f64[0] == 0
126 /* If the exponent is zero, either we have a zero or a denormal.
127 Produce a zero. This is a hack in that it forces denormals to
128 zero. Could do better. */
129 if (bexp == 0) {
130 f80[9] = toUChar( sign << 7 );
131 f80[8] = f80[7] = f80[6] = f80[5] = f80[4]
132 = f80[3] = f80[2] = f80[1] = f80[0] = 0;
134 if (mantissaIsZero)
135 /* It really is zero, so that's all we can do. */
136 return;
138 /* There is at least one 1-bit in the mantissa. So it's a
139 potentially denormalised double -- but we can produce a
140 normalised long double. Count the leading zeroes in the
141 mantissa so as to decide how much to bump the exponent down
142 by. Note, this is SLOW. */
143 shift = 0;
144 for (i = 51; i >= 0; i--) {
145 if (read_bit_array(f64, i))
146 break;
147 shift++;
150 /* and copy into place as many bits as we can get our hands on. */
151 j = 63;
152 for (i = 51 - shift; i >= 0; i--) {
153 write_bit_array( f80, j,
154 read_bit_array( f64, i ) );
155 j--;
158 /* Set the exponent appropriately, and we're done. */
159 bexp -= shift;
160 bexp += (16383 - 1023);
161 f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
162 f80[8] = toUChar( bexp & 0xFF );
163 return;
166 /* If the exponent is 7FF, this is either an Infinity, a SNaN or
167 QNaN, as determined by examining bits 51:0, thus:
168 0 ... 0 Inf
169 0X ... X SNaN
170 1X ... X QNaN
171 where at least one of the Xs is not zero.
173 if (bexp == 0x7FF) {
174 if (mantissaIsZero) {
175 /* Produce an appropriately signed infinity:
176 S 1--1 (15) 1 0--0 (63)
178 f80[9] = toUChar( (sign << 7) | 0x7F );
179 f80[8] = 0xFF;
180 f80[7] = 0x80;
181 f80[6] = f80[5] = f80[4] = f80[3]
182 = f80[2] = f80[1] = f80[0] = 0;
183 return;
185 /* So it's either a QNaN or SNaN. Distinguish by considering
186 bit 51. Note, this destroys all the trailing bits
187 (identity?) of the NaN. IEEE754 doesn't require preserving
188 these (it only requires that there be one QNaN value and one
189 SNaN value), but x87 does seem to have some ability to
190 preserve them. Anyway, here, the NaN's identity is
191 destroyed. Could be improved. */
192 if (f64[6] & 8) {
193 /* QNaN. Make a canonical QNaN:
194 S 1--1 (15) 1 1 0--0 (62)
196 f80[9] = toUChar( (sign << 7) | 0x7F );
197 f80[8] = 0xFF;
198 f80[7] = 0xC0;
199 f80[6] = f80[5] = f80[4] = f80[3]
200 = f80[2] = f80[1] = f80[0] = 0x00;
201 } else {
202 /* SNaN. Make a SNaN:
203 S 1--1 (15) 1 0 1--1 (62)
205 f80[9] = toUChar( (sign << 7) | 0x7F );
206 f80[8] = 0xFF;
207 f80[7] = 0xBF;
208 f80[6] = f80[5] = f80[4] = f80[3]
209 = f80[2] = f80[1] = f80[0] = 0xFF;
211 return;
214 /* It's not a zero, denormal, infinity or nan. So it must be a
215 normalised number. Rebias the exponent and build the new
216 number. */
217 bexp += (16383 - 1023);
219 f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
220 f80[8] = toUChar( bexp & 0xFF );
221 f80[7] = toUChar( (1 << 7) | ((f64[6] << 3) & 0x78)
222 | ((f64[5] >> 5) & 7) );
223 f80[6] = toUChar( ((f64[5] << 3) & 0xF8) | ((f64[4] >> 5) & 7) );
224 f80[5] = toUChar( ((f64[4] << 3) & 0xF8) | ((f64[3] >> 5) & 7) );
225 f80[4] = toUChar( ((f64[3] << 3) & 0xF8) | ((f64[2] >> 5) & 7) );
226 f80[3] = toUChar( ((f64[2] << 3) & 0xF8) | ((f64[1] >> 5) & 7) );
227 f80[2] = toUChar( ((f64[1] << 3) & 0xF8) | ((f64[0] >> 5) & 7) );
228 f80[1] = toUChar( ((f64[0] << 3) & 0xF8) );
229 f80[0] = toUChar( 0 );
233 /* Convert an x87 extended double (80-bit) into an IEEE 754 double
234 (64-bit), mimicking the hardware fairly closely. Both numbers are
235 stored little-endian. Limitations, both of which could be fixed,
236 given some level of hassle:
238 * Rounding following truncation could be a bit better.
240 * Identity of NaNs is not preserved.
242 See comments in the code for more details.
244 void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 )
246 Bool isInf;
247 Int bexp, i, j;
248 UChar sign;
250 sign = toUChar((f80[9] >> 7) & 1);
251 bexp = (((UInt)f80[9]) << 8) | (UInt)f80[8];
252 bexp &= 0x7FFF;
254 /* If the exponent is zero, either we have a zero or a denormal.
255 But an extended precision denormal becomes a double precision
256 zero, so in either case, just produce the appropriately signed
257 zero. */
258 if (bexp == 0) {
259 f64[7] = toUChar(sign << 7);
260 f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
261 return;
264 /* If the exponent is 7FFF, this is either an Infinity, a SNaN or
265 QNaN, as determined by examining bits 62:0, thus:
266 10 ... 0 Inf
267 10X ... X SNaN
268 11X ... X QNaN
269 where at least one of the Xs is not zero.
271 if (bexp == 0x7FFF) {
272 isInf = toBool(
273 (f80[7] & 0x7F) == 0
274 && f80[6] == 0 && f80[5] == 0 && f80[4] == 0
275 && f80[3] == 0 && f80[2] == 0 && f80[1] == 0
276 && f80[0] == 0
278 if (isInf) {
279 if (0 == (f80[7] & 0x80))
280 goto wierd_NaN;
281 /* Produce an appropriately signed infinity:
282 S 1--1 (11) 0--0 (52)
284 f64[7] = toUChar((sign << 7) | 0x7F);
285 f64[6] = 0xF0;
286 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
287 return;
289 /* So it's either a QNaN or SNaN. Distinguish by considering
290 bit 61. Note, this destroys all the trailing bits
291 (identity?) of the NaN. IEEE754 doesn't require preserving
292 these (it only requires that there be one QNaN value and one
293 SNaN value), but x87 does seem to have some ability to
294 preserve them. Anyway, here, the NaN's identity is
295 destroyed. Could be improved. */
296 if (f80[7] & 0x40) {
297 /* QNaN. Make a canonical QNaN:
298 S 1--1 (11) 1 0--0 (51)
300 f64[7] = toUChar((sign << 7) | 0x7F);
301 f64[6] = 0xF8;
302 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0x00;
303 } else {
304 /* SNaN. Make a SNaN:
305 S 1--1 (11) 0 1--1 (51)
307 f64[7] = toUChar((sign << 7) | 0x7F);
308 f64[6] = 0xF7;
309 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
311 return;
314 /* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is
315 zero, the x87 FPU appears to consider the number denormalised
316 and converts it to a QNaN. */
317 if (0 == (f80[7] & 0x80)) {
318 wierd_NaN:
319 /* Strange hardware QNaN:
320 S 1--1 (11) 1 0--0 (51)
322 /* On a PIII, these QNaNs always appear with sign==1. I have
323 no idea why. */
324 f64[7] = (1 /*sign*/ << 7) | 0x7F;
325 f64[6] = 0xF8;
326 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
327 return;
330 /* It's not a zero, denormal, infinity or nan. So it must be a
331 normalised number. Rebias the exponent and consider. */
332 bexp -= (16383 - 1023);
333 if (bexp >= 0x7FF) {
334 /* It's too big for a double. Construct an infinity. */
335 f64[7] = toUChar((sign << 7) | 0x7F);
336 f64[6] = 0xF0;
337 f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
338 return;
341 if (bexp <= 0) {
342 /* It's too small for a normalised double. First construct a
343 zero and then see if it can be improved into a denormal. */
344 f64[7] = toUChar(sign << 7);
345 f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
347 if (bexp < -52)
348 /* Too small even for a denormal. */
349 return;
351 /* Ok, let's make a denormal. Note, this is SLOW. */
352 /* Copy bits 63, 62, 61, etc of the src mantissa into the dst,
353 indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */
354 /* bexp is in range -52 .. 0 inclusive */
355 for (i = 63; i >= 0; i--) {
356 j = i - 12 + bexp;
357 if (j < 0) break;
358 /* We shouldn't really call vassert from generated code. */
359 vassert(j >= 0 && j < 52);
360 write_bit_array ( f64,
362 read_bit_array ( f80, i ) );
364 /* and now we might have to round ... */
365 if (read_bit_array(f80, 10+1 - bexp) == 1)
366 goto do_rounding;
368 return;
371 /* Ok, it's a normalised number which is representable as a double.
372 Copy the exponent and mantissa into place. */
374 for (i = 0; i < 52; i++)
375 write_bit_array ( f64,
377 read_bit_array ( f80, i+11 ) );
379 f64[0] = toUChar( (f80[1] >> 3) | (f80[2] << 5) );
380 f64[1] = toUChar( (f80[2] >> 3) | (f80[3] << 5) );
381 f64[2] = toUChar( (f80[3] >> 3) | (f80[4] << 5) );
382 f64[3] = toUChar( (f80[4] >> 3) | (f80[5] << 5) );
383 f64[4] = toUChar( (f80[5] >> 3) | (f80[6] << 5) );
384 f64[5] = toUChar( (f80[6] >> 3) | (f80[7] << 5) );
386 f64[6] = toUChar( ((bexp << 4) & 0xF0) | ((f80[7] >> 3) & 0x0F) );
388 f64[7] = toUChar( (sign << 7) | ((bexp >> 4) & 0x7F) );
390 /* Now consider any rounding that needs to happen as a result of
391 truncating the mantissa. */
392 if (f80[1] & 4) /* read_bit_array(f80, 10) == 1) */ {
394 /* If the bottom bits of f80 are "100 0000 0000", then the
395 infinitely precise value is deemed to be mid-way between the
396 two closest representable values. Since we're doing
397 round-to-nearest (the default mode), in that case it is the
398 bit immediately above which indicates whether we should round
399 upwards or not -- if 0, we don't. All that is encapsulated
400 in the following simple test. */
401 if ((f80[1] & 0xF) == 4/*0100b*/ && f80[0] == 0)
402 return;
404 do_rounding:
405 /* Round upwards. This is a kludge. Once in every 2^24
406 roundings (statistically) the bottom three bytes are all 0xFF
407 and so we don't round at all. Could be improved. */
408 if (f64[0] != 0xFF) {
409 f64[0]++;
411 else
412 if (f64[0] == 0xFF && f64[1] != 0xFF) {
413 f64[0] = 0;
414 f64[1]++;
416 else
417 if (f64[0] == 0xFF && f64[1] == 0xFF && f64[2] != 0xFF) {
418 f64[0] = 0;
419 f64[1] = 0;
420 f64[2]++;
422 /* else we don't round, but we should. */
427 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
428 /* Extract the signed significand or exponent component as per
429 fxtract. Arg and result are doubles travelling under the guise of
430 ULongs. Returns significand when getExp is zero and exponent
431 otherwise. */
432 ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp )
434 ULong uSig, uExp;
435 /* Long sSig; */
436 Int sExp, i;
437 UInt sign, expExp;
440 S 7FF 0------0 infinity
441 S 7FF 0X-----X snan
442 S 7FF 1X-----X qnan
444 const ULong posInf = 0x7FF0000000000000ULL;
445 const ULong negInf = 0xFFF0000000000000ULL;
446 const ULong nanMask = 0x7FF0000000000000ULL;
447 const ULong qNan = 0x7FF8000000000000ULL;
448 const ULong posZero = 0x0000000000000000ULL;
449 const ULong negZero = 0x8000000000000000ULL;
450 const ULong bit51 = 1ULL << 51;
451 const ULong bit52 = 1ULL << 52;
452 const ULong sigMask = bit52 - 1;
454 /* Mimic Core i5 behaviour for special cases. */
455 if (arg == posInf)
456 return posInf; /* Both significand and exponent are posInf. */
457 if (arg == negInf)
458 return getExp ? posInf : negInf;
459 if ((arg & nanMask) == nanMask)
460 return qNan | (arg & (1ULL << 63));
461 if (arg == posZero)
462 return getExp ? negInf : posZero;
463 if (arg == negZero)
464 return getExp ? negInf : negZero;
466 /* Split into sign, exponent and significand. */
467 sign = ((UInt)(arg >> 63)) & 1;
469 /* Mask off exponent & sign. uSig is in range 0 .. 2^52-1. */
470 uSig = arg & sigMask;
472 /* Get the exponent. */
473 sExp = ((Int)(arg >> 52)) & 0x7FF;
475 /* Deal with denormals: if the exponent is zero, then the
476 significand cannot possibly be zero (negZero/posZero are handled
477 above). Shift the significand left until bit 51 of it becomes
478 1, and decrease the exponent accordingly.
480 if (sExp == 0) {
481 for (i = 0; i < 52; i++) {
482 if (uSig & bit51)
483 break;
484 uSig <<= 1;
485 sExp--;
487 uSig <<= 1;
488 } else {
489 /* Add the implied leading-1 in the significand. */
490 uSig |= bit52;
493 /* Roll in the sign. */
494 /* sSig = uSig; */
495 /* if (sign) sSig =- sSig; */
497 /* Convert sig into a double. This should be an exact conversion.
498 Then divide by 2^52, which should give a value in the range 1.0
499 to 2.0-epsilon, at least for normalised args. */
500 /* dSig = (Double)sSig; */
501 /* dSig /= 67108864.0; */ /* 2^26 */
502 /* dSig /= 67108864.0; */ /* 2^26 */
503 uSig &= sigMask;
504 uSig |= 0x3FF0000000000000ULL;
505 if (sign)
506 uSig ^= negZero;
508 /* Convert exp into a double. Also an exact conversion. */
509 /* dExp = (Double)(sExp - 1023); */
510 sExp -= 1023;
511 if (sExp == 0) {
512 uExp = 0;
513 } else {
514 uExp = sExp < 0 ? -sExp : sExp;
515 expExp = 0x3FF +52;
516 /* 1 <= uExp <= 1074 */
517 /* Skip first 42 iterations of normalisation loop as we know they
518 will always happen */
519 uExp <<= 42;
520 expExp -= 42;
521 for (i = 0; i < 52-42; i++) {
522 if (uExp & bit52)
523 break;
524 uExp <<= 1;
525 expExp--;
527 uExp &= sigMask;
528 uExp |= ((ULong)expExp) << 52;
529 if (sExp < 0) uExp ^= negZero;
532 return getExp ? uExp : uSig;
537 /*---------------------------------------------------------*/
538 /*--- SSE4.2 PCMP{E,I}STR{I,M} helpers ---*/
539 /*---------------------------------------------------------*/
541 /* We need the definitions for OSZACP eflags/rflags offsets.
542 #including guest_{amd64,x86}_defs.h causes chaos, so just copy the
543 required values directly. They are not going to change in the
544 foreseeable future :-)
547 #define SHIFT_O 11
548 #define SHIFT_S 7
549 #define SHIFT_Z 6
550 #define SHIFT_A 4
551 #define SHIFT_C 0
552 #define SHIFT_P 2
554 #define MASK_O (1 << SHIFT_O)
555 #define MASK_S (1 << SHIFT_S)
556 #define MASK_Z (1 << SHIFT_Z)
557 #define MASK_A (1 << SHIFT_A)
558 #define MASK_C (1 << SHIFT_C)
559 #define MASK_P (1 << SHIFT_P)
562 /* Count leading zeroes, w/ 0-produces-32 semantics, a la Hacker's
563 Delight. */
564 static UInt clz32 ( UInt x )
566 Int y, m, n;
567 y = -(x >> 16);
568 m = (y >> 16) & 16;
569 n = 16 - m;
570 x = x >> m;
571 y = x - 0x100;
572 m = (y >> 16) & 8;
573 n = n + m;
574 x = x << m;
575 y = x - 0x1000;
576 m = (y >> 16) & 4;
577 n = n + m;
578 x = x << m;
579 y = x - 0x4000;
580 m = (y >> 16) & 2;
581 n = n + m;
582 x = x << m;
583 y = x >> 14;
584 m = y & ~(y >> 1);
585 return n + 2 - m;
588 static UInt ctz32 ( UInt x )
590 return 32 - clz32((~x) & (x-1));
593 /* Convert a 4-bit value to a 32-bit value by cloning each bit 8
594 times. There's surely a better way to do this, but I don't know
595 what it is. */
596 static UInt bits4_to_bytes4 ( UInt bits4 )
598 UInt r = 0;
599 r |= (bits4 & 1) ? 0x000000FF : 0;
600 r |= (bits4 & 2) ? 0x0000FF00 : 0;
601 r |= (bits4 & 4) ? 0x00FF0000 : 0;
602 r |= (bits4 & 8) ? 0xFF000000 : 0;
603 return r;
607 /* Convert a 2-bit value to a 32-bit value by cloning each bit 16
608 times. There's surely a better way to do this, but I don't know
609 what it is. */
610 static UInt bits2_to_bytes4 ( UInt bits2 )
612 UInt r = 0;
613 r |= (bits2 & 1) ? 0x0000FFFF : 0;
614 r |= (bits2 & 2) ? 0xFFFF0000 : 0;
615 return r;
619 /* Given partial results from a pcmpXstrX operation (intRes1,
620 basically), generate an I- or M-format output value, also the new
621 OSZACP flags. */
622 static
623 void compute_PCMPxSTRx_gen_output (/*OUT*/V128* resV,
624 /*OUT*/UInt* resOSZACP,
625 UInt intRes1,
626 UInt zmaskL, UInt zmaskR,
627 UInt validL,
628 UInt pol, UInt idx,
629 Bool isxSTRM )
631 vassert((pol >> 2) == 0);
632 vassert((idx >> 1) == 0);
634 UInt intRes2 = 0;
635 switch (pol) {
636 case 0: intRes2 = intRes1; break; // pol +
637 case 1: intRes2 = ~intRes1; break; // pol -
638 case 2: intRes2 = intRes1; break; // pol m+
639 case 3: intRes2 = intRes1 ^ validL; break; // pol m-
641 intRes2 &= 0xFFFF;
643 if (isxSTRM) {
645 // generate M-format output (a bit or byte mask in XMM0)
646 if (idx) {
647 resV->w32[0] = bits4_to_bytes4( (intRes2 >> 0) & 0xF );
648 resV->w32[1] = bits4_to_bytes4( (intRes2 >> 4) & 0xF );
649 resV->w32[2] = bits4_to_bytes4( (intRes2 >> 8) & 0xF );
650 resV->w32[3] = bits4_to_bytes4( (intRes2 >> 12) & 0xF );
651 } else {
652 resV->w32[0] = intRes2 & 0xFFFF;
653 resV->w32[1] = 0;
654 resV->w32[2] = 0;
655 resV->w32[3] = 0;
658 } else {
660 // generate I-format output (an index in ECX)
661 // generate ecx value
662 UInt newECX = 0;
663 if (idx) {
664 // index of ms-1-bit
665 newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
666 } else {
667 // index of ls-1-bit
668 newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
671 resV->w32[0] = newECX;
672 resV->w32[1] = 0;
673 resV->w32[2] = 0;
674 resV->w32[3] = 0;
678 // generate new flags, common to all ISTRI and ISTRM cases
679 *resOSZACP // A, P are zero
680 = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
681 | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
682 | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
683 | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
687 /* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
688 basically), generate an I- or M-format output value, also the new
689 OSZACP flags. */
690 static
691 void compute_PCMPxSTRx_gen_output_wide (/*OUT*/V128* resV,
692 /*OUT*/UInt* resOSZACP,
693 UInt intRes1,
694 UInt zmaskL, UInt zmaskR,
695 UInt validL,
696 UInt pol, UInt idx,
697 Bool isxSTRM )
699 vassert((pol >> 2) == 0);
700 vassert((idx >> 1) == 0);
702 UInt intRes2 = 0;
703 switch (pol) {
704 case 0: intRes2 = intRes1; break; // pol +
705 case 1: intRes2 = ~intRes1; break; // pol -
706 case 2: intRes2 = intRes1; break; // pol m+
707 case 3: intRes2 = intRes1 ^ validL; break; // pol m-
709 intRes2 &= 0xFF;
711 if (isxSTRM) {
713 // generate M-format output (a bit or byte mask in XMM0)
714 if (idx) {
715 resV->w32[0] = bits2_to_bytes4( (intRes2 >> 0) & 0x3 );
716 resV->w32[1] = bits2_to_bytes4( (intRes2 >> 2) & 0x3 );
717 resV->w32[2] = bits2_to_bytes4( (intRes2 >> 4) & 0x3 );
718 resV->w32[3] = bits2_to_bytes4( (intRes2 >> 6) & 0x3 );
719 } else {
720 resV->w32[0] = intRes2 & 0xFF;
721 resV->w32[1] = 0;
722 resV->w32[2] = 0;
723 resV->w32[3] = 0;
726 } else {
728 // generate I-format output (an index in ECX)
729 // generate ecx value
730 UInt newECX = 0;
731 if (idx) {
732 // index of ms-1-bit
733 newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2));
734 } else {
735 // index of ls-1-bit
736 newECX = intRes2 == 0 ? 8 : ctz32(intRes2);
739 resV->w32[0] = newECX;
740 resV->w32[1] = 0;
741 resV->w32[2] = 0;
742 resV->w32[3] = 0;
746 // generate new flags, common to all ISTRI and ISTRM cases
747 *resOSZACP // A, P are zero
748 = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
749 | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
750 | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
751 | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
755 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
756 variants on 8-bit data.
758 For xSTRI variants, the new ECX value is placed in the 32 bits
759 pointed to by *resV, and the top 96 bits are zeroed. For xSTRM
760 variants, the result is a 128 bit value and is placed at *resV in
761 the obvious way.
763 For all variants, the new OSZACP value is placed at *resOSZACP.
765 argLV and argRV are the vector args. The caller must prepare a
766 16-bit mask for each, zmaskL and zmaskR. For ISTRx variants this
767 must be 1 for each zero byte of of the respective arg. For ESTRx
768 variants this is derived from the explicit length indication, and
769 must be 0 in all places except at the bit index corresponding to
770 the valid length (0 .. 16). If the valid length is 16 then the
771 mask must be all zeroes. In all cases, bits 31:16 must be zero.
773 imm8 is the original immediate from the instruction. isSTRM
774 indicates whether this is a xSTRM or xSTRI variant, which controls
775 how much of *res is written.
777 If the given imm8 case can be handled, the return value is True.
778 If not, False is returned, and neither *res not *resOSZACP are
779 altered.
782 Bool compute_PCMPxSTRx ( /*OUT*/V128* resV,
783 /*OUT*/UInt* resOSZACP,
784 V128* argLV, V128* argRV,
785 UInt zmaskL, UInt zmaskR,
786 UInt imm8, Bool isxSTRM )
788 vassert(imm8 < 0x80);
789 vassert((zmaskL >> 16) == 0);
790 vassert((zmaskR >> 16) == 0);
792 /* Explicitly reject any imm8 values that haven't been validated,
793 even if they would probably work. Life is too short to have
794 unvalidated cases in the code base. */
795 switch (imm8) {
796 case 0x00: case 0x02:
797 case 0x08: case 0x0A: case 0x0C: case 0x0E:
798 case 0x10: case 0x12: case 0x14:
799 case 0x18: case 0x1A:
800 case 0x30: case 0x34:
801 case 0x38: case 0x3A:
802 case 0x40: case 0x42: case 0x44: case 0x46:
803 case 0x4A:
804 case 0x62:
805 case 0x70: case 0x72:
806 break;
807 default:
808 return False;
811 UInt fmt = (imm8 >> 0) & 3; // imm8[1:0] data format
812 UInt agg = (imm8 >> 2) & 3; // imm8[3:2] aggregation fn
813 UInt pol = (imm8 >> 4) & 3; // imm8[5:4] polarity
814 UInt idx = (imm8 >> 6) & 1; // imm8[6] 1==msb/bytemask
816 /*----------------------------------------*/
817 /*-- strcmp on byte data --*/
818 /*----------------------------------------*/
820 if (agg == 2/*equal each, aka strcmp*/
821 && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
822 Int i;
823 UChar* argL = (UChar*)argLV;
824 UChar* argR = (UChar*)argRV;
825 UInt boolResII = 0;
826 for (i = 15; i >= 0; i--) {
827 UChar cL = argL[i];
828 UChar cR = argR[i];
829 boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
831 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
832 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
834 // do invalidation, common to all equal-each cases
835 UInt intRes1
836 = (boolResII & validL & validR) // if both valid, use cmpres
837 | (~ (validL | validR)); // if both invalid, force 1
838 // else force 0
839 intRes1 &= 0xFFFF;
841 // generate I-format output
842 compute_PCMPxSTRx_gen_output(
843 resV, resOSZACP,
844 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
847 return True;
850 /*----------------------------------------*/
851 /*-- set membership on byte data --*/
852 /*----------------------------------------*/
854 if (agg == 0/*equal any, aka find chars in a set*/
855 && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
856 /* argL: the string, argR: charset */
857 UInt si, ci;
858 UChar* argL = (UChar*)argLV;
859 UChar* argR = (UChar*)argRV;
860 UInt boolRes = 0;
861 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
862 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
864 for (si = 0; si < 16; si++) {
865 if ((validL & (1 << si)) == 0)
866 // run off the end of the string.
867 break;
868 UInt m = 0;
869 for (ci = 0; ci < 16; ci++) {
870 if ((validR & (1 << ci)) == 0) break;
871 if (argR[ci] == argL[si]) { m = 1; break; }
873 boolRes |= (m << si);
876 // boolRes is "pre-invalidated"
877 UInt intRes1 = boolRes & 0xFFFF;
879 // generate I-format output
880 compute_PCMPxSTRx_gen_output(
881 resV, resOSZACP,
882 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
885 return True;
888 /*----------------------------------------*/
889 /*-- substring search on byte data --*/
890 /*----------------------------------------*/
892 if (agg == 3/*equal ordered, aka substring search*/
893 && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
895 /* argL: haystack, argR: needle */
896 UInt ni, hi;
897 UChar* argL = (UChar*)argLV;
898 UChar* argR = (UChar*)argRV;
899 UInt boolRes = 0;
900 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
901 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
902 for (hi = 0; hi < 16; hi++) {
903 UInt m = 1;
904 for (ni = 0; ni < 16; ni++) {
905 if ((validR & (1 << ni)) == 0) break;
906 UInt i = ni + hi;
907 if (i >= 16) break;
908 if (argL[i] != argR[ni]) { m = 0; break; }
910 boolRes |= (m << hi);
911 if ((validL & (1 << hi)) == 0)
912 // run off the end of the haystack
913 break;
916 // boolRes is "pre-invalidated"
917 UInt intRes1 = boolRes & 0xFFFF;
919 // generate I-format output
920 compute_PCMPxSTRx_gen_output(
921 resV, resOSZACP,
922 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
925 return True;
928 /*----------------------------------------*/
929 /*-- ranges, unsigned byte data --*/
930 /*----------------------------------------*/
932 if (agg == 1/*ranges*/
933 && fmt == 0/*ub*/) {
935 /* argL: string, argR: range-pairs */
936 UInt ri, si;
937 UChar* argL = (UChar*)argLV;
938 UChar* argR = (UChar*)argRV;
939 UInt boolRes = 0;
940 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
941 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
942 for (si = 0; si < 16; si++) {
943 if ((validL & (1 << si)) == 0)
944 // run off the end of the string
945 break;
946 UInt m = 0;
947 for (ri = 0; ri < 16; ri += 2) {
948 if ((validR & (3 << ri)) != (3 << ri)) break;
949 if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
950 m = 1; break;
953 boolRes |= (m << si);
956 // boolRes is "pre-invalidated"
957 UInt intRes1 = boolRes & 0xFFFF;
959 // generate I-format output
960 compute_PCMPxSTRx_gen_output(
961 resV, resOSZACP,
962 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
965 return True;
968 /*----------------------------------------*/
969 /*-- ranges, signed byte data --*/
970 /*----------------------------------------*/
972 if (agg == 1/*ranges*/
973 && fmt == 2/*sb*/) {
975 /* argL: string, argR: range-pairs */
976 UInt ri, si;
977 Char* argL = (Char*)argLV;
978 Char* argR = (Char*)argRV;
979 UInt boolRes = 0;
980 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
981 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
982 for (si = 0; si < 16; si++) {
983 if ((validL & (1 << si)) == 0)
984 // run off the end of the string
985 break;
986 UInt m = 0;
987 for (ri = 0; ri < 16; ri += 2) {
988 if ((validR & (3 << ri)) != (3 << ri)) break;
989 if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
990 m = 1; break;
993 boolRes |= (m << si);
996 // boolRes is "pre-invalidated"
997 UInt intRes1 = boolRes & 0xFFFF;
999 // generate I-format output
1000 compute_PCMPxSTRx_gen_output(
1001 resV, resOSZACP,
1002 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1005 return True;
1008 return False;
1012 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
1013 variants on 16-bit characters.
1015 For xSTRI variants, the new ECX value is placed in the 32 bits
1016 pointed to by *resV, and the top 96 bits are zeroed. For xSTRM
1017 variants, the result is a 128 bit value and is placed at *resV in
1018 the obvious way.
1020 For all variants, the new OSZACP value is placed at *resOSZACP.
1022 argLV and argRV are the vector args. The caller must prepare a
1023 8-bit mask for each, zmaskL and zmaskR. For ISTRx variants this
1024 must be 1 for each zero byte of of the respective arg. For ESTRx
1025 variants this is derived from the explicit length indication, and
1026 must be 0 in all places except at the bit index corresponding to
1027 the valid length (0 .. 8). If the valid length is 8 then the
1028 mask must be all zeroes. In all cases, bits 31:8 must be zero.
1030 imm8 is the original immediate from the instruction. isSTRM
1031 indicates whether this is a xSTRM or xSTRI variant, which controls
1032 how much of *res is written.
1034 If the given imm8 case can be handled, the return value is True.
1035 If not, False is returned, and neither *res not *resOSZACP are
1036 altered.
1039 Bool compute_PCMPxSTRx_wide ( /*OUT*/V128* resV,
1040 /*OUT*/UInt* resOSZACP,
1041 V128* argLV, V128* argRV,
1042 UInt zmaskL, UInt zmaskR,
1043 UInt imm8, Bool isxSTRM )
1045 vassert(imm8 < 0x80);
1046 vassert((zmaskL >> 8) == 0);
1047 vassert((zmaskR >> 8) == 0);
1049 /* Explicitly reject any imm8 values that haven't been validated,
1050 even if they would probably work. Life is too short to have
1051 unvalidated cases in the code base. */
1052 switch (imm8) {
1053 // 1,9 3,B 5,D 7,F
1054 case 0x01: case 0x03:
1055 case 0x09: case 0x0B: case 0x0D:
1056 case 0x13:
1057 case 0x19: case 0x1B:
1058 case 0x39: case 0x3B:
1059 case 0x41: case 0x45:
1060 case 0x4B:
1061 break;
1062 default:
1063 return False;
1066 UInt fmt = (imm8 >> 0) & 3; // imm8[1:0] data format
1067 UInt agg = (imm8 >> 2) & 3; // imm8[3:2] aggregation fn
1068 UInt pol = (imm8 >> 4) & 3; // imm8[5:4] polarity
1069 UInt idx = (imm8 >> 6) & 1; // imm8[6] 1==msb/bytemask
1071 /*----------------------------------------*/
1072 /*-- strcmp on wide data --*/
1073 /*----------------------------------------*/
1075 if (agg == 2/*equal each, aka strcmp*/
1076 && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
1077 Int i;
1078 UShort* argL = (UShort*)argLV;
1079 UShort* argR = (UShort*)argRV;
1080 UInt boolResII = 0;
1081 for (i = 7; i >= 0; i--) {
1082 UShort cL = argL[i];
1083 UShort cR = argR[i];
1084 boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
1086 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
1087 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
1089 // do invalidation, common to all equal-each cases
1090 UInt intRes1
1091 = (boolResII & validL & validR) // if both valid, use cmpres
1092 | (~ (validL | validR)); // if both invalid, force 1
1093 // else force 0
1094 intRes1 &= 0xFF;
1096 // generate I-format output
1097 compute_PCMPxSTRx_gen_output_wide(
1098 resV, resOSZACP,
1099 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1102 return True;
1105 /*----------------------------------------*/
1106 /*-- set membership on wide data --*/
1107 /*----------------------------------------*/
1109 if (agg == 0/*equal any, aka find chars in a set*/
1110 && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
1111 /* argL: the string, argR: charset */
1112 UInt si, ci;
1113 UShort* argL = (UShort*)argLV;
1114 UShort* argR = (UShort*)argRV;
1115 UInt boolRes = 0;
1116 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
1117 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
1119 for (si = 0; si < 8; si++) {
1120 if ((validL & (1 << si)) == 0)
1121 // run off the end of the string.
1122 break;
1123 UInt m = 0;
1124 for (ci = 0; ci < 8; ci++) {
1125 if ((validR & (1 << ci)) == 0) break;
1126 if (argR[ci] == argL[si]) { m = 1; break; }
1128 boolRes |= (m << si);
1131 // boolRes is "pre-invalidated"
1132 UInt intRes1 = boolRes & 0xFF;
1134 // generate I-format output
1135 compute_PCMPxSTRx_gen_output_wide(
1136 resV, resOSZACP,
1137 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1140 return True;
1143 /*----------------------------------------*/
1144 /*-- substring search on wide data --*/
1145 /*----------------------------------------*/
1147 if (agg == 3/*equal ordered, aka substring search*/
1148 && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
1150 /* argL: haystack, argR: needle */
1151 UInt ni, hi;
1152 UShort* argL = (UShort*)argLV;
1153 UShort* argR = (UShort*)argRV;
1154 UInt boolRes = 0;
1155 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
1156 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
1157 for (hi = 0; hi < 8; hi++) {
1158 UInt m = 1;
1159 for (ni = 0; ni < 8; ni++) {
1160 if ((validR & (1 << ni)) == 0) break;
1161 UInt i = ni + hi;
1162 if (i >= 8) break;
1163 if (argL[i] != argR[ni]) { m = 0; break; }
1165 boolRes |= (m << hi);
1166 if ((validL & (1 << hi)) == 0)
1167 // run off the end of the haystack
1168 break;
1171 // boolRes is "pre-invalidated"
1172 UInt intRes1 = boolRes & 0xFF;
1174 // generate I-format output
1175 compute_PCMPxSTRx_gen_output_wide(
1176 resV, resOSZACP,
1177 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1180 return True;
1183 /*----------------------------------------*/
1184 /*-- ranges, unsigned wide data --*/
1185 /*----------------------------------------*/
1187 if (agg == 1/*ranges*/
1188 && fmt == 1/*uw*/) {
1190 /* argL: string, argR: range-pairs */
1191 UInt ri, si;
1192 UShort* argL = (UShort*)argLV;
1193 UShort* argR = (UShort*)argRV;
1194 UInt boolRes = 0;
1195 UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
1196 UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
1197 for (si = 0; si < 8; si++) {
1198 if ((validL & (1 << si)) == 0)
1199 // run off the end of the string
1200 break;
1201 UInt m = 0;
1202 for (ri = 0; ri < 8; ri += 2) {
1203 if ((validR & (3 << ri)) != (3 << ri)) break;
1204 if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
1205 m = 1; break;
1208 boolRes |= (m << si);
1211 // boolRes is "pre-invalidated"
1212 UInt intRes1 = boolRes & 0xFF;
1214 // generate I-format output
1215 compute_PCMPxSTRx_gen_output_wide(
1216 resV, resOSZACP,
1217 intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1220 return True;
1223 return False;
1227 /*---------------------------------------------------------------*/
1228 /*--- end guest_generic_x87.c ---*/
1229 /*---------------------------------------------------------------*/