2 /*---------------------------------------------------------------*/
3 /*--- begin guest_generic_x87.c ---*/
4 /*---------------------------------------------------------------*/
7 This file is part of Valgrind, a dynamic binary instrumentation
10 Copyright (C) 2004-2017 OpenWorks LLP
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
28 Neither the names of the U.S. Department of Energy nor the
29 University of California nor the names of its contributors may be
30 used to endorse or promote products derived from this software
31 without prior written permission.
34 /* This file contains functions for doing some x87-specific
35 operations. Both the amd64 and x86 front ends (guests) indirectly
36 call these functions via guest helper calls. By putting them here,
37 code duplication is avoided. Some of these functions are tricky
38 and hard to verify, so there is much to be said for only having one
42 #include "libvex_basictypes.h"
44 #include "main_util.h"
45 #include "guest_generic_x87.h"
48 /* 80 and 64-bit floating point formats:
53 S 0 0X------X denormals
54 S 1-7FFE 1X------X normals (all normals have leading 1)
55 S 7FFF 10------0 infinity
59 S is the sign bit. For runs X----X, at least one of the Xs must be
60 nonzero. Exponent is 15 bits, fractional part is 63 bits, and
61 there is an explicitly represented leading 1, and a sign bit,
64 64-bit avoids the confusion of an explicitly represented leading 1
68 S 0 X------X denormals
70 S 7FF 0------0 infinity
74 Exponent is 11 bits, fractional part is 52 bits, and there is a
75 sign bit, giving 64 in total.
79 static inline UInt
read_bit_array ( UChar
* arr
, UInt n
)
81 UChar c
= arr
[n
>> 3];
86 static inline void write_bit_array ( UChar
* arr
, UInt n
, UInt b
)
88 UChar c
= arr
[n
>> 3];
89 c
= toUChar( c
& ~(1 << (n
&7)) );
90 c
= toUChar( c
| ((b
&1) << (n
&7)) );
94 /* Convert an IEEE754 double (64-bit) into an x87 extended double
95 (80-bit), mimicing the hardware fairly closely. Both numbers are
96 stored little-endian. Limitations, all of which could be fixed,
97 given some level of hassle:
99 * Identity of NaNs is not preserved.
101 See comments in the code for more details.
103 void convert_f64le_to_f80le ( /*IN*/UChar
* f64
, /*OUT*/UChar
* f80
)
106 Int bexp
, i
, j
, shift
;
109 sign
= toUChar( (f64
[7] >> 7) & 1 );
110 bexp
= (f64
[7] << 4) | ((f64
[6] >> 4) & 0x0F);
113 mantissaIsZero
= False
;
114 if (bexp
== 0 || bexp
== 0x7FF) {
115 /* We'll need to know whether or not the mantissa (bits 51:0) is
116 all zeroes in order to handle these cases. So figure it
121 && f64
[5] == 0 && f64
[4] == 0 && f64
[3] == 0
122 && f64
[2] == 0 && f64
[1] == 0 && f64
[0] == 0
126 /* If the exponent is zero, either we have a zero or a denormal.
127 Produce a zero. This is a hack in that it forces denormals to
128 zero. Could do better. */
130 f80
[9] = toUChar( sign
<< 7 );
131 f80
[8] = f80
[7] = f80
[6] = f80
[5] = f80
[4]
132 = f80
[3] = f80
[2] = f80
[1] = f80
[0] = 0;
135 /* It really is zero, so that's all we can do. */
138 /* There is at least one 1-bit in the mantissa. So it's a
139 potentially denormalised double -- but we can produce a
140 normalised long double. Count the leading zeroes in the
141 mantissa so as to decide how much to bump the exponent down
142 by. Note, this is SLOW. */
144 for (i
= 51; i
>= 0; i
--) {
145 if (read_bit_array(f64
, i
))
150 /* and copy into place as many bits as we can get our hands on. */
152 for (i
= 51 - shift
; i
>= 0; i
--) {
153 write_bit_array( f80
, j
,
154 read_bit_array( f64
, i
) );
158 /* Set the exponent appropriately, and we're done. */
160 bexp
+= (16383 - 1023);
161 f80
[9] = toUChar( (sign
<< 7) | ((bexp
>> 8) & 0xFF) );
162 f80
[8] = toUChar( bexp
& 0xFF );
166 /* If the exponent is 7FF, this is either an Infinity, a SNaN or
167 QNaN, as determined by examining bits 51:0, thus:
171 where at least one of the Xs is not zero.
174 if (mantissaIsZero
) {
175 /* Produce an appropriately signed infinity:
176 S 1--1 (15) 1 0--0 (63)
178 f80
[9] = toUChar( (sign
<< 7) | 0x7F );
181 f80
[6] = f80
[5] = f80
[4] = f80
[3]
182 = f80
[2] = f80
[1] = f80
[0] = 0;
185 /* So it's either a QNaN or SNaN. Distinguish by considering
186 bit 51. Note, this destroys all the trailing bits
187 (identity?) of the NaN. IEEE754 doesn't require preserving
188 these (it only requires that there be one QNaN value and one
189 SNaN value), but x87 does seem to have some ability to
190 preserve them. Anyway, here, the NaN's identity is
191 destroyed. Could be improved. */
193 /* QNaN. Make a canonical QNaN:
194 S 1--1 (15) 1 1 0--0 (62)
196 f80
[9] = toUChar( (sign
<< 7) | 0x7F );
199 f80
[6] = f80
[5] = f80
[4] = f80
[3]
200 = f80
[2] = f80
[1] = f80
[0] = 0x00;
202 /* SNaN. Make a SNaN:
203 S 1--1 (15) 1 0 1--1 (62)
205 f80
[9] = toUChar( (sign
<< 7) | 0x7F );
208 f80
[6] = f80
[5] = f80
[4] = f80
[3]
209 = f80
[2] = f80
[1] = f80
[0] = 0xFF;
214 /* It's not a zero, denormal, infinity or nan. So it must be a
215 normalised number. Rebias the exponent and build the new
217 bexp
+= (16383 - 1023);
219 f80
[9] = toUChar( (sign
<< 7) | ((bexp
>> 8) & 0xFF) );
220 f80
[8] = toUChar( bexp
& 0xFF );
221 f80
[7] = toUChar( (1 << 7) | ((f64
[6] << 3) & 0x78)
222 | ((f64
[5] >> 5) & 7) );
223 f80
[6] = toUChar( ((f64
[5] << 3) & 0xF8) | ((f64
[4] >> 5) & 7) );
224 f80
[5] = toUChar( ((f64
[4] << 3) & 0xF8) | ((f64
[3] >> 5) & 7) );
225 f80
[4] = toUChar( ((f64
[3] << 3) & 0xF8) | ((f64
[2] >> 5) & 7) );
226 f80
[3] = toUChar( ((f64
[2] << 3) & 0xF8) | ((f64
[1] >> 5) & 7) );
227 f80
[2] = toUChar( ((f64
[1] << 3) & 0xF8) | ((f64
[0] >> 5) & 7) );
228 f80
[1] = toUChar( ((f64
[0] << 3) & 0xF8) );
229 f80
[0] = toUChar( 0 );
233 /* Convert an x87 extended double (80-bit) into an IEEE 754 double
234 (64-bit), mimicking the hardware fairly closely. Both numbers are
235 stored little-endian. Limitations, both of which could be fixed,
236 given some level of hassle:
238 * Rounding following truncation could be a bit better.
240 * Identity of NaNs is not preserved.
242 See comments in the code for more details.
244 void convert_f80le_to_f64le ( /*IN*/UChar
* f80
, /*OUT*/UChar
* f64
)
250 sign
= toUChar((f80
[9] >> 7) & 1);
251 bexp
= (((UInt
)f80
[9]) << 8) | (UInt
)f80
[8];
254 /* If the exponent is zero, either we have a zero or a denormal.
255 But an extended precision denormal becomes a double precision
256 zero, so in either case, just produce the appropriately signed
259 f64
[7] = toUChar(sign
<< 7);
260 f64
[6] = f64
[5] = f64
[4] = f64
[3] = f64
[2] = f64
[1] = f64
[0] = 0;
264 /* If the exponent is 7FFF, this is either an Infinity, a SNaN or
265 QNaN, as determined by examining bits 62:0, thus:
269 where at least one of the Xs is not zero.
271 if (bexp
== 0x7FFF) {
274 && f80
[6] == 0 && f80
[5] == 0 && f80
[4] == 0
275 && f80
[3] == 0 && f80
[2] == 0 && f80
[1] == 0
279 if (0 == (f80
[7] & 0x80))
281 /* Produce an appropriately signed infinity:
282 S 1--1 (11) 0--0 (52)
284 f64
[7] = toUChar((sign
<< 7) | 0x7F);
286 f64
[5] = f64
[4] = f64
[3] = f64
[2] = f64
[1] = f64
[0] = 0;
289 /* So it's either a QNaN or SNaN. Distinguish by considering
290 bit 61. Note, this destroys all the trailing bits
291 (identity?) of the NaN. IEEE754 doesn't require preserving
292 these (it only requires that there be one QNaN value and one
293 SNaN value), but x87 does seem to have some ability to
294 preserve them. Anyway, here, the NaN's identity is
295 destroyed. Could be improved. */
297 /* QNaN. Make a canonical QNaN:
298 S 1--1 (11) 1 0--0 (51)
300 f64
[7] = toUChar((sign
<< 7) | 0x7F);
302 f64
[5] = f64
[4] = f64
[3] = f64
[2] = f64
[1] = f64
[0] = 0x00;
304 /* SNaN. Make a SNaN:
305 S 1--1 (11) 0 1--1 (51)
307 f64
[7] = toUChar((sign
<< 7) | 0x7F);
309 f64
[5] = f64
[4] = f64
[3] = f64
[2] = f64
[1] = f64
[0] = 0xFF;
314 /* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is
315 zero, the x87 FPU appears to consider the number denormalised
316 and converts it to a QNaN. */
317 if (0 == (f80
[7] & 0x80)) {
319 /* Strange hardware QNaN:
320 S 1--1 (11) 1 0--0 (51)
322 /* On a PIII, these QNaNs always appear with sign==1. I have
324 f64
[7] = (1 /*sign*/ << 7) | 0x7F;
326 f64
[5] = f64
[4] = f64
[3] = f64
[2] = f64
[1] = f64
[0] = 0;
330 /* It's not a zero, denormal, infinity or nan. So it must be a
331 normalised number. Rebias the exponent and consider. */
332 bexp
-= (16383 - 1023);
334 /* It's too big for a double. Construct an infinity. */
335 f64
[7] = toUChar((sign
<< 7) | 0x7F);
337 f64
[5] = f64
[4] = f64
[3] = f64
[2] = f64
[1] = f64
[0] = 0;
342 /* It's too small for a normalised double. First construct a
343 zero and then see if it can be improved into a denormal. */
344 f64
[7] = toUChar(sign
<< 7);
345 f64
[6] = f64
[5] = f64
[4] = f64
[3] = f64
[2] = f64
[1] = f64
[0] = 0;
348 /* Too small even for a denormal. */
351 /* Ok, let's make a denormal. Note, this is SLOW. */
352 /* Copy bits 63, 62, 61, etc of the src mantissa into the dst,
353 indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */
354 /* bexp is in range -52 .. 0 inclusive */
355 for (i
= 63; i
>= 0; i
--) {
358 /* We shouldn't really call vassert from generated code. */
359 vassert(j
>= 0 && j
< 52);
360 write_bit_array ( f64
,
362 read_bit_array ( f80
, i
) );
364 /* and now we might have to round ... */
365 if (read_bit_array(f80
, 10+1 - bexp
) == 1)
371 /* Ok, it's a normalised number which is representable as a double.
372 Copy the exponent and mantissa into place. */
374 for (i = 0; i < 52; i++)
375 write_bit_array ( f64,
377 read_bit_array ( f80, i+11 ) );
379 f64
[0] = toUChar( (f80
[1] >> 3) | (f80
[2] << 5) );
380 f64
[1] = toUChar( (f80
[2] >> 3) | (f80
[3] << 5) );
381 f64
[2] = toUChar( (f80
[3] >> 3) | (f80
[4] << 5) );
382 f64
[3] = toUChar( (f80
[4] >> 3) | (f80
[5] << 5) );
383 f64
[4] = toUChar( (f80
[5] >> 3) | (f80
[6] << 5) );
384 f64
[5] = toUChar( (f80
[6] >> 3) | (f80
[7] << 5) );
386 f64
[6] = toUChar( ((bexp
<< 4) & 0xF0) | ((f80
[7] >> 3) & 0x0F) );
388 f64
[7] = toUChar( (sign
<< 7) | ((bexp
>> 4) & 0x7F) );
390 /* Now consider any rounding that needs to happen as a result of
391 truncating the mantissa. */
392 if (f80
[1] & 4) /* read_bit_array(f80, 10) == 1) */ {
394 /* If the bottom bits of f80 are "100 0000 0000", then the
395 infinitely precise value is deemed to be mid-way between the
396 two closest representable values. Since we're doing
397 round-to-nearest (the default mode), in that case it is the
398 bit immediately above which indicates whether we should round
399 upwards or not -- if 0, we don't. All that is encapsulated
400 in the following simple test. */
401 if ((f80
[1] & 0xF) == 4/*0100b*/ && f80
[0] == 0)
405 /* Round upwards. This is a kludge. Once in every 2^24
406 roundings (statistically) the bottom three bytes are all 0xFF
407 and so we don't round at all. Could be improved. */
408 if (f64
[0] != 0xFF) {
412 if (f64
[0] == 0xFF && f64
[1] != 0xFF) {
417 if (f64
[0] == 0xFF && f64
[1] == 0xFF && f64
[2] != 0xFF) {
422 /* else we don't round, but we should. */
427 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
428 /* Extract the signed significand or exponent component as per
429 fxtract. Arg and result are doubles travelling under the guise of
430 ULongs. Returns significand when getExp is zero and exponent
432 ULong
x86amd64g_calculate_FXTRACT ( ULong arg
, HWord getExp
)
440 S 7FF 0------0 infinity
444 const ULong posInf
= 0x7FF0000000000000ULL
;
445 const ULong negInf
= 0xFFF0000000000000ULL
;
446 const ULong nanMask
= 0x7FF0000000000000ULL
;
447 const ULong qNan
= 0x7FF8000000000000ULL
;
448 const ULong posZero
= 0x0000000000000000ULL
;
449 const ULong negZero
= 0x8000000000000000ULL
;
450 const ULong bit51
= 1ULL << 51;
451 const ULong bit52
= 1ULL << 52;
452 const ULong sigMask
= bit52
- 1;
454 /* Mimic Core i5 behaviour for special cases. */
456 return posInf
; /* Both significand and exponent are posInf. */
458 return getExp
? posInf
: negInf
;
459 if ((arg
& nanMask
) == nanMask
)
460 return qNan
| (arg
& (1ULL << 63));
462 return getExp
? negInf
: posZero
;
464 return getExp
? negInf
: negZero
;
466 /* Split into sign, exponent and significand. */
467 sign
= ((UInt
)(arg
>> 63)) & 1;
469 /* Mask off exponent & sign. uSig is in range 0 .. 2^52-1. */
470 uSig
= arg
& sigMask
;
472 /* Get the exponent. */
473 sExp
= ((Int
)(arg
>> 52)) & 0x7FF;
475 /* Deal with denormals: if the exponent is zero, then the
476 significand cannot possibly be zero (negZero/posZero are handled
477 above). Shift the significand left until bit 51 of it becomes
478 1, and decrease the exponent accordingly.
481 for (i
= 0; i
< 52; i
++) {
489 /* Add the implied leading-1 in the significand. */
493 /* Roll in the sign. */
495 /* if (sign) sSig =- sSig; */
497 /* Convert sig into a double. This should be an exact conversion.
498 Then divide by 2^52, which should give a value in the range 1.0
499 to 2.0-epsilon, at least for normalised args. */
500 /* dSig = (Double)sSig; */
501 /* dSig /= 67108864.0; */ /* 2^26 */
502 /* dSig /= 67108864.0; */ /* 2^26 */
504 uSig
|= 0x3FF0000000000000ULL
;
508 /* Convert exp into a double. Also an exact conversion. */
509 /* dExp = (Double)(sExp - 1023); */
514 uExp
= sExp
< 0 ? -sExp
: sExp
;
516 /* 1 <= uExp <= 1074 */
517 /* Skip first 42 iterations of normalisation loop as we know they
518 will always happen */
521 for (i
= 0; i
< 52-42; i
++) {
528 uExp
|= ((ULong
)expExp
) << 52;
529 if (sExp
< 0) uExp
^= negZero
;
532 return getExp
? uExp
: uSig
;
537 /*---------------------------------------------------------*/
538 /*--- SSE4.2 PCMP{E,I}STR{I,M} helpers ---*/
539 /*---------------------------------------------------------*/
541 /* We need the definitions for OSZACP eflags/rflags offsets.
542 #including guest_{amd64,x86}_defs.h causes chaos, so just copy the
543 required values directly. They are not going to change in the
544 foreseeable future :-)
554 #define MASK_O (1 << SHIFT_O)
555 #define MASK_S (1 << SHIFT_S)
556 #define MASK_Z (1 << SHIFT_Z)
557 #define MASK_A (1 << SHIFT_A)
558 #define MASK_C (1 << SHIFT_C)
559 #define MASK_P (1 << SHIFT_P)
562 /* Count leading zeroes, w/ 0-produces-32 semantics, a la Hacker's
564 static UInt
clz32 ( UInt x
)
588 static UInt
ctz32 ( UInt x
)
590 return 32 - clz32((~x
) & (x
-1));
593 /* Convert a 4-bit value to a 32-bit value by cloning each bit 8
594 times. There's surely a better way to do this, but I don't know
596 static UInt
bits4_to_bytes4 ( UInt bits4
)
599 r
|= (bits4
& 1) ? 0x000000FF : 0;
600 r
|= (bits4
& 2) ? 0x0000FF00 : 0;
601 r
|= (bits4
& 4) ? 0x00FF0000 : 0;
602 r
|= (bits4
& 8) ? 0xFF000000 : 0;
607 /* Convert a 2-bit value to a 32-bit value by cloning each bit 16
608 times. There's surely a better way to do this, but I don't know
610 static UInt
bits2_to_bytes4 ( UInt bits2
)
613 r
|= (bits2
& 1) ? 0x0000FFFF : 0;
614 r
|= (bits2
& 2) ? 0xFFFF0000 : 0;
619 /* Given partial results from a pcmpXstrX operation (intRes1,
620 basically), generate an I- or M-format output value, also the new
623 void compute_PCMPxSTRx_gen_output (/*OUT*/V128
* resV
,
624 /*OUT*/UInt
* resOSZACP
,
626 UInt zmaskL
, UInt zmaskR
,
631 vassert((pol
>> 2) == 0);
632 vassert((idx
>> 1) == 0);
636 case 0: intRes2
= intRes1
; break; // pol +
637 case 1: intRes2
= ~intRes1
; break; // pol -
638 case 2: intRes2
= intRes1
; break; // pol m+
639 case 3: intRes2
= intRes1
^ validL
; break; // pol m-
645 // generate M-format output (a bit or byte mask in XMM0)
647 resV
->w32
[0] = bits4_to_bytes4( (intRes2
>> 0) & 0xF );
648 resV
->w32
[1] = bits4_to_bytes4( (intRes2
>> 4) & 0xF );
649 resV
->w32
[2] = bits4_to_bytes4( (intRes2
>> 8) & 0xF );
650 resV
->w32
[3] = bits4_to_bytes4( (intRes2
>> 12) & 0xF );
652 resV
->w32
[0] = intRes2
& 0xFFFF;
660 // generate I-format output (an index in ECX)
661 // generate ecx value
665 newECX
= intRes2
== 0 ? 16 : (31 - clz32(intRes2
));
668 newECX
= intRes2
== 0 ? 16 : ctz32(intRes2
);
671 resV
->w32
[0] = newECX
;
678 // generate new flags, common to all ISTRI and ISTRM cases
679 *resOSZACP
// A, P are zero
680 = ((intRes2
== 0) ? 0 : MASK_C
) // C == 0 iff intRes2 == 0
681 | ((zmaskL
== 0) ? 0 : MASK_Z
) // Z == 1 iff any in argL is 0
682 | ((zmaskR
== 0) ? 0 : MASK_S
) // S == 1 iff any in argR is 0
683 | ((intRes2
& 1) << SHIFT_O
); // O == IntRes2[0]
687 /* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
688 basically), generate an I- or M-format output value, also the new
691 void compute_PCMPxSTRx_gen_output_wide (/*OUT*/V128
* resV
,
692 /*OUT*/UInt
* resOSZACP
,
694 UInt zmaskL
, UInt zmaskR
,
699 vassert((pol
>> 2) == 0);
700 vassert((idx
>> 1) == 0);
704 case 0: intRes2
= intRes1
; break; // pol +
705 case 1: intRes2
= ~intRes1
; break; // pol -
706 case 2: intRes2
= intRes1
; break; // pol m+
707 case 3: intRes2
= intRes1
^ validL
; break; // pol m-
713 // generate M-format output (a bit or byte mask in XMM0)
715 resV
->w32
[0] = bits2_to_bytes4( (intRes2
>> 0) & 0x3 );
716 resV
->w32
[1] = bits2_to_bytes4( (intRes2
>> 2) & 0x3 );
717 resV
->w32
[2] = bits2_to_bytes4( (intRes2
>> 4) & 0x3 );
718 resV
->w32
[3] = bits2_to_bytes4( (intRes2
>> 6) & 0x3 );
720 resV
->w32
[0] = intRes2
& 0xFF;
728 // generate I-format output (an index in ECX)
729 // generate ecx value
733 newECX
= intRes2
== 0 ? 8 : (31 - clz32(intRes2
));
736 newECX
= intRes2
== 0 ? 8 : ctz32(intRes2
);
739 resV
->w32
[0] = newECX
;
746 // generate new flags, common to all ISTRI and ISTRM cases
747 *resOSZACP
// A, P are zero
748 = ((intRes2
== 0) ? 0 : MASK_C
) // C == 0 iff intRes2 == 0
749 | ((zmaskL
== 0) ? 0 : MASK_Z
) // Z == 1 iff any in argL is 0
750 | ((zmaskR
== 0) ? 0 : MASK_S
) // S == 1 iff any in argR is 0
751 | ((intRes2
& 1) << SHIFT_O
); // O == IntRes2[0]
755 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
756 variants on 8-bit data.
758 For xSTRI variants, the new ECX value is placed in the 32 bits
759 pointed to by *resV, and the top 96 bits are zeroed. For xSTRM
760 variants, the result is a 128 bit value and is placed at *resV in
763 For all variants, the new OSZACP value is placed at *resOSZACP.
765 argLV and argRV are the vector args. The caller must prepare a
766 16-bit mask for each, zmaskL and zmaskR. For ISTRx variants this
767 must be 1 for each zero byte of of the respective arg. For ESTRx
768 variants this is derived from the explicit length indication, and
769 must be 0 in all places except at the bit index corresponding to
770 the valid length (0 .. 16). If the valid length is 16 then the
771 mask must be all zeroes. In all cases, bits 31:16 must be zero.
773 imm8 is the original immediate from the instruction. isSTRM
774 indicates whether this is a xSTRM or xSTRI variant, which controls
775 how much of *res is written.
777 If the given imm8 case can be handled, the return value is True.
778 If not, False is returned, and neither *res not *resOSZACP are
782 Bool
compute_PCMPxSTRx ( /*OUT*/V128
* resV
,
783 /*OUT*/UInt
* resOSZACP
,
784 V128
* argLV
, V128
* argRV
,
785 UInt zmaskL
, UInt zmaskR
,
786 UInt imm8
, Bool isxSTRM
)
788 vassert(imm8
< 0x80);
789 vassert((zmaskL
>> 16) == 0);
790 vassert((zmaskR
>> 16) == 0);
792 /* Explicitly reject any imm8 values that haven't been validated,
793 even if they would probably work. Life is too short to have
794 unvalidated cases in the code base. */
796 case 0x00: case 0x02:
797 case 0x08: case 0x0A: case 0x0C: case 0x0E:
798 case 0x10: case 0x12: case 0x14:
799 case 0x18: case 0x1A:
800 case 0x30: case 0x34:
801 case 0x38: case 0x3A:
802 case 0x40: case 0x42: case 0x44: case 0x46:
805 case 0x70: case 0x72:
811 UInt fmt
= (imm8
>> 0) & 3; // imm8[1:0] data format
812 UInt agg
= (imm8
>> 2) & 3; // imm8[3:2] aggregation fn
813 UInt pol
= (imm8
>> 4) & 3; // imm8[5:4] polarity
814 UInt idx
= (imm8
>> 6) & 1; // imm8[6] 1==msb/bytemask
816 /*----------------------------------------*/
817 /*-- strcmp on byte data --*/
818 /*----------------------------------------*/
820 if (agg
== 2/*equal each, aka strcmp*/
821 && (fmt
== 0/*ub*/ || fmt
== 2/*sb*/)) {
823 UChar
* argL
= (UChar
*)argLV
;
824 UChar
* argR
= (UChar
*)argRV
;
826 for (i
= 15; i
>= 0; i
--) {
829 boolResII
= (boolResII
<< 1) | (cL
== cR
? 1 : 0);
831 UInt validL
= ~(zmaskL
| -zmaskL
); // not(left(zmaskL))
832 UInt validR
= ~(zmaskR
| -zmaskR
); // not(left(zmaskR))
834 // do invalidation, common to all equal-each cases
836 = (boolResII
& validL
& validR
) // if both valid, use cmpres
837 | (~ (validL
| validR
)); // if both invalid, force 1
841 // generate I-format output
842 compute_PCMPxSTRx_gen_output(
844 intRes1
, zmaskL
, zmaskR
, validL
, pol
, idx
, isxSTRM
850 /*----------------------------------------*/
851 /*-- set membership on byte data --*/
852 /*----------------------------------------*/
854 if (agg
== 0/*equal any, aka find chars in a set*/
855 && (fmt
== 0/*ub*/ || fmt
== 2/*sb*/)) {
856 /* argL: the string, argR: charset */
858 UChar
* argL
= (UChar
*)argLV
;
859 UChar
* argR
= (UChar
*)argRV
;
861 UInt validL
= ~(zmaskL
| -zmaskL
); // not(left(zmaskL))
862 UInt validR
= ~(zmaskR
| -zmaskR
); // not(left(zmaskR))
864 for (si
= 0; si
< 16; si
++) {
865 if ((validL
& (1 << si
)) == 0)
866 // run off the end of the string.
869 for (ci
= 0; ci
< 16; ci
++) {
870 if ((validR
& (1 << ci
)) == 0) break;
871 if (argR
[ci
] == argL
[si
]) { m
= 1; break; }
873 boolRes
|= (m
<< si
);
876 // boolRes is "pre-invalidated"
877 UInt intRes1
= boolRes
& 0xFFFF;
879 // generate I-format output
880 compute_PCMPxSTRx_gen_output(
882 intRes1
, zmaskL
, zmaskR
, validL
, pol
, idx
, isxSTRM
888 /*----------------------------------------*/
889 /*-- substring search on byte data --*/
890 /*----------------------------------------*/
892 if (agg
== 3/*equal ordered, aka substring search*/
893 && (fmt
== 0/*ub*/ || fmt
== 2/*sb*/)) {
895 /* argL: haystack, argR: needle */
897 UChar
* argL
= (UChar
*)argLV
;
898 UChar
* argR
= (UChar
*)argRV
;
900 UInt validL
= ~(zmaskL
| -zmaskL
); // not(left(zmaskL))
901 UInt validR
= ~(zmaskR
| -zmaskR
); // not(left(zmaskR))
902 for (hi
= 0; hi
< 16; hi
++) {
904 for (ni
= 0; ni
< 16; ni
++) {
905 if ((validR
& (1 << ni
)) == 0) break;
908 if (argL
[i
] != argR
[ni
]) { m
= 0; break; }
910 boolRes
|= (m
<< hi
);
911 if ((validL
& (1 << hi
)) == 0)
912 // run off the end of the haystack
916 // boolRes is "pre-invalidated"
917 UInt intRes1
= boolRes
& 0xFFFF;
919 // generate I-format output
920 compute_PCMPxSTRx_gen_output(
922 intRes1
, zmaskL
, zmaskR
, validL
, pol
, idx
, isxSTRM
928 /*----------------------------------------*/
929 /*-- ranges, unsigned byte data --*/
930 /*----------------------------------------*/
932 if (agg
== 1/*ranges*/
935 /* argL: string, argR: range-pairs */
937 UChar
* argL
= (UChar
*)argLV
;
938 UChar
* argR
= (UChar
*)argRV
;
940 UInt validL
= ~(zmaskL
| -zmaskL
); // not(left(zmaskL))
941 UInt validR
= ~(zmaskR
| -zmaskR
); // not(left(zmaskR))
942 for (si
= 0; si
< 16; si
++) {
943 if ((validL
& (1 << si
)) == 0)
944 // run off the end of the string
947 for (ri
= 0; ri
< 16; ri
+= 2) {
948 if ((validR
& (3 << ri
)) != (3 << ri
)) break;
949 if (argR
[ri
] <= argL
[si
] && argL
[si
] <= argR
[ri
+1]) {
953 boolRes
|= (m
<< si
);
956 // boolRes is "pre-invalidated"
957 UInt intRes1
= boolRes
& 0xFFFF;
959 // generate I-format output
960 compute_PCMPxSTRx_gen_output(
962 intRes1
, zmaskL
, zmaskR
, validL
, pol
, idx
, isxSTRM
968 /*----------------------------------------*/
969 /*-- ranges, signed byte data --*/
970 /*----------------------------------------*/
972 if (agg
== 1/*ranges*/
975 /* argL: string, argR: range-pairs */
977 Char
* argL
= (Char
*)argLV
;
978 Char
* argR
= (Char
*)argRV
;
980 UInt validL
= ~(zmaskL
| -zmaskL
); // not(left(zmaskL))
981 UInt validR
= ~(zmaskR
| -zmaskR
); // not(left(zmaskR))
982 for (si
= 0; si
< 16; si
++) {
983 if ((validL
& (1 << si
)) == 0)
984 // run off the end of the string
987 for (ri
= 0; ri
< 16; ri
+= 2) {
988 if ((validR
& (3 << ri
)) != (3 << ri
)) break;
989 if (argR
[ri
] <= argL
[si
] && argL
[si
] <= argR
[ri
+1]) {
993 boolRes
|= (m
<< si
);
996 // boolRes is "pre-invalidated"
997 UInt intRes1
= boolRes
& 0xFFFF;
999 // generate I-format output
1000 compute_PCMPxSTRx_gen_output(
1002 intRes1
, zmaskL
, zmaskR
, validL
, pol
, idx
, isxSTRM
1012 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
1013 variants on 16-bit characters.
1015 For xSTRI variants, the new ECX value is placed in the 32 bits
1016 pointed to by *resV, and the top 96 bits are zeroed. For xSTRM
1017 variants, the result is a 128 bit value and is placed at *resV in
1020 For all variants, the new OSZACP value is placed at *resOSZACP.
1022 argLV and argRV are the vector args. The caller must prepare a
1023 8-bit mask for each, zmaskL and zmaskR. For ISTRx variants this
1024 must be 1 for each zero byte of of the respective arg. For ESTRx
1025 variants this is derived from the explicit length indication, and
1026 must be 0 in all places except at the bit index corresponding to
1027 the valid length (0 .. 8). If the valid length is 8 then the
1028 mask must be all zeroes. In all cases, bits 31:8 must be zero.
1030 imm8 is the original immediate from the instruction. isSTRM
1031 indicates whether this is a xSTRM or xSTRI variant, which controls
1032 how much of *res is written.
1034 If the given imm8 case can be handled, the return value is True.
1035 If not, False is returned, and neither *res not *resOSZACP are
1039 Bool
compute_PCMPxSTRx_wide ( /*OUT*/V128
* resV
,
1040 /*OUT*/UInt
* resOSZACP
,
1041 V128
* argLV
, V128
* argRV
,
1042 UInt zmaskL
, UInt zmaskR
,
1043 UInt imm8
, Bool isxSTRM
)
1045 vassert(imm8
< 0x80);
1046 vassert((zmaskL
>> 8) == 0);
1047 vassert((zmaskR
>> 8) == 0);
1049 /* Explicitly reject any imm8 values that haven't been validated,
1050 even if they would probably work. Life is too short to have
1051 unvalidated cases in the code base. */
1054 case 0x01: case 0x03:
1055 case 0x09: case 0x0B: case 0x0D:
1057 case 0x19: case 0x1B:
1058 case 0x39: case 0x3B:
1059 case 0x41: case 0x45:
1066 UInt fmt
= (imm8
>> 0) & 3; // imm8[1:0] data format
1067 UInt agg
= (imm8
>> 2) & 3; // imm8[3:2] aggregation fn
1068 UInt pol
= (imm8
>> 4) & 3; // imm8[5:4] polarity
1069 UInt idx
= (imm8
>> 6) & 1; // imm8[6] 1==msb/bytemask
1071 /*----------------------------------------*/
1072 /*-- strcmp on wide data --*/
1073 /*----------------------------------------*/
1075 if (agg
== 2/*equal each, aka strcmp*/
1076 && (fmt
== 1/*uw*/ || fmt
== 3/*sw*/)) {
1078 UShort
* argL
= (UShort
*)argLV
;
1079 UShort
* argR
= (UShort
*)argRV
;
1081 for (i
= 7; i
>= 0; i
--) {
1082 UShort cL
= argL
[i
];
1083 UShort cR
= argR
[i
];
1084 boolResII
= (boolResII
<< 1) | (cL
== cR
? 1 : 0);
1086 UInt validL
= ~(zmaskL
| -zmaskL
); // not(left(zmaskL))
1087 UInt validR
= ~(zmaskR
| -zmaskR
); // not(left(zmaskR))
1089 // do invalidation, common to all equal-each cases
1091 = (boolResII
& validL
& validR
) // if both valid, use cmpres
1092 | (~ (validL
| validR
)); // if both invalid, force 1
1096 // generate I-format output
1097 compute_PCMPxSTRx_gen_output_wide(
1099 intRes1
, zmaskL
, zmaskR
, validL
, pol
, idx
, isxSTRM
1105 /*----------------------------------------*/
1106 /*-- set membership on wide data --*/
1107 /*----------------------------------------*/
1109 if (agg
== 0/*equal any, aka find chars in a set*/
1110 && (fmt
== 1/*uw*/ || fmt
== 3/*sw*/)) {
1111 /* argL: the string, argR: charset */
1113 UShort
* argL
= (UShort
*)argLV
;
1114 UShort
* argR
= (UShort
*)argRV
;
1116 UInt validL
= ~(zmaskL
| -zmaskL
); // not(left(zmaskL))
1117 UInt validR
= ~(zmaskR
| -zmaskR
); // not(left(zmaskR))
1119 for (si
= 0; si
< 8; si
++) {
1120 if ((validL
& (1 << si
)) == 0)
1121 // run off the end of the string.
1124 for (ci
= 0; ci
< 8; ci
++) {
1125 if ((validR
& (1 << ci
)) == 0) break;
1126 if (argR
[ci
] == argL
[si
]) { m
= 1; break; }
1128 boolRes
|= (m
<< si
);
1131 // boolRes is "pre-invalidated"
1132 UInt intRes1
= boolRes
& 0xFF;
1134 // generate I-format output
1135 compute_PCMPxSTRx_gen_output_wide(
1137 intRes1
, zmaskL
, zmaskR
, validL
, pol
, idx
, isxSTRM
1143 /*----------------------------------------*/
1144 /*-- substring search on wide data --*/
1145 /*----------------------------------------*/
1147 if (agg
== 3/*equal ordered, aka substring search*/
1148 && (fmt
== 1/*uw*/ || fmt
== 3/*sw*/)) {
1150 /* argL: haystack, argR: needle */
1152 UShort
* argL
= (UShort
*)argLV
;
1153 UShort
* argR
= (UShort
*)argRV
;
1155 UInt validL
= ~(zmaskL
| -zmaskL
); // not(left(zmaskL))
1156 UInt validR
= ~(zmaskR
| -zmaskR
); // not(left(zmaskR))
1157 for (hi
= 0; hi
< 8; hi
++) {
1159 for (ni
= 0; ni
< 8; ni
++) {
1160 if ((validR
& (1 << ni
)) == 0) break;
1163 if (argL
[i
] != argR
[ni
]) { m
= 0; break; }
1165 boolRes
|= (m
<< hi
);
1166 if ((validL
& (1 << hi
)) == 0)
1167 // run off the end of the haystack
1171 // boolRes is "pre-invalidated"
1172 UInt intRes1
= boolRes
& 0xFF;
1174 // generate I-format output
1175 compute_PCMPxSTRx_gen_output_wide(
1177 intRes1
, zmaskL
, zmaskR
, validL
, pol
, idx
, isxSTRM
1183 /*----------------------------------------*/
1184 /*-- ranges, unsigned wide data --*/
1185 /*----------------------------------------*/
1187 if (agg
== 1/*ranges*/
1188 && fmt
== 1/*uw*/) {
1190 /* argL: string, argR: range-pairs */
1192 UShort
* argL
= (UShort
*)argLV
;
1193 UShort
* argR
= (UShort
*)argRV
;
1195 UInt validL
= ~(zmaskL
| -zmaskL
); // not(left(zmaskL))
1196 UInt validR
= ~(zmaskR
| -zmaskR
); // not(left(zmaskR))
1197 for (si
= 0; si
< 8; si
++) {
1198 if ((validL
& (1 << si
)) == 0)
1199 // run off the end of the string
1202 for (ri
= 0; ri
< 8; ri
+= 2) {
1203 if ((validR
& (3 << ri
)) != (3 << ri
)) break;
1204 if (argR
[ri
] <= argL
[si
] && argL
[si
] <= argR
[ri
+1]) {
1208 boolRes
|= (m
<< si
);
1211 // boolRes is "pre-invalidated"
1212 UInt intRes1
= boolRes
& 0xFF;
1214 // generate I-format output
1215 compute_PCMPxSTRx_gen_output_wide(
1217 intRes1
, zmaskL
, zmaskR
, validL
, pol
, idx
, isxSTRM
1227 /*---------------------------------------------------------------*/
1228 /*--- end guest_generic_x87.c ---*/
1229 /*---------------------------------------------------------------*/