2 /*---------------------------------------------------------------*/
3 /*--- begin guest_ppc_helpers.c ---*/
4 /*---------------------------------------------------------------*/
7 This file is part of Valgrind, a dynamic binary instrumentation
10 Copyright (C) 2004-2017 OpenWorks LLP
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
28 Neither the names of the U.S. Department of Energy nor the
29 University of California nor the names of its contributors may be
30 used to endorse or promote products derived from this software
31 without prior written permission.
34 #include "libvex_basictypes.h"
35 #include "libvex_emnote.h"
36 #include "libvex_guest_ppc32.h"
37 #include "libvex_guest_ppc64.h"
38 #include "libvex_ir.h"
41 #include "main_util.h"
42 #include "main_globals.h"
43 #include "guest_generic_bb_to_IR.h"
44 #include "guest_ppc_defs.h"
47 /* This file contains helper functions for ppc32 and ppc64 guest code.
48 Calls to these functions are generated by the back end. These
49 calls are of course in the host machine code and this file will be
50 compiled to host machine code, so that all makes sense.
52 Only change the signatures of these helper functions very
53 carefully. If you change the signature here, you'll have to change
54 the parameters passed to it in the IR calls constructed by
59 /*---------------------------------------------------------------*/
60 /*--- Misc integer helpers. ---*/
61 /*---------------------------------------------------------------*/
63 /* CALLED FROM GENERATED CODE */
64 /* DIRTY HELPER (non-referentially-transparent) */
65 /* Horrible hack. On non-ppc platforms, return 1. */
66 /* Reads a complete, consistent 64-bit TB value. */
67 ULong
ppcg_dirtyhelper_MFTB ( void )
69 # if defined(__powerpc__)
73 __asm__
__volatile__ ("\n"
77 : "=r" (hi1
), "=r" (lo
), "=r" (hi2
)
79 if (hi1
== hi2
) break;
81 res
= ((ULong
)hi1
) << 32;
90 /* CALLED FROM GENERATED CODE */
91 /* DIRTY HELPER (non-referentially transparent) */
92 UInt
ppc32g_dirtyhelper_MFSPR_268_269 ( UInt r269
)
94 # if defined(__powerpc__)
97 __asm__
__volatile__("mfspr %0,269" : "=b"(spr
));
99 __asm__
__volatile__("mfspr %0,268" : "=b"(spr
));
108 /* CALLED FROM GENERATED CODE */
109 /* DIRTY HELPER (I'm not really sure what the side effects are) */
110 UInt
ppc32g_dirtyhelper_MFSPR_287 ( void )
112 # if defined(__powerpc__)
114 __asm__
__volatile__("mfspr %0,287" : "=b"(spr
));
122 /* CALLED FROM GENERATED CODE */
123 /* DIRTY HELPER (reads guest state, writes guest mem) */
124 void ppc32g_dirtyhelper_LVS ( VexGuestPPC32State
* gst
,
125 UInt vD_off
, UInt sh
, UInt shift_right
)
128 UChar ref
[32] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
129 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
130 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
131 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F };
135 vassert( vD_off
<= sizeof(VexGuestPPC32State
)-8 );
137 vassert( shift_right
<= 1 );
140 /* else shift left */
142 pU128_src
= (U128
*)&ref
[sh
];
143 pU128_dst
= (U128
*)( ((UChar
*)gst
) + vD_off
);
145 (*pU128_dst
)[0] = (*pU128_src
)[0];
146 (*pU128_dst
)[1] = (*pU128_src
)[1];
147 (*pU128_dst
)[2] = (*pU128_src
)[2];
148 (*pU128_dst
)[3] = (*pU128_src
)[3];
151 /* CALLED FROM GENERATED CODE */
152 /* DIRTY HELPER (reads guest state, writes guest mem) */
153 void ppc64g_dirtyhelper_LVS ( VexGuestPPC64State
* gst
,
154 UInt vD_off
, UInt sh
, UInt shift_right
,
160 /* ref[] used to be a static const array, but this doesn't work on
161 ppc64 because VEX doesn't load the TOC pointer for the call here,
162 and so we wind up picking up some totally random other data.
163 (It's a wonder we don't segfault.) So, just to be clear, this
164 "fix" (vex r2073) is really a kludgearound for the fact that
165 VEX's 64-bit ppc code generation doesn't provide a valid TOC
166 pointer for helper function calls. Ick. (Bug 250038) */
167 for (i
= 0; i
< 32; i
++) ref
[i
] = i
;
172 vassert( vD_off
<= sizeof(VexGuestPPC64State
)-8 );
174 vassert( shift_right
<= 1 );
177 /* else shift left */
179 pU128_src
= (U128
*)&ref
[sh
];
180 pU128_dst
= (U128
*)( ((UChar
*)gst
) + vD_off
);
182 if ((0x1 & endness
) == 0x0) {
184 unsigned char *srcp
, *dstp
;
185 srcp
= (unsigned char *)pU128_src
;
186 dstp
= (unsigned char *)pU128_dst
;
187 for (k
= 15; k
>= 0; k
--, srcp
++)
190 (*pU128_dst
)[0] = (*pU128_src
)[0];
191 (*pU128_dst
)[1] = (*pU128_src
)[1];
192 (*pU128_dst
)[2] = (*pU128_src
)[2];
193 (*pU128_dst
)[3] = (*pU128_src
)[3];
198 /* Helper-function specialiser. */
200 IRExpr
* guest_ppc32_spechelper ( const HChar
* function_name
,
202 IRStmt
** precedingStmts
,
203 Int n_precedingStmts
)
208 IRExpr
* guest_ppc64_spechelper ( const HChar
* function_name
,
210 IRStmt
** precedingStmts
,
211 Int n_precedingStmts
)
217 /* 16-bit floating point number is stored in the lower 16-bits of 32-bit value */
218 #define I16_EXP_MASK 0x7C00
219 #define I16_FRACTION_MASK 0x03FF
220 #define I32_EXP_MASK 0x7F800000
221 #define I32_FRACTION_MASK 0x007FFFFF
222 #define I64_EXP_MASK 0x7FF0000000000000ULL
223 #define I64_FRACTION_MASK 0x000FFFFFFFFFFFFFULL
224 #define V128_EXP_MASK 0x7FFF000000000000ULL
225 #define V128_FRACTION_MASK 0x0000FFFFFFFFFFFFULL /* upper 64-bit fractional mask */
227 ULong
generate_C_FPCC_helper( ULong irType
, ULong src_hi
, ULong src
)
229 UInt NaN
, inf
, zero
, norm
, dnorm
, pos
;
230 UInt bit0
, bit1
, bit2
, bit3
;
232 ULong exp_mask
= 0, exp_part
= 0, frac_part
= 0;
235 if ( irType
== Ity_I16
) {
236 frac_part
= I16_FRACTION_MASK
& src
;
237 exp_mask
= I16_EXP_MASK
;
238 exp_part
= exp_mask
& src
;
239 sign_bit
= src
>> 15;
241 } else if ( irType
== Ity_I32
) {
242 frac_part
= I32_FRACTION_MASK
& src
;
243 exp_mask
= I32_EXP_MASK
;
244 exp_part
= exp_mask
& src
;
245 sign_bit
= src
>> 31;
247 } else if ( irType
== Ity_I64
) {
248 frac_part
= I64_FRACTION_MASK
& src
;
249 exp_mask
= I64_EXP_MASK
;
250 exp_part
= exp_mask
& src
;
251 sign_bit
= src
>> 63;
253 } else if ( irType
== Ity_F128
) {
254 /* only care if the frac part is zero or non-zero */
255 frac_part
= (V128_FRACTION_MASK
& src_hi
) | src
;
256 exp_mask
= V128_EXP_MASK
;
257 exp_part
= exp_mask
& src_hi
;
258 sign_bit
= src_hi
>> 63;
260 vassert(0); // Unknown value of irType
263 /* NaN: exponene is all ones, fractional part not zero */
264 if ((exp_part
== exp_mask
) && (frac_part
!= 0))
269 /* inf: exponent all 1's, fraction part is zero */
270 if ((exp_part
== exp_mask
) && (frac_part
== 0))
275 /* zero: exponent is 0, fraction part is zero */
276 if ((exp_part
== 0) && (frac_part
== 0))
281 /* norm: exponent is not 0, exponent is not all 1's */
282 if ((exp_part
!= 0) && (exp_part
!= exp_mask
))
287 /* dnorm: exponent is all 0's, fraction is not 0 */
288 if ((exp_part
== 0) && (frac_part
!= 0))
300 /* If the result is NaN then must force bits 1, 2 and 3 to zero
301 * to get correct result.
305 bit1
= (!NaN
) & zero
;
306 bit2
= (!NaN
) & ((pos
& dnorm
) | (pos
& norm
) | (pos
& inf
))
307 & ((!zero
) & (!NaN
));
308 bit3
= (!NaN
) & (((!pos
) & dnorm
) |((!pos
) & norm
) | ((!pos
) & inf
))
309 & ((!zero
) & (!NaN
));
311 fpcc
= (bit3
<< 3) | (bit2
<< 2) | (bit1
<< 1) | bit0
;
314 c
= NaN
| ((!pos
) & dnorm
) | ((!pos
) & zero
) | (pos
& dnorm
);
316 /* return C in the upper 32-bits and FPCC in the lower 32 bits */
317 return (c
<<32) | fpcc
;
321 UInt
generate_DFP_FPRF_value_helper( UInt gfield
,
326 UInt T_value_is_zero
)
328 UInt gfield_5_bit_mask
= 0xF8000000;
329 UInt gfield_upper_5_bits
= (gfield
& gfield_5_bit_mask
) >> (32 - 5);
330 UInt gfield_6_bit_mask
= 0xF8000000;
331 UInt gfield_upper_6_bits
= (gfield
& gfield_6_bit_mask
) >> (32 - 6);
333 Int unbiased_exponent
= exponent
- exponent_bias
;
335 /* The assumption is the gfield bits are left justified. Mask off
336 the most significant 5-bits in the 32-bit wide field. */
337 if ( T_value_is_zero
== 1) {
339 fprf_value
= 0b00010; // positive zero
341 fprf_value
= 0b10010; // negative zero
342 } else if ( unbiased_exponent
< min_norm_exp
) {
344 fprf_value
= 0b10100; // posative subnormal
346 fprf_value
= 0b11000; // negative subnormal
348 } else if ( gfield_upper_5_bits
== 0b11110 ) { // infinity
350 fprf_value
= 0b00101; // positive infinity
352 fprf_value
= 0b01001; // negative infinity
354 } else if ( gfield_upper_6_bits
== 0b111110 ) {
355 fprf_value
= 0b10001; // Quiet NaN
357 } else if ( gfield_upper_6_bits
== 0b111111 ) {
358 fprf_value
= 0b10001; // Signaling NaN
362 fprf_value
= 0b00100; // positive normal
364 fprf_value
= 0b01000; // negative normal
370 /*---------------------------------------------------------------*/
371 /*--- Misc BCD clean helpers. ---*/
372 /*---------------------------------------------------------------*/
374 /* NOTE, the clean and dirty helpers need to called using the
375 * fnptr_to_fnentry() function wrapper to handle the Big Endian
376 * pointer-to-function ABI and the Little Endian ABI.
379 /* This C-helper takes a 128-bit BCD value as two 64-bit pieces.
380 * It checks the string to see if it is a valid 128-bit BCD value.
381 * A valid BCD value has a sign value in bits [3:0] between 0xA
382 * and 0xF inclusive. each of the BCD digits represented as a 4-bit
383 * hex number in bits BCD value[128:4] mut be between 0 and 9
384 * inclusive. Returns an unsigned 64-bit value if valid.
386 ULong
is_BCDstring128_helper( ULong Signed
, ULong bcd_string_hi
,
387 ULong bcd_string_low
) {
389 ULong valid_bcd
, sign_valid
= False
;
393 if ( Signed
== True
) {
394 sign
= bcd_string_low
& 0xF;
395 if( ( sign
>= 0xA ) && ( sign
<= 0xF ) )
398 /* Change the sign digit to a zero
399 * so the for loop below works the same
400 * for signed and unsigned BCD stings
402 bcd_string_low
&= 0xFFFFFFFFFFFFFFF0ULL
;
405 sign_valid
= True
; /* set sign to True so result is only
406 based on the validity of the digits */
409 valid_bcd
= True
; // Assume true to start
410 for( i
= 0; i
< 32; i
++ ) {
411 /* check high and low 64-bit strings in parallel */
412 digit
= bcd_string_low
& 0xF;
415 bcd_string_low
= bcd_string_low
>> 4;
417 digit
= bcd_string_hi
& 0xF;
420 bcd_string_hi
= bcd_string_hi
>> 4;
423 return valid_bcd
& sign_valid
;
426 /* This clean helper takes a signed 32-bit BCD value and a carry in
427 * and adds 1 to the value of the BCD value. The BCD value is passed
428 * in as a single 64-bit value. The incremented value is returned in
429 * the lower 32 bits of the result. If the input was signed the sign of
430 * the result is the same as the input. The carry out is returned in
431 * bits [35:32] of the result.
433 ULong
increment_BCDstring32_helper( ULong Signed
,
434 ULong bcd_string
, ULong carry_in
) {
435 UInt i
, num_digits
= 8;
436 ULong bcd_value
, result
= 0;
437 ULong carry
, digit
, new_digit
;
441 if ( Signed
== True
) {
442 bcd_value
= bcd_string
>> 4; /* remove sign */
443 num_digits
= num_digits
- 1;
445 bcd_value
= bcd_string
;
448 for( i
= 0; i
< num_digits
; i
++ ) {
449 digit
= bcd_value
& 0xF;
450 bcd_value
= bcd_value
>> 4;
451 new_digit
= digit
+ carry
;
453 if ( new_digit
> 10 ) {
455 new_digit
= new_digit
- 10;
460 result
= result
| (new_digit
<< (i
*4) );
463 if ( Signed
== True
) {
464 result
= ( carry
<< 32) | ( result
<< 4 ) | ( bcd_string
& 0xF );
466 result
= ( carry
<< 32) | result
;
472 /*---------------------------------------------------------------*/
473 /*--- Misc packed decimal clean helpers. ---*/
474 /*---------------------------------------------------------------*/
476 /* This C-helper takes a 64-bit packed decimal value stored in a
477 * 64-bit value. It converts the zoned decimal format. The lower
478 * byte may contain a sign value, set it to zero. If return_upper
479 * is zero, return lower 64 bits of result, otherwise return upper
480 * 64 bits of the result.
482 ULong
convert_to_zoned_helper( ULong src_hi
, ULong src_low
,
483 ULong upper_byte
, ULong return_upper
) {
485 ULong tmp
= 0, new_value
;
487 /* Remove the sign from the source. Put in the upper byte of result.
488 * Sign inserted later.
490 if ( return_upper
== 0 ) { /* return lower 64-bit result */
491 for(i
= 0; i
< 7; i
++) {
493 new_value
= ( ( src_low
>> sh
) & 0xf ) | upper_byte
;
494 tmp
= tmp
| ( new_value
<< ( ( 7 - i
) * 8 ) );
498 /* Byte for i=0 is in upper 64-bit of the source, do it separately */
499 new_value
= ( src_hi
& 0xf ) | upper_byte
;
500 tmp
= tmp
| new_value
<< 56;
502 for( i
= 1; i
< 8; i
++ ) {
504 new_value
= ( ( src_low
>> sh
) & 0xf ) | upper_byte
;
505 tmp
= tmp
| ( new_value
<< ( ( 7 - i
) * 8 ) );
511 /* This C-helper takes the lower 64-bits of the 128-bit packed decimal
512 * src value. It converts the src value to a 128-bit national format.
513 * If return_upper is zero, the helper returns lower 64 bits of result,
514 * otherwise it returns the upper 64-bits of the result.
516 ULong
convert_to_national_helper( ULong src
, ULong return_upper
) {
519 UInt sh
= 3, max
= 4, min
= 0; /* initialize max, min for return upper */
520 ULong tmp
= 0, new_value
;
522 if ( return_upper
== 0 ) { /* return lower 64-bit result */
528 for( i
= min
; i
< max
; i
++ ) {
529 new_value
= ( ( src
>> ( ( 7 - i
) * 4 ) ) & 0xf ) | 0x0030;
530 tmp
= tmp
| ( new_value
<< ( ( sh
- i
) * 16 ) );
535 /* This C-helper takes a 128-bit zoned value stored in a 128-bit
536 * value. It converts it to the packed 64-bit decimal format without a
537 * a sign value. The sign is supposed to be in bits [3:0] and the packed
538 * value in bits [67:4]. This helper leaves it to the caller to put the
539 * result into a V128 and shift the returned value over and put the sign
542 ULong
convert_from_zoned_helper( ULong src_hi
, ULong src_low
) {
544 ULong tmp
= 0, nibble
;
546 /* Unroll the i = 0 iteration so the sizes of the loop for the upper
547 * and lower extraction match. Skip sign in lease significant byte.
549 nibble
= ( src_hi
>> 56 ) & 0xF;
550 tmp
= tmp
| ( nibble
<< 60 );
552 for( i
= 1; i
< 8; i
++ ) {
553 /* get the high nibbles, put into result */
554 nibble
= ( src_hi
>> ( ( 7 - i
) * 8 ) ) & 0xF;
555 tmp
= tmp
| ( nibble
<< ( ( 15 - i
) * 4 ) );
557 /* get the low nibbles, put into result */
558 nibble
= ( src_low
>> ( ( 8 - i
) * 8 ) ) & 0xF;
559 tmp
= tmp
| ( nibble
<< ( ( 8 - i
) * 4 ) );
564 /* This C-helper takes a 128-bit national value stored in a 128-bit
565 * value. It converts it to a signless packed 64-bit decimal format.
567 ULong
convert_from_national_helper( ULong src_hi
, ULong src_low
) {
569 ULong tmp
= 0, hword
;
571 src_low
= src_low
& 0xFFFFFFFFFFFFFFF0ULL
; /* remove the sign */
573 for( i
= 0; i
< 4; i
++ ) {
574 /* get the high half-word, put into result */
575 hword
= ( src_hi
>> ( ( 3 - i
) * 16 ) ) & 0xF;
576 tmp
= tmp
| ( hword
<< ( ( 7 - i
) * 4 ) );
578 /* get the low half-word, put into result */
579 hword
= ( src_low
>> ( ( 3 - i
) * 16 ) ) & 0xF;
580 tmp
= tmp
| ( hword
<< ( ( 3 - i
) * 4 ) );
585 /*------------------------------------------------*/
586 /*--- Population count ---------------------------*/
587 /*------------------------------------------------*/
588 ULong
population_count64_helper( ULong src
) {
589 /* Fast population count based on algorithm in the "Hacker's Delight" by
591 src
= (src
& 0x5555555555555555) + ((src
>> 1) & 0x5555555555555555);
592 src
= (src
& 0x3333333333333333) + ((src
>> 2) & 0x3333333333333333);
593 src
= (src
& 0x0F0F0F0F0F0F0F0F) + ((src
>> 4) & 0x0F0F0F0F0F0F0F0F);
594 src
= (src
& 0x00FF00FF00FF00FF) + ((src
>> 8) & 0x00FF00FF00FF00FF);
595 src
= (src
& 0x0000FFFF0000FFFF) + ((src
>> 16) & 0x0000FFFF0000FFFF);
596 src
= (src
& 0x00000000FFFFFFFF) + ((src
>> 32) & 0x00000000FFFFFFFF);
600 /*------------------------------------------------*/
601 /*---- Extract/Deposit bits under mask helpers ---*/
602 /*------------------------------------------------*/
603 ULong
extract_bits_under_mask_helper( ULong src
, ULong mask
, UInt flag
) {
606 ULong ones
, zeros
, mask_bit
, bit_src
;
611 for (i
=0; i
<64; i
++){
612 mask_bit
= 0x1 & (mask
>> (63-i
));
613 bit_src
= 0x1 & (src
>> (63-i
));
615 ones
= ones
<< mask_bit
;
616 ones
= ones
| (mask_bit
& bit_src
);
618 zeros
= zeros
<< (1^mask_bit
);
619 zeros
= zeros
| ((1^mask_bit
) & bit_src
);
628 UInt
count_bits_under_mask_helper( ULong src
, ULong mask
, UInt flag
) {
630 UInt i
, count_extracted_1
, count_extracted_0
;;
633 count_extracted_1
= 0;
634 count_extracted_0
= 0;
636 for (i
=0; i
<64; i
++){
637 mask_bit
= 0x1 & (mask
>> (63-i
));
642 if ((1^mask_bit
) == 1)
647 return count_extracted_1
;
649 return count_extracted_0
;
652 ULong
deposit_bits_under_mask_helper( ULong src
, ULong mask
) {
655 ULong result
, mask_bit
, bit_src
;
660 for (i
=0; i
<64; i
++){
661 mask_bit
= 0x1 & (mask
>> i
);
664 bit_src
= 0x1 & (src
>> src_bit_pos
);
665 result
= result
| (bit_src
<< i
);
672 /*----------------------------------------------*/
673 /*--- Vector Evaluate Inst helper --------------*/
674 /*----------------------------------------------*/
675 /* This is a 64-bit version of the VXS Vector Evaluate
676 instruction xxeval. */
678 ULong
vector_evaluate64_helper( ULong srcA
, ULong srcB
, ULong srcC
,
681 #define MAX_IMM_BITS 8
684 ULong bitA
, bitB
, bitC
, result
;
689 for (i
=0; i
<MAX_BITS
; i
++){
690 bitA
= 0x1 & (srcA
>> i
);
691 bitB
= 0x1 & (srcB
>> i
);
692 bitC
= 0x1 & (srcC
>> i
);
694 /* The value of select is IBM numbering based, i.e. MSB is bit 0 */
695 select
= (bitA
<< 2) | (bitB
<< 1) | bitC
;
696 bitIMM
= (IMM
>> (MAX_IMM_BITS
- 1 - select
)) & 0x1;
697 result
= result
| (bitIMM
<< i
);
704 /*---------------------------------------------------------------*/
705 /* --- Clean helper for vbpermq instruction ---*/
706 /*---------------------------------------------------------------*/
707 UInt
vbpermq_clean_helper( ULong vA_high
, ULong vA_low
, ULong vB
) {
708 ULong bit
, result
= 0x0;
711 /* IBM numbering bit 0 on is MSB, bit 63 is LSB */
712 for ( i
= 0; i
< 8; i
++) {
713 index
= 0xFFULL
& (vB
>> (56 - 8*i
) );
716 bit
= 0x1 & (vA_high
>> (63 - index
));
718 } else if (index
< 128) {
719 bit
= 0x1 & (vA_low
>> (127 - index
));
724 result
|= bit
<< (7 - i
);
730 /*--------------------------------------------------*/
731 /*---- VSX Vector Generate PCV from Mask helpers ---*/
732 /*--------------------------------------------------*/
733 static void write_VSX_entry (VexGuestPPC64State
* gst
, UInt reg_offset
,
737 pU128_dst
= (U128
*) (((UChar
*) gst
) + reg_offset
);
739 /* The U128 type is defined as an array of unsigned intetgers. */
740 /* Writing in LE order */
741 (*pU128_dst
)[0] = (UInt
)(vsx_entry
[1] & 0xFFFFFFFF);
742 (*pU128_dst
)[1] = (UInt
)(vsx_entry
[1] >> 32);
743 (*pU128_dst
)[2] = (UInt
)(vsx_entry
[0] & 0xFFFFFFFF);
744 (*pU128_dst
)[3] = (UInt
)(vsx_entry
[0] >> 32);
748 /* CALLED FROM GENERATED CODE */
749 void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State
* gst
,
750 ULong src_hi
, ULong src_lo
,
751 UInt reg_offset
, UInt imm
) {
752 /* The function computes the 128-bit result then writes it directly
753 into the guest state VSX register. */
755 UInt i
, shift_by
, sel_shift_by
, half_sel
;
756 ULong index
, src
, result
[2];
763 /* The algorithm in the ISA is written with IBM numbering zero on left and
764 N-1 on right. The loop index is converted to "i" to match the algorithm
765 for claritiy of matching the C code to the algorithm in the ISA. */
767 if (imm
== 0b00) { // big endian expansion
768 for( index
= 0; index
< 16; index
++) {
775 shift_by
= shift_by
- 64;
782 sel_shift_by
= shift_by
+ 7;
784 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
785 result
[half_sel
] |= j
<< shift_by
;
788 result
[half_sel
] |= (index
+ (unsigned long long)0x10) << shift_by
;
793 } else if (imm
== 0b01) { // big endian compression
794 /* If IMM=0b00001, let pcv be the permute control vector required to
795 enable a left-indexed permute (vperm or xxperm) to implement a
796 compression of the sparse byte elements in a source vector specified
797 by the byte-element mask in VSR[VRB+32] into the leftmost byte
798 elements of a result vector.
800 for( index
= 0; index
< 16; index
++) {
806 shift_by
= shift_by
- 64;
813 sel_shift_by
= shift_by
+ 7;
815 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
817 result
[1] |= (index
) << (15 - j
)*8;
819 result
[0] |= (index
) << (7 - j
)*8;
823 /* The algorithim says set to undefined, leave as 0
824 for( index = 3 - j; index < 4; index++) {
825 result |= (0 << (index*8));
829 } else if (imm
== 0b10) { //little-endian expansion
830 /* If IMM=0b00010, let pcv be the permute control vector required to
831 enable a right-indexed permute (vpermr or xxpermr) to implement an
832 expansion of the rightmost byte elements of a source vector into the
833 byte elements of a result vector specified by the byte-element mask
835 for( index
= 0; index
< 16; index
++) {
842 shift_by
= shift_by
- 64;
849 sel_shift_by
= shift_by
+ 7;
851 /* mod shift amount by 8 since src is either the upper or lower
853 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
854 result
[half_sel
] |= j
<< shift_by
;
857 result
[half_sel
] |= (index
+ (unsigned long long)0x10) << shift_by
;
861 } else if (imm
== 0b11) { //little-endian compression
862 /* If IMM=0b00011, let pcv be the permute control vector required to
863 enable a right-indexed permute (vpermr or xxpermr) to implement a
864 compression of the sparse byte elements in a source vector specified
865 by the byte-element mask in VSR[VRB+32] into the rightmost byte
866 elements of a result vector. */
868 for( index
= 0; index
< 16; index
++) {
875 shift_by
= shift_by
- 64;
882 sel_shift_by
= shift_by
+ 7;
884 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
886 result
[0] |= (index
) << (j
-8)*8;
888 result
[1] |= (index
) << j
*8;
893 /* The algorithim says set to undefined, leave as 0
894 for( index = 3 - j; index < 4; index++) {
895 result |= (0 << (index*8));
900 vex_printf("ERROR, vector_gen_pvc_byte_mask_dirty_helper, imm value %u not supported.\n",
904 write_VSX_entry( gst
, reg_offset
, result
);
907 /* CALLED FROM GENERATED CODE */
908 void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State
* gst
,
909 ULong src_hi
, ULong src_lo
,
912 /* The function computes the 128-bit result then writes it directly
913 into the guest state VSX register. */
914 UInt i
, shift_by
, sel_shift_by
, half_sel
;
915 ULong index
, src
, result
[2];
922 /* The algorithm in the ISA is written with IBM numbering zero on left and
923 N-1 on right. The loop index is converted to "i" to match the algorithm
924 for claritiy of matching the C code to the algorithm in the ISA. */
926 if (imm
== 0b00) { // big endian expansion
927 /* If IMM=0b00000, let pcv be the permute control vector required to
928 enable a left-indexed permute (vperm or xxperm) to implement an
929 expansion of the leftmost halfword elements of a source vector into
930 the halfword elements of a result vector specified by the halfword-
931 element mask in VSR[VRB+32].
933 for( index
= 0; index
< 8; index
++) {
940 shift_by
= shift_by
- 64;
947 sel_shift_by
= shift_by
+ 15;
949 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
950 // half-word i, byte 0
951 result
[half_sel
] |= (2*j
+ 0x0) << (shift_by
+8);
952 // half-word i, byte 1
953 result
[half_sel
] |= (2*j
+ 0x1) << shift_by
;
956 result
[half_sel
] |= (2*index
+ 0x10) << (shift_by
+8);
957 result
[half_sel
] |= (2*index
+ 0x11) << shift_by
;
961 } else if (imm
== 0b01) { // big endian expansion
962 /* If IMM=0b00001,let pcv be the permute control vector required to
963 enable a left-indexed permute (vperm or xxperm) to implement a
964 compression of the sparse halfword elements in a source vector
965 specified by the halfword-element mask in VSR[VRB+32] into the
966 leftmost halfword elements of a result vector.
968 for( index
= 0; index
< 8; index
++) {
975 shift_by
= shift_by
- 64;
982 sel_shift_by
= shift_by
+ 15;
984 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
986 // half-word i, byte 0
987 result
[1] |= (2*index
+ 0x0) << ((7 - j
)*16 + 8);
988 // half-word i, byte 1
989 result
[1] |= (2*index
+ 0x1) << ((7 - j
)*16);
991 // half-word i, byte 0
992 result
[0] |= (2*index
+ 0x0) << ((3 - j
)*16 + 8);
993 // half-word i, byte 1
994 result
[0] |= (2*index
+ 0x1) << ((3 - j
)*16);
1000 } else if (imm
== 0b10) { //little-endian expansion
1001 /* If IMM=0b00010, let pcv be the permute control vector required to
1002 enable a right-indexed permute (vpermr or xxpermr) to implement an
1003 expansion of the rightmost halfword elements of a source vector into
1004 the halfword elements of a result vector specified by the halfword-
1005 element mask in VSR[VRB+32].
1007 for( index
= 0; index
< 8; index
++) {
1013 shift_by
= shift_by
- 64;
1020 sel_shift_by
= shift_by
+ 15;
1022 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
1023 // half-word i, byte 0
1024 result
[half_sel
] |= (2*j
+ 0x00) << shift_by
;
1025 // half-word i, byte 1
1026 result
[half_sel
] |= (2*j
+ 0x01) << (shift_by
+8);
1030 // half-word i, byte 0
1031 result
[half_sel
] |= (2*index
+ 0x10) << shift_by
;
1032 // half-word i, byte 1
1033 result
[half_sel
] |= (2*index
+ 0x11) << (shift_by
+8);
1037 } else if (imm
== 0b11) { //little-endian compression
1038 /* If IMM=0b00011, let pcv be the permute control vector required to
1039 enable a right-indexed permute (vpermr or xxpermr) to implement a
1040 compression of the sparse halfword elements in a source vector
1041 specified by the halfword-element mask in VSR[VRB+32] into the
1042 rightmost halfword elements of a result vector. */
1043 for( index
= 0; index
< 8; index
++) {
1049 shift_by
= shift_by
- 64;
1056 sel_shift_by
= shift_by
+ 15;
1058 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
1060 // half-word j, byte 0
1061 result
[0] |= (2*index
+ 0x0) << ((j
-4)*16);
1062 // half-word j, byte 1
1063 result
[0] |= (2*index
+ 0x1) << ((j
-4)*16+8);
1065 // half-word j, byte 0
1066 result
[1] |= (2*index
+ 0x0) << (j
*16);
1067 // half-word j, byte 1
1068 result
[1] |= (2*index
+ 0x1) << ((j
*16)+8);
1075 vex_printf("ERROR, vector_gen_pvc_hword_dirty_mask_helper, imm value %u not supported.\n",
1079 write_VSX_entry( gst
, reg_offset
, result
);
1082 /* CALLED FROM GENERATED CODE */
1083 void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State
* gst
,
1084 ULong src_hi
, ULong src_lo
,
1085 UInt reg_offset
, UInt imm
) {
1086 /* The function computes the 128-bit result then writes it directly
1087 into the guest state VSX register. */
1088 UInt i
, shift_by
, sel_shift_by
, half_sel
;
1089 ULong index
, src
, result
[2];
1096 /* The algorithm in the ISA is written with IBM numbering zero on left and
1097 N-1 on right. The loop index is converted to "i" to match the algorithm
1098 for claritiy of matching the C code to the algorithm in the ISA. */
1100 if (imm
== 0b00) { // big endian expansion
1101 /* If IMM=0b00000, let pcv be the permute control vector required to
1102 enable a left-indexed permute (vperm or xxperm) to implement an
1103 expansion of the leftmost word elements of a source vector into the
1104 word elements of a result vector specified by the word-element mask
1107 for( index
= 0; index
< 4; index
++) {
1114 shift_by
= shift_by
- 64;
1121 sel_shift_by
= shift_by
+ 31;
1123 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
1124 result
[half_sel
] |= (4*j
+0) << (shift_by
+24); // word i, byte 0
1125 result
[half_sel
] |= (4*j
+1) << (shift_by
+16); // word i, byte 1
1126 result
[half_sel
] |= (4*j
+2) << (shift_by
+8); // word i, byte 2
1127 result
[half_sel
] |= (4*j
+3) << shift_by
; // word i, byte 3
1130 result
[half_sel
] |= (4*index
+ 0x10) << (shift_by
+24);
1131 result
[half_sel
] |= (4*index
+ 0x11) << (shift_by
+16);
1132 result
[half_sel
] |= (4*index
+ 0x12) << (shift_by
+8);
1133 result
[half_sel
] |= (4*index
+ 0x13) << shift_by
;
1137 } else if (imm
== 0b01) { // big endian compression
1138 /* If IMM=0b00001, let pcv be the permute control vector required to
1139 enable a left-indexed permute (vperm or xxperm) to implement a
1140 compression of the sparse word elements in a source vector specified
1141 by the word-element mask in VSR[VRB+32] into the leftmost word
1142 elements of a result vector.
1144 for( index
= 0; index
< 4; index
++) {
1151 shift_by
= shift_by
- 64;
1158 sel_shift_by
= shift_by
+ 31;
1160 if (((src
>> sel_shift_by
) & 0x1) == 1) {
1163 result
[1] |= (4*index
+0) << ((3 - j
)*32 + 24);
1165 result
[1] |= (4*index
+1) << ((3 - j
)*32 + 16);
1167 result
[1] |= (4*index
+2) << ((3 - j
)*32 + 8);
1169 result
[1] |= (4*index
+3) << ((3 - j
)*32 + 0);
1171 result
[0] |= (4*index
+0) << ((1 - j
)*32 + 24);
1172 result
[0] |= (4*index
+1) << ((1 - j
)*32 + 16);
1173 result
[0] |= (4*index
+2) << ((1 - j
)*32 + 8);
1174 result
[0] |= (4*index
+3) << ((1 - j
)*32 + 0);
1180 } else if (imm
== 0b10) { //little-endian expansion
1181 /* If IMM=0b00010, let pcv be the permute control vector required to
1182 enable a right-indexed permute (vpermr or xxpermr) to implement an
1183 expansion of the rightmost word elements of a source vector into the
1184 word elements of a result vector specified by the word-element mask
1187 for( index
= 0; index
< 4; index
++) {
1194 shift_by
= shift_by
- 64;
1201 sel_shift_by
= shift_by
+ 31;
1203 if (((src
>> sel_shift_by
) & 0x1) == 1) {
1204 result
[half_sel
] |= (4*j
+0) << (shift_by
+ 0); // word j, byte 0
1205 result
[half_sel
] |= (4*j
+1) << (shift_by
+ 8); // word j, byte 1
1206 result
[half_sel
] |= (4*j
+2) << (shift_by
+ 16); // word j, byte 2
1207 result
[half_sel
] |= (4*j
+3) << (shift_by
+ 24); // word j, byte 3
1210 result
[half_sel
] |= (4*index
+ 0x10) << (shift_by
+ 0);
1211 result
[half_sel
] |= (4*index
+ 0x11) << (shift_by
+ 8);
1212 result
[half_sel
] |= (4*index
+ 0x12) << (shift_by
+ 16);
1213 result
[half_sel
] |= (4*index
+ 0x13) << (shift_by
+ 24);
1217 } else if (imm
== 0b11) { //little-endian compression
1218 /* If IMM=0b00011, let pcv be the permute control vector required to
1219 enable a right-indexed permute (vpermr or xxpermr) to implement a
1220 compression of the sparse word elements in a source vector specified
1221 by the word-element mask in VSR[VRB+32] into the rightmost word
1222 elements of a result vector. */
1223 for( index
= 0; index
< 4; index
++) {
1230 shift_by
= shift_by
- 64;
1237 sel_shift_by
= shift_by
+ 31;
1239 if (((src
>> sel_shift_by
) & 0x1) == 1) {
1242 result
[0] |= (4*index
+ 0x0) << ((j
-2)*32+0);
1244 result
[0] |= (4*index
+ 0x1) << ((j
-2)*32+8);
1246 result
[0] |= (4*index
+ 0x2) << ((j
-2)*32+16);
1248 result
[0] |= (4*index
+ 0x3) << ((j
-2)*32+24);
1250 result
[1] |= (4*index
+ 0x0) << (j
*32+0);
1251 result
[1] |= (4*index
+ 0x1) << (j
*32+8);
1252 result
[1] |= (4*index
+ 0x2) << (j
*32+16);
1253 result
[1] |= (4*index
+ 0x3) << (j
*32+24);
1259 vex_printf("ERROR, vector_gen_pvc_word_mask_dirty_helper, imm value %u not supported.\n",
1264 write_VSX_entry( gst
, reg_offset
, result
);
1267 /* CALLED FROM GENERATED CODE */
1268 void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State
* gst
,
1269 ULong src_hi
, ULong src_lo
,
1270 UInt reg_offset
, UInt imm
) {
1271 /* The function computes the 128-bit result then writes it directly
1272 into the guest state VSX register. */
1273 UInt sel_shift_by
, half_sel
;
1274 ULong index
, src
, result
[2];
1281 /* The algorithm in the ISA is written with IBM numbering zero on left and
1282 N-1 on right. The loop index is converted to "i" to match the algorithm
1283 for claritiy of matching the C code to the algorithm in the ISA. */
1285 if (imm
== 0b00) { // big endian expansion
1286 /* If IMM=0b00000, let pcv be the permute control vector required to
1287 enable a left-indexed permute (vperm or xxperm) to implement an
1288 expansion of the leftmost doubleword elements of a source vector into
1289 the doubleword elements of a result vector specified by the
1290 doubleword-element mask in VSR[VRB+32].
1292 for( index
= 0; index
< 2; index
++) {
1305 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
1306 result
[half_sel
] |= (8*j
+ 0x0) << 56; // dword i, byte 0
1307 result
[half_sel
] |= (8*j
+ 0x1) << 48; // dword i, byte 1
1308 result
[half_sel
] |= (8*j
+ 0x2) << 40; // dword i, byte 2
1309 result
[half_sel
] |= (8*j
+ 0x3) << 32; // dword i, byte 3
1310 result
[half_sel
] |= (8*j
+ 0x4) << 24; // dword i, byte 4
1311 result
[half_sel
] |= (8*j
+ 0x5) << 16; // dword i, byte 5
1312 result
[half_sel
] |= (8*j
+ 0x6) << 8; // dword i, byte 6
1313 result
[half_sel
] |= (8*j
+ 0x7) << 0; // dword i, byte 7
1316 result
[half_sel
] |= (8*index
+ 0x10) << 56;
1317 result
[half_sel
] |= (8*index
+ 0x11) << 48;
1318 result
[half_sel
] |= (8*index
+ 0x12) << 40;
1319 result
[half_sel
] |= (8*index
+ 0x13) << 32;
1320 result
[half_sel
] |= (8*index
+ 0x14) << 24;
1321 result
[half_sel
] |= (8*index
+ 0x15) << 16;
1322 result
[half_sel
] |= (8*index
+ 0x16) << 8;
1323 result
[half_sel
] |= (8*index
+ 0x17) << 0;
1326 } else if (imm
== 0b01) { // big endian compression
1327 /* If IMM=0b00001, let pcv be the the permute control vector required to
1328 enable a left-indexed permute (vperm or xxperm) to implement a
1329 compression of the sparse doubleword elements in a source vector
1330 specified by the doubleword-element mask in VSR[VRB+32] into the
1331 leftmost doubleword elements of a result vector.
1333 for( index
= 0; index
< 2; index
++) {
1346 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
1348 result
[1] |= (8*index
+ 0x0) << 56; // double-word j, byte 0
1349 result
[1] |= (8*index
+ 0x1) << 48; // double-word j, byte 1
1350 result
[1] |= (8*index
+ 0x2) << 40; // double-word j, byte 2
1351 result
[1] |= (8*index
+ 0x3) << 32; // double-word j, byte 3
1352 result
[1] |= (8*index
+ 0x4) << 24; // double-word j, byte 4
1353 result
[1] |= (8*index
+ 0x5) << 16; // double-word j, byte 5
1354 result
[1] |= (8*index
+ 0x6) << 8; // double-word j, byte 6
1355 result
[1] |= (8*index
+ 0x7) << 0; // double-word j, byte 7
1357 result
[0] |= (8*index
+ 0x0) << 56; // double-word j, byte 0
1358 result
[0] |= (8*index
+ 0x1) << 48; // double-word j, byte 1
1359 result
[0] |= (8*index
+ 0x2) << 40; // double-word j, byte 2
1360 result
[0] |= (8*index
+ 0x3) << 32; // double-word j, byte 3
1361 result
[0] |= (8*index
+ 0x4) << 24; // double-word j, byte 4
1362 result
[0] |= (8*index
+ 0x5) << 16; // double-word j, byte 5
1363 result
[0] |= (8*index
+ 0x6) << 8; // double-word j, byte 6
1364 result
[0] |= (8*index
+ 0x7) << 0; // double-word j, byte 7
1369 } else if (imm
== 0b10) { //little-endian expansion
1370 /* If IMM=0b00010, let pcv be the permute control vector required to
1371 enable a right-indexed permute (vpermr or xxpermr) to implement an
1372 expansion of the rightmost doubleword elements of a source vector
1373 into the doubleword elements of a result vector specified by the
1374 doubleword-element mask in VSR[VRB+32].
1377 for( index
= 0; index
< 2; index
++) {
1390 if ( ((src
>> sel_shift_by
) & 0x1) == 1) {
1391 result
[half_sel
] |= (8*j
+0) << 0; // double-word i, byte 0
1392 result
[half_sel
] |= (8*j
+1) << 8; // double-word i, byte 1
1393 result
[half_sel
] |= (8*j
+2) << 16; // double-word i, byte 2
1394 result
[half_sel
] |= (8*j
+3) << 24; // double-word i, byte 3
1395 result
[half_sel
] |= (8*j
+4) << 32; // double-word i, byte 4
1396 result
[half_sel
] |= (8*j
+5) << 40; // double-word i, byte 5
1397 result
[half_sel
] |= (8*j
+6) << 48; // double-word i, byte 6
1398 result
[half_sel
] |= (8*j
+7) << 56; // double-word i, byte 7
1401 result
[half_sel
] |= (8*index
+ 0x10) << 0;
1402 result
[half_sel
] |= (8*index
+ 0x11) << 8;
1403 result
[half_sel
] |= (8*index
+ 0x12) << 16;
1404 result
[half_sel
] |= (8*index
+ 0x13) << 24;
1405 result
[half_sel
] |= (8*index
+ 0x14) << 32;
1406 result
[half_sel
] |= (8*index
+ 0x15) << 40;
1407 result
[half_sel
] |= (8*index
+ 0x16) << 48;
1408 result
[half_sel
] |= (8*index
+ 0x17) << 56;
1412 } else if (imm
== 0b11) { //little-endian compression
1413 /* If IMM=0b00011, let pcv be the permute control vector required to
1414 enable a right-indexed permute (vpermr or xxpermr) to implement a
1415 compression of the sparse doubleword elements in a source vector
1416 specified by the doubleword-element mask in VSR[VRB+32] into the
1417 rightmost doubleword elements of a result vector. */
1418 for( index
= 0; index
< 2; index
++) {
1431 if (((src
>> sel_shift_by
) & 0x1) == 1) {
1433 result
[0] |= (8*index
+ 0x0) << 0; // double-word j, byte 0
1434 result
[0] |= (8*index
+ 0x1) << 8; // double-word j, byte 1
1435 result
[0] |= (8*index
+ 0x2) << 16; // double-word j, byte 2
1436 result
[0] |= (8*index
+ 0x3) << 24; // double-word j, byte 3
1437 result
[0] |= (8*index
+ 0x4) << 32; // double-word j, byte 4
1438 result
[0] |= (8*index
+ 0x5) << 40; // double-word j, byte 5
1439 result
[0] |= (8*index
+ 0x6) << 48; // double-word j, byte 6
1440 result
[0] |= (8*index
+ 0x7) << 56; // double-word j, byte 7
1442 result
[1] |= (8*index
+ 0x0) << 0;
1443 result
[1] |= (8*index
+ 0x1) << 8;
1444 result
[1] |= (8*index
+ 0x2) << 16;
1445 result
[1] |= (8*index
+ 0x3) << 24;
1446 result
[1] |= (8*index
+ 0x4) << 32;
1447 result
[1] |= (8*index
+ 0x5) << 40;
1448 result
[1] |= (8*index
+ 0x6) << 48;
1449 result
[1] |= (8*index
+ 0x7) << 56;
1455 vex_printf("ERROR, vector_gen_pvc_dword_mask_helper, imm value %u not supported.\n",
1460 write_VSX_entry( gst
, reg_offset
, result
);
1463 /*------------------------------------------------*/
1464 /*---- VSX Matrix signed integer GER functions ---*/
1465 /*------------------------------------------------*/
1466 static UInt
exts4( UInt src
)
1468 /* Input is an 4-bit value. Extend bit 3 to bits [31:4] */
1469 if (( src
>> 3 ) & 0x1)
1470 return src
| 0xFFFFFFF0; /* sign bit is a 1, extend */
1472 return src
& 0xF; /* make sure high order bits are zero */
1475 static ULong
exts8( UInt src
)
1477 /* Input is an 8-bit value. Extend bit 7 to bits [63:8] */
1478 if (( src
>> 7 ) & 0x1)
1479 return src
| 0xFFFFFFFFFFFFFF00ULL
; /* sign bit is a 1, extend */
1481 return src
& 0xFF; /* make sure high order bits are zero */
1484 static ULong
extz8( UInt src
)
1486 /* Input is an 8-bit value. Extend src on the left with zeros. */
1487 return src
& 0xFF; /* make sure high order bits are zero */
1490 static ULong
exts16to64( UInt src
)
1492 /* Input is an 16-bit value. Extend bit 15 to bits [63:16] */
1493 if (( src
>> 15 ) & 0x1)
1494 return ((ULong
) src
) | 0xFFFFFFFFFFFF0000ULL
; /* sign is 1, extend */
1496 /* make sure high order bits are zero */
1497 return ((ULong
) src
) & 0xFFFFULL
;
1500 static UInt
chop64to32( Long src
) {
1501 /* Take a 64-bit input, return the lower 32-bits */
1502 return (UInt
)(0xFFFFFFFF & src
);
1505 static UInt
clampS64toS32( Long src
) {
1506 /* Take a 64-bit signed input, clamp positive values to 2^31,
1507 clamp negative values at -2^31. Return the result in an
1508 unsigned 32-bit value. */
1509 Long max_val
= 2147483647; // 2^31-1
1511 return (UInt
)max_val
;
1514 return (UInt
)-max_val
;
1519 void write_ACC_entry (VexGuestPPC64State
* gst
, UInt offset
, UInt acc
, UInt reg
,
1527 pU128_dst
= (U128
*) (((UChar
*)gst
) + offset
+ acc
*4*sizeof(U128
)
1528 + reg
*sizeof(U128
));
1530 /* The U128 type is defined as an array of unsigned intetgers. */
1531 (*pU128_dst
)[0] = acc_word
[0];
1532 (*pU128_dst
)[1] = acc_word
[1];
1533 (*pU128_dst
)[2] = acc_word
[2];
1534 (*pU128_dst
)[3] = acc_word
[3];
1538 void get_ACC_entry (VexGuestPPC64State
* gst
, UInt offset
, UInt acc
, UInt reg
,
1543 acc_word
[3] = 0xDEAD;
1544 acc_word
[2] = 0xBEEF;
1545 acc_word
[1] = 0xBAD;
1546 acc_word
[0] = 0xBEEF;
1551 pU128_src
= (U128
*) (((UChar
*)gst
) + offset
+ acc
*4*sizeof(U128
)
1552 + reg
*sizeof(U128
));
1554 /* The U128 type is defined as an array of unsigned intetgers. */
1555 acc_word
[0] = (*pU128_src
)[0];
1556 acc_word
[1] = (*pU128_src
)[1];
1557 acc_word
[2] = (*pU128_src
)[2];
1558 acc_word
[3] = (*pU128_src
)[3];
1562 void vsx_matrix_4bit_ger_dirty_helper ( VexGuestPPC64State
* gst
,
1564 ULong srcA_hi
, ULong srcA_lo
,
1565 ULong srcB_hi
, ULong srcB_lo
,
1568 /* This helper calculates the result for one of the four ACC entires.
1569 It is called twice, to get the hi and then the low 64-bit of the
1571 UInt i
, j
, mask
, sum
, inst
, acc_entry
, prefix_inst
;
1573 UInt srcA_nibbles
[4][8]; /* word, nibble */
1574 UInt srcB_nibbles
[4][8]; /* word, nibble */
1576 UInt prod0
, prod1
, prod2
, prod3
, prod4
, prod5
, prod6
, prod7
;
1583 inst
= (masks_inst
>> 5) & 0xFF;
1584 prefix_inst
= (masks_inst
>> 13) & 0x1;
1585 acc_entry
= masks_inst
& 0xF;
1587 /* LE word numbering */
1588 if ( prefix_inst
== 0 ) {
1589 /* Set the masks for non-prefix instructions */
1595 pmsk
= (masks_inst
>> 22) & 0xFF;
1596 xmsk
= (masks_inst
>> 18) & 0xF;
1597 ymsk
= (masks_inst
>> 14) & 0xF;
1600 /* Address nibbles using IBM numbering */
1601 for( i
= 0; i
< 4; i
++) {
1602 /* Get the ACC contents directly from the PPC64 state */
1603 get_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, acc_word
);
1605 // input is in double words
1606 for( j
= 0; j
< 8; j
++) {
1607 srcA_nibbles
[3][j
] = (srcA_hi
>> (60-4*j
)) & mask
; // hi bits [63:32]
1608 srcA_nibbles
[2][j
] = (srcA_hi
>> (28-4*j
)) & mask
; // hi bits [31:0]
1609 srcA_nibbles
[1][j
] = (srcA_lo
>> (60-4*j
)) & mask
; // lo bits [63:32]
1610 srcA_nibbles
[0][j
] = (srcA_lo
>> (28-4*j
)) & mask
; // lo bits [31:0]
1612 srcB_nibbles
[3][j
] = (srcB_hi
>> (60-4*j
)) & mask
;
1613 srcB_nibbles
[2][j
] = (srcB_hi
>> (28-4*j
)) & mask
;
1614 srcB_nibbles
[1][j
] = (srcB_lo
>> (60-4*j
)) & mask
;
1615 srcB_nibbles
[0][j
] = (srcB_lo
>> (28-4*j
)) & mask
;
1618 for( j
= 0; j
< 4; j
++) {
1619 if (((xmsk
>> i
) & 0x1) & ((ymsk
>> j
) & 0x1)) {
1620 if (((pmsk
>> 7) & 0x1) == 0)
1623 prod0
= exts4( srcA_nibbles
[i
][0] )
1624 * exts4( srcB_nibbles
[j
][0] );
1626 if (((pmsk
>> 6) & 0x1) == 0)
1629 prod1
= exts4( srcA_nibbles
[i
][1] )
1630 * exts4( srcB_nibbles
[j
][1] );
1632 if (((pmsk
>> 5) & 0x1) == 0)
1635 prod2
= exts4( srcA_nibbles
[i
][2] )
1636 * exts4( srcB_nibbles
[j
][2] );
1638 if (((pmsk
>> 4) & 0x1) == 0)
1641 prod3
= exts4( srcA_nibbles
[i
][3] )
1642 * exts4( srcB_nibbles
[j
][3] );
1644 if (((pmsk
>> 3) & 0x1) == 0)
1647 prod4
= exts4( srcA_nibbles
[i
][4] )
1648 * exts4( srcB_nibbles
[j
][4] );
1650 if (((pmsk
>> 2) & 0x1) == 0)
1653 prod5
= exts4( srcA_nibbles
[i
][5] )
1654 * exts4( srcB_nibbles
[j
][5] );
1656 if (((pmsk
>> 1) & 0x1) == 0)
1659 prod6
= exts4( srcA_nibbles
[i
][6] )
1660 * exts4( srcB_nibbles
[j
][6] );
1662 if ((pmsk
& 0x1) == 0)
1665 prod7
= exts4( srcA_nibbles
[i
][7] )
1666 * exts4( srcB_nibbles
[j
][7] );
1667 /* sum is UInt so the result is choped to 32-bits */
1668 sum
= prod0
+ prod1
+ prod2
+ prod3
+ prod4
1669 + prod5
+ prod6
+ prod7
;
1671 if ( inst
== XVI4GER8
)
1674 else if ( inst
== XVI4GER8PP
)
1675 result
[j
] = sum
+ acc_word
[j
];
1681 write_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, result
);
1685 void vsx_matrix_8bit_ger_dirty_helper( VexGuestPPC64State
* gst
,
1687 ULong srcA_hi
, ULong srcA_lo
,
1688 ULong srcB_hi
, ULong srcB_lo
,
1691 UInt i
, j
, mask
, inst
, acc_entry
, prefix_inst
;
1693 UInt srcA_bytes
[4][4]; /* word, byte */
1694 UInt srcB_bytes
[4][4]; /* word, byte */
1696 ULong prod0
, prod1
, prod2
, prod3
, sum
;
1703 inst
= (masks_inst
>> 5) & 0xFF;
1704 prefix_inst
= (masks_inst
>> 13) & 0x1;
1705 acc_entry
= masks_inst
& 0xF;
1707 /* LE word numbering */
1708 if ( prefix_inst
== 0 ) {
1715 pmsk
= (masks_inst
>> 26) & 0xF;
1716 xmsk
= (masks_inst
>> 18) & 0xF;
1717 ymsk
= (masks_inst
>> 14) & 0xF;
1720 /* Address byes using IBM numbering */
1721 for( i
= 0; i
< 4; i
++) {
1722 /* Get the ACC contents directly from the PPC64 state */
1723 get_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, acc_word
);
1725 for( j
= 0; j
< 4; j
++) {
1726 srcA_bytes
[3][j
] = (srcA_hi
>> (56-8*j
)) & mask
;
1727 srcA_bytes
[2][j
] = (srcA_hi
>> (24-8*j
)) & mask
;
1728 srcA_bytes
[1][j
] = (srcA_lo
>> (56-8*j
)) & mask
;
1729 srcA_bytes
[0][j
] = (srcA_lo
>> (24-8*j
)) & mask
;
1731 srcB_bytes
[3][j
] = (srcB_hi
>> (56-8*j
)) & mask
;
1732 srcB_bytes
[2][j
] = (srcB_hi
>> (24-8*j
)) & mask
;
1733 srcB_bytes
[1][j
] = (srcB_lo
>> (56-8*j
)) & mask
;
1734 srcB_bytes
[0][j
] = (srcB_lo
>> (24-8*j
)) & mask
;
1737 for( j
= 0; j
< 4; j
++) {
1738 if (((xmsk
>> i
) & 0x1) & ((ymsk
>> j
) & 0x1)) {
1739 if (((pmsk
>> 3) & 0x1) == 0)
1743 exts8( srcA_bytes
[i
][0] )
1744 * extz8( srcB_bytes
[j
][0] );
1746 if (((pmsk
>> 2) & 0x1) == 0)
1750 exts8( srcA_bytes
[i
][1] )
1751 * extz8( srcB_bytes
[j
][1] );
1753 if (((pmsk
>> 1) & 0x1) == 0)
1757 exts8( srcA_bytes
[i
][2] )
1758 * extz8( srcB_bytes
[j
][2] );
1760 if (((pmsk
>> 0) & 0x1) == 0)
1764 exts8( srcA_bytes
[i
][3] )
1765 * extz8( srcB_bytes
[j
][3] );
1767 /* sum is UInt so the result is choped to 32-bits */
1768 sum
= prod0
+ prod1
+ prod2
+ prod3
;
1770 if ( inst
== XVI8GER4
)
1771 result
[j
] = chop64to32( sum
);
1773 else if ( inst
== XVI8GER4PP
)
1774 result
[j
] = chop64to32( sum
+ acc_word
[j
] );
1776 else if ( inst
== XVI8GER4SPP
)
1777 result
[j
] = clampS64toS32(sum
+ acc_word
[j
]);
1779 // @todo PJF Coverity complains that if none of the abofe ifs are true
1780 // then result gets used uninitialized
1785 write_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, result
);
1789 void vsx_matrix_16bit_ger_dirty_helper( VexGuestPPC64State
* gst
,
1791 ULong srcA_hi
, ULong srcA_lo
,
1792 ULong srcB_hi
, ULong srcB_lo
,
1795 UInt i
, j
, mask
, inst
, acc_entry
, prefix_inst
;
1797 UInt srcA_word
[4][2]; /* word, hword */
1798 UInt srcB_word
[4][2]; /* word, hword */
1807 inst
= (masks_inst
>> 5) & 0xFF;
1808 prefix_inst
= (masks_inst
>> 13) & 0x1;
1809 acc_entry
= masks_inst
& 0xF;
1811 /* LE word numbering */
1812 if ( prefix_inst
== 0 ) {
1813 /* Set the masks for non prefix instructions */
1819 pmsk
= (masks_inst
>> 28) & 0x3;
1820 xmsk
= (masks_inst
>> 18) & 0xF;
1821 ymsk
= (masks_inst
>> 14) & 0xF;
1824 /* Address half-words using IBM numbering */
1825 for( i
= 0; i
< 4; i
++) {
1826 /* Get the ACC contents directly from the PPC64 state */
1827 get_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, acc_word
);
1829 for( j
= 0; j
< 2; j
++) {
1830 srcA_word
[3][j
] = (srcA_hi
>> (48-16*j
)) & mask
;
1831 srcA_word
[2][j
] = (srcA_hi
>> (16-16*j
)) & mask
;
1832 srcA_word
[1][j
] = (srcA_lo
>> (48-16*j
)) & mask
;
1833 srcA_word
[0][j
] = (srcA_lo
>> (16-16*j
)) & mask
;
1835 srcB_word
[3][j
] = (srcB_hi
>> (48-16*j
)) & mask
;
1836 srcB_word
[2][j
] = (srcB_hi
>> (16-16*j
)) & mask
;
1837 srcB_word
[1][j
] = (srcB_lo
>> (48-16*j
)) & mask
;
1838 srcB_word
[0][j
] = (srcB_lo
>> (16-16*j
)) & mask
;
1841 for( j
= 0; j
< 4; j
++) {
1842 if (((xmsk
>> i
) & 0x1) & ((ymsk
>> j
) & 0x1)) {
1843 if (((pmsk
>> 1) & 0x1) == 0)
1847 prod0
= exts16to64( srcA_word
[i
][0] )
1848 * exts16to64( srcB_word
[j
][0] );
1850 if (((pmsk
>> 0) & 0x1) == 0)
1853 prod1
= exts16to64( srcA_word
[i
][1] )
1854 * exts16to64( srcB_word
[j
][1] );
1856 sum
= prod0
+ prod1
;
1858 if ( inst
== XVI16GER2
)
1859 result
[j
] = chop64to32( sum
);
1861 else if ( inst
== XVI16GER2S
)
1862 result
[j
] = clampS64toS32( sum
);
1864 else if ( inst
== XVI16GER2PP
)
1865 result
[j
] = chop64to32( sum
+ acc_word
[j
] );
1867 else if ( inst
== XVI16GER2SPP
)
1868 result
[j
] = clampS64toS32( sum
+ acc_word
[j
] );
1874 write_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, result
);
1878 //matrix 16 float stuff
1887 static Float
reinterpret_int_as_float( UInt input
)
1889 /* Reinterpret the bit pattern of an int as a float. */
1890 __attribute__ ((aligned (128))) union convert_t conv
;
1896 static UInt
reinterpret_float_as_int( Float input
)
1898 /* Reinterpret the bit pattern of an int as a float. */
1899 __attribute__ ((aligned (128))) union convert_t conv
;
1905 static Double
reinterpret_long_as_double( ULong input
)
1907 /* Reinterpret the bit pattern of an int as a float. */
1908 __attribute__ ((aligned (128))) union convert_t conv
;
1914 static ULong
reinterpret_double_as_long( Double input
)
1916 /* Reinterpret the bit pattern of an int as a float. */
1917 __attribute__ ((aligned (128))) union convert_t conv
;
1923 static Double
conv_f16_to_double( ULong input
)
1925 # if defined (HAS_XSCVHPDP)
1926 // This all seems to be very alignment sensitive??
1927 __attribute__ ((aligned (64))) ULong src
;
1928 __attribute__ ((aligned (64))) Double result
;
1930 __asm__
__volatile__ (".machine push;\n" ".machine power9;\n" \
1931 "xscvhpdp %x0,%x1 ;\n .machine pop" \
1932 : "=wa" (result
) : "wa" (src
) );
1939 #define BF16_SIGN_MASK 0x8000
1940 #define BF16_EXP_MASK 0x7F80
1941 #define BF16_FRAC_MASK 0x007F
1942 #define BF16_BIAS 127
1943 #define BF16_MAX_UNBIASED_EXP 127
1944 #define BF16_MIN_UNBIASED_EXP -126
1945 #define FLOAT_SIGN_MASK 0x80000000
1946 #define FLOAT_EXP_MASK 0x7F800000
1947 #define FLOAT_FRAC_MASK 0x007FFFFF
1948 #define FLOAT_FRAC_BIT8 0x00008000
1949 #define FLOAT_BIAS 127
1951 static Float
conv_bf16_to_float( UInt input
)
1953 /* input is 16-bit bfloat.
1954 bias +127, exponent 8-bits, fraction 7-bits
1956 output is 32-bit float.
1957 bias +127, exponent 8-bits, fraction 22-bits
1960 UInt input_exp
, input_fraction
, unbiased_exp
;
1961 UInt output_exp
, output_fraction
;
1963 union convert_t conv
;
1965 sign
= (UInt
)(input
& BF16_SIGN_MASK
);
1966 input_exp
= input
& BF16_EXP_MASK
;
1967 unbiased_exp
= (input_exp
>> 7) - (UInt
)BF16_BIAS
;
1968 input_fraction
= input
& BF16_FRAC_MASK
;
1970 if (((input_exp
& BF16_EXP_MASK
) == BF16_EXP_MASK
) &&
1971 (input_fraction
!= 0)) {
1972 /* input is NaN or SNaN, exp all 1's, fraction != 0 */
1973 output_exp
= FLOAT_EXP_MASK
;
1974 output_fraction
= input_fraction
;
1976 } else if(((input_exp
& BF16_EXP_MASK
) == BF16_EXP_MASK
) &&
1977 ( input_fraction
== 0)) {
1978 /* input is infinity, exp all 1's, fraction = 0 */
1979 output_exp
= FLOAT_EXP_MASK
;
1980 output_fraction
= 0;
1982 } else if((input_exp
== 0) && (input_fraction
== 0)) {
1985 output_fraction
= 0;
1987 } else if((input_exp
== 0) && (input_fraction
!= 0)) {
1988 /* input is denormal */
1989 output_fraction
= input_fraction
;
1990 output_exp
= (-(Int
)BF16_BIAS
+ (Int
)FLOAT_BIAS
) << 23;
1993 /* result is normal */
1994 output_exp
= (unbiased_exp
+ FLOAT_BIAS
) << 23;
1995 output_fraction
= input_fraction
;
1998 conv
.u32
= sign
<< (31 - 15) | output_exp
| (output_fraction
<< (23-7));
2002 static UInt
conv_float_to_bf16( UInt input
)
2004 /* input is 32-bit float stored as unsigned 32-bit.
2005 bias +127, exponent 8-bits, fraction 23-bits
2007 output is 16-bit bfloat.
2008 bias +127, exponent 8-bits, fraction 7-bits
2010 If the unbiased exponent of the input is greater than the max floating
2011 point unbiased exponent value, the result of the floating point 16-bit
2015 UInt input_exp
, input_fraction
;
2016 UInt output_exp
, output_fraction
;
2019 sign
= input
& FLOAT_SIGN_MASK
;
2020 input_exp
= input
& FLOAT_EXP_MASK
;
2021 input_fraction
= input
& FLOAT_FRAC_MASK
;
2023 if (((input_exp
& FLOAT_EXP_MASK
) == FLOAT_EXP_MASK
) &&
2024 (input_fraction
!= 0)) {
2025 /* input is NaN or SNaN, exp all 1's, fraction != 0 */
2026 output_exp
= BF16_EXP_MASK
;
2027 output_fraction
= (ULong
)input_fraction
>> (23 - 7);
2028 } else if (((input_exp
& FLOAT_EXP_MASK
) == FLOAT_EXP_MASK
) &&
2029 ( input_fraction
== 0)) {
2030 /* input is infinity, exp all 1's, fraction = 0 */
2031 output_exp
= BF16_EXP_MASK
;
2032 output_fraction
= 0;
2033 } else if ((input_exp
== 0) && (input_fraction
== 0)) {
2036 output_fraction
= 0;
2037 } else if ((input_exp
== 0) && (input_fraction
!= 0)) {
2038 /* input is denormal */
2040 output_fraction
= (ULong
)input_fraction
>> (23 - 7);
2042 /* result is normal */
2043 output_exp
= (input_exp
- BF16_BIAS
+ FLOAT_BIAS
) >> (23 - 7);
2044 output_fraction
= (ULong
)input_fraction
>> (23 - 7);
2046 /* Round result. Look at the 8th bit position of the 32-bit floating
2047 pointt fraction. The F16 fraction is only 7 bits wide so if the 8th
2048 bit of the F32 is a 1 we need to round up by adding 1 to the output
2050 if ((input_fraction
& FLOAT_FRAC_BIT8
) == FLOAT_FRAC_BIT8
)
2051 /* Round the F16 fraction up by 1 */
2052 output_fraction
= output_fraction
+ 1;
2055 result
= sign
>> (31 - 15) | output_exp
| output_fraction
;
2059 static Float
conv_double_to_float( Double src
)
2061 return (float) src
;
2065 static Double
negate_double( Double input
)
2067 /* Don't negate a NaN value. A NaN has an exponet
2068 of all 1's, non zero fraction. */
2069 __attribute__ ((aligned (128))) union convert_t conv
;
2073 if ( ( ( conv
.u64
& I64_EXP_MASK
) == I64_EXP_MASK
)
2074 && ( ( conv
.u64
& I64_FRACTION_MASK
) != 0 ) )
2080 static Float
negate_float( Float input
)
2082 /* Don't negate a NaN value. A NaN has an exponet
2083 of all 1's, non zero fraction. */
2084 __attribute__ ((aligned (128))) union convert_t conv
;
2088 if ( ( ( conv
.u32
& I32_EXP_MASK
) == I32_EXP_MASK
)
2089 && ( ( conv
.u32
& I32_FRACTION_MASK
) != 0 ) )
2095 /* This C-helper takes a vector of two 32-bit floating point values
2096 * and returns a vector containing two 16-bit bfloats.
2098 output 0x0 hword1 0x0 hword3
2099 Called from generated code.
2101 ULong
convert_from_floattobf16_helper( ULong src
) {
2102 ULong resultHi
, resultLo
;
2104 resultHi
= (ULong
)conv_float_to_bf16( (UInt
)(src
>> 32));
2105 resultLo
= (ULong
)conv_float_to_bf16( (UInt
)(src
& 0xFFFFFFFF));
2106 return (resultHi
<< 32) | resultLo
;
2110 /* This C-helper takes a vector of two 16-bit bfloating point values
2111 * and returns a vector containing one 32-bit float.
2112 input: 0x0 hword1 0x0 hword3
2115 ULong
convert_from_bf16tofloat_helper( ULong src
) {
2117 union convert_t conv
;
2118 conv
.f
= conv_bf16_to_float( (UInt
)(src
>> 32) );
2119 result
= (ULong
) conv
.u32
;
2120 conv
.f
= conv_bf16_to_float( (UInt
)(src
& 0xFFFFFFFF));
2121 result
= (result
<< 32) | (ULong
) conv
.u32
;
2125 void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State
* gst
,
2127 ULong srcA_hi
, ULong srcA_lo
,
2128 ULong srcB_hi
, ULong srcB_lo
,
2131 UInt i
, j
, mask
, inst
, acc_entry
, prefix_inst
;
2133 UInt srcA_word
[4][2]; /* word, hword */
2134 UInt srcB_word
[4][2]; /* word, hword */
2135 Double src10
, src11
, src20
, src21
;
2136 UInt acc_word_input
[4];
2146 inst
= (masks_inst
>> 5) & 0xFF;
2147 prefix_inst
= (masks_inst
>> 13) & 0x1;
2148 acc_entry
= masks_inst
& 0xF;
2150 if ( prefix_inst
== 0 ) {
2151 /* Set the masks for non-prefix instructions */
2157 /* Use mask supplied with prefix inst */
2158 pmsk
= (masks_inst
>> 28) & 0x3;
2159 xmsk
= (masks_inst
>> 18) & 0xF;
2160 ymsk
= (masks_inst
>> 14) & 0xF;
2163 /* Address half-words using IBM numbering */
2164 for( i
= 0; i
< 4; i
++) {
2165 /* Get the ACC contents directly from the PPC64 state */
2166 get_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, acc_word_input
);
2168 acc_word
[3] = reinterpret_int_as_float( acc_word_input
[3] );
2169 acc_word
[2] = reinterpret_int_as_float( acc_word_input
[2] );
2170 acc_word
[1] = reinterpret_int_as_float( acc_word_input
[1] );
2171 acc_word
[0] = reinterpret_int_as_float( acc_word_input
[0] );
2173 for( j
= 0; j
< 2; j
++) { // input is in double words
2174 srcA_word
[3][j
] = (UInt
)((srcA_hi
>> (48-16*j
)) & mask
);
2175 srcA_word
[2][j
] = (UInt
)((srcA_hi
>> (16-16*j
)) & mask
);
2176 srcA_word
[1][j
] = (UInt
)((srcA_lo
>> (48-16*j
)) & mask
);
2177 srcA_word
[0][j
] = (UInt
)((srcA_lo
>> (16-16*j
)) & mask
);
2179 srcB_word
[3][j
] = (UInt
)((srcB_hi
>> (48-16*j
)) & mask
);
2180 srcB_word
[2][j
] = (UInt
)((srcB_hi
>> (16-16*j
)) & mask
);
2181 srcB_word
[1][j
] = (UInt
)((srcB_lo
>> (48-16*j
)) & mask
);
2182 srcB_word
[0][j
] = (UInt
)((srcB_lo
>> (16-16*j
)) & mask
);
2185 /* Note the isa is not consistent in the src naming. Will use the
2186 naming src10, src11, src20, src21 used with xvf16ger2 instructions.
2188 for( j
= 0; j
< 4; j
++) {
2189 if (((pmsk
>> 1) & 0x1) == 0) {
2193 if (( inst
== XVF16GER2
) || ( inst
== XVF16GER2PP
)
2194 || ( inst
== XVF16GER2PN
) || ( inst
== XVF16GER2NP
)
2195 || ( inst
== XVF16GER2NN
)) {
2196 src10
= conv_f16_to_double((ULong
)srcA_word
[i
][0]);
2197 src20
= conv_f16_to_double((ULong
)srcB_word
[j
][0]);
2199 /* Input is in bfloat format, result is stored in the
2200 "traditional" 64-bit float format. */
2201 src10
= (double)conv_bf16_to_float((ULong
)srcA_word
[i
][0]);
2202 src20
= (double)conv_bf16_to_float((ULong
)srcB_word
[j
][0]);
2206 if ((pmsk
& 0x1) == 0) {
2210 if (( inst
== XVF16GER2
) || ( inst
== XVF16GER2PP
)
2211 || ( inst
== XVF16GER2PN
) || ( inst
== XVF16GER2NP
)
2212 || ( inst
== XVF16GER2NN
)) {
2213 src11
= conv_f16_to_double((ULong
)srcA_word
[i
][1]);
2214 src21
= conv_f16_to_double((ULong
)srcB_word
[j
][1]);
2216 /* Input is in bfloat format, result is stored in the
2217 "traditional" 64-bit float format. */
2218 src11
= (double)conv_bf16_to_float((ULong
)srcA_word
[i
][1]);
2219 src21
= (double)conv_bf16_to_float((ULong
)srcB_word
[j
][1]);
2223 prod
= src10
* src20
;
2224 msum
= prod
+ src11
* src21
;
2226 if (((xmsk
>> i
) & 0x1) & ((ymsk
>> j
) & 0x1)) {
2227 /* Note, we do not track the exception handling bits
2228 ox, ux, xx, si, mz, vxsnan and vximz in the FPSCR. */
2230 if (( inst
== XVF16GER2
) || ( inst
== XVBF16GER2
) )
2231 result
[j
] = reinterpret_float_as_int(
2232 conv_double_to_float(msum
) );
2234 else if (( inst
== XVF16GER2PP
) || (inst
== XVBF16GER2PP
))
2235 result
[j
] = reinterpret_float_as_int(
2236 conv_double_to_float(msum
)
2239 else if (( inst
== XVF16GER2PN
) || ( inst
== XVBF16GER2PN
))
2240 result
[j
] = reinterpret_float_as_int(
2241 conv_double_to_float(msum
)
2242 + negate_float( acc_word
[j
] ) );
2244 else if (( inst
== XVF16GER2NP
) || ( inst
== XVBF16GER2NP
))
2245 result
[j
] = reinterpret_float_as_int(
2246 conv_double_to_float( negate_double( msum
) )
2249 else if (( inst
== XVF16GER2NN
) || ( inst
== XVBF16GER2NN
))
2250 result
[j
] = reinterpret_float_as_int(
2251 conv_double_to_float( negate_double( msum
) )
2252 + negate_float( acc_word
[j
] ) );
2257 write_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, result
);
2261 void vsx_matrix_32bit_float_ger_dirty_helper( VexGuestPPC64State
* gst
,
2263 ULong srcA_hi
, ULong srcA_lo
,
2264 ULong srcB_hi
, ULong srcB_lo
,
2267 UInt i
, j
, mask
, inst
, acc_entry
, prefix_inst
;
2271 UInt acc_word_input
[4];
2276 Float src1
, src2
, acc
;
2279 inst
= (masks_inst
>> 5) & 0xFF;
2280 prefix_inst
= (masks_inst
>> 13) & 0x1;
2281 acc_entry
= masks_inst
& 0xF;
2283 if ( prefix_inst
== 0 ) {
2284 /* Set the masks for non-prefix instructions */
2289 xmsk
= (masks_inst
>> 18) & 0xF;
2290 ymsk
= (masks_inst
>> 14) & 0xF;
2293 srcA_word
[3] = reinterpret_int_as_float( (srcA_hi
>> 32) & mask
);
2294 srcA_word
[2] = reinterpret_int_as_float( srcA_hi
& mask
);
2295 srcA_word
[1] = reinterpret_int_as_float( (srcA_lo
>> 32) & mask
);
2296 srcA_word
[0] = reinterpret_int_as_float( srcA_lo
& mask
);
2298 srcB_word
[3] = reinterpret_int_as_float( (srcB_hi
>> 32) & mask
);
2299 srcB_word
[2] = reinterpret_int_as_float( srcB_hi
& mask
);
2300 srcB_word
[1] = reinterpret_int_as_float( (srcB_lo
>> 32) & mask
);
2301 srcB_word
[0] = reinterpret_int_as_float( srcB_lo
& mask
);
2303 /* Address byes using IBM numbering */
2304 for( i
= 0; i
< 4; i
++) {
2305 /* Get the ACC contents directly from the PPC64 state */
2306 get_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, acc_word_input
);
2308 acc_word
[3] = reinterpret_int_as_float( acc_word_input
[3] );
2309 acc_word
[2] = reinterpret_int_as_float( acc_word_input
[2] );
2310 acc_word
[1] = reinterpret_int_as_float( acc_word_input
[1] );
2311 acc_word
[0] = reinterpret_int_as_float( acc_word_input
[0] );
2313 for( j
= 0; j
< 4; j
++) {
2315 if ((((xmsk
>> i
) & 0x1) & ((ymsk
>> j
) & 0x1)) == 0x1) {
2316 /* Note, we do not track the exception handling bits
2317 ox, ux, xx, si, mz, vxsnan and vximz in the FPSCR. */
2319 src1
= srcA_word
[i
];
2320 src2
= srcB_word
[j
];
2323 if ( inst
== XVF32GER
)
2324 result
[j
] = reinterpret_float_as_int( src1
* src2
);
2326 else if ( inst
== XVF32GERPP
)
2327 result
[j
] = reinterpret_float_as_int( ( src1
* src2
) + acc
);
2329 else if ( inst
== XVF32GERPN
)
2330 result
[j
] = reinterpret_float_as_int( ( src1
* src2
)
2331 + negate_float( acc
) );
2333 else if ( inst
== XVF32GERNP
)
2334 result
[j
] = reinterpret_float_as_int(
2335 negate_float( src1
* src2
) + acc
);
2337 else if ( inst
== XVF32GERNN
)
2338 result
[j
] = reinterpret_float_as_int(
2339 negate_float( src1
* src2
) + negate_float( acc
) );
2344 write_ACC_entry (gst
, offset_ACC
, acc_entry
, 3-i
, result
);
2348 void vsx_matrix_64bit_float_ger_dirty_helper( VexGuestPPC64State
* gst
,
2350 ULong srcX_hi
, ULong srcX_lo
,
2351 ULong srcY_hi
, ULong srcY_lo
,
2354 /* This function just computes the result for one entry in the ACC. */
2355 UInt i
, j
, inst
, acc_entry
, prefix_inst
;
2357 Double srcX_dword
[4];
2358 Double srcY_dword
[2];
2360 UInt result_uint
[4];
2361 ULong result_ulong
[2];
2362 Double acc_dword
[4];
2363 ULong acc_word_ulong
[2];
2364 UInt acc_word_input
[4];
2368 Double src1
, src2
, acc
;
2370 inst
= (masks_inst
>> 8) & 0xFF;
2371 prefix_inst
= (masks_inst
>> 16) & 0x1;
2372 start_i
= (masks_inst
>> 4) & 0xF;
2373 acc_entry
= masks_inst
& 0xF;
2375 if ( prefix_inst
== 0 ) {
2376 /* Set the masks for non-prefix instructions */
2381 xmsk
= (masks_inst
>> 21) & 0xF;
2382 ymsk
= (masks_inst
>> 19) & 0x3;
2385 /* Need to store the srcX_dword in the correct index for the following
2387 srcX_dword
[1+start_i
] = reinterpret_long_as_double( srcX_lo
);
2388 srcX_dword
[0+start_i
] = reinterpret_long_as_double( srcX_hi
);
2389 srcY_dword
[1] = reinterpret_long_as_double( srcY_lo
);
2390 srcY_dword
[0] = reinterpret_long_as_double( srcY_hi
);
2392 for( i
= start_i
; i
< start_i
+2; i
++) {
2393 /* Get the ACC contents directly from the PPC64 state */
2394 get_ACC_entry (gst
, offset_ACC
, acc_entry
, 3 - i
,
2397 acc_word_ulong
[1] = acc_word_input
[3];
2398 acc_word_ulong
[1] = (acc_word_ulong
[1] << 32) | acc_word_input
[2];
2399 acc_word_ulong
[0] = acc_word_input
[1];
2400 acc_word_ulong
[0] = (acc_word_ulong
[0] << 32) | acc_word_input
[0];
2401 acc_dword
[0] = reinterpret_long_as_double( acc_word_ulong
[0] );
2402 acc_dword
[1] = reinterpret_long_as_double( acc_word_ulong
[1]);
2404 for( j
= 0; j
< 2; j
++) {
2406 if (((xmsk
>> i
) & 0x1) & ((ymsk
>> j
) & 0x1)) {
2407 /* Note, we do not track the exception handling bits
2408 ox, ux, xx, si, mz, vxsnan and vximz in the FPSCR. */
2410 src1
= srcX_dword
[i
];
2411 src2
= srcY_dword
[j
];
2414 if ( inst
== XVF64GER
)
2415 result
[j
] = src1
* src2
;
2417 else if ( inst
== XVF64GERPP
)
2418 result
[j
] = ( src1
* src2
) + acc
;
2420 else if ( inst
== XVF64GERPN
)
2421 result
[j
] = ( src1
* src2
) + negate_double( acc
);
2423 else if ( inst
== XVF64GERNP
)
2424 result
[j
] = negate_double( src1
* src2
) + acc
;
2426 else if ( inst
== XVF64GERNN
)
2427 result
[j
] = negate_double( src1
* src2
) + negate_double( acc
);
2434 /* Need to store the two double float values as two unsigned ints in
2435 order to store them to the ACC. */
2436 result_ulong
[0] = reinterpret_double_as_long ( result
[0] );
2437 result_ulong
[1] = reinterpret_double_as_long ( result
[1] );
2439 result_uint
[0] = result_ulong
[0] & 0xFFFFFFFF;
2440 result_uint
[1] = (result_ulong
[0] >> 32) & 0xFFFFFFFF;
2441 result_uint
[2] = result_ulong
[1] & 0xFFFFFFFF;
2442 result_uint
[3] = (result_ulong
[1] >> 32) & 0xFFFFFFFF;
2444 write_ACC_entry (gst
, offset_ACC
, acc_entry
, 3 - i
,
2449 /* CALLED FROM GENERATED CODE */
2450 /* DIRTY HELPER uses inline assembly to call random number instruction on
2451 the host machine. Note, the dirty helper takes the value returned from
2452 the host and returns it. The helper does not change the guest state
2454 ULong
darn_dirty_helper ( UInt L
)
2456 ULong val
= 0xFFFFFFFFFFFFFFFFULL
; /* error */
2458 # if defined (HAS_DARN)
2460 __asm__
__volatile__(".machine push; .machine power9;" \
2461 "darn %0,0; .machine pop;" : "=r"(val
));
2463 __asm__
__volatile__(".machine push; .machine power9;" \
2464 "darn %0,1; .machine pop;" : "=r"(val
));
2466 __asm__
__volatile__(".machine push; .machine power9;"
2467 "darn %0,2; .machine pop;" : "=r"(val
));
2473 /*----------------------------------------------*/
2474 /*--- The exported fns .. ---*/
2475 /*----------------------------------------------*/
2477 /* VISIBLE TO LIBVEX CLIENT */
2478 UInt
LibVEX_GuestPPC32_get_CR ( /*IN*/const VexGuestPPC32State
* vex_state
)
2480 # define FIELD(_n) \
2482 ( (vex_state->guest_CR##_n##_321 & (7<<1)) \
2483 | (vex_state->guest_CR##_n##_0 & 1) \
2490 FIELD(0) | FIELD(1) | FIELD(2) | FIELD(3)
2491 | FIELD(4) | FIELD(5) | FIELD(6) | FIELD(7);
2497 /* VISIBLE TO LIBVEX CLIENT */
2498 /* Note: %CR is 32 bits even for ppc64 */
2499 UInt
LibVEX_GuestPPC64_get_CR ( /*IN*/const VexGuestPPC64State
* vex_state
)
2501 # define FIELD(_n) \
2503 ( (vex_state->guest_CR##_n##_321 & (7<<1)) \
2504 | (vex_state->guest_CR##_n##_0 & 1) \
2511 FIELD(0) | FIELD(1) | FIELD(2) | FIELD(3)
2512 | FIELD(4) | FIELD(5) | FIELD(6) | FIELD(7);
2518 /* VISIBLE TO LIBVEX CLIENT */
2519 void LibVEX_GuestPPC32_put_CR ( UInt cr_native
,
2520 /*OUT*/VexGuestPPC32State
* vex_state
)
2524 # define FIELD(_n) \
2526 t = cr_native >> (4*(7-(_n))); \
2527 vex_state->guest_CR##_n##_0 = toUChar(t & 1); \
2528 vex_state->guest_CR##_n##_321 = toUChar(t & (7<<1)); \
2544 /* VISIBLE TO LIBVEX CLIENT */
2545 /* Note: %CR is 32 bits even for ppc64 */
2546 void LibVEX_GuestPPC64_put_CR ( UInt cr_native
,
2547 /*OUT*/VexGuestPPC64State
* vex_state
)
2551 # define FIELD(_n) \
2553 t = cr_native >> (4*(7-(_n))); \
2554 vex_state->guest_CR##_n##_0 = toUChar(t & 1); \
2555 vex_state->guest_CR##_n##_321 = toUChar(t & (7<<1)); \
2571 /* VISIBLE TO LIBVEX CLIENT */
2572 UInt
LibVEX_GuestPPC32_get_XER ( /*IN*/const VexGuestPPC32State
* vex_state
)
2575 w
|= ( ((UInt
)vex_state
->guest_XER_BC
) & 0xFF );
2576 w
|= ( (((UInt
)vex_state
->guest_XER_SO
) & 0x1) << 31 );
2577 w
|= ( (((UInt
)vex_state
->guest_XER_OV
) & 0x1) << 30 );
2578 w
|= ( (((UInt
)vex_state
->guest_XER_CA
) & 0x1) << 29 );
2579 w
|= ( (((UInt
)vex_state
->guest_XER_OV32
) & 0x1) << 19 );
2580 w
|= ( (((UInt
)vex_state
->guest_XER_CA32
) & 0x1) << 18 );
2585 /* VISIBLE TO LIBVEX CLIENT */
2586 /* Note: %XER is 32 bits even for ppc64 */
2587 UInt
LibVEX_GuestPPC64_get_XER ( /*IN*/const VexGuestPPC64State
* vex_state
)
2590 w
|= ( ((UInt
)vex_state
->guest_XER_BC
) & 0xFF );
2591 w
|= ( (((UInt
)vex_state
->guest_XER_SO
) & 0x1) << 31 );
2592 w
|= ( (((UInt
)vex_state
->guest_XER_OV
) & 0x1) << 30 );
2593 w
|= ( (((UInt
)vex_state
->guest_XER_CA
) & 0x1) << 29 );
2594 w
|= ( (((UInt
)vex_state
->guest_XER_OV32
) & 0x1) << 19 );
2595 w
|= ( (((UInt
)vex_state
->guest_XER_CA32
) & 0x1) << 18 );
2600 /* VISIBLE TO LIBVEX CLIENT */
2601 void LibVEX_GuestPPC32_put_XER ( UInt xer_native
,
2602 /*OUT*/VexGuestPPC32State
* vex_state
)
2604 vex_state
->guest_XER_BC
= toUChar(xer_native
& 0xFF);
2605 vex_state
->guest_XER_SO
= toUChar((xer_native
>> 31) & 0x1);
2606 vex_state
->guest_XER_OV
= toUChar((xer_native
>> 30) & 0x1);
2607 vex_state
->guest_XER_CA
= toUChar((xer_native
>> 29) & 0x1);
2608 vex_state
->guest_XER_OV32
= toUChar((xer_native
>> 19) & 0x1);
2609 vex_state
->guest_XER_CA32
= toUChar((xer_native
>> 18) & 0x1);
2612 /* VISIBLE TO LIBVEX CLIENT */
2613 /* Note: %XER is 32 bits even for ppc64 */
2614 void LibVEX_GuestPPC64_put_XER ( UInt xer_native
,
2615 /*OUT*/VexGuestPPC64State
* vex_state
)
2617 vex_state
->guest_XER_BC
= toUChar(xer_native
& 0xFF);
2618 vex_state
->guest_XER_SO
= toUChar((xer_native
>> 31) & 0x1);
2619 vex_state
->guest_XER_OV
= toUChar((xer_native
>> 30) & 0x1);
2620 vex_state
->guest_XER_CA
= toUChar((xer_native
>> 29) & 0x1);
2621 vex_state
->guest_XER_OV32
= toUChar((xer_native
>> 19) & 0x1);
2622 vex_state
->guest_XER_CA32
= toUChar((xer_native
>> 18) & 0x1);
2625 /* VISIBLE TO LIBVEX CLIENT */
2626 void LibVEX_GuestPPC32_initialise ( /*OUT*/VexGuestPPC32State
* vex_state
)
2629 vex_state
->host_EvC_FAILADDR
= 0;
2630 vex_state
->host_EvC_COUNTER
= 0;
2631 vex_state
->pad3
= 0;
2632 vex_state
->pad4
= 0;
2634 vex_state
->guest_GPR0
= 0;
2635 vex_state
->guest_GPR1
= 0;
2636 vex_state
->guest_GPR2
= 0;
2637 vex_state
->guest_GPR3
= 0;
2638 vex_state
->guest_GPR4
= 0;
2639 vex_state
->guest_GPR5
= 0;
2640 vex_state
->guest_GPR6
= 0;
2641 vex_state
->guest_GPR7
= 0;
2642 vex_state
->guest_GPR8
= 0;
2643 vex_state
->guest_GPR9
= 0;
2644 vex_state
->guest_GPR10
= 0;
2645 vex_state
->guest_GPR11
= 0;
2646 vex_state
->guest_GPR12
= 0;
2647 vex_state
->guest_GPR13
= 0;
2648 vex_state
->guest_GPR14
= 0;
2649 vex_state
->guest_GPR15
= 0;
2650 vex_state
->guest_GPR16
= 0;
2651 vex_state
->guest_GPR17
= 0;
2652 vex_state
->guest_GPR18
= 0;
2653 vex_state
->guest_GPR19
= 0;
2654 vex_state
->guest_GPR20
= 0;
2655 vex_state
->guest_GPR21
= 0;
2656 vex_state
->guest_GPR22
= 0;
2657 vex_state
->guest_GPR23
= 0;
2658 vex_state
->guest_GPR24
= 0;
2659 vex_state
->guest_GPR25
= 0;
2660 vex_state
->guest_GPR26
= 0;
2661 vex_state
->guest_GPR27
= 0;
2662 vex_state
->guest_GPR28
= 0;
2663 vex_state
->guest_GPR29
= 0;
2664 vex_state
->guest_GPR30
= 0;
2665 vex_state
->guest_GPR31
= 0;
2667 /* Initialise the vector state. */
2668 # define VECZERO(_vr) _vr[0]=_vr[1]=_vr[2]=_vr[3] = 0;
2670 VECZERO(vex_state
->guest_VSR0
);
2671 VECZERO(vex_state
->guest_VSR1
);
2672 VECZERO(vex_state
->guest_VSR2
);
2673 VECZERO(vex_state
->guest_VSR3
);
2674 VECZERO(vex_state
->guest_VSR4
);
2675 VECZERO(vex_state
->guest_VSR5
);
2676 VECZERO(vex_state
->guest_VSR6
);
2677 VECZERO(vex_state
->guest_VSR7
);
2678 VECZERO(vex_state
->guest_VSR8
);
2679 VECZERO(vex_state
->guest_VSR9
);
2680 VECZERO(vex_state
->guest_VSR10
);
2681 VECZERO(vex_state
->guest_VSR11
);
2682 VECZERO(vex_state
->guest_VSR12
);
2683 VECZERO(vex_state
->guest_VSR13
);
2684 VECZERO(vex_state
->guest_VSR14
);
2685 VECZERO(vex_state
->guest_VSR15
);
2686 VECZERO(vex_state
->guest_VSR16
);
2687 VECZERO(vex_state
->guest_VSR17
);
2688 VECZERO(vex_state
->guest_VSR18
);
2689 VECZERO(vex_state
->guest_VSR19
);
2690 VECZERO(vex_state
->guest_VSR20
);
2691 VECZERO(vex_state
->guest_VSR21
);
2692 VECZERO(vex_state
->guest_VSR22
);
2693 VECZERO(vex_state
->guest_VSR23
);
2694 VECZERO(vex_state
->guest_VSR24
);
2695 VECZERO(vex_state
->guest_VSR25
);
2696 VECZERO(vex_state
->guest_VSR26
);
2697 VECZERO(vex_state
->guest_VSR27
);
2698 VECZERO(vex_state
->guest_VSR28
);
2699 VECZERO(vex_state
->guest_VSR29
);
2700 VECZERO(vex_state
->guest_VSR30
);
2701 VECZERO(vex_state
->guest_VSR31
);
2702 VECZERO(vex_state
->guest_VSR32
);
2703 VECZERO(vex_state
->guest_VSR33
);
2704 VECZERO(vex_state
->guest_VSR34
);
2705 VECZERO(vex_state
->guest_VSR35
);
2706 VECZERO(vex_state
->guest_VSR36
);
2707 VECZERO(vex_state
->guest_VSR37
);
2708 VECZERO(vex_state
->guest_VSR38
);
2709 VECZERO(vex_state
->guest_VSR39
);
2710 VECZERO(vex_state
->guest_VSR40
);
2711 VECZERO(vex_state
->guest_VSR41
);
2712 VECZERO(vex_state
->guest_VSR42
);
2713 VECZERO(vex_state
->guest_VSR43
);
2714 VECZERO(vex_state
->guest_VSR44
);
2715 VECZERO(vex_state
->guest_VSR45
);
2716 VECZERO(vex_state
->guest_VSR46
);
2717 VECZERO(vex_state
->guest_VSR47
);
2718 VECZERO(vex_state
->guest_VSR48
);
2719 VECZERO(vex_state
->guest_VSR49
);
2720 VECZERO(vex_state
->guest_VSR50
);
2721 VECZERO(vex_state
->guest_VSR51
);
2722 VECZERO(vex_state
->guest_VSR52
);
2723 VECZERO(vex_state
->guest_VSR53
);
2724 VECZERO(vex_state
->guest_VSR54
);
2725 VECZERO(vex_state
->guest_VSR55
);
2726 VECZERO(vex_state
->guest_VSR56
);
2727 VECZERO(vex_state
->guest_VSR57
);
2728 VECZERO(vex_state
->guest_VSR58
);
2729 VECZERO(vex_state
->guest_VSR59
);
2730 VECZERO(vex_state
->guest_VSR60
);
2731 VECZERO(vex_state
->guest_VSR61
);
2732 VECZERO(vex_state
->guest_VSR62
);
2733 VECZERO(vex_state
->guest_VSR63
);
2735 VECZERO( vex_state
->guest_ACC_0_r0
);
2736 VECZERO( vex_state
->guest_ACC_0_r1
);
2737 VECZERO( vex_state
->guest_ACC_0_r2
);
2738 VECZERO( vex_state
->guest_ACC_0_r3
);
2739 VECZERO( vex_state
->guest_ACC_1_r0
);
2740 VECZERO( vex_state
->guest_ACC_1_r1
);
2741 VECZERO( vex_state
->guest_ACC_1_r2
);
2742 VECZERO( vex_state
->guest_ACC_1_r3
);
2743 VECZERO( vex_state
->guest_ACC_2_r0
);
2744 VECZERO( vex_state
->guest_ACC_2_r1
);
2745 VECZERO( vex_state
->guest_ACC_2_r2
);
2746 VECZERO( vex_state
->guest_ACC_2_r3
);
2747 VECZERO( vex_state
->guest_ACC_3_r0
);
2748 VECZERO( vex_state
->guest_ACC_3_r1
);
2749 VECZERO( vex_state
->guest_ACC_3_r2
);
2750 VECZERO( vex_state
->guest_ACC_3_r3
);
2751 VECZERO( vex_state
->guest_ACC_4_r0
);
2752 VECZERO( vex_state
->guest_ACC_4_r1
);
2753 VECZERO( vex_state
->guest_ACC_4_r2
);
2754 VECZERO( vex_state
->guest_ACC_4_r3
);
2755 VECZERO( vex_state
->guest_ACC_5_r0
);
2756 VECZERO( vex_state
->guest_ACC_5_r1
);
2757 VECZERO( vex_state
->guest_ACC_5_r2
);
2758 VECZERO( vex_state
->guest_ACC_5_r3
);
2759 VECZERO( vex_state
->guest_ACC_6_r0
);
2760 VECZERO( vex_state
->guest_ACC_6_r1
);
2761 VECZERO( vex_state
->guest_ACC_6_r2
);
2762 VECZERO( vex_state
->guest_ACC_6_r3
);
2763 VECZERO( vex_state
->guest_ACC_7_r0
);
2764 VECZERO( vex_state
->guest_ACC_7_r1
);
2765 VECZERO( vex_state
->guest_ACC_7_r2
);
2766 VECZERO( vex_state
->guest_ACC_7_r3
);
2770 vex_state
->guest_CIA
= 0;
2771 vex_state
->guest_LR
= 0;
2772 vex_state
->guest_CTR
= 0;
2774 vex_state
->guest_XER_SO
= 0;
2775 vex_state
->guest_XER_OV
= 0;
2776 vex_state
->guest_XER_CA
= 0;
2777 vex_state
->guest_XER_BC
= 0;
2779 vex_state
->guest_XER_OV32
= 0;
2780 vex_state
->guest_XER_CA32
= 0;
2782 vex_state
->guest_CR0_321
= 0;
2783 vex_state
->guest_CR0_0
= 0;
2784 vex_state
->guest_CR1_321
= 0;
2785 vex_state
->guest_CR1_0
= 0;
2786 vex_state
->guest_CR2_321
= 0;
2787 vex_state
->guest_CR2_0
= 0;
2788 vex_state
->guest_CR3_321
= 0;
2789 vex_state
->guest_CR3_0
= 0;
2790 vex_state
->guest_CR4_321
= 0;
2791 vex_state
->guest_CR4_0
= 0;
2792 vex_state
->guest_CR5_321
= 0;
2793 vex_state
->guest_CR5_0
= 0;
2794 vex_state
->guest_CR6_321
= 0;
2795 vex_state
->guest_CR6_0
= 0;
2796 vex_state
->guest_CR7_321
= 0;
2797 vex_state
->guest_CR7_0
= 0;
2799 vex_state
->guest_FPROUND
= PPCrm_NEAREST
;
2800 vex_state
->guest_DFPROUND
= PPCrm_NEAREST
;
2801 vex_state
->guest_C_FPCC
= 0;
2802 vex_state
->pad2
= 0;
2804 vex_state
->guest_VRSAVE
= 0;
2806 # if defined(VGP_ppc64be_linux)
2807 /* By default, the HW for BE sets the VSCR[NJ] bit to 1.
2808 VSR is a 128-bit register, NJ bit is bit 111 (IBM numbering).
2809 However, VSCR is modeled as a 64-bit register. */
2810 vex_state
->guest_VSCR
= 0x1 << (127 - 111);
2812 /* LE API requires NJ be set to 0. */
2813 vex_state
->guest_VSCR
= 0x0;
2816 vex_state
->guest_EMNOTE
= EmNote_NONE
;
2818 vex_state
->guest_CMSTART
= 0;
2819 vex_state
->guest_CMLEN
= 0;
2821 vex_state
->guest_NRADDR
= 0;
2822 vex_state
->guest_NRADDR_GPR2
= 0;
2824 vex_state
->guest_REDIR_SP
= -1;
2825 for (i
= 0; i
< VEX_GUEST_PPC32_REDIR_STACK_SIZE
; i
++)
2826 vex_state
->guest_REDIR_STACK
[i
] = 0;
2828 vex_state
->guest_IP_AT_SYSCALL
= 0;
2829 vex_state
->guest_SPRG3_RO
= 0;
2830 vex_state
->guest_PPR
= 0x4ULL
<< 50; // medium priority
2831 vex_state
->guest_PSPB
= 0x100; // an arbitrary non-zero value to start with
2833 vex_state
->padding1
= 0;
2834 /* vex_state->padding2 = 0; currently not used */
2838 /* VISIBLE TO LIBVEX CLIENT */
2839 void LibVEX_GuestPPC64_initialise ( /*OUT*/VexGuestPPC64State
* vex_state
)
2842 vex_state
->host_EvC_FAILADDR
= 0;
2843 vex_state
->host_EvC_COUNTER
= 0;
2844 vex_state
->pad0
= 0;
2845 vex_state
->guest_GPR0
= 0;
2846 vex_state
->guest_GPR1
= 0;
2847 vex_state
->guest_GPR2
= 0;
2848 vex_state
->guest_GPR3
= 0;
2849 vex_state
->guest_GPR4
= 0;
2850 vex_state
->guest_GPR5
= 0;
2851 vex_state
->guest_GPR6
= 0;
2852 vex_state
->guest_GPR7
= 0;
2853 vex_state
->guest_GPR8
= 0;
2854 vex_state
->guest_GPR9
= 0;
2855 vex_state
->guest_GPR10
= 0;
2856 vex_state
->guest_GPR11
= 0;
2857 vex_state
->guest_GPR12
= 0;
2858 vex_state
->guest_GPR13
= 0;
2859 vex_state
->guest_GPR14
= 0;
2860 vex_state
->guest_GPR15
= 0;
2861 vex_state
->guest_GPR16
= 0;
2862 vex_state
->guest_GPR17
= 0;
2863 vex_state
->guest_GPR18
= 0;
2864 vex_state
->guest_GPR19
= 0;
2865 vex_state
->guest_GPR20
= 0;
2866 vex_state
->guest_GPR21
= 0;
2867 vex_state
->guest_GPR22
= 0;
2868 vex_state
->guest_GPR23
= 0;
2869 vex_state
->guest_GPR24
= 0;
2870 vex_state
->guest_GPR25
= 0;
2871 vex_state
->guest_GPR26
= 0;
2872 vex_state
->guest_GPR27
= 0;
2873 vex_state
->guest_GPR28
= 0;
2874 vex_state
->guest_GPR29
= 0;
2875 vex_state
->guest_GPR30
= 0;
2876 vex_state
->guest_GPR31
= 0;
2878 /* Initialise the vector state. */
2879 # define VECZERO(_vr) _vr[0]=_vr[1]=_vr[2]=_vr[3] = 0;
2881 VECZERO(vex_state
->guest_VSR0
);
2882 VECZERO(vex_state
->guest_VSR1
);
2883 VECZERO(vex_state
->guest_VSR2
);
2884 VECZERO(vex_state
->guest_VSR3
);
2885 VECZERO(vex_state
->guest_VSR4
);
2886 VECZERO(vex_state
->guest_VSR5
);
2887 VECZERO(vex_state
->guest_VSR6
);
2888 VECZERO(vex_state
->guest_VSR7
);
2889 VECZERO(vex_state
->guest_VSR8
);
2890 VECZERO(vex_state
->guest_VSR9
);
2891 VECZERO(vex_state
->guest_VSR10
);
2892 VECZERO(vex_state
->guest_VSR11
);
2893 VECZERO(vex_state
->guest_VSR12
);
2894 VECZERO(vex_state
->guest_VSR13
);
2895 VECZERO(vex_state
->guest_VSR14
);
2896 VECZERO(vex_state
->guest_VSR15
);
2897 VECZERO(vex_state
->guest_VSR16
);
2898 VECZERO(vex_state
->guest_VSR17
);
2899 VECZERO(vex_state
->guest_VSR18
);
2900 VECZERO(vex_state
->guest_VSR19
);
2901 VECZERO(vex_state
->guest_VSR20
);
2902 VECZERO(vex_state
->guest_VSR21
);
2903 VECZERO(vex_state
->guest_VSR22
);
2904 VECZERO(vex_state
->guest_VSR23
);
2905 VECZERO(vex_state
->guest_VSR24
);
2906 VECZERO(vex_state
->guest_VSR25
);
2907 VECZERO(vex_state
->guest_VSR26
);
2908 VECZERO(vex_state
->guest_VSR27
);
2909 VECZERO(vex_state
->guest_VSR28
);
2910 VECZERO(vex_state
->guest_VSR29
);
2911 VECZERO(vex_state
->guest_VSR30
);
2912 VECZERO(vex_state
->guest_VSR31
);
2913 VECZERO(vex_state
->guest_VSR32
);
2914 VECZERO(vex_state
->guest_VSR33
);
2915 VECZERO(vex_state
->guest_VSR34
);
2916 VECZERO(vex_state
->guest_VSR35
);
2917 VECZERO(vex_state
->guest_VSR36
);
2918 VECZERO(vex_state
->guest_VSR37
);
2919 VECZERO(vex_state
->guest_VSR38
);
2920 VECZERO(vex_state
->guest_VSR39
);
2921 VECZERO(vex_state
->guest_VSR40
);
2922 VECZERO(vex_state
->guest_VSR41
);
2923 VECZERO(vex_state
->guest_VSR42
);
2924 VECZERO(vex_state
->guest_VSR43
);
2925 VECZERO(vex_state
->guest_VSR44
);
2926 VECZERO(vex_state
->guest_VSR45
);
2927 VECZERO(vex_state
->guest_VSR46
);
2928 VECZERO(vex_state
->guest_VSR47
);
2929 VECZERO(vex_state
->guest_VSR48
);
2930 VECZERO(vex_state
->guest_VSR49
);
2931 VECZERO(vex_state
->guest_VSR50
);
2932 VECZERO(vex_state
->guest_VSR51
);
2933 VECZERO(vex_state
->guest_VSR52
);
2934 VECZERO(vex_state
->guest_VSR53
);
2935 VECZERO(vex_state
->guest_VSR54
);
2936 VECZERO(vex_state
->guest_VSR55
);
2937 VECZERO(vex_state
->guest_VSR56
);
2938 VECZERO(vex_state
->guest_VSR57
);
2939 VECZERO(vex_state
->guest_VSR58
);
2940 VECZERO(vex_state
->guest_VSR59
);
2941 VECZERO(vex_state
->guest_VSR60
);
2942 VECZERO(vex_state
->guest_VSR61
);
2943 VECZERO(vex_state
->guest_VSR62
);
2944 VECZERO(vex_state
->guest_VSR63
);
2948 vex_state
->guest_CIA
= 0;
2949 vex_state
->guest_LR
= 0;
2950 vex_state
->guest_CTR
= 0;
2952 vex_state
->guest_XER_SO
= 0;
2953 vex_state
->guest_XER_OV
= 0;
2954 vex_state
->guest_XER_CA
= 0;
2955 vex_state
->guest_XER_BC
= 0;
2957 vex_state
->guest_CR0_321
= 0;
2958 vex_state
->guest_CR0_0
= 0;
2959 vex_state
->guest_CR1_321
= 0;
2960 vex_state
->guest_CR1_0
= 0;
2961 vex_state
->guest_CR2_321
= 0;
2962 vex_state
->guest_CR2_0
= 0;
2963 vex_state
->guest_CR3_321
= 0;
2964 vex_state
->guest_CR3_0
= 0;
2965 vex_state
->guest_CR4_321
= 0;
2966 vex_state
->guest_CR4_0
= 0;
2967 vex_state
->guest_CR5_321
= 0;
2968 vex_state
->guest_CR5_0
= 0;
2969 vex_state
->guest_CR6_321
= 0;
2970 vex_state
->guest_CR6_0
= 0;
2971 vex_state
->guest_CR7_321
= 0;
2972 vex_state
->guest_CR7_0
= 0;
2974 vex_state
->guest_FPROUND
= PPCrm_NEAREST
;
2975 vex_state
->guest_DFPROUND
= PPCrm_NEAREST
;
2976 vex_state
->guest_C_FPCC
= 0;
2977 vex_state
->pad2
= 0;
2979 vex_state
->guest_VRSAVE
= 0;
2981 # if defined(VGP_ppc64be_linux)
2982 /* By default, the HW for BE sets the VSCR[NJ] bit to 1.
2983 VSR is a 128-bit register, NJ bit is bit 111 (IBM numbering).
2984 However, VSCR is modeled as a 64-bit register. */
2985 vex_state
->guest_VSCR
= 0x1 << (127 - 111);
2987 /* LE API requires NJ be set to 0. */
2988 vex_state
->guest_VSCR
= 0x0;
2991 vex_state
->guest_EMNOTE
= EmNote_NONE
;
2993 vex_state
->padding
= 0;
2995 vex_state
->guest_CMSTART
= 0;
2996 vex_state
->guest_CMLEN
= 0;
2998 vex_state
->guest_NRADDR
= 0;
2999 vex_state
->guest_NRADDR_GPR2
= 0;
3001 vex_state
->guest_REDIR_SP
= -1;
3002 for (i
= 0; i
< VEX_GUEST_PPC64_REDIR_STACK_SIZE
; i
++)
3003 vex_state
->guest_REDIR_STACK
[i
] = 0;
3005 vex_state
->guest_IP_AT_SYSCALL
= 0;
3006 vex_state
->guest_SPRG3_RO
= 0;
3007 vex_state
->guest_TFHAR
= 0;
3008 vex_state
->guest_TFIAR
= 0;
3009 vex_state
->guest_TEXASR
= 0;
3010 vex_state
->guest_PPR
= 0x4ULL
<< 50; // medium priority
3011 vex_state
->guest_PSPB
= 0x100; // an arbitrary non-zero value to start with
3012 vex_state
->guest_DSCR
= 0;
3017 /*-----------------------------------------------------------*/
3018 /*--- Describing the ppc guest state, for the benefit ---*/
3019 /*--- of iropt and instrumenters. ---*/
3020 /*-----------------------------------------------------------*/
3022 /* Figure out if any part of the guest state contained in minoff
3023 .. maxoff requires precise memory exceptions. If in doubt return
3024 True (but this is generates significantly slower code).
3026 By default we enforce precise exns for guest R1 (stack pointer),
3027 CIA (current insn address) and LR (link register). These are the
3028 minimum needed to extract correct stack backtraces from ppc
3029 code. [[NB: not sure if keeping LR up to date is actually
3032 Only R1 is needed in mode VexRegUpdSpAtMemAccess.
3034 Bool
guest_ppc32_state_requires_precise_mem_exns (
3035 Int minoff
, Int maxoff
, VexRegisterUpdates pxControl
3038 Int lr_min
= offsetof(VexGuestPPC32State
, guest_LR
);
3039 Int lr_max
= lr_min
+ 4 - 1;
3040 Int r1_min
= offsetof(VexGuestPPC32State
, guest_GPR1
);
3041 Int r1_max
= r1_min
+ 4 - 1;
3042 Int cia_min
= offsetof(VexGuestPPC32State
, guest_CIA
);
3043 Int cia_max
= cia_min
+ 4 - 1;
3045 if (maxoff
< r1_min
|| minoff
> r1_max
) {
3046 /* no overlap with R1 */
3047 if (pxControl
== VexRegUpdSpAtMemAccess
)
3048 return False
; // We only need to check stack pointer.
3053 if (maxoff
< lr_min
|| minoff
> lr_max
) {
3054 /* no overlap with LR */
3059 if (maxoff
< cia_min
|| minoff
> cia_max
) {
3060 /* no overlap with CIA */
3068 Bool
guest_ppc64_state_requires_precise_mem_exns (
3069 Int minoff
, Int maxoff
, VexRegisterUpdates pxControl
3072 /* Given that R2 is a Big Deal in the ELF ppc64 ABI, it seems
3073 prudent to be conservative with it, even though thus far there
3074 is no evidence to suggest that it actually needs to be kept up
3075 to date wrt possible exceptions. */
3076 Int lr_min
= offsetof(VexGuestPPC64State
, guest_LR
);
3077 Int lr_max
= lr_min
+ 8 - 1;
3078 Int r1_min
= offsetof(VexGuestPPC64State
, guest_GPR1
);
3079 Int r1_max
= r1_min
+ 8 - 1;
3080 Int r2_min
= offsetof(VexGuestPPC64State
, guest_GPR2
);
3081 Int r2_max
= r2_min
+ 8 - 1;
3082 Int cia_min
= offsetof(VexGuestPPC64State
, guest_CIA
);
3083 Int cia_max
= cia_min
+ 8 - 1;
3085 if (maxoff
< r1_min
|| minoff
> r1_max
) {
3086 /* no overlap with R1 */
3087 if (pxControl
== VexRegUpdSpAtMemAccess
)
3088 return False
; // We only need to check stack pointer.
3093 if (maxoff
< lr_min
|| minoff
> lr_max
) {
3094 /* no overlap with LR */
3099 if (maxoff
< r2_min
|| minoff
> r2_max
) {
3100 /* no overlap with R2 */
3105 if (maxoff
< cia_min
|| minoff
> cia_max
) {
3106 /* no overlap with CIA */
3115 #define ALWAYSDEFD32(field) \
3116 { offsetof(VexGuestPPC32State, field), \
3117 (sizeof ((VexGuestPPC32State*)0)->field) }
3122 /* Total size of the guest state, in bytes. */
3123 .total_sizeB
= sizeof(VexGuestPPC32State
),
3125 /* Describe the stack pointer. */
3126 .offset_SP
= offsetof(VexGuestPPC32State
,guest_GPR1
),
3129 /* Describe the frame pointer. */
3130 .offset_FP
= offsetof(VexGuestPPC32State
,guest_GPR1
),
3133 /* Describe the instruction pointer. */
3134 .offset_IP
= offsetof(VexGuestPPC32State
,guest_CIA
),
3137 /* Describe any sections to be regarded by Memcheck as
3138 'always-defined'. */
3142 = { /* 0 */ ALWAYSDEFD32(guest_CIA
),
3143 /* 1 */ ALWAYSDEFD32(guest_EMNOTE
),
3144 /* 2 */ ALWAYSDEFD32(guest_CMSTART
),
3145 /* 3 */ ALWAYSDEFD32(guest_CMLEN
),
3146 /* 4 */ ALWAYSDEFD32(guest_VSCR
),
3147 /* 5 */ ALWAYSDEFD32(guest_FPROUND
),
3148 /* 6 */ ALWAYSDEFD32(guest_NRADDR
),
3149 /* 7 */ ALWAYSDEFD32(guest_NRADDR_GPR2
),
3150 /* 8 */ ALWAYSDEFD32(guest_REDIR_SP
),
3151 /* 9 */ ALWAYSDEFD32(guest_REDIR_STACK
),
3152 /* 10 */ ALWAYSDEFD32(guest_IP_AT_SYSCALL
),
3153 /* 11 */ ALWAYSDEFD32(guest_C_FPCC
)
3157 #define ALWAYSDEFD64(field) \
3158 { offsetof(VexGuestPPC64State, field), \
3159 (sizeof ((VexGuestPPC64State*)0)->field) }
3164 /* Total size of the guest state, in bytes. */
3165 .total_sizeB
= sizeof(VexGuestPPC64State
),
3167 /* Describe the stack pointer. */
3168 .offset_SP
= offsetof(VexGuestPPC64State
,guest_GPR1
),
3171 /* Describe the frame pointer. */
3172 .offset_FP
= offsetof(VexGuestPPC64State
,guest_GPR1
),
3175 /* Describe the instruction pointer. */
3176 .offset_IP
= offsetof(VexGuestPPC64State
,guest_CIA
),
3179 /* Describe any sections to be regarded by Memcheck as
3180 'always-defined'. */
3184 = { /* 0 */ ALWAYSDEFD64(guest_CIA
),
3185 /* 1 */ ALWAYSDEFD64(guest_EMNOTE
),
3186 /* 2 */ ALWAYSDEFD64(guest_CMSTART
),
3187 /* 3 */ ALWAYSDEFD64(guest_CMLEN
),
3188 /* 4 */ ALWAYSDEFD64(guest_VSCR
),
3189 /* 5 */ ALWAYSDEFD64(guest_FPROUND
),
3190 /* 6 */ ALWAYSDEFD64(guest_NRADDR
),
3191 /* 7 */ ALWAYSDEFD64(guest_NRADDR_GPR2
),
3192 /* 8 */ ALWAYSDEFD64(guest_REDIR_SP
),
3193 /* 9 */ ALWAYSDEFD64(guest_REDIR_STACK
),
3194 /* 10 */ ALWAYSDEFD64(guest_IP_AT_SYSCALL
),
3195 /* 11 */ ALWAYSDEFD64(guest_C_FPCC
)
3199 UInt
copy_paste_abort_dirty_helper(UInt addr
, UInt op
) {
3200 # if defined(__powerpc__) && defined(HAS_ISA_3_00)
3201 /* The enable copy, paste., and cpabort were introduced in ISA 3.0. */
3205 if (op
== COPY_INST
)
3206 __asm__
__volatile__ (".machine push;\n"
3207 ".machine power9;\n"
3209 ".machine pop" :: "r" (addr
));
3211 else if (op
== PASTE_INST
)
3212 __asm__
__volatile__ (".machine push;\n"
3213 ".machine power9;\n"
3215 ".machine pop" :: "r" (addr
));
3217 else if (op
== CPABORT_INST
)
3218 __asm__
__volatile__ (".machine push;\n"
3219 ".machine power9;\n"
3224 /* Unknown operation */
3227 /* Return the CR0 value. Contains status for the paste instruction. */
3228 __asm__
__volatile__ ("mfocrf %0,128" : "=r" (cr
));
3229 __asm__
__volatile__ ("srawi %0,%1,28" : "=r" (ret
) : "r" (cr
));
3230 /* Make sure the upper bits of the return value are zero per the hack
3231 described in function dis_copy_paste(). */
3238 /*---------------------------------------------------------------*/
3239 /*--- end guest_ppc_helpers.c ---*/
3240 /*---------------------------------------------------------------*/