VEX/priv/guest_ppc_helpers.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                               guest_ppc_helpers.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex_emnote.h"
  36 #include "libvex_guest_ppc32.h"
  37 #include "libvex_guest_ppc64.h"
  38 #include "libvex_ir.h"
  39 #include "libvex.h"
  40
  41 #include "main_util.h"
  42 #include "main_globals.h"
  43 #include "guest_generic_bb_to_IR.h"
  44 #include "guest_ppc_defs.h"
  45
  46
  47 /* This file contains helper functions for ppc32 and ppc64 guest code.
  48    Calls to these functions are generated by the back end.  These
  49    calls are of course in the host machine code and this file will be
  50    compiled to host machine code, so that all makes sense.
  51
  52    Only change the signatures of these helper functions very
  53    carefully.  If you change the signature here, you'll have to change
  54    the parameters passed to it in the IR calls constructed by
  55    guest-ppc/toIR.c.
  56 */
  57
  58
  59 /*---------------------------------------------------------------*/
  60 /*--- Misc integer helpers.                                   ---*/
  61 /*---------------------------------------------------------------*/
  62
  63 /* CALLED FROM GENERATED CODE */
  64 /* DIRTY HELPER (non-referentially-transparent) */
  65 /* Horrible hack.  On non-ppc platforms, return 1. */
  66 /* Reads a complete, consistent 64-bit TB value. */
  67 ULong ppcg_dirtyhelper_MFTB ( void )
  68 {
  69 #  if defined(__powerpc__)
  70    ULong res;
  71    UInt  lo, hi1, hi2;
  72    while (1) {
  73       __asm__ __volatile__ ("\n"
  74          "\tmftbu %0\n"
  75          "\tmftb %1\n"
  76          "\tmftbu %2\n"
  77          : "=r" (hi1), "=r" (lo), "=r" (hi2)
  78       );
  79       if (hi1 == hi2) break;
  80    }
  81    res = ((ULong)hi1) << 32;
  82    res |= (ULong)lo;
  83    return res;
  84 #  else
  85    return 1ULL;
  86 #  endif
  87 }
  88
  89
  90 /* CALLED FROM GENERATED CODE */
  91 /* DIRTY HELPER (non-referentially transparent) */
  92 UInt ppc32g_dirtyhelper_MFSPR_268_269 ( UInt r269 )
  93 {
  94 #  if defined(__powerpc__)
  95    UInt spr;
  96    if (r269) {
  97       __asm__ __volatile__("mfspr %0,269" : "=b"(spr));
  98    } else {
  99       __asm__ __volatile__("mfspr %0,268" : "=b"(spr));
 100    }
 101    return spr;
 102 #  else
 103    return 0;
 104 #  endif
 105 }
 106
 107
 108 /* CALLED FROM GENERATED CODE */
 109 /* DIRTY HELPER (I'm not really sure what the side effects are) */
 110 UInt ppc32g_dirtyhelper_MFSPR_287 ( void )
 111 {
 112 #  if defined(__powerpc__)
 113    UInt spr;
 114    __asm__ __volatile__("mfspr %0,287" : "=b"(spr));
 115    return spr;
 116 #  else
 117    return 0;
 118 #  endif
 119 }
 120
 121
 122 /* CALLED FROM GENERATED CODE */
 123 /* DIRTY HELPER (reads guest state, writes guest mem) */
 124 void ppc32g_dirtyhelper_LVS ( VexGuestPPC32State* gst,
 125                               UInt vD_off, UInt sh, UInt shift_right )
 126 {
 127   static
 128   UChar ref[32] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 129                     0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
 130                     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 131                     0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F };
 132   U128* pU128_src;
 133   U128* pU128_dst;
 134
 135   vassert( vD_off       <= sizeof(VexGuestPPC32State)-8 );
 136   vassert( sh           <= 15 );
 137   vassert( shift_right  <=  1 );
 138   if (shift_right)
 139      sh = 16-sh;
 140   /* else shift left  */
 141
 142   pU128_src = (U128*)&ref[sh];
 143   pU128_dst = (U128*)( ((UChar*)gst) + vD_off );
 144
 145   (*pU128_dst)[0] = (*pU128_src)[0];
 146   (*pU128_dst)[1] = (*pU128_src)[1];
 147   (*pU128_dst)[2] = (*pU128_src)[2];
 148   (*pU128_dst)[3] = (*pU128_src)[3];
 149 }
 150
 151 /* CALLED FROM GENERATED CODE */
 152 /* DIRTY HELPER (reads guest state, writes guest mem) */
 153 void ppc64g_dirtyhelper_LVS ( VexGuestPPC64State* gst,
 154                               UInt vD_off, UInt sh, UInt shift_right,
 155                               UInt endness )
 156 {
 157   UChar ref[32];
 158   ULong i;
 159   Int k;
 160   /* ref[] used to be a static const array, but this doesn't work on
 161      ppc64 because VEX doesn't load the TOC pointer for the call here,
 162      and so we wind up picking up some totally random other data.
 163      (It's a wonder we don't segfault.)  So, just to be clear, this
 164      "fix" (vex r2073) is really a kludgearound for the fact that
 165      VEX's 64-bit ppc code generation doesn't provide a valid TOC
 166      pointer for helper function calls.  Ick.  (Bug 250038) */
 167   for (i = 0; i < 32; i++) ref[i] = i;
 168
 169   U128* pU128_src;
 170   U128* pU128_dst;
 171
 172   vassert( vD_off       <= sizeof(VexGuestPPC64State)-8 );
 173   vassert( sh           <= 15 );
 174   vassert( shift_right  <=  1 );
 175   if (shift_right)
 176      sh = 16-sh;
 177   /* else shift left  */
 178
 179   pU128_src = (U128*)&ref[sh];
 180   pU128_dst = (U128*)( ((UChar*)gst) + vD_off );
 181
 182   if ((0x1 & endness) == 0x0) {
 183      /* Little endian */
 184      unsigned char *srcp, *dstp;
 185      srcp = (unsigned char *)pU128_src;
 186      dstp = (unsigned char *)pU128_dst;
 187      for (k = 15; k >= 0; k--, srcp++)
 188         dstp[k] = *srcp;
 189   } else {
 190      (*pU128_dst)[0] = (*pU128_src)[0];
 191      (*pU128_dst)[1] = (*pU128_src)[1];
 192      (*pU128_dst)[2] = (*pU128_src)[2];
 193      (*pU128_dst)[3] = (*pU128_src)[3];
 194   }
 195 }
 196
 197
 198 /* Helper-function specialiser. */
 199
 200 IRExpr* guest_ppc32_spechelper ( const HChar* function_name,
 201                                  IRExpr** args,
 202                                  IRStmt** precedingStmts,
 203                                  Int      n_precedingStmts )
 204 {
 205    return NULL;
 206 }
 207
 208 IRExpr* guest_ppc64_spechelper ( const HChar* function_name,
 209                                  IRExpr** args,
 210                                  IRStmt** precedingStmts,
 211                                  Int      n_precedingStmts )
 212 {
 213    return NULL;
 214 }
 215
 216
 217 /* 16-bit floating point number is stored in the lower 16-bits of 32-bit value */
 218 #define I16_EXP_MASK       0x7C00
 219 #define I16_FRACTION_MASK  0x03FF
 220 #define I32_EXP_MASK       0x7F800000
 221 #define I32_FRACTION_MASK  0x007FFFFF
 222 #define I64_EXP_MASK       0x7FF0000000000000ULL
 223 #define I64_FRACTION_MASK  0x000FFFFFFFFFFFFFULL
 224 #define V128_EXP_MASK      0x7FFF000000000000ULL
 225 #define V128_FRACTION_MASK 0x0000FFFFFFFFFFFFULL  /* upper 64-bit fractional mask */
 226
 227 ULong generate_C_FPCC_helper( ULong irType, ULong src_hi, ULong src )
 228 {
 229    UInt NaN, inf, zero, norm, dnorm, pos;
 230    UInt bit0, bit1, bit2, bit3;
 231    UInt sign_bit = 0;
 232    ULong exp_mask = 0, exp_part = 0, frac_part = 0;
 233    ULong fpcc, c;
 234
 235    if ( irType == Ity_I16 ) {
 236       frac_part = I16_FRACTION_MASK & src;
 237       exp_mask = I16_EXP_MASK;
 238       exp_part = exp_mask & src;
 239       sign_bit = src >> 15;
 240
 241    } else if ( irType == Ity_I32 ) {
 242       frac_part = I32_FRACTION_MASK & src;
 243       exp_mask = I32_EXP_MASK;
 244       exp_part = exp_mask & src;
 245       sign_bit = src >> 31;
 246
 247    } else  if ( irType == Ity_I64 ) {
 248      frac_part = I64_FRACTION_MASK & src;
 249      exp_mask = I64_EXP_MASK;
 250      exp_part = exp_mask & src;
 251      sign_bit = src >> 63;
 252
 253    } else  if ( irType == Ity_F128 ) {
 254      /* only care if the frac part is zero or non-zero */
 255      frac_part = (V128_FRACTION_MASK & src_hi) | src;
 256      exp_mask = V128_EXP_MASK;
 257      exp_part = exp_mask & src_hi;
 258      sign_bit = src_hi >> 63;
 259    } else {
 260      vassert(0);  // Unknown value of irType
 261    }
 262
 263    /* NaN: exponene is all ones, fractional part not zero */
 264    if ((exp_part == exp_mask) && (frac_part != 0))
 265      NaN = 1;
 266    else
 267      NaN = 0;
 268
 269    /* inf: exponent all 1's, fraction part is zero */
 270    if ((exp_part == exp_mask) && (frac_part == 0))
 271      inf = 1;
 272    else
 273      inf = 0;
 274
 275    /* zero: exponent is 0, fraction part is zero */
 276    if ((exp_part == 0) && (frac_part == 0))
 277      zero = 1;
 278    else
 279      zero = 0;
 280
 281    /* norm: exponent is not 0, exponent is not all 1's */
 282    if ((exp_part != 0) && (exp_part != exp_mask))
 283      norm = 1;
 284    else
 285      norm = 0;
 286
 287    /* dnorm: exponent is all 0's, fraction is not 0 */
 288    if ((exp_part == 0) && (frac_part != 0))
 289      dnorm = 1;
 290    else
 291      dnorm = 0;
 292
 293    /* pos: MSB is 1 */
 294    if (sign_bit == 0)
 295      pos = 1;
 296    else
 297      pos = 0;
 298
 299    /* calculate FPCC */
 300    /* If the result is NaN then must force bits 1, 2 and 3 to zero
 301     * to get correct result.
 302     */
 303    bit0 = NaN | inf;
 304
 305    bit1 = (!NaN) & zero;
 306    bit2 =  (!NaN) & ((pos & dnorm) | (pos & norm) | (pos & inf))
 307       & ((!zero) & (!NaN));
 308    bit3 =  (!NaN) & (((!pos) & dnorm) |((!pos) & norm) | ((!pos) & inf))
 309       & ((!zero) & (!NaN));
 310
 311    fpcc = (bit3 << 3) | (bit2 << 2) | (bit1 << 1) | bit0;
 312
 313    /* calculate C */
 314    c = NaN | ((!pos) & dnorm) | ((!pos) & zero) | (pos & dnorm);
 315
 316    /* return C in the upper 32-bits and FPCC in the lower 32 bits */
 317    return (c <<32) | fpcc;
 318 }
 319
 320
 321 UInt generate_DFP_FPRF_value_helper( UInt gfield,
 322                                      ULong exponent,
 323                                      UInt exponent_bias,
 324                                      Int min_norm_exp,
 325                                      UInt sign,
 326                                      UInt T_value_is_zero )
 327 {
 328    UInt gfield_5_bit_mask = 0xF8000000;
 329    UInt gfield_upper_5_bits = (gfield & gfield_5_bit_mask) >> (32 - 5);
 330    UInt gfield_6_bit_mask = 0xF8000000;
 331    UInt gfield_upper_6_bits = (gfield & gfield_6_bit_mask) >> (32 - 6);
 332    UInt fprf_value = 0;
 333    Int  unbiased_exponent = exponent - exponent_bias;
 334
 335    /* The assumption is the gfield bits are left justified. Mask off
 336        the most significant 5-bits in the 32-bit wide field.  */
 337    if ( T_value_is_zero == 1) {
 338       if (sign == 0)
 339          fprf_value = 0b00010;  // positive zero
 340       else
 341          fprf_value = 0b10010;  // negative zero
 342   } else if ( unbiased_exponent < min_norm_exp ) {
 343       if (sign == 0)
 344          fprf_value = 0b10100;  // posative subnormal
 345       else
 346          fprf_value = 0b11000;  // negative subnormal
 347
 348   } else if ( gfield_upper_5_bits == 0b11110 ) {  // infinity
 349       if (sign == 0)
 350          fprf_value = 0b00101;  // positive infinity
 351       else
 352          fprf_value = 0b01001;  // negative infinity
 353
 354    } else if ( gfield_upper_6_bits == 0b111110 ) {
 355       fprf_value = 0b10001;  // Quiet NaN
 356
 357    } else if ( gfield_upper_6_bits == 0b111111 ) {
 358       fprf_value = 0b10001;  // Signaling NaN
 359
 360    } else {
 361       if (sign == 0)
 362          fprf_value = 0b00100;  // positive normal
 363       else
 364          fprf_value = 0b01000;  // negative normal
 365    }
 366
 367    return fprf_value;
 368 }
 369
 370 /*---------------------------------------------------------------*/
 371 /*--- Misc BCD clean helpers.                                 ---*/
 372 /*---------------------------------------------------------------*/
 373
 374 /* NOTE, the clean and dirty helpers need to called using the
 375  * fnptr_to_fnentry() function wrapper to handle the Big Endian
 376  * pointer-to-function ABI and the Little Endian ABI.
 377  */
 378
 379 /* This C-helper takes a 128-bit BCD value as two 64-bit pieces.
 380  * It checks the string to see if it is a valid 128-bit BCD value.
 381  * A valid BCD value has a sign value in bits [3:0] between 0xA
 382  * and 0xF inclusive. each of the BCD digits represented as a 4-bit
 383  * hex number in bits BCD value[128:4] mut be between 0 and 9
 384  * inclusive.  Returns an unsigned 64-bit value if valid.
 385  */
 386 ULong is_BCDstring128_helper( ULong Signed, ULong bcd_string_hi,
 387                               ULong bcd_string_low ) {
 388    Int i;
 389    ULong valid_bcd, sign_valid = False;
 390    ULong digit;
 391    UInt  sign;
 392
 393    if ( Signed == True ) {
 394       sign = bcd_string_low & 0xF;
 395       if( ( sign >= 0xA ) && ( sign <= 0xF ) )
 396          sign_valid = True;
 397
 398       /* Change the sign digit to a zero
 399        * so the for loop below works the same
 400        * for signed and unsigned BCD stings
 401        */
 402       bcd_string_low &= 0xFFFFFFFFFFFFFFF0ULL;
 403
 404    } else {
 405       sign_valid = True;  /* set sign to True so result is only
 406                              based on the validity of the digits */
 407    }
 408
 409    valid_bcd = True;  // Assume true to start
 410    for( i = 0; i < 32; i++ ) {
 411       /* check high and low 64-bit strings in parallel */
 412       digit = bcd_string_low & 0xF;
 413       if ( digit > 0x9 )
 414          valid_bcd = False;
 415       bcd_string_low = bcd_string_low >> 4;
 416
 417       digit = bcd_string_hi & 0xF;
 418       if ( digit > 0x9 )
 419          valid_bcd = False;
 420       bcd_string_hi = bcd_string_hi >> 4;
 421    }
 422
 423    return valid_bcd & sign_valid;
 424 }
 425
 426 /* This clean helper takes a signed 32-bit BCD value and a carry in
 427  * and adds 1 to the value of the BCD value.  The BCD value is passed
 428  * in as a single 64-bit value.  The incremented value is returned in
 429  * the lower 32 bits of the result.  If the input was signed the sign of
 430  * the result is the same as the input.  The carry out is returned in
 431  * bits [35:32] of the result.
 432  */
 433 ULong increment_BCDstring32_helper( ULong Signed,
 434                                     ULong bcd_string, ULong carry_in ) {
 435    UInt i, num_digits = 8;
 436    ULong bcd_value, result = 0;
 437    ULong carry, digit, new_digit;
 438
 439    carry = carry_in;
 440
 441    if ( Signed == True ) {
 442       bcd_value = bcd_string >> 4;   /* remove sign */
 443       num_digits = num_digits - 1;
 444    } else {
 445       bcd_value = bcd_string;
 446    }
 447
 448    for( i = 0; i < num_digits; i++ ) {
 449       digit = bcd_value & 0xF;
 450       bcd_value = bcd_value >> 4;
 451       new_digit = digit + carry;
 452
 453       if ( new_digit > 10 ) {
 454          carry = 1;
 455          new_digit = new_digit - 10;
 456
 457       } else {
 458          carry = 0;
 459       }
 460       result =  result | (new_digit << (i*4) );
 461    }
 462
 463    if ( Signed == True ) {
 464       result = ( carry << 32) | ( result << 4 ) | ( bcd_string & 0xF );
 465    } else {
 466       result = ( carry << 32) | result;
 467    }
 468
 469    return result;
 470 }
 471
 472 /*---------------------------------------------------------------*/
 473 /*--- Misc packed decimal clean helpers.                      ---*/
 474 /*---------------------------------------------------------------*/
 475
 476 /* This C-helper takes a 64-bit packed decimal value stored in a
 477  * 64-bit value. It converts the zoned decimal format.  The lower
 478  * byte may contain a sign value, set it to zero.  If return_upper
 479  * is zero, return lower 64 bits of result, otherwise return upper
 480  * 64 bits of the result.
 481  */
 482 ULong convert_to_zoned_helper( ULong src_hi, ULong src_low,
 483                                ULong upper_byte, ULong return_upper ) {
 484    UInt i, sh;
 485    ULong tmp = 0, new_value;
 486
 487    /* Remove the sign from the source.  Put in the upper byte of result.
 488     * Sign inserted later.
 489     */
 490    if ( return_upper == 0 ) {  /* return lower 64-bit result */
 491       for(i = 0; i < 7; i++) {
 492          sh = ( 8 - i ) * 4;
 493          new_value = ( ( src_low >> sh ) & 0xf ) | upper_byte;
 494          tmp = tmp | ( new_value <<  ( ( 7 - i ) * 8 ) );
 495       }
 496
 497    } else {
 498       /* Byte for i=0 is in upper 64-bit of the source, do it separately */
 499       new_value = ( src_hi & 0xf ) | upper_byte;
 500       tmp = tmp | new_value << 56;
 501
 502       for( i = 1; i < 8; i++ ) {
 503          sh = ( 16 - i ) * 4;
 504          new_value = ( ( src_low >> sh ) & 0xf ) | upper_byte;
 505          tmp = tmp | ( new_value <<  ( ( 7 - i ) * 8 ) );
 506       }
 507    }
 508    return tmp;
 509 }
 510
 511 /* This C-helper takes the lower 64-bits of the 128-bit packed decimal
 512  * src value.  It converts the src value to a 128-bit national format.
 513  * If return_upper is zero, the helper returns lower 64 bits of result,
 514  * otherwise it returns the upper 64-bits of the result.
 515  */
 516 ULong convert_to_national_helper( ULong src, ULong return_upper ) {
 517
 518    UInt i;
 519    UInt sh = 3, max = 4, min = 0;  /* initialize max, min for return upper */
 520    ULong tmp = 0, new_value;
 521
 522    if ( return_upper == 0 ) {  /* return lower 64-bit result */
 523       min = 4;
 524       max = 7;
 525       sh  = 7;
 526    }
 527
 528    for( i = min; i < max; i++ ) {
 529       new_value = ( ( src >> ( ( 7 - i ) * 4 ) ) & 0xf ) | 0x0030;
 530       tmp = tmp | ( new_value <<  ( ( sh - i ) * 16 ) );
 531    }
 532    return tmp;
 533 }
 534
 535 /* This C-helper takes a 128-bit zoned value stored in a 128-bit
 536  * value. It converts it to the packed 64-bit decimal format without a
 537  * a sign value.  The sign is supposed to be in bits [3:0] and the packed
 538  * value in bits [67:4].  This helper leaves it to the caller to put the
 539  * result into a V128 and shift the returned value over and put the sign
 540  * in.
 541  */
 542 ULong convert_from_zoned_helper( ULong src_hi, ULong src_low ) {
 543    UInt i;
 544    ULong tmp = 0, nibble;
 545
 546    /* Unroll the i = 0 iteration so the sizes of the loop for the upper
 547     * and lower extraction match.  Skip sign in lease significant byte.
 548     */
 549    nibble = ( src_hi >> 56 ) & 0xF;
 550    tmp = tmp | ( nibble << 60 );
 551
 552    for( i = 1; i < 8; i++ ) {
 553       /* get the high nibbles, put into result */
 554       nibble = ( src_hi >> ( ( 7 - i ) * 8 ) ) & 0xF;
 555       tmp = tmp | ( nibble << ( ( 15 - i ) * 4 ) );
 556
 557       /* get the low nibbles, put into result */
 558       nibble = ( src_low >> ( ( 8 - i ) * 8 ) ) & 0xF;
 559       tmp = tmp | ( nibble << ( ( 8 - i ) * 4 ) );
 560    }
 561    return tmp;
 562 }
 563
 564 /* This C-helper takes a 128-bit national value stored in a 128-bit
 565  * value. It converts it to a signless packed 64-bit decimal format.
 566  */
 567 ULong convert_from_national_helper( ULong src_hi, ULong src_low ) {
 568    UInt i;
 569    ULong tmp = 0, hword;
 570
 571    src_low = src_low & 0xFFFFFFFFFFFFFFF0ULL; /* remove the sign */
 572
 573    for( i = 0; i < 4; i++ ) {
 574       /* get the high half-word, put into result */
 575       hword = ( src_hi >> ( ( 3 - i ) * 16 ) ) & 0xF;
 576       tmp = tmp | ( hword << ( ( 7 - i ) * 4 ) );
 577
 578       /* get the low half-word, put into result */
 579       hword = ( src_low >> (  ( 3 - i ) * 16 ) ) & 0xF;
 580       tmp = tmp | ( hword << ( ( 3 - i ) * 4 ) );
 581    }
 582    return tmp;
 583 }
 584
 585 /*------------------------------------------------*/
 586 /*--- Population count ---------------------------*/
 587 /*------------------------------------------------*/
 588 ULong population_count64_helper( ULong src ) {
 589    /* Fast population count based on algorithm in the "Hacker's Delight" by
 590       Henery S. Warren.  */
 591    src = (src & 0x5555555555555555) + ((src >> 1) & 0x5555555555555555);
 592    src = (src & 0x3333333333333333) + ((src >> 2) & 0x3333333333333333);
 593    src = (src & 0x0F0F0F0F0F0F0F0F) + ((src >> 4) & 0x0F0F0F0F0F0F0F0F);
 594    src = (src & 0x00FF00FF00FF00FF) + ((src >> 8) & 0x00FF00FF00FF00FF);
 595    src = (src & 0x0000FFFF0000FFFF) + ((src >> 16) & 0x0000FFFF0000FFFF);
 596    src = (src & 0x00000000FFFFFFFF) + ((src >> 32) & 0x00000000FFFFFFFF);
 597    return src & 0x3F;
 598 }
 599
 600 /*------------------------------------------------*/
 601 /*---- Extract/Deposit bits under mask helpers ---*/
 602 /*------------------------------------------------*/
 603 ULong extract_bits_under_mask_helper( ULong src, ULong mask, UInt flag ) {
 604
 605    UInt i;
 606    ULong ones, zeros, mask_bit, bit_src;
 607
 608    zeros = 0;
 609    ones = 0;
 610
 611    for (i=0; i<64; i++){
 612       mask_bit = 0x1 & (mask >> (63-i));
 613       bit_src = 0x1 & (src >> (63-i));
 614
 615       ones = ones << mask_bit;
 616       ones = ones | (mask_bit & bit_src);
 617
 618       zeros = zeros << (1^mask_bit);
 619       zeros = zeros | ((1^mask_bit) & bit_src);
 620    }
 621
 622    if (flag == 1)
 623       return ones;
 624    else
 625       return zeros;
 626 }
 627
 628 UInt count_bits_under_mask_helper( ULong src, ULong mask, UInt flag ) {
 629
 630    UInt i, count_extracted_1, count_extracted_0;;
 631    ULong mask_bit;
 632
 633    count_extracted_1 = 0;
 634    count_extracted_0 = 0;
 635
 636    for (i=0; i<64; i++){
 637       mask_bit = 0x1 & (mask >> (63-i));
 638
 639       if (mask_bit == 1)
 640          count_extracted_1++;
 641
 642       if ((1^mask_bit) == 1)
 643          count_extracted_0++;
 644    }
 645
 646    if (flag == 1)
 647       return count_extracted_1;
 648    else
 649       return count_extracted_0;
 650 }
 651
 652 ULong deposit_bits_under_mask_helper( ULong src, ULong mask ) {
 653
 654    UInt i, src_bit_pos;
 655    ULong result, mask_bit, bit_src;
 656
 657    result = 0;
 658    src_bit_pos = 0;
 659
 660    for (i=0; i<64; i++){
 661       mask_bit = 0x1 & (mask >> i);
 662
 663       if (mask_bit == 1) {
 664          bit_src = 0x1 & (src >> src_bit_pos);
 665          result = result | (bit_src << i);
 666          src_bit_pos++;
 667       }
 668    }
 669    return result;
 670 }
 671
 672 /*----------------------------------------------*/
 673 /*--- Vector Evaluate Inst helper --------------*/
 674 /*----------------------------------------------*/
 675    /* This is a 64-bit version of the VXS Vector Evaluate
 676       instruction xxeval.  */
 677
 678 ULong vector_evaluate64_helper( ULong srcA, ULong srcB, ULong srcC,
 679                                 ULong IMM ) {
 680 #define MAX_BITS 64
 681 #define MAX_IMM_BITS 8
 682
 683    UInt i, select;
 684    ULong bitA, bitB, bitC, result;
 685    ULong bitIMM;
 686
 687    result = 0;
 688
 689    for (i=0; i<MAX_BITS; i++){
 690       bitA = 0x1 & (srcA >> i);
 691       bitB = 0x1 & (srcB >> i);
 692       bitC = 0x1 & (srcC >> i);
 693
 694       /* The value of select is IBM numbering based, i.e. MSB is bit 0 */
 695       select = (bitA << 2) | (bitB << 1) | bitC;
 696       bitIMM = (IMM >> (MAX_IMM_BITS - 1 - select)) & 0x1;
 697       result = result | (bitIMM << i);
 698    }
 699    return result;
 700 #undef MAX_BITS
 701 #undef MAX_IMM_BITS
 702 }
 703
 704 /*---------------------------------------------------------------*/
 705 /* --- Clean helper for vbpermq instruction                   ---*/
 706 /*---------------------------------------------------------------*/
 707 UInt vbpermq_clean_helper( ULong vA_high, ULong vA_low, ULong vB) {
 708    ULong bit, result = 0x0;
 709    UInt i, index;
 710
 711    /* IBM numbering bit 0 on is MSB, bit 63 is LSB */
 712    for ( i = 0; i < 8; i++) {
 713       index = 0xFFULL & (vB >> (56 - 8*i) );
 714
 715       if (index < 64) {
 716          bit = 0x1 & (vA_high >> (63 - index));
 717
 718       } else if (index < 128) {
 719          bit = 0x1 & (vA_low >> (127 - index));
 720
 721       } else
 722          bit = 0;
 723
 724       result |= bit << (7 - i);
 725    }
 726    return result;
 727 }
 728
 729
 730 /*--------------------------------------------------*/
 731 /*---- VSX Vector Generate PCV from Mask helpers ---*/
 732 /*--------------------------------------------------*/
 733 static void write_VSX_entry (VexGuestPPC64State* gst, UInt reg_offset,
 734                              ULong *vsx_entry)
 735 {
 736    U128* pU128_dst;
 737    pU128_dst = (U128*) (((UChar*) gst) + reg_offset);
 738
 739    /* The U128 type is defined as an array of unsigned intetgers.  */
 740    /* Writing in LE order */
 741    (*pU128_dst)[0] = (UInt)(vsx_entry[1] & 0xFFFFFFFF);
 742    (*pU128_dst)[1] = (UInt)(vsx_entry[1] >> 32);
 743    (*pU128_dst)[2] = (UInt)(vsx_entry[0] & 0xFFFFFFFF);
 744    (*pU128_dst)[3] = (UInt)(vsx_entry[0] >> 32);
 745    return;
 746 }
 747
 748 /* CALLED FROM GENERATED CODE */
 749 void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State* gst,
 750                                             ULong src_hi, ULong src_lo,
 751                                             UInt reg_offset, UInt imm ) {
 752    /* The function computes the 128-bit result then writes it directly
 753       into the guest state VSX register.  */
 754
 755    UInt  i, shift_by, sel_shift_by, half_sel;
 756    ULong index, src, result[2];
 757    ULong j;
 758
 759    result[0] = 0;
 760    result[1] = 0;
 761    j = 0;
 762
 763    /* The algorithm in the ISA is written with IBM numbering zero on left and
 764       N-1 on right. The loop index is converted to "i" to match the algorithm
 765       for claritiy of matching the C code to the algorithm in the ISA.  */
 766
 767    if (imm == 0b00) {    // big endian expansion
 768       for( index = 0; index < 16; index++) {
 769          i = 15 - index;
 770
 771          shift_by = i*8;
 772
 773          if ( i >= 8) {
 774             src = src_hi;
 775             shift_by = shift_by - 64;
 776             half_sel = 0;
 777          } else {
 778             src = src_lo;
 779             half_sel = 1;
 780          }
 781
 782          sel_shift_by = shift_by + 7;
 783
 784          if ( ((src >> sel_shift_by) & 0x1) == 1) {
 785                result[half_sel] |= j << shift_by;
 786             j++;
 787          } else {
 788             result[half_sel] |= (index + (unsigned long long)0x10) << shift_by;
 789          }
 790       }
 791
 792
 793    } else if (imm == 0b01) {    // big endian compression
 794       /* If IMM=0b00001, let pcv be the permute control vector required to
 795          enable a left-indexed permute (vperm or xxperm) to implement a
 796          compression of the sparse byte elements in a source vector specified
 797          by the byte-element mask in VSR[VRB+32] into the leftmost byte
 798          elements of a result vector.
 799       */
 800       for( index = 0; index < 16; index++) {
 801          i = 15 - index;
 802          shift_by = i*8;
 803
 804          if ( i >= 8) {
 805             src = src_hi;
 806             shift_by = shift_by - 64;
 807             half_sel = 0;
 808          } else {
 809             src = src_lo;
 810             half_sel = 1;
 811          }
 812
 813          sel_shift_by = shift_by + 7;
 814
 815          if ( ((src >> sel_shift_by) & 0x1) == 1) {
 816             if (j >= 8)
 817                result[1] |= (index) << (15 - j)*8;
 818             else
 819                result[0] |= (index) << (7 - j)*8;
 820             j++;
 821          }
 822       }
 823       /* The algorithim says set to undefined, leave as 0
 824       for( index = 3 - j; index < 4; index++) {
 825          result |= (0 << (index*8));
 826       }
 827       */
 828
 829    } else if (imm == 0b10) {   //little-endian expansion
 830       /* If IMM=0b00010, let pcv be the permute control vector required to
 831          enable a right-indexed permute (vpermr or xxpermr) to implement an
 832          expansion of the rightmost byte elements of a source vector into the
 833          byte elements of a result vector specified by the byte-element mask
 834          in VSR[VRB+32].  */
 835       for( index = 0; index < 16; index++) {
 836          i = index;
 837
 838          shift_by = i*8;
 839
 840          if ( i >= 8) {
 841             src = src_hi;
 842             shift_by = shift_by - 64;
 843             half_sel = 0;
 844          } else {
 845             src = src_lo;
 846             half_sel = 1;
 847          }
 848
 849          sel_shift_by = shift_by + 7;
 850
 851          /* mod shift amount by 8 since src is either the upper or lower
 852             64-bits.  */
 853          if ( ((src >> sel_shift_by) & 0x1) == 1) {
 854                result[half_sel] |= j << shift_by;
 855             j++;
 856          } else {
 857             result[half_sel] |= (index + (unsigned long long)0x10) << shift_by;
 858          }
 859       }
 860
 861    } else if (imm == 0b11) {   //little-endian compression
 862       /* If IMM=0b00011, let pcv be the permute control vector required to
 863          enable a right-indexed permute (vpermr or xxpermr) to implement a
 864          compression of the sparse byte elements in a source vector specified
 865          by the byte-element mask in VSR[VRB+32] into the rightmost byte
 866          elements of a result vector.  */
 867
 868       for( index = 0; index < 16; index++) {
 869          i = index;
 870
 871          shift_by = i*8;
 872
 873          if ( i >= 8) {
 874             src = src_hi;
 875             shift_by = shift_by - 64;
 876             half_sel = 0;
 877          } else {
 878             src = src_lo;
 879             half_sel = 1;
 880          }
 881
 882          sel_shift_by = shift_by + 7;
 883
 884          if ( ((src >> sel_shift_by) & 0x1) == 1) {
 885             if (j >= 8)
 886                result[0] |= (index) << (j-8)*8;
 887             else
 888                result[1] |= (index) << j*8;
 889             j++;
 890          }
 891       }
 892
 893       /* The algorithim says set to undefined, leave as 0
 894       for( index = 3 - j; index < 4; index++) {
 895          result |= (0 << (index*8));
 896       }
 897       */
 898
 899    } else {
 900       vex_printf("ERROR, vector_gen_pvc_byte_mask_dirty_helper, imm value %u not supported.\n",
 901                  imm);
 902       vassert(0);
 903    }
 904    write_VSX_entry( gst, reg_offset, result);
 905 }
 906
 907 /* CALLED FROM GENERATED CODE */
 908 void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State* gst,
 909                                              ULong src_hi, ULong src_lo,
 910                                              UInt reg_offset,
 911                                              UInt imm ) {
 912    /* The function computes the 128-bit result then writes it directly
 913       into the guest state VSX register.  */
 914    UInt  i, shift_by, sel_shift_by, half_sel;
 915    ULong index, src, result[2];
 916    ULong j;
 917
 918    result[0] = 0;
 919    result[1] = 0;
 920    j = 0;
 921
 922    /* The algorithm in the ISA is written with IBM numbering zero on left and
 923       N-1 on right. The loop index is converted to "i" to match the algorithm
 924       for claritiy of matching the C code to the algorithm in the ISA.  */
 925
 926    if (imm == 0b00) {    // big endian expansion
 927       /* If IMM=0b00000, let pcv be the permute control vector required to
 928          enable a left-indexed permute (vperm or xxperm) to implement an
 929          expansion of the leftmost halfword elements of a source vector into
 930          the halfword elements of a result vector specified by the halfword-
 931          element mask in VSR[VRB+32].
 932       */
 933       for( index = 0; index < 8; index++) {
 934          i = 7 - index;
 935
 936          shift_by = i*16;
 937
 938          if ( i >= 4) {
 939             src = src_hi;
 940             shift_by = shift_by - 64;
 941             half_sel = 0;
 942          } else {
 943             src = src_lo;
 944             half_sel = 1;
 945          }
 946
 947          sel_shift_by = shift_by + 15;
 948
 949          if ( ((src >> sel_shift_by) & 0x1) == 1) {
 950             // half-word i, byte 0
 951             result[half_sel] |= (2*j + 0x0) << (shift_by+8);
 952             // half-word i, byte 1
 953             result[half_sel] |= (2*j + 0x1) << shift_by;
 954             j++;
 955          } else {
 956             result[half_sel] |= (2*index + 0x10) << (shift_by+8);
 957             result[half_sel] |= (2*index + 0x11) << shift_by;
 958          }
 959       }
 960
 961    } else if (imm == 0b01) {    // big endian expansion
 962       /* If IMM=0b00001,let pcv be the permute control vector required to
 963          enable a left-indexed permute (vperm or xxperm) to implement a
 964          compression of the sparse halfword elements in a source vector
 965          specified by the halfword-element mask in VSR[VRB+32] into the
 966          leftmost halfword elements of a result vector.
 967       */
 968       for( index = 0; index < 8; index++) {
 969          i = 7 - index;
 970
 971          shift_by = i*16;
 972
 973          if ( i >= 4) {
 974             src = src_hi;
 975             shift_by = shift_by - 64;
 976             half_sel = 0;
 977          } else {
 978             src = src_lo;
 979             half_sel = 1;
 980          }
 981
 982          sel_shift_by = shift_by + 15;
 983
 984          if ( ((src >> sel_shift_by) & 0x1) == 1) {
 985             if (j >= 4) {
 986                // half-word i, byte 0
 987                result[1] |= (2*index + 0x0) << ((7 - j)*16 + 8);
 988                // half-word i, byte 1
 989                result[1] |= (2*index + 0x1) << ((7 - j)*16);
 990             } else {
 991                // half-word i, byte 0
 992                result[0] |= (2*index + 0x0) << ((3 - j)*16 + 8);
 993                // half-word i, byte 1
 994                result[0] |= (2*index + 0x1) << ((3 - j)*16);
 995             }
 996             j++;
 997          }
 998       }
 999
1000    } else if (imm == 0b10) {   //little-endian expansion
1001       /* If IMM=0b00010, let pcv be the permute control vector required to
1002          enable a right-indexed permute (vpermr or xxpermr) to implement an
1003          expansion of the rightmost halfword elements of a source vector into
1004          the halfword elements of a result vector specified by the halfword-
1005          element mask in VSR[VRB+32].
1006        */
1007       for( index = 0; index < 8; index++) {
1008          i = index;
1009          shift_by = i*16;
1010
1011          if ( i >= 4) {
1012             src = src_hi;
1013             shift_by = shift_by - 64;
1014             half_sel = 0;
1015          } else {
1016             src = src_lo;
1017             half_sel = 1;
1018          }
1019
1020          sel_shift_by = shift_by + 15;
1021
1022          if ( ((src >> sel_shift_by) & 0x1) == 1) {
1023             // half-word i, byte 0
1024             result[half_sel] |= (2*j + 0x00) << shift_by;
1025             // half-word i, byte 1
1026             result[half_sel] |= (2*j + 0x01) << (shift_by+8);
1027             j++;
1028
1029          } else {
1030             // half-word i, byte 0
1031             result[half_sel] |= (2*index + 0x10) << shift_by;
1032             // half-word i, byte 1
1033             result[half_sel] |= (2*index + 0x11) << (shift_by+8);
1034          }
1035       }
1036
1037    } else if (imm == 0b11) {   //little-endian compression
1038       /* If IMM=0b00011, let pcv be the permute control vector required to
1039          enable a right-indexed permute (vpermr or xxpermr) to implement a
1040          compression of the sparse halfword elements in a source vector
1041          specified by the halfword-element mask in VSR[VRB+32] into the
1042          rightmost halfword elements of a result vector.  */
1043       for( index = 0; index < 8; index++) {
1044          i = index;
1045          shift_by = i*16;
1046
1047          if ( i >= 4) {
1048             src = src_hi;
1049             shift_by = shift_by - 64;
1050             half_sel = 0;
1051          } else {
1052             src = src_lo;
1053             half_sel = 1;
1054          }
1055
1056          sel_shift_by = shift_by + 15;
1057
1058          if ( ((src >> sel_shift_by) & 0x1) == 1) {
1059             if (j >= 4) {
1060                // half-word j, byte 0
1061                result[0] |= (2*index + 0x0) << ((j-4)*16);
1062                // half-word j, byte 1
1063                result[0] |= (2*index + 0x1) << ((j-4)*16+8);
1064             } else {
1065                // half-word j, byte 0
1066                result[1] |= (2*index + 0x0) << (j*16);
1067                // half-word j, byte 1
1068                result[1] |= (2*index + 0x1) << ((j*16)+8);
1069             }
1070             j++;
1071          }
1072       }
1073
1074    } else {
1075       vex_printf("ERROR, vector_gen_pvc_hword_dirty_mask_helper, imm value %u not supported.\n",
1076                  imm);
1077       vassert(0);
1078    }
1079    write_VSX_entry( gst, reg_offset, result);
1080 }
1081
1082 /* CALLED FROM GENERATED CODE */
1083 void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State* gst,
1084                                             ULong src_hi, ULong src_lo,
1085                                             UInt reg_offset, UInt imm ) {
1086    /* The function computes the 128-bit result then writes it directly
1087       into the guest state VSX register.  */
1088    UInt  i, shift_by, sel_shift_by, half_sel;
1089    ULong index, src, result[2];
1090    ULong j;
1091
1092    result[0] = 0;
1093    result[1] = 0;
1094    j = 0;
1095
1096    /* The algorithm in the ISA is written with IBM numbering zero on left and
1097       N-1 on right. The loop index is converted to "i" to match the algorithm
1098       for claritiy of matching the C code to the algorithm in the ISA.  */
1099
1100    if (imm == 0b00) {    // big endian expansion
1101       /* If IMM=0b00000, let pcv be the permute control vector required to
1102          enable a left-indexed permute (vperm or xxperm) to implement an
1103          expansion of the leftmost word elements of a source vector into the
1104          word elements of a result vector specified by the word-element mask
1105          in VSR[VRB+32].
1106       */
1107       for( index = 0; index < 4; index++) {
1108          i = 3 - index;
1109
1110          shift_by = i*32;
1111
1112          if ( i >= 2) {
1113             src = src_hi;
1114             shift_by = shift_by - 64;
1115             half_sel = 0;
1116          } else {
1117             src = src_lo;
1118             half_sel = 1;
1119          }
1120
1121          sel_shift_by = shift_by + 31;
1122
1123          if ( ((src >> sel_shift_by) & 0x1) == 1) {
1124             result[half_sel] |= (4*j+0) << (shift_by+24);  // word i, byte 0
1125             result[half_sel] |= (4*j+1) << (shift_by+16);  // word i, byte 1
1126             result[half_sel] |= (4*j+2) << (shift_by+8);   // word i, byte 2
1127             result[half_sel] |= (4*j+3) << shift_by;       // word i, byte 3
1128             j++;
1129          } else {
1130             result[half_sel] |= (4*index + 0x10) << (shift_by+24);
1131             result[half_sel] |= (4*index + 0x11) << (shift_by+16);
1132             result[half_sel] |= (4*index + 0x12) << (shift_by+8);
1133             result[half_sel] |= (4*index + 0x13) << shift_by;
1134          }
1135       }
1136
1137    } else if (imm == 0b01) {    // big endian compression
1138       /* If IMM=0b00001, let pcv be the permute control vector required to
1139          enable a left-indexed permute (vperm or xxperm) to implement a
1140          compression of the sparse word elements in a source vector specified
1141          by the word-element mask in VSR[VRB+32] into the leftmost word
1142          elements of a result vector.
1143       */
1144       for( index = 0; index < 4; index++) {
1145          i = 3 - index;
1146
1147          shift_by = i*32;
1148
1149          if ( i >= 2) {
1150             src = src_hi;
1151             shift_by = shift_by - 64;
1152             half_sel = 0;
1153          } else {
1154             src = src_lo;
1155             half_sel = 1;
1156          }
1157
1158          sel_shift_by = shift_by + 31;
1159
1160          if (((src >> sel_shift_by) & 0x1) == 1) {
1161             if (j >= 2) {
1162                // word j, byte 0
1163                result[1] |= (4*index+0) << ((3 - j)*32 + 24);
1164                // word j, byte 1
1165                result[1] |= (4*index+1) << ((3 - j)*32 + 16);
1166                // word j, byte 2
1167                result[1] |= (4*index+2) << ((3 - j)*32 + 8);
1168                // word j, byte 3
1169                result[1] |= (4*index+3) << ((3 - j)*32 + 0);
1170             } else {
1171                result[0] |= (4*index+0) << ((1 - j)*32 + 24);
1172                result[0] |= (4*index+1) << ((1 - j)*32 + 16);
1173                result[0] |= (4*index+2) << ((1 - j)*32 + 8);
1174                result[0] |= (4*index+3) << ((1 - j)*32 + 0);
1175             }
1176             j++;
1177          }
1178       }
1179
1180    } else if (imm == 0b10) {   //little-endian expansion
1181       /* If IMM=0b00010, let pcv be the permute control vector required to
1182          enable a right-indexed permute (vpermr or xxpermr) to implement an
1183          expansion of the rightmost word elements of a source vector into the
1184          word elements of a result vector specified by the word-element mask
1185          in VSR[VRB+32].
1186        */
1187       for( index = 0; index < 4; index++) {
1188          i = index;
1189
1190          shift_by = i*32;
1191
1192          if ( i >= 2) {
1193             src = src_hi;
1194             shift_by = shift_by - 64;
1195             half_sel = 0;
1196          } else {
1197             src = src_lo;
1198             half_sel = 1;
1199          }
1200
1201          sel_shift_by = shift_by + 31;
1202
1203          if (((src >> sel_shift_by) & 0x1) == 1) {
1204             result[half_sel] |= (4*j+0) << (shift_by + 0);  // word j, byte 0
1205             result[half_sel] |= (4*j+1) << (shift_by + 8);  // word j, byte 1
1206             result[half_sel] |= (4*j+2) << (shift_by + 16); // word j, byte 2
1207             result[half_sel] |= (4*j+3) << (shift_by + 24); // word j, byte 3
1208             j++;
1209          } else {
1210             result[half_sel] |= (4*index + 0x10) << (shift_by + 0);
1211             result[half_sel] |= (4*index + 0x11) << (shift_by + 8);
1212             result[half_sel] |= (4*index + 0x12) << (shift_by + 16);
1213             result[half_sel] |= (4*index + 0x13) << (shift_by + 24);
1214          }
1215       }
1216
1217    } else if (imm == 0b11) {   //little-endian compression
1218       /* If IMM=0b00011, let pcv be the permute control vector required to
1219          enable a right-indexed permute (vpermr or xxpermr) to implement a
1220          compression of the sparse word elements in a source vector specified
1221          by the word-element mask in VSR[VRB+32] into the rightmost word
1222          elements of a result vector.  */
1223       for( index = 0; index < 4; index++) {
1224          i =index;
1225
1226          shift_by = i*32;
1227
1228          if ( i >= 2) {
1229             src = src_hi;
1230             shift_by = shift_by - 64;
1231             half_sel = 0;
1232          } else {
1233             src = src_lo;
1234             half_sel = 1;
1235          }
1236
1237          sel_shift_by = shift_by + 31;
1238
1239          if (((src >> sel_shift_by) & 0x1) == 1) {
1240             if (j >= 2){
1241                // word j, byte 0
1242                result[0] |= (4*index + 0x0) << ((j-2)*32+0);
1243                // word j, byte 1
1244                result[0] |= (4*index + 0x1) << ((j-2)*32+8);
1245                // word j, byte 2
1246                result[0] |= (4*index + 0x2) << ((j-2)*32+16);
1247                // word j, byte 3
1248                result[0] |= (4*index + 0x3) << ((j-2)*32+24);
1249             } else {
1250                result[1] |= (4*index + 0x0) << (j*32+0);
1251                result[1] |= (4*index + 0x1) << (j*32+8);
1252                result[1] |= (4*index + 0x2) << (j*32+16);
1253                result[1] |= (4*index + 0x3) << (j*32+24);
1254             }
1255             j++;
1256          }
1257       }
1258    } else {
1259       vex_printf("ERROR, vector_gen_pvc_word_mask_dirty_helper, imm value %u not supported.\n",
1260                  imm);
1261       vassert(0);
1262    }
1263
1264    write_VSX_entry( gst, reg_offset, result);
1265 }
1266
1267 /* CALLED FROM GENERATED CODE */
1268 void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,
1269                                              ULong src_hi, ULong src_lo,
1270                                              UInt reg_offset, UInt imm ) {
1271    /* The function computes the 128-bit result then writes it directly
1272       into the guest state VSX register.  */
1273    UInt  sel_shift_by, half_sel;
1274    ULong index, src, result[2];
1275    ULong j, i;
1276
1277    result[0] = 0;
1278    result[1] = 0;
1279    j = 0;
1280
1281    /* The algorithm in the ISA is written with IBM numbering zero on left and
1282       N-1 on right. The loop index is converted to "i" to match the algorithm
1283       for claritiy of matching the C code to the algorithm in the ISA.  */
1284
1285    if (imm == 0b00) {    // big endian expansion
1286       /* If IMM=0b00000, let pcv be the permute control vector required to
1287          enable a left-indexed permute (vperm or xxperm) to implement an
1288          expansion of the leftmost doubleword elements of a source vector into
1289          the doubleword elements of a result vector specified by the
1290          doubleword-element mask in VSR[VRB+32].
1291       */
1292       for( index = 0; index < 2; index++) {
1293          i = 1 - index;
1294
1295          if ( i == 1) {
1296             src = src_hi;
1297             half_sel = 0;
1298          } else {
1299             src = src_lo;
1300             half_sel = 1;
1301          }
1302
1303          sel_shift_by = 63;
1304
1305          if ( ((src >> sel_shift_by) & 0x1) == 1) {
1306             result[half_sel] |= (8*j + 0x0) << 56; // dword i, byte 0
1307             result[half_sel] |= (8*j + 0x1) << 48; // dword i, byte 1
1308             result[half_sel] |= (8*j + 0x2) << 40; // dword i, byte 2
1309             result[half_sel] |= (8*j + 0x3) << 32; // dword i, byte 3
1310             result[half_sel] |= (8*j + 0x4) << 24; // dword i, byte 4
1311             result[half_sel] |= (8*j + 0x5) << 16; // dword i, byte 5
1312             result[half_sel] |= (8*j + 0x6) << 8;  // dword i, byte 6
1313             result[half_sel] |= (8*j + 0x7) << 0;  // dword i, byte 7
1314             j++;
1315          } else {
1316             result[half_sel] |= (8*index + 0x10) << 56;
1317             result[half_sel] |= (8*index + 0x11) << 48;
1318             result[half_sel] |= (8*index + 0x12) << 40;
1319             result[half_sel] |= (8*index + 0x13) << 32;
1320             result[half_sel] |= (8*index + 0x14) << 24;
1321             result[half_sel] |= (8*index + 0x15) << 16;
1322             result[half_sel] |= (8*index + 0x16) << 8;
1323             result[half_sel] |= (8*index + 0x17) << 0;
1324          }
1325       }
1326    } else if (imm == 0b01) {    // big endian compression
1327       /* If IMM=0b00001, let pcv be the the permute control vector required to
1328          enable a left-indexed permute (vperm or xxperm) to implement a
1329          compression of the sparse doubleword elements in a source vector
1330          specified by the doubleword-element mask in VSR[VRB+32] into the
1331          leftmost doubleword elements of a result vector.
1332       */
1333       for( index = 0; index < 2; index++) {
1334          i = 1 - index;
1335
1336          if ( i == 1) {
1337             src = src_hi;
1338             half_sel = 0;
1339          } else {
1340             src = src_lo;
1341             half_sel = 1;
1342          }
1343
1344          sel_shift_by = 63;
1345
1346          if ( ((src >> sel_shift_by) & 0x1) == 1) {
1347             if (j == 1) {
1348                result[1] |= (8*index + 0x0) << 56;   // double-word j, byte 0
1349                result[1] |= (8*index + 0x1) << 48;   // double-word j, byte 1
1350                result[1] |= (8*index + 0x2) << 40;   // double-word j, byte 2
1351                result[1] |= (8*index + 0x3) << 32;   // double-word j, byte 3
1352                result[1] |= (8*index + 0x4) << 24;   // double-word j, byte 4
1353                result[1] |= (8*index + 0x5) << 16;   // double-word j, byte 5
1354                result[1] |= (8*index + 0x6) << 8;    // double-word j, byte 6
1355                result[1] |= (8*index + 0x7) << 0;    // double-word j, byte 7
1356             } else {
1357                result[0] |= (8*index + 0x0) << 56;   // double-word j, byte 0
1358                result[0] |= (8*index + 0x1) << 48;   // double-word j, byte 1
1359                result[0] |= (8*index + 0x2) << 40;   // double-word j, byte 2
1360                result[0] |= (8*index + 0x3) << 32;   // double-word j, byte 3
1361                result[0] |= (8*index + 0x4) << 24;   // double-word j, byte 4
1362                result[0] |= (8*index + 0x5) << 16;   // double-word j, byte 5
1363                result[0] |= (8*index + 0x6) << 8;    // double-word j, byte 6
1364                result[0] |= (8*index + 0x7) << 0;    // double-word j, byte 7
1365             }
1366             j++;
1367          }
1368       }
1369    } else if (imm == 0b10) {   //little-endian expansion
1370       /* If IMM=0b00010, let pcv be the permute control vector required to
1371          enable a right-indexed permute (vpermr or xxpermr) to implement an
1372          expansion of the rightmost doubleword elements of a source vector
1373          into the doubleword elements of a result vector specified by the
1374          doubleword-element mask in VSR[VRB+32].
1375        */
1376
1377       for( index = 0; index < 2; index++) {
1378          i = index;
1379
1380          if ( i == 1) {
1381             src = src_hi;
1382             half_sel = 0;
1383          } else {
1384             src = src_lo;
1385             half_sel = 1;
1386          }
1387
1388          sel_shift_by = 63;
1389
1390          if ( ((src >> sel_shift_by) & 0x1) == 1) {
1391             result[half_sel] |= (8*j+0) << 0;  // double-word i, byte 0
1392             result[half_sel] |= (8*j+1) << 8;  // double-word i, byte 1
1393             result[half_sel] |= (8*j+2) << 16; // double-word i, byte 2
1394             result[half_sel] |= (8*j+3) << 24; // double-word i, byte 3
1395             result[half_sel] |= (8*j+4) << 32; // double-word i, byte 4
1396             result[half_sel] |= (8*j+5) << 40; // double-word i, byte 5
1397             result[half_sel] |= (8*j+6) << 48; // double-word i, byte 6
1398             result[half_sel] |= (8*j+7) << 56; // double-word i, byte 7
1399             j++;
1400          } else {
1401             result[half_sel] |= (8*index + 0x10) << 0;
1402             result[half_sel] |= (8*index + 0x11) << 8;
1403             result[half_sel] |= (8*index + 0x12) << 16;
1404             result[half_sel] |= (8*index + 0x13) << 24;
1405             result[half_sel] |= (8*index + 0x14) << 32;
1406             result[half_sel] |= (8*index + 0x15) << 40;
1407             result[half_sel] |= (8*index + 0x16) << 48;
1408             result[half_sel] |= (8*index + 0x17) << 56;
1409          }
1410       }
1411
1412    } else if (imm == 0b11) {   //little-endian compression
1413       /* If IMM=0b00011, let pcv be the permute control vector required to
1414          enable a right-indexed permute (vpermr or xxpermr) to implement a
1415          compression of the sparse doubleword elements in a source vector
1416          specified by the doubleword-element mask in VSR[VRB+32] into the
1417          rightmost doubleword elements of a result vector.  */
1418       for( index = 0; index < 2; index++) {
1419          i = index;
1420
1421          if ( i == 1) {
1422             src = src_hi;
1423             half_sel = 0;
1424          } else {
1425             src = src_lo;
1426             half_sel = 1;
1427          }
1428
1429          sel_shift_by = 63;
1430
1431          if (((src >> sel_shift_by) & 0x1) == 1) {
1432             if (j == 1) {
1433                result[0] |= (8*index + 0x0) << 0;    // double-word j, byte 0
1434                result[0] |= (8*index + 0x1) << 8;    // double-word j, byte 1
1435                result[0] |= (8*index + 0x2) << 16;   // double-word j, byte 2
1436                result[0] |= (8*index + 0x3) << 24;   // double-word j, byte 3
1437                result[0] |= (8*index + 0x4) << 32;   // double-word j, byte 4
1438                result[0] |= (8*index + 0x5) << 40;   // double-word j, byte 5
1439                result[0] |= (8*index + 0x6) << 48;   // double-word j, byte 6
1440                result[0] |= (8*index + 0x7) << 56;   // double-word j, byte 7
1441             } else {
1442                result[1] |= (8*index + 0x0) << 0;
1443                result[1] |= (8*index + 0x1) << 8;
1444                result[1] |= (8*index + 0x2) << 16;
1445                result[1] |= (8*index + 0x3) << 24;
1446                result[1] |= (8*index + 0x4) << 32;
1447                result[1] |= (8*index + 0x5) << 40;
1448                result[1] |= (8*index + 0x6) << 48;
1449                result[1] |= (8*index + 0x7) << 56;
1450             }
1451             j++;
1452          }
1453       }
1454    } else {
1455       vex_printf("ERROR, vector_gen_pvc_dword_mask_helper, imm value %u not supported.\n",
1456                  imm);
1457       vassert(0);
1458    }
1459
1460    write_VSX_entry( gst, reg_offset, result);
1461 }
1462
1463 /*------------------------------------------------*/
1464 /*---- VSX Matrix signed integer GER functions ---*/
1465 /*------------------------------------------------*/
1466 static UInt exts4( UInt src)
1467 {
1468    /* Input is an 4-bit value.  Extend bit 3 to bits [31:4] */
1469    if (( src >> 3 ) & 0x1)
1470       return src | 0xFFFFFFF0; /* sign bit is a 1, extend */
1471    else
1472       return src & 0xF;        /* make sure high order bits are zero */
1473 }
1474
1475 static ULong exts8( UInt src)
1476 {
1477    /* Input is an 8-bit value.  Extend bit 7 to bits [63:8] */
1478    if (( src >> 7 ) & 0x1)
1479       return src | 0xFFFFFFFFFFFFFF00ULL; /* sign bit is a 1, extend */
1480    else
1481       return src & 0xFF;        /* make sure high order bits are zero */
1482 }
1483
1484 static ULong extz8( UInt src)
1485 {
1486    /* Input is an 8-bit value.  Extend src on the left with zeros.  */
1487    return src & 0xFF;        /* make sure high order bits are zero */
1488 }
1489
1490 static ULong exts16to64( UInt src)
1491 {
1492    /* Input is an 16-bit value.  Extend bit 15 to bits [63:16] */
1493    if (( src >> 15 ) & 0x1)
1494       return ((ULong) src) | 0xFFFFFFFFFFFF0000ULL; /* sign is 1, extend */
1495    else
1496       /* make sure high order bits are zero */
1497       return ((ULong) src) & 0xFFFFULL;
1498 }
1499
1500 static UInt chop64to32( Long src ) {
1501    /* Take a 64-bit input, return the lower 32-bits */
1502    return (UInt)(0xFFFFFFFF & src);
1503 }
1504
1505 static UInt clampS64toS32( Long src ) {
1506    /* Take a 64-bit signed input, clamp positive values to 2^31,
1507       clamp negative values at -2^31. Return the result in an
1508       unsigned 32-bit value.  */
1509    Long max_val = 2147483647;  // 2^31-1
1510    if ( src > max_val)
1511       return (UInt)max_val;
1512
1513    if (src < -max_val)
1514       return (UInt)-max_val;
1515
1516    return (UInt)src;
1517 }
1518
1519 void write_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc, UInt reg,
1520                       UInt *acc_word)
1521 {
1522    U128* pU128_dst;
1523
1524    vassert(acc < 8);
1525    vassert(reg < 4);
1526
1527    pU128_dst = (U128*) (((UChar*)gst) + offset + acc*4*sizeof(U128)
1528                         + reg*sizeof(U128));
1529
1530    /* The U128 type is defined as an array of unsigned intetgers.  */
1531    (*pU128_dst)[0] = acc_word[0];
1532    (*pU128_dst)[1] = acc_word[1];
1533    (*pU128_dst)[2] = acc_word[2];
1534    (*pU128_dst)[3] = acc_word[3];
1535    return;
1536 }
1537
1538 void get_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc, UInt reg,
1539                     UInt *acc_word)
1540 {
1541    U128* pU128_src;
1542
1543    acc_word[3] = 0xDEAD;
1544    acc_word[2] = 0xBEEF;
1545    acc_word[1] = 0xBAD;
1546    acc_word[0] = 0xBEEF;
1547
1548    vassert(acc < 8);
1549    vassert(reg < 4);
1550
1551    pU128_src = (U128*) (((UChar*)gst) + offset + acc*4*sizeof(U128)
1552                         + reg*sizeof(U128));
1553
1554    /* The U128 type is defined as an array of unsigned intetgers.  */
1555    acc_word[0] = (*pU128_src)[0];
1556    acc_word[1] = (*pU128_src)[1];
1557    acc_word[2] = (*pU128_src)[2];
1558    acc_word[3] = (*pU128_src)[3];
1559    return;
1560 }
1561
1562 void vsx_matrix_4bit_ger_dirty_helper ( VexGuestPPC64State* gst,
1563                                         UInt offset_ACC,
1564                                         ULong srcA_hi, ULong srcA_lo,
1565                                         ULong srcB_hi, ULong srcB_lo,
1566                                         UInt masks_inst )
1567 {
1568    /* This helper calculates the result for one of the four ACC entires.
1569       It is called twice, to get the hi and then the low 64-bit of the
1570       128-bit result.  */
1571    UInt i, j, mask, sum, inst, acc_entry, prefix_inst;
1572
1573    UInt srcA_nibbles[4][8];   /* word, nibble */
1574    UInt srcB_nibbles[4][8];   /* word, nibble */
1575    UInt acc_word[4];
1576    UInt prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7;
1577    UInt result[4];
1578    UInt pmsk = 0;
1579    UInt xmsk = 0;
1580    UInt ymsk = 0;
1581
1582    mask = 0xF;
1583    inst = (masks_inst >> 5) & 0xFF;
1584    prefix_inst = (masks_inst >> 13) & 0x1;
1585    acc_entry = masks_inst & 0xF;
1586
1587    /* LE word numbering */
1588    if ( prefix_inst == 0 ) {
1589       /* Set the masks for non-prefix instructions */
1590       pmsk = 0b11111111;
1591       xmsk = 0b1111;
1592       ymsk = 0b1111;
1593
1594    } else {
1595       pmsk = (masks_inst >> 22) & 0xFF;
1596       xmsk = (masks_inst >> 18) & 0xF;
1597       ymsk = (masks_inst >> 14) & 0xF;
1598    }
1599
1600    /* Address nibbles using IBM numbering */
1601    for( i = 0; i < 4; i++) {
1602       /* Get the ACC contents directly from the PPC64 state */
1603       get_ACC_entry (gst, offset_ACC, acc_entry, 3-i, acc_word);
1604
1605       // input is in double words
1606       for( j = 0; j< 8; j++) {
1607          srcA_nibbles[3][j] = (srcA_hi >> (60-4*j)) & mask;  // hi bits [63:32]
1608          srcA_nibbles[2][j] = (srcA_hi >> (28-4*j)) & mask;  // hi bits [31:0]
1609          srcA_nibbles[1][j] = (srcA_lo >> (60-4*j)) & mask;  // lo bits [63:32]
1610          srcA_nibbles[0][j] = (srcA_lo >> (28-4*j)) & mask;  // lo bits [31:0]
1611
1612          srcB_nibbles[3][j] = (srcB_hi >> (60-4*j)) & mask;
1613          srcB_nibbles[2][j] = (srcB_hi >> (28-4*j)) & mask;
1614          srcB_nibbles[1][j] = (srcB_lo >> (60-4*j)) & mask;
1615          srcB_nibbles[0][j] = (srcB_lo >> (28-4*j)) & mask;
1616       }
1617
1618       for( j = 0; j < 4; j++) {
1619          if (((xmsk >> i) & 0x1) & ((ymsk >> j) & 0x1)) {
1620             if (((pmsk >> 7) & 0x1) == 0)
1621                prod0 = 0;
1622             else
1623                prod0 = exts4( srcA_nibbles[i][0] )
1624                   * exts4( srcB_nibbles[j][0] );
1625
1626             if (((pmsk >> 6) & 0x1) == 0)
1627                prod1 = 0;
1628             else
1629                prod1 = exts4( srcA_nibbles[i][1] )
1630                   * exts4( srcB_nibbles[j][1] );
1631
1632             if (((pmsk >> 5) & 0x1) == 0)
1633                prod2 = 0;
1634             else
1635                prod2 = exts4( srcA_nibbles[i][2] )
1636                   * exts4( srcB_nibbles[j][2] );
1637
1638             if (((pmsk >> 4) & 0x1) == 0)
1639                prod3 = 0;
1640             else
1641                prod3 = exts4( srcA_nibbles[i][3] )
1642                   * exts4(  srcB_nibbles[j][3] );
1643
1644             if (((pmsk >> 3) & 0x1) == 0)
1645                prod4 = 0;
1646             else
1647                prod4 = exts4( srcA_nibbles[i][4] )
1648                   * exts4( srcB_nibbles[j][4] );
1649
1650             if (((pmsk >> 2) & 0x1) == 0)
1651                prod5 = 0;
1652             else
1653                prod5 = exts4( srcA_nibbles[i][5] )
1654                   * exts4( srcB_nibbles[j][5] );
1655
1656             if (((pmsk >> 1) & 0x1) == 0)
1657                prod6 = 0;
1658             else
1659                prod6 = exts4( srcA_nibbles[i][6] )
1660                   * exts4( srcB_nibbles[j][6] );
1661
1662             if ((pmsk & 0x1) == 0)
1663                prod7 = 0;
1664             else
1665                prod7 = exts4( srcA_nibbles[i][7] )
1666                   * exts4( srcB_nibbles[j][7] );
1667             /* sum is UInt so the result is choped to 32-bits */
1668             sum = prod0 + prod1 + prod2 + prod3 + prod4
1669                + prod5 + prod6 + prod7;
1670
1671             if ( inst == XVI4GER8 )
1672                result[j] = sum;
1673
1674             else if ( inst == XVI4GER8PP )
1675                result[j] = sum + acc_word[j];
1676
1677          } else {
1678             result[j] = 0;
1679          }
1680       }
1681       write_ACC_entry (gst, offset_ACC, acc_entry, 3-i, result);
1682    }
1683 }
1684
1685 void vsx_matrix_8bit_ger_dirty_helper( VexGuestPPC64State* gst,
1686                                        UInt offset_ACC,
1687                                        ULong srcA_hi, ULong srcA_lo,
1688                                        ULong srcB_hi, ULong srcB_lo,
1689                                        UInt masks_inst )
1690 {
1691    UInt i, j, mask, inst, acc_entry, prefix_inst;
1692
1693    UInt srcA_bytes[4][4];   /* word, byte */
1694    UInt srcB_bytes[4][4];   /* word, byte */
1695    UInt acc_word[4];
1696    ULong prod0, prod1, prod2, prod3, sum;
1697    UInt result[4];
1698    UInt pmsk = 0;
1699    UInt xmsk = 0;
1700    UInt ymsk = 0;
1701
1702    mask = 0xFF;
1703    inst = (masks_inst >> 5) & 0xFF;
1704    prefix_inst = (masks_inst >> 13) & 0x1;
1705    acc_entry = masks_inst & 0xF;
1706
1707    /* LE word numbering */
1708    if ( prefix_inst == 0 ) {
1709       /* Set the masks */
1710       pmsk = 0b1111;
1711       xmsk = 0b1111;
1712       ymsk = 0b1111;
1713
1714    } else {
1715       pmsk = (masks_inst >> 26) & 0xF;
1716       xmsk = (masks_inst >> 18) & 0xF;
1717       ymsk = (masks_inst >> 14) & 0xF;
1718    }
1719
1720    /* Address byes using IBM numbering */
1721    for( i = 0; i < 4; i++) {
1722       /* Get the ACC contents directly from the PPC64 state */
1723       get_ACC_entry (gst, offset_ACC, acc_entry, 3-i, acc_word);
1724
1725       for( j = 0; j< 4; j++) {
1726          srcA_bytes[3][j] = (srcA_hi >> (56-8*j)) & mask;
1727          srcA_bytes[2][j] = (srcA_hi >> (24-8*j)) & mask;
1728          srcA_bytes[1][j] = (srcA_lo >> (56-8*j)) & mask;
1729          srcA_bytes[0][j] = (srcA_lo >> (24-8*j)) & mask;
1730
1731          srcB_bytes[3][j] = (srcB_hi >> (56-8*j)) & mask;
1732          srcB_bytes[2][j] = (srcB_hi >> (24-8*j)) & mask;
1733          srcB_bytes[1][j] = (srcB_lo >> (56-8*j)) & mask;
1734          srcB_bytes[0][j] = (srcB_lo >> (24-8*j)) & mask;
1735       }
1736
1737       for( j = 0; j < 4; j++) {
1738          if (((xmsk >> i) & 0x1) & ((ymsk >> j) & 0x1)) {
1739             if (((pmsk >> 3) & 0x1) == 0)
1740                prod0 = 0;
1741             else
1742                prod0 =
1743                   exts8( srcA_bytes[i][0] )
1744                   * extz8( srcB_bytes[j][0] );
1745
1746             if (((pmsk >> 2) & 0x1) == 0)
1747                prod1 = 0;
1748             else
1749                prod1 =
1750                   exts8( srcA_bytes[i][1] )
1751                   * extz8( srcB_bytes[j][1] );
1752
1753             if (((pmsk >> 1) & 0x1) == 0)
1754                prod2 = 0;
1755             else
1756                prod2 =
1757                   exts8( srcA_bytes[i][2] )
1758                   * extz8( srcB_bytes[j][2] );
1759
1760             if (((pmsk >> 0) & 0x1) == 0)
1761                prod3 = 0;
1762             else
1763                prod3 =
1764                   exts8( srcA_bytes[i][3] )
1765                   * extz8( srcB_bytes[j][3] );
1766
1767             /* sum is UInt so the result is choped to 32-bits */
1768             sum = prod0 + prod1 + prod2 + prod3;
1769
1770             if ( inst == XVI8GER4 )
1771                result[j] = chop64to32( sum );
1772
1773             else if ( inst == XVI8GER4PP )
1774                result[j] = chop64to32( sum + acc_word[j] );
1775
1776             else if ( inst == XVI8GER4SPP )
1777                result[j] = clampS64toS32(sum + acc_word[j]);
1778
1779             // @todo PJF Coverity complains that if none of the abofe ifs are true
1780             // then result gets used uninitialized
1781          } else {
1782             result[j] = 0;
1783          }
1784       }
1785       write_ACC_entry (gst, offset_ACC, acc_entry, 3-i, result);
1786    }
1787 }
1788
1789 void vsx_matrix_16bit_ger_dirty_helper( VexGuestPPC64State* gst,
1790                                         UInt offset_ACC,
1791                                         ULong srcA_hi, ULong srcA_lo,
1792                                         ULong srcB_hi, ULong srcB_lo,
1793                                         UInt masks_inst )
1794 {
1795    UInt i, j, mask, inst, acc_entry, prefix_inst;
1796    ULong sum;
1797    UInt srcA_word[4][2];   /* word, hword */
1798    UInt srcB_word[4][2];   /* word, hword */
1799    UInt acc_word[4];
1800    ULong prod0, prod1;
1801    UInt result[4];
1802    UInt pmsk = 0;
1803    UInt xmsk = 0;
1804    UInt ymsk = 0;
1805
1806    mask = 0xFFFF;
1807    inst = (masks_inst >> 5) & 0xFF;
1808    prefix_inst = (masks_inst >> 13) & 0x1;
1809    acc_entry = masks_inst & 0xF;
1810
1811    /* LE word numbering */
1812    if ( prefix_inst == 0 ) {
1813       /* Set the masks for non prefix instructions */
1814       pmsk = 0b11;
1815       xmsk = 0b1111;
1816       ymsk = 0b1111;
1817
1818    } else {
1819       pmsk = (masks_inst >> 28) & 0x3;
1820       xmsk = (masks_inst >> 18) & 0xF;
1821       ymsk = (masks_inst >> 14) & 0xF;
1822    }
1823
1824    /* Address half-words using IBM numbering */
1825    for( i = 0; i < 4; i++) {
1826       /* Get the ACC contents directly from the PPC64 state */
1827       get_ACC_entry (gst, offset_ACC, acc_entry, 3-i, acc_word);
1828
1829       for( j = 0; j< 2; j++) {
1830          srcA_word[3][j] = (srcA_hi >> (48-16*j)) & mask;
1831          srcA_word[2][j] = (srcA_hi >> (16-16*j)) & mask;
1832          srcA_word[1][j] = (srcA_lo >> (48-16*j)) & mask;
1833          srcA_word[0][j] = (srcA_lo >> (16-16*j)) & mask;
1834
1835          srcB_word[3][j] = (srcB_hi >> (48-16*j)) & mask;
1836          srcB_word[2][j] = (srcB_hi >> (16-16*j)) & mask;
1837          srcB_word[1][j] = (srcB_lo >> (48-16*j)) & mask;
1838          srcB_word[0][j] = (srcB_lo >> (16-16*j)) & mask;
1839       }
1840
1841       for( j = 0; j < 4; j++) {
1842          if (((xmsk >> i) & 0x1) & ((ymsk >> j) & 0x1)) {
1843             if (((pmsk >> 1) & 0x1) == 0)
1844                prod0 = 0;
1845
1846             else
1847                prod0 = exts16to64( srcA_word[i][0] )
1848                   * exts16to64( srcB_word[j][0] );
1849
1850             if (((pmsk >> 0) & 0x1) == 0)
1851                prod1 = 0;
1852             else
1853                prod1 = exts16to64( srcA_word[i][1] )
1854                   * exts16to64( srcB_word[j][1] );
1855
1856             sum = prod0 + prod1;
1857
1858             if ( inst == XVI16GER2 )
1859                result[j] = chop64to32( sum );
1860
1861             else if ( inst == XVI16GER2S )
1862                result[j] = clampS64toS32( sum );
1863
1864             else if ( inst == XVI16GER2PP )
1865                result[j] = chop64to32( sum + acc_word[j] );
1866
1867             else if ( inst == XVI16GER2SPP )
1868                result[j] = clampS64toS32( sum + acc_word[j] );
1869
1870          } else {
1871             result[j] = 0;
1872          }
1873       }
1874       write_ACC_entry (gst, offset_ACC, acc_entry, 3-i, result);
1875    }
1876 }
1877
1878 //matrix 16 float stuff
1879 union
1880 convert_t {
1881   UInt u32;
1882   ULong u64;
1883   Float f;
1884   Double d;
1885 };
1886
1887 static Float reinterpret_int_as_float( UInt input )
1888 {
1889   /* Reinterpret the bit pattern of an int as a float. */
1890  __attribute__ ((aligned (128)))   union convert_t conv;
1891
1892   conv.u32 = input;
1893   return conv.f;
1894 }
1895
1896 static UInt reinterpret_float_as_int( Float input )
1897 {
1898   /* Reinterpret the bit pattern of an int as a float. */
1899  __attribute__ ((aligned (128)))   union convert_t conv;
1900
1901   conv.f = input;
1902   return conv.u32;
1903 }
1904
1905 static Double reinterpret_long_as_double( ULong input )
1906 {
1907   /* Reinterpret the bit pattern of an int as a float. */
1908  __attribute__ ((aligned (128)))   union convert_t conv;
1909
1910   conv.u64 = input;
1911   return conv.d;
1912 }
1913
1914 static ULong reinterpret_double_as_long( Double input )
1915 {
1916   /* Reinterpret the bit pattern of an int as a float. */
1917  __attribute__ ((aligned (128)))   union convert_t conv;
1918
1919   conv.d = input;
1920   return conv.u64;
1921 }
1922
1923 static Double conv_f16_to_double( ULong input )
1924 {
1925 #  if defined (HAS_XSCVHPDP)
1926    // This all seems to be very alignment sensitive??
1927    __attribute__ ((aligned (64))) ULong src;
1928    __attribute__ ((aligned (64))) Double result;
1929    src = input;
1930    __asm__ __volatile__ (".machine push;\n" ".machine power9;\n" \
1931                          "xscvhpdp %x0,%x1 ;\n .machine pop" \
1932                          : "=wa" (result) : "wa" (src) );
1933    return result;
1934 #  else
1935    return 0.0;
1936 #  endif
1937 }
1938
1939 #define BF16_SIGN_MASK   0x8000
1940 #define BF16_EXP_MASK    0x7F80
1941 #define BF16_FRAC_MASK   0x007F
1942 #define BF16_BIAS        127
1943 #define BF16_MAX_UNBIASED_EXP 127
1944 #define BF16_MIN_UNBIASED_EXP -126
1945 #define FLOAT_SIGN_MASK  0x80000000
1946 #define FLOAT_EXP_MASK   0x7F800000
1947 #define FLOAT_FRAC_MASK  0x007FFFFF
1948 #define FLOAT_FRAC_BIT8  0x00008000
1949 #define FLOAT_BIAS       127
1950
1951 static Float conv_bf16_to_float( UInt input )
1952 {
1953   /* input is 16-bit bfloat.
1954      bias +127, exponent 8-bits, fraction 7-bits
1955
1956      output is 32-bit float.
1957      bias +127, exponent 8-bits, fraction 22-bits
1958   */
1959
1960   UInt input_exp, input_fraction, unbiased_exp;
1961   UInt output_exp, output_fraction;
1962   UInt sign;
1963   union convert_t conv;
1964
1965   sign = (UInt)(input & BF16_SIGN_MASK);
1966   input_exp = input & BF16_EXP_MASK;
1967   unbiased_exp = (input_exp >> 7) - (UInt)BF16_BIAS;
1968   input_fraction = input & BF16_FRAC_MASK;
1969
1970   if (((input_exp & BF16_EXP_MASK) == BF16_EXP_MASK) &&
1971       (input_fraction != 0)) {
1972      /* input is NaN or SNaN, exp all 1's, fraction != 0 */
1973      output_exp = FLOAT_EXP_MASK;
1974      output_fraction = input_fraction;
1975
1976   } else if(((input_exp & BF16_EXP_MASK) == BF16_EXP_MASK) &&
1977       ( input_fraction == 0)) {
1978      /* input is infinity,  exp all 1's, fraction = 0  */
1979      output_exp = FLOAT_EXP_MASK;
1980      output_fraction = 0;
1981
1982   } else if((input_exp == 0) && (input_fraction == 0)) {
1983      /* input is zero */
1984      output_exp = 0;
1985      output_fraction = 0;
1986
1987   } else if((input_exp == 0) && (input_fraction != 0)) {
1988      /* input is denormal */
1989      output_fraction = input_fraction;
1990      output_exp = (-(Int)BF16_BIAS + (Int)FLOAT_BIAS ) << 23;
1991
1992   } else {
1993      /* result is normal */
1994      output_exp = (unbiased_exp + FLOAT_BIAS) << 23;
1995      output_fraction = input_fraction;
1996   }
1997
1998   conv.u32 = sign << (31 - 15) | output_exp | (output_fraction << (23-7));
1999   return conv.f;
2000 }
2001
2002 static UInt conv_float_to_bf16( UInt input )
2003 {
2004    /* input is 32-bit float stored as unsigned 32-bit.
2005       bias +127, exponent 8-bits, fraction 23-bits
2006
2007       output is 16-bit bfloat.
2008       bias +127, exponent 8-bits, fraction 7-bits
2009
2010       If the unbiased exponent of the input is greater than the max floating
2011       point unbiased exponent value, the result of the floating point 16-bit
2012       value is infinity.
2013    */
2014
2015    UInt input_exp, input_fraction;
2016    UInt output_exp, output_fraction;
2017    UInt result, sign;
2018
2019    sign = input & FLOAT_SIGN_MASK;
2020    input_exp = input & FLOAT_EXP_MASK;
2021    input_fraction = input & FLOAT_FRAC_MASK;
2022
2023    if (((input_exp & FLOAT_EXP_MASK) == FLOAT_EXP_MASK) &&
2024        (input_fraction != 0)) {
2025       /* input is NaN or SNaN, exp all 1's, fraction != 0 */
2026       output_exp = BF16_EXP_MASK;
2027       output_fraction = (ULong)input_fraction >> (23 - 7);
2028    } else if (((input_exp & FLOAT_EXP_MASK) == FLOAT_EXP_MASK) &&
2029               ( input_fraction == 0)) {
2030       /* input is infinity,  exp all 1's, fraction = 0  */
2031       output_exp = BF16_EXP_MASK;
2032       output_fraction = 0;
2033    } else if ((input_exp == 0) && (input_fraction == 0)) {
2034       /* input is zero */
2035       output_exp = 0;
2036       output_fraction = 0;
2037    } else if ((input_exp == 0) && (input_fraction != 0)) {
2038       /* input is denormal */
2039       output_exp = 0;
2040       output_fraction = (ULong)input_fraction >> (23 - 7);
2041    } else {
2042       /* result is normal */
2043       output_exp = (input_exp - BF16_BIAS + FLOAT_BIAS) >> (23 - 7);
2044       output_fraction = (ULong)input_fraction >> (23 - 7);
2045
2046       /* Round result. Look at the 8th bit position of the 32-bit floating
2047          pointt fraction.  The F16 fraction is only 7 bits wide so if the 8th
2048          bit of the F32 is a 1 we need to round up by adding 1 to the output
2049          fraction.  */
2050       if ((input_fraction & FLOAT_FRAC_BIT8) == FLOAT_FRAC_BIT8)
2051          /* Round the F16 fraction up by 1 */
2052          output_fraction = output_fraction + 1;
2053    }
2054
2055    result = sign >> (31 - 15) | output_exp | output_fraction;
2056    return result;
2057 }
2058
2059 static Float conv_double_to_float( Double src )
2060 {
2061   return (float) src ;
2062 }
2063
2064
2065 static Double negate_double( Double input )
2066 {
2067    /* Don't negate a NaN value. A NaN has an exponet
2068       of all 1's, non zero fraction. */
2069    __attribute__ ((aligned (128))) union convert_t conv;
2070
2071    conv.d = input;
2072
2073    if ( ( ( conv.u64 & I64_EXP_MASK) == I64_EXP_MASK )
2074         && ( ( conv.u64 & I64_FRACTION_MASK ) != 0 ) )
2075       return input;
2076    else
2077       return -input;
2078 }
2079
2080 static Float negate_float( Float input )
2081 {
2082    /* Don't negate a NaN value. A NaN has an exponet
2083       of all 1's, non zero fraction. */
2084    __attribute__ ((aligned (128))) union convert_t conv;
2085
2086    conv.f = input;
2087
2088    if ( ( ( conv.u32 & I32_EXP_MASK) == I32_EXP_MASK )
2089         && ( ( conv.u32 & I32_FRACTION_MASK ) != 0 ) )
2090       return input;
2091   else
2092       return -input;
2093 }
2094
2095 /* This C-helper takes a vector of two 32-bit floating point values
2096  * and returns a vector containing two 16-bit bfloats.
2097    input:    word0           word1
2098    output  0x0   hword1   0x0    hword3
2099    Called from generated code.
2100  */
2101 ULong convert_from_floattobf16_helper( ULong src ) {
2102    ULong resultHi, resultLo;
2103
2104    resultHi = (ULong)conv_float_to_bf16( (UInt)(src >> 32));
2105    resultLo = (ULong)conv_float_to_bf16( (UInt)(src & 0xFFFFFFFF));
2106    return (resultHi << 32) | resultLo;
2107
2108 }
2109
2110 /* This C-helper takes a vector of two 16-bit bfloating point values
2111  * and returns a vector containing one 32-bit float.
2112    input:   0x0   hword1   0x0    hword3
2113    output:    word0           word1
2114  */
2115 ULong convert_from_bf16tofloat_helper( ULong src ) {
2116    ULong result;
2117    union convert_t conv;
2118    conv.f = conv_bf16_to_float( (UInt)(src >> 32) );
2119    result = (ULong) conv.u32;
2120    conv.f = conv_bf16_to_float( (UInt)(src & 0xFFFFFFFF));
2121    result = (result << 32) | (ULong) conv.u32;
2122    return result;
2123  }
2124
2125 void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,
2126                                               UInt offset_ACC,
2127                                               ULong srcA_hi, ULong srcA_lo,
2128                                               ULong srcB_hi, ULong srcB_lo,
2129                                               UInt masks_inst )
2130 {
2131    UInt i, j, mask, inst, acc_entry, prefix_inst;
2132
2133    UInt srcA_word[4][2];   /* word, hword */
2134    UInt srcB_word[4][2];   /* word, hword */
2135    Double src10, src11, src20, src21;
2136    UInt acc_word_input[4];
2137    Float acc_word[4];
2138    Double prod;
2139    Double msum;
2140    UInt result[4];
2141    UInt pmsk = 0;
2142    UInt xmsk = 0;
2143    UInt ymsk = 0;
2144
2145    mask = 0xFFFF;
2146    inst = (masks_inst >> 5) & 0xFF;
2147    prefix_inst = (masks_inst >> 13) & 0x1;
2148    acc_entry = masks_inst & 0xF;
2149
2150    if ( prefix_inst == 0 ) {
2151       /* Set the masks for non-prefix instructions */
2152       pmsk = 0b11;
2153       xmsk = 0b1111;
2154       ymsk = 0b1111;
2155
2156    } else {
2157       /* Use mask supplied with prefix inst */
2158       pmsk = (masks_inst >> 28) & 0x3;
2159       xmsk = (masks_inst >> 18) & 0xF;
2160       ymsk = (masks_inst >> 14) & 0xF;
2161    }
2162
2163    /* Address half-words using IBM numbering */
2164    for( i = 0; i < 4; i++) {
2165       /* Get the ACC contents directly from the PPC64 state */
2166       get_ACC_entry (gst, offset_ACC, acc_entry, 3-i, acc_word_input);
2167
2168       acc_word[3] = reinterpret_int_as_float( acc_word_input[3] );
2169       acc_word[2] = reinterpret_int_as_float( acc_word_input[2] );
2170       acc_word[1] = reinterpret_int_as_float( acc_word_input[1] );
2171       acc_word[0] = reinterpret_int_as_float( acc_word_input[0] );
2172
2173       for( j = 0; j < 2; j++) {    // input is in double words
2174          srcA_word[3][j] = (UInt)((srcA_hi >> (48-16*j)) & mask);
2175          srcA_word[2][j] = (UInt)((srcA_hi >> (16-16*j)) & mask);
2176          srcA_word[1][j] = (UInt)((srcA_lo >> (48-16*j)) & mask);
2177          srcA_word[0][j] = (UInt)((srcA_lo >> (16-16*j)) & mask);
2178
2179          srcB_word[3][j] = (UInt)((srcB_hi >> (48-16*j)) & mask);
2180          srcB_word[2][j] = (UInt)((srcB_hi >> (16-16*j)) & mask);
2181          srcB_word[1][j] = (UInt)((srcB_lo >> (48-16*j)) & mask);
2182          srcB_word[0][j] = (UInt)((srcB_lo >> (16-16*j)) & mask);
2183       }
2184
2185       /* Note the isa is not consistent in the src naming.  Will use the
2186          naming src10, src11, src20, src21 used with xvf16ger2 instructions.
2187       */
2188       for( j = 0; j < 4; j++) {
2189          if (((pmsk >> 1) & 0x1) == 0) {
2190             src10 = 0;
2191             src20 = 0;
2192          } else {
2193             if (( inst  == XVF16GER2 ) || ( inst  == XVF16GER2PP )
2194                 || ( inst == XVF16GER2PN ) || ( inst  == XVF16GER2NP )
2195                 || ( inst == XVF16GER2NN )) {
2196                src10 = conv_f16_to_double((ULong)srcA_word[i][0]);
2197                src20 = conv_f16_to_double((ULong)srcB_word[j][0]);
2198             } else {
2199                /* Input is in bfloat format, result is stored in the
2200                   "traditional" 64-bit float format. */
2201                src10 = (double)conv_bf16_to_float((ULong)srcA_word[i][0]);
2202                src20 = (double)conv_bf16_to_float((ULong)srcB_word[j][0]);
2203             }
2204          }
2205
2206          if ((pmsk & 0x1) == 0) {
2207             src11 = 0;
2208             src21 = 0;
2209          } else {
2210             if (( inst  == XVF16GER2 ) || ( inst  == XVF16GER2PP )
2211                 || ( inst == XVF16GER2PN ) || ( inst  == XVF16GER2NP )
2212                 || ( inst == XVF16GER2NN )) {
2213                src11 = conv_f16_to_double((ULong)srcA_word[i][1]);
2214                src21 = conv_f16_to_double((ULong)srcB_word[j][1]);
2215             } else {
2216                /* Input is in bfloat format, result is stored in the
2217                   "traditional" 64-bit float format. */
2218                src11 = (double)conv_bf16_to_float((ULong)srcA_word[i][1]);
2219                src21 = (double)conv_bf16_to_float((ULong)srcB_word[j][1]);
2220             }
2221          }
2222
2223          prod = src10 * src20;
2224          msum = prod + src11 * src21;
2225
2226          if (((xmsk >> i) & 0x1) & ((ymsk >> j) & 0x1)) {
2227             /* Note, we do not track the exception handling bits
2228                ox, ux, xx, si, mz, vxsnan and vximz in the FPSCR.  */
2229
2230             if (( inst == XVF16GER2 ) || ( inst == XVBF16GER2 ) )
2231                result[j] = reinterpret_float_as_int(
2232                   conv_double_to_float(msum) );
2233
2234             else if (( inst == XVF16GER2PP ) ||  (inst == XVBF16GER2PP ))
2235                result[j] = reinterpret_float_as_int(
2236                   conv_double_to_float(msum)
2237                   + acc_word[j] );
2238
2239             else if (( inst == XVF16GER2PN ) || ( inst == XVBF16GER2PN ))
2240                result[j] = reinterpret_float_as_int(
2241                   conv_double_to_float(msum)
2242                   + negate_float( acc_word[j] ) );
2243
2244             else if (( inst == XVF16GER2NP ) || ( inst == XVBF16GER2NP ))
2245                result[j] = reinterpret_float_as_int(
2246                   conv_double_to_float( negate_double( msum ) )
2247                   + acc_word[j] );
2248
2249             else if (( inst == XVF16GER2NN ) || ( inst == XVBF16GER2NN ))
2250                result[j] = reinterpret_float_as_int(
2251                   conv_double_to_float( negate_double( msum ) )
2252                   + negate_float( acc_word[j] ) );
2253          } else {
2254             result[j] = 0;
2255          }
2256       }
2257       write_ACC_entry (gst, offset_ACC, acc_entry, 3-i, result);
2258    }
2259 }
2260
2261 void vsx_matrix_32bit_float_ger_dirty_helper( VexGuestPPC64State* gst,
2262                                               UInt offset_ACC,
2263                                               ULong srcA_hi, ULong srcA_lo,
2264                                               ULong srcB_hi, ULong srcB_lo,
2265                                               UInt masks_inst )
2266 {
2267    UInt i, j, mask, inst, acc_entry, prefix_inst;
2268
2269    Float srcA_word[4];
2270    Float srcB_word[4];
2271    UInt acc_word_input[4];
2272    Float acc_word[4];
2273    UInt result[4];
2274    UInt xmsk = 0;
2275    UInt ymsk = 0;
2276    Float src1, src2, acc;
2277
2278    mask = 0xFFFFFFFF;
2279    inst = (masks_inst >> 5) & 0xFF;
2280    prefix_inst = (masks_inst >> 13) & 0x1;
2281    acc_entry = masks_inst & 0xF;
2282
2283    if ( prefix_inst == 0 ) {
2284       /* Set the masks for non-prefix instructions */
2285       xmsk = 0b1111;
2286       ymsk = 0b1111;
2287
2288    } else {
2289       xmsk = (masks_inst >> 18) & 0xF;
2290       ymsk = (masks_inst >> 14) & 0xF;
2291    }
2292
2293    srcA_word[3] = reinterpret_int_as_float( (srcA_hi >> 32) & mask );
2294    srcA_word[2] = reinterpret_int_as_float( srcA_hi & mask );
2295    srcA_word[1] = reinterpret_int_as_float( (srcA_lo >> 32) & mask );
2296    srcA_word[0] = reinterpret_int_as_float( srcA_lo & mask );
2297
2298    srcB_word[3] = reinterpret_int_as_float( (srcB_hi >> 32) & mask );
2299    srcB_word[2] = reinterpret_int_as_float( srcB_hi & mask );
2300    srcB_word[1] = reinterpret_int_as_float( (srcB_lo >> 32) & mask );
2301    srcB_word[0] = reinterpret_int_as_float( srcB_lo & mask );
2302
2303    /* Address byes using IBM numbering */
2304    for( i = 0; i < 4; i++) {
2305       /* Get the ACC contents directly from the PPC64 state */
2306       get_ACC_entry (gst, offset_ACC, acc_entry, 3-i, acc_word_input);
2307
2308       acc_word[3] = reinterpret_int_as_float( acc_word_input[3] );
2309       acc_word[2] = reinterpret_int_as_float( acc_word_input[2] );
2310       acc_word[1] = reinterpret_int_as_float( acc_word_input[1] );
2311       acc_word[0] = reinterpret_int_as_float( acc_word_input[0] );
2312
2313       for( j = 0; j < 4; j++) {
2314
2315          if ((((xmsk >> i) & 0x1) & ((ymsk >> j) & 0x1)) == 0x1) {
2316             /* Note, we do not track the exception handling bits
2317                ox, ux, xx, si, mz, vxsnan and vximz in the FPSCR.  */
2318
2319             src1 = srcA_word[i];
2320             src2 = srcB_word[j];
2321             acc = acc_word[j];
2322
2323             if ( inst == XVF32GER )
2324                result[j] = reinterpret_float_as_int( src1 * src2 );
2325
2326             else if ( inst == XVF32GERPP )
2327                result[j] = reinterpret_float_as_int( ( src1 * src2 ) + acc );
2328
2329             else if ( inst == XVF32GERPN )
2330                result[j] = reinterpret_float_as_int( ( src1 * src2 )
2331                                                      + negate_float( acc ) );
2332
2333             else if ( inst == XVF32GERNP )
2334                result[j] = reinterpret_float_as_int(
2335                   negate_float( src1 * src2 ) + acc );
2336
2337             else if ( inst == XVF32GERNN )
2338                result[j] = reinterpret_float_as_int(
2339                   negate_float( src1 * src2 ) + negate_float( acc ) );
2340          } else {
2341             result[j] = 0;
2342          }
2343       }
2344       write_ACC_entry (gst, offset_ACC, acc_entry, 3-i, result);
2345    }
2346 }
2347
2348 void vsx_matrix_64bit_float_ger_dirty_helper( VexGuestPPC64State* gst,
2349                                               UInt offset_ACC,
2350                                               ULong srcX_hi, ULong srcX_lo,
2351                                               ULong srcY_hi, ULong srcY_lo,
2352                                               UInt masks_inst )
2353 {
2354    /* This function just computes the result for one entry in the ACC. */
2355    UInt i, j, inst, acc_entry, prefix_inst;
2356
2357    Double srcX_dword[4];
2358    Double srcY_dword[2];
2359    Double result[2];
2360    UInt result_uint[4];
2361    ULong result_ulong[2];
2362    Double acc_dword[4];
2363    ULong acc_word_ulong[2];
2364    UInt acc_word_input[4];
2365    UInt xmsk = 0;
2366    UInt ymsk = 0;
2367    UInt start_i;
2368    Double src1, src2, acc;
2369
2370    inst = (masks_inst >> 8) & 0xFF;
2371    prefix_inst = (masks_inst >> 16) & 0x1;
2372    start_i = (masks_inst >> 4) & 0xF;
2373    acc_entry = masks_inst & 0xF;
2374
2375    if ( prefix_inst == 0 ) {
2376       /* Set the masks for non-prefix instructions */
2377       xmsk = 0b1111;
2378       ymsk = 0b11;
2379
2380    } else {
2381       xmsk = (masks_inst >> 21) & 0xF;
2382       ymsk = (masks_inst >> 19) & 0x3;
2383    }
2384
2385    /* Need to store the srcX_dword in the correct index for the following
2386       for loop.  */
2387    srcX_dword[1+start_i] = reinterpret_long_as_double( srcX_lo);
2388    srcX_dword[0+start_i] = reinterpret_long_as_double( srcX_hi );
2389    srcY_dword[1] = reinterpret_long_as_double( srcY_lo );
2390    srcY_dword[0] = reinterpret_long_as_double( srcY_hi );
2391
2392    for( i = start_i; i < start_i+2; i++) {
2393       /* Get the ACC contents directly from the PPC64 state */
2394       get_ACC_entry (gst, offset_ACC, acc_entry, 3 - i,
2395                      acc_word_input);
2396
2397       acc_word_ulong[1] = acc_word_input[3];
2398       acc_word_ulong[1] = (acc_word_ulong[1] << 32) | acc_word_input[2];
2399       acc_word_ulong[0] = acc_word_input[1];
2400       acc_word_ulong[0] = (acc_word_ulong[0] << 32) | acc_word_input[0];
2401       acc_dword[0] = reinterpret_long_as_double( acc_word_ulong[0] );
2402       acc_dword[1] = reinterpret_long_as_double( acc_word_ulong[1]);
2403
2404       for( j = 0; j < 2; j++) {
2405
2406          if (((xmsk >> i) & 0x1) & ((ymsk >> j) & 0x1)) {
2407             /* Note, we do not track the exception handling bits
2408                ox, ux, xx, si, mz, vxsnan and vximz in the FPSCR.  */
2409
2410             src1 = srcX_dword[i];
2411             src2 = srcY_dword[j];
2412             acc = acc_dword[j];
2413
2414             if ( inst == XVF64GER )
2415                result[j] = src1 * src2;
2416
2417             else if ( inst == XVF64GERPP )
2418                result[j] = ( src1 * src2 ) + acc;
2419
2420             else if ( inst == XVF64GERPN )
2421                result[j] = ( src1 * src2 ) + negate_double( acc );
2422
2423             else if ( inst == XVF64GERNP )
2424                result[j] = negate_double( src1 * src2 ) + acc;
2425
2426             else if ( inst == XVF64GERNN )
2427                result[j] = negate_double( src1 * src2 ) + negate_double( acc );
2428
2429          } else {
2430             result[j] = 0;
2431          }
2432       }
2433
2434       /* Need to store the two double float values as two unsigned ints in
2435          order to store them to the ACC.  */
2436       result_ulong[0] = reinterpret_double_as_long ( result[0] );
2437       result_ulong[1] = reinterpret_double_as_long ( result[1] );
2438
2439       result_uint[0] = result_ulong[0] & 0xFFFFFFFF;
2440       result_uint[1] = (result_ulong[0] >> 32) & 0xFFFFFFFF;
2441       result_uint[2] = result_ulong[1] & 0xFFFFFFFF;
2442       result_uint[3] = (result_ulong[1] >> 32) & 0xFFFFFFFF;
2443
2444       write_ACC_entry (gst, offset_ACC, acc_entry, 3 - i,
2445                        result_uint);
2446    }
2447 }
2448
2449 /* CALLED FROM GENERATED CODE */
2450 /* DIRTY HELPER uses inline assembly to call random number instruction on
2451    the host machine.  Note, the dirty helper takes the value returned from
2452    the host and returns it.  The helper does not change the guest state
2453    or guest memory.  */
2454 ULong darn_dirty_helper ( UInt L )
2455 {
2456    ULong val = 0xFFFFFFFFFFFFFFFFULL;  /* error */
2457
2458 #  if defined (HAS_DARN)
2459    if ( L == 0)
2460       __asm__ __volatile__(".machine push; .machine power9;" \
2461                            "darn  %0,0; .machine pop;" : "=r"(val));
2462    else if (L == 1)
2463       __asm__ __volatile__(".machine push; .machine power9;" \
2464                            "darn  %0,1; .machine pop;" : "=r"(val));
2465    else if (L == 2)
2466       __asm__ __volatile__(".machine push; .machine power9;"
2467                            "darn  %0,2; .machine pop;" : "=r"(val));
2468 # endif
2469
2470    return val;
2471 }
2472
2473 /*----------------------------------------------*/
2474 /*--- The exported fns ..                    ---*/
2475 /*----------------------------------------------*/
2476
2477 /* VISIBLE TO LIBVEX CLIENT */
2478 UInt LibVEX_GuestPPC32_get_CR ( /*IN*/const VexGuestPPC32State* vex_state )
2479 {
2480 #  define FIELD(_n)                                    \
2481       ( ( (UInt)                                       \
2482            ( (vex_state->guest_CR##_n##_321 & (7<<1))  \
2483              | (vex_state->guest_CR##_n##_0 & 1)       \
2484            )                                           \
2485         )                                              \
2486         << (4 * (7-(_n)))                              \
2487       )
2488
2489    return
2490       FIELD(0) | FIELD(1) | FIELD(2) | FIELD(3)
2491       | FIELD(4) | FIELD(5) | FIELD(6) | FIELD(7);
2492
2493 #  undef FIELD
2494 }
2495
2496
2497 /* VISIBLE TO LIBVEX CLIENT */
2498 /* Note: %CR is 32 bits even for ppc64 */
2499 UInt LibVEX_GuestPPC64_get_CR ( /*IN*/const VexGuestPPC64State* vex_state )
2500 {
2501 #  define FIELD(_n)                                    \
2502       ( ( (UInt)                                       \
2503            ( (vex_state->guest_CR##_n##_321 & (7<<1))  \
2504              | (vex_state->guest_CR##_n##_0 & 1)       \
2505            )                                           \
2506         )                                              \
2507         << (4 * (7-(_n)))                              \
2508       )
2509
2510    return
2511       FIELD(0) | FIELD(1) | FIELD(2) | FIELD(3)
2512       | FIELD(4) | FIELD(5) | FIELD(6) | FIELD(7);
2513
2514 #  undef FIELD
2515 }
2516
2517
2518 /* VISIBLE TO LIBVEX CLIENT */
2519 void LibVEX_GuestPPC32_put_CR ( UInt cr_native,
2520                                 /*OUT*/VexGuestPPC32State* vex_state )
2521 {
2522    UInt t;
2523
2524 #  define FIELD(_n)                                           \
2525       do {                                                    \
2526          t = cr_native >> (4*(7-(_n)));                       \
2527          vex_state->guest_CR##_n##_0 = toUChar(t & 1);        \
2528          vex_state->guest_CR##_n##_321 = toUChar(t & (7<<1)); \
2529       } while (0)
2530
2531    FIELD(0);
2532    FIELD(1);
2533    FIELD(2);
2534    FIELD(3);
2535    FIELD(4);
2536    FIELD(5);
2537    FIELD(6);
2538    FIELD(7);
2539
2540 #  undef FIELD
2541 }
2542
2543
2544 /* VISIBLE TO LIBVEX CLIENT */
2545 /* Note: %CR is 32 bits even for ppc64 */
2546 void LibVEX_GuestPPC64_put_CR ( UInt cr_native,
2547                                 /*OUT*/VexGuestPPC64State* vex_state )
2548 {
2549    UInt t;
2550
2551 #  define FIELD(_n)                                           \
2552       do {                                                    \
2553          t = cr_native >> (4*(7-(_n)));                       \
2554          vex_state->guest_CR##_n##_0 = toUChar(t & 1);        \
2555          vex_state->guest_CR##_n##_321 = toUChar(t & (7<<1)); \
2556       } while (0)
2557
2558    FIELD(0);
2559    FIELD(1);
2560    FIELD(2);
2561    FIELD(3);
2562    FIELD(4);
2563    FIELD(5);
2564    FIELD(6);
2565    FIELD(7);
2566
2567 #  undef FIELD
2568 }
2569
2570
2571 /* VISIBLE TO LIBVEX CLIENT */
2572 UInt LibVEX_GuestPPC32_get_XER ( /*IN*/const VexGuestPPC32State* vex_state )
2573 {
2574    UInt w = 0;
2575    w |= ( ((UInt)vex_state->guest_XER_BC) & 0xFF );
2576    w |= ( (((UInt)vex_state->guest_XER_SO) & 0x1) << 31 );
2577    w |= ( (((UInt)vex_state->guest_XER_OV) & 0x1) << 30 );
2578    w |= ( (((UInt)vex_state->guest_XER_CA) & 0x1) << 29 );
2579    w |= ( (((UInt)vex_state->guest_XER_OV32) & 0x1) << 19 );
2580    w |= ( (((UInt)vex_state->guest_XER_CA32) & 0x1) << 18 );
2581    return w;
2582 }
2583
2584
2585 /* VISIBLE TO LIBVEX CLIENT */
2586 /* Note: %XER is 32 bits even for ppc64 */
2587 UInt LibVEX_GuestPPC64_get_XER ( /*IN*/const VexGuestPPC64State* vex_state )
2588 {
2589    UInt w = 0;
2590    w |= ( ((UInt)vex_state->guest_XER_BC) & 0xFF );
2591    w |= ( (((UInt)vex_state->guest_XER_SO) & 0x1) << 31 );
2592    w |= ( (((UInt)vex_state->guest_XER_OV) & 0x1) << 30 );
2593    w |= ( (((UInt)vex_state->guest_XER_CA) & 0x1) << 29 );
2594    w |= ( (((UInt)vex_state->guest_XER_OV32) & 0x1) << 19 );
2595    w |= ( (((UInt)vex_state->guest_XER_CA32) & 0x1) << 18 );
2596    return w;
2597 }
2598
2599
2600 /* VISIBLE TO LIBVEX CLIENT */
2601 void LibVEX_GuestPPC32_put_XER ( UInt xer_native,
2602                                  /*OUT*/VexGuestPPC32State* vex_state )
2603 {
2604    vex_state->guest_XER_BC = toUChar(xer_native & 0xFF);
2605    vex_state->guest_XER_SO = toUChar((xer_native >> 31) & 0x1);
2606    vex_state->guest_XER_OV = toUChar((xer_native >> 30) & 0x1);
2607    vex_state->guest_XER_CA = toUChar((xer_native >> 29) & 0x1);
2608    vex_state->guest_XER_OV32 = toUChar((xer_native >> 19) & 0x1);
2609    vex_state->guest_XER_CA32 = toUChar((xer_native >> 18) & 0x1);
2610 }
2611
2612 /* VISIBLE TO LIBVEX CLIENT */
2613 /* Note: %XER is 32 bits even for ppc64 */
2614 void LibVEX_GuestPPC64_put_XER ( UInt xer_native,
2615                                  /*OUT*/VexGuestPPC64State* vex_state )
2616 {
2617    vex_state->guest_XER_BC = toUChar(xer_native & 0xFF);
2618    vex_state->guest_XER_SO = toUChar((xer_native >> 31) & 0x1);
2619    vex_state->guest_XER_OV = toUChar((xer_native >> 30) & 0x1);
2620    vex_state->guest_XER_CA = toUChar((xer_native >> 29) & 0x1);
2621    vex_state->guest_XER_OV32 = toUChar((xer_native >> 19) & 0x1);
2622    vex_state->guest_XER_CA32 = toUChar((xer_native >> 18) & 0x1);
2623 }
2624
2625 /* VISIBLE TO LIBVEX CLIENT */
2626 void LibVEX_GuestPPC32_initialise ( /*OUT*/VexGuestPPC32State* vex_state )
2627 {
2628    Int i;
2629    vex_state->host_EvC_FAILADDR = 0;
2630    vex_state->host_EvC_COUNTER  = 0;
2631    vex_state->pad3 = 0;
2632    vex_state->pad4 = 0;
2633
2634    vex_state->guest_GPR0  = 0;
2635    vex_state->guest_GPR1  = 0;
2636    vex_state->guest_GPR2  = 0;
2637    vex_state->guest_GPR3  = 0;
2638    vex_state->guest_GPR4  = 0;
2639    vex_state->guest_GPR5  = 0;
2640    vex_state->guest_GPR6  = 0;
2641    vex_state->guest_GPR7  = 0;
2642    vex_state->guest_GPR8  = 0;
2643    vex_state->guest_GPR9  = 0;
2644    vex_state->guest_GPR10 = 0;
2645    vex_state->guest_GPR11 = 0;
2646    vex_state->guest_GPR12 = 0;
2647    vex_state->guest_GPR13 = 0;
2648    vex_state->guest_GPR14 = 0;
2649    vex_state->guest_GPR15 = 0;
2650    vex_state->guest_GPR16 = 0;
2651    vex_state->guest_GPR17 = 0;
2652    vex_state->guest_GPR18 = 0;
2653    vex_state->guest_GPR19 = 0;
2654    vex_state->guest_GPR20 = 0;
2655    vex_state->guest_GPR21 = 0;
2656    vex_state->guest_GPR22 = 0;
2657    vex_state->guest_GPR23 = 0;
2658    vex_state->guest_GPR24 = 0;
2659    vex_state->guest_GPR25 = 0;
2660    vex_state->guest_GPR26 = 0;
2661    vex_state->guest_GPR27 = 0;
2662    vex_state->guest_GPR28 = 0;
2663    vex_state->guest_GPR29 = 0;
2664    vex_state->guest_GPR30 = 0;
2665    vex_state->guest_GPR31 = 0;
2666
2667    /* Initialise the vector state. */
2668 #  define VECZERO(_vr) _vr[0]=_vr[1]=_vr[2]=_vr[3] = 0;
2669
2670    VECZERO(vex_state->guest_VSR0 );
2671    VECZERO(vex_state->guest_VSR1 );
2672    VECZERO(vex_state->guest_VSR2 );
2673    VECZERO(vex_state->guest_VSR3 );
2674    VECZERO(vex_state->guest_VSR4 );
2675    VECZERO(vex_state->guest_VSR5 );
2676    VECZERO(vex_state->guest_VSR6 );
2677    VECZERO(vex_state->guest_VSR7 );
2678    VECZERO(vex_state->guest_VSR8 );
2679    VECZERO(vex_state->guest_VSR9 );
2680    VECZERO(vex_state->guest_VSR10);
2681    VECZERO(vex_state->guest_VSR11);
2682    VECZERO(vex_state->guest_VSR12);
2683    VECZERO(vex_state->guest_VSR13);
2684    VECZERO(vex_state->guest_VSR14);
2685    VECZERO(vex_state->guest_VSR15);
2686    VECZERO(vex_state->guest_VSR16);
2687    VECZERO(vex_state->guest_VSR17);
2688    VECZERO(vex_state->guest_VSR18);
2689    VECZERO(vex_state->guest_VSR19);
2690    VECZERO(vex_state->guest_VSR20);
2691    VECZERO(vex_state->guest_VSR21);
2692    VECZERO(vex_state->guest_VSR22);
2693    VECZERO(vex_state->guest_VSR23);
2694    VECZERO(vex_state->guest_VSR24);
2695    VECZERO(vex_state->guest_VSR25);
2696    VECZERO(vex_state->guest_VSR26);
2697    VECZERO(vex_state->guest_VSR27);
2698    VECZERO(vex_state->guest_VSR28);
2699    VECZERO(vex_state->guest_VSR29);
2700    VECZERO(vex_state->guest_VSR30);
2701    VECZERO(vex_state->guest_VSR31);
2702    VECZERO(vex_state->guest_VSR32);
2703    VECZERO(vex_state->guest_VSR33);
2704    VECZERO(vex_state->guest_VSR34);
2705    VECZERO(vex_state->guest_VSR35);
2706    VECZERO(vex_state->guest_VSR36);
2707    VECZERO(vex_state->guest_VSR37);
2708    VECZERO(vex_state->guest_VSR38);
2709    VECZERO(vex_state->guest_VSR39);
2710    VECZERO(vex_state->guest_VSR40);
2711    VECZERO(vex_state->guest_VSR41);
2712    VECZERO(vex_state->guest_VSR42);
2713    VECZERO(vex_state->guest_VSR43);
2714    VECZERO(vex_state->guest_VSR44);
2715    VECZERO(vex_state->guest_VSR45);
2716    VECZERO(vex_state->guest_VSR46);
2717    VECZERO(vex_state->guest_VSR47);
2718    VECZERO(vex_state->guest_VSR48);
2719    VECZERO(vex_state->guest_VSR49);
2720    VECZERO(vex_state->guest_VSR50);
2721    VECZERO(vex_state->guest_VSR51);
2722    VECZERO(vex_state->guest_VSR52);
2723    VECZERO(vex_state->guest_VSR53);
2724    VECZERO(vex_state->guest_VSR54);
2725    VECZERO(vex_state->guest_VSR55);
2726    VECZERO(vex_state->guest_VSR56);
2727    VECZERO(vex_state->guest_VSR57);
2728    VECZERO(vex_state->guest_VSR58);
2729    VECZERO(vex_state->guest_VSR59);
2730    VECZERO(vex_state->guest_VSR60);
2731    VECZERO(vex_state->guest_VSR61);
2732    VECZERO(vex_state->guest_VSR62);
2733    VECZERO(vex_state->guest_VSR63);
2734
2735    VECZERO( vex_state->guest_ACC_0_r0 );
2736    VECZERO( vex_state->guest_ACC_0_r1 );
2737    VECZERO( vex_state->guest_ACC_0_r2 );
2738    VECZERO( vex_state->guest_ACC_0_r3 );
2739    VECZERO( vex_state->guest_ACC_1_r0 );
2740    VECZERO( vex_state->guest_ACC_1_r1 );
2741    VECZERO( vex_state->guest_ACC_1_r2 );
2742    VECZERO( vex_state->guest_ACC_1_r3 );
2743    VECZERO( vex_state->guest_ACC_2_r0 );
2744    VECZERO( vex_state->guest_ACC_2_r1 );
2745    VECZERO( vex_state->guest_ACC_2_r2 );
2746    VECZERO( vex_state->guest_ACC_2_r3 );
2747    VECZERO( vex_state->guest_ACC_3_r0 );
2748    VECZERO( vex_state->guest_ACC_3_r1 );
2749    VECZERO( vex_state->guest_ACC_3_r2 );
2750    VECZERO( vex_state->guest_ACC_3_r3 );
2751    VECZERO( vex_state->guest_ACC_4_r0 );
2752    VECZERO( vex_state->guest_ACC_4_r1 );
2753    VECZERO( vex_state->guest_ACC_4_r2 );
2754    VECZERO( vex_state->guest_ACC_4_r3 );
2755    VECZERO( vex_state->guest_ACC_5_r0 );
2756    VECZERO( vex_state->guest_ACC_5_r1 );
2757    VECZERO( vex_state->guest_ACC_5_r2 );
2758    VECZERO( vex_state->guest_ACC_5_r3 );
2759    VECZERO( vex_state->guest_ACC_6_r0 );
2760    VECZERO( vex_state->guest_ACC_6_r1 );
2761    VECZERO( vex_state->guest_ACC_6_r2 );
2762    VECZERO( vex_state->guest_ACC_6_r3 );
2763    VECZERO( vex_state->guest_ACC_7_r0 );
2764    VECZERO( vex_state->guest_ACC_7_r1 );
2765    VECZERO( vex_state->guest_ACC_7_r2 );
2766    VECZERO( vex_state->guest_ACC_7_r3 );
2767
2768 #  undef VECZERO
2769
2770    vex_state->guest_CIA  = 0;
2771    vex_state->guest_LR   = 0;
2772    vex_state->guest_CTR  = 0;
2773
2774    vex_state->guest_XER_SO = 0;
2775    vex_state->guest_XER_OV = 0;
2776    vex_state->guest_XER_CA = 0;
2777    vex_state->guest_XER_BC = 0;
2778
2779    vex_state->guest_XER_OV32 = 0;
2780    vex_state->guest_XER_CA32 = 0;
2781
2782    vex_state->guest_CR0_321 = 0;
2783    vex_state->guest_CR0_0   = 0;
2784    vex_state->guest_CR1_321 = 0;
2785    vex_state->guest_CR1_0   = 0;
2786    vex_state->guest_CR2_321 = 0;
2787    vex_state->guest_CR2_0   = 0;
2788    vex_state->guest_CR3_321 = 0;
2789    vex_state->guest_CR3_0   = 0;
2790    vex_state->guest_CR4_321 = 0;
2791    vex_state->guest_CR4_0   = 0;
2792    vex_state->guest_CR5_321 = 0;
2793    vex_state->guest_CR5_0   = 0;
2794    vex_state->guest_CR6_321 = 0;
2795    vex_state->guest_CR6_0   = 0;
2796    vex_state->guest_CR7_321 = 0;
2797    vex_state->guest_CR7_0   = 0;
2798
2799    vex_state->guest_FPROUND  = PPCrm_NEAREST;
2800    vex_state->guest_DFPROUND = PPCrm_NEAREST;
2801    vex_state->guest_C_FPCC   = 0;
2802    vex_state->pad2 = 0;
2803
2804    vex_state->guest_VRSAVE = 0;
2805
2806 # if defined(VGP_ppc64be_linux)
2807    /* By default, the HW for BE sets the VSCR[NJ] bit to 1.
2808       VSR is a 128-bit register, NJ bit is bit 111 (IBM numbering).
2809       However, VSCR is modeled as a 64-bit register. */
2810    vex_state->guest_VSCR = 0x1 << (127 - 111);
2811 # else
2812    /* LE API requires NJ be set to 0. */
2813    vex_state->guest_VSCR = 0x0;
2814 #endif
2815
2816    vex_state->guest_EMNOTE = EmNote_NONE;
2817
2818    vex_state->guest_CMSTART = 0;
2819    vex_state->guest_CMLEN   = 0;
2820
2821    vex_state->guest_NRADDR = 0;
2822    vex_state->guest_NRADDR_GPR2 = 0;
2823
2824    vex_state->guest_REDIR_SP = -1;
2825    for (i = 0; i < VEX_GUEST_PPC32_REDIR_STACK_SIZE; i++)
2826       vex_state->guest_REDIR_STACK[i] = 0;
2827
2828    vex_state->guest_IP_AT_SYSCALL = 0;
2829    vex_state->guest_SPRG3_RO = 0;
2830    vex_state->guest_PPR = 0x4ULL << 50;  // medium priority
2831    vex_state->guest_PSPB = 0x100;  // an arbitrary non-zero value to start with
2832
2833    vex_state->padding1 = 0;
2834    /*   vex_state->padding2 = 0;  currently not used */
2835 }
2836
2837
2838 /* VISIBLE TO LIBVEX CLIENT */
2839 void LibVEX_GuestPPC64_initialise ( /*OUT*/VexGuestPPC64State* vex_state )
2840 {
2841    Int i;
2842    vex_state->host_EvC_FAILADDR = 0;
2843    vex_state->host_EvC_COUNTER = 0;
2844    vex_state->pad0 = 0;
2845    vex_state->guest_GPR0  = 0;
2846    vex_state->guest_GPR1  = 0;
2847    vex_state->guest_GPR2  = 0;
2848    vex_state->guest_GPR3  = 0;
2849    vex_state->guest_GPR4  = 0;
2850    vex_state->guest_GPR5  = 0;
2851    vex_state->guest_GPR6  = 0;
2852    vex_state->guest_GPR7  = 0;
2853    vex_state->guest_GPR8  = 0;
2854    vex_state->guest_GPR9  = 0;
2855    vex_state->guest_GPR10 = 0;
2856    vex_state->guest_GPR11 = 0;
2857    vex_state->guest_GPR12 = 0;
2858    vex_state->guest_GPR13 = 0;
2859    vex_state->guest_GPR14 = 0;
2860    vex_state->guest_GPR15 = 0;
2861    vex_state->guest_GPR16 = 0;
2862    vex_state->guest_GPR17 = 0;
2863    vex_state->guest_GPR18 = 0;
2864    vex_state->guest_GPR19 = 0;
2865    vex_state->guest_GPR20 = 0;
2866    vex_state->guest_GPR21 = 0;
2867    vex_state->guest_GPR22 = 0;
2868    vex_state->guest_GPR23 = 0;
2869    vex_state->guest_GPR24 = 0;
2870    vex_state->guest_GPR25 = 0;
2871    vex_state->guest_GPR26 = 0;
2872    vex_state->guest_GPR27 = 0;
2873    vex_state->guest_GPR28 = 0;
2874    vex_state->guest_GPR29 = 0;
2875    vex_state->guest_GPR30 = 0;
2876    vex_state->guest_GPR31 = 0;
2877
2878    /* Initialise the vector state. */
2879 #  define VECZERO(_vr) _vr[0]=_vr[1]=_vr[2]=_vr[3] = 0;
2880
2881    VECZERO(vex_state->guest_VSR0 );
2882    VECZERO(vex_state->guest_VSR1 );
2883    VECZERO(vex_state->guest_VSR2 );
2884    VECZERO(vex_state->guest_VSR3 );
2885    VECZERO(vex_state->guest_VSR4 );
2886    VECZERO(vex_state->guest_VSR5 );
2887    VECZERO(vex_state->guest_VSR6 );
2888    VECZERO(vex_state->guest_VSR7 );
2889    VECZERO(vex_state->guest_VSR8 );
2890    VECZERO(vex_state->guest_VSR9 );
2891    VECZERO(vex_state->guest_VSR10);
2892    VECZERO(vex_state->guest_VSR11);
2893    VECZERO(vex_state->guest_VSR12);
2894    VECZERO(vex_state->guest_VSR13);
2895    VECZERO(vex_state->guest_VSR14);
2896    VECZERO(vex_state->guest_VSR15);
2897    VECZERO(vex_state->guest_VSR16);
2898    VECZERO(vex_state->guest_VSR17);
2899    VECZERO(vex_state->guest_VSR18);
2900    VECZERO(vex_state->guest_VSR19);
2901    VECZERO(vex_state->guest_VSR20);
2902    VECZERO(vex_state->guest_VSR21);
2903    VECZERO(vex_state->guest_VSR22);
2904    VECZERO(vex_state->guest_VSR23);
2905    VECZERO(vex_state->guest_VSR24);
2906    VECZERO(vex_state->guest_VSR25);
2907    VECZERO(vex_state->guest_VSR26);
2908    VECZERO(vex_state->guest_VSR27);
2909    VECZERO(vex_state->guest_VSR28);
2910    VECZERO(vex_state->guest_VSR29);
2911    VECZERO(vex_state->guest_VSR30);
2912    VECZERO(vex_state->guest_VSR31);
2913    VECZERO(vex_state->guest_VSR32);
2914    VECZERO(vex_state->guest_VSR33);
2915    VECZERO(vex_state->guest_VSR34);
2916    VECZERO(vex_state->guest_VSR35);
2917    VECZERO(vex_state->guest_VSR36);
2918    VECZERO(vex_state->guest_VSR37);
2919    VECZERO(vex_state->guest_VSR38);
2920    VECZERO(vex_state->guest_VSR39);
2921    VECZERO(vex_state->guest_VSR40);
2922    VECZERO(vex_state->guest_VSR41);
2923    VECZERO(vex_state->guest_VSR42);
2924    VECZERO(vex_state->guest_VSR43);
2925    VECZERO(vex_state->guest_VSR44);
2926    VECZERO(vex_state->guest_VSR45);
2927    VECZERO(vex_state->guest_VSR46);
2928    VECZERO(vex_state->guest_VSR47);
2929    VECZERO(vex_state->guest_VSR48);
2930    VECZERO(vex_state->guest_VSR49);
2931    VECZERO(vex_state->guest_VSR50);
2932    VECZERO(vex_state->guest_VSR51);
2933    VECZERO(vex_state->guest_VSR52);
2934    VECZERO(vex_state->guest_VSR53);
2935    VECZERO(vex_state->guest_VSR54);
2936    VECZERO(vex_state->guest_VSR55);
2937    VECZERO(vex_state->guest_VSR56);
2938    VECZERO(vex_state->guest_VSR57);
2939    VECZERO(vex_state->guest_VSR58);
2940    VECZERO(vex_state->guest_VSR59);
2941    VECZERO(vex_state->guest_VSR60);
2942    VECZERO(vex_state->guest_VSR61);
2943    VECZERO(vex_state->guest_VSR62);
2944    VECZERO(vex_state->guest_VSR63);
2945
2946 #  undef VECZERO
2947
2948    vex_state->guest_CIA  = 0;
2949    vex_state->guest_LR   = 0;
2950    vex_state->guest_CTR  = 0;
2951
2952    vex_state->guest_XER_SO = 0;
2953    vex_state->guest_XER_OV = 0;
2954    vex_state->guest_XER_CA = 0;
2955    vex_state->guest_XER_BC = 0;
2956
2957    vex_state->guest_CR0_321 = 0;
2958    vex_state->guest_CR0_0   = 0;
2959    vex_state->guest_CR1_321 = 0;
2960    vex_state->guest_CR1_0   = 0;
2961    vex_state->guest_CR2_321 = 0;
2962    vex_state->guest_CR2_0   = 0;
2963    vex_state->guest_CR3_321 = 0;
2964    vex_state->guest_CR3_0   = 0;
2965    vex_state->guest_CR4_321 = 0;
2966    vex_state->guest_CR4_0   = 0;
2967    vex_state->guest_CR5_321 = 0;
2968    vex_state->guest_CR5_0   = 0;
2969    vex_state->guest_CR6_321 = 0;
2970    vex_state->guest_CR6_0   = 0;
2971    vex_state->guest_CR7_321 = 0;
2972    vex_state->guest_CR7_0   = 0;
2973
2974    vex_state->guest_FPROUND  = PPCrm_NEAREST;
2975    vex_state->guest_DFPROUND = PPCrm_NEAREST;
2976    vex_state->guest_C_FPCC   = 0;
2977    vex_state->pad2 = 0;
2978
2979    vex_state->guest_VRSAVE = 0;
2980
2981 # if defined(VGP_ppc64be_linux)
2982    /* By default, the HW for BE sets the VSCR[NJ] bit to 1.
2983       VSR is a 128-bit register, NJ bit is bit 111 (IBM numbering).
2984       However, VSCR is modeled as a 64-bit register. */
2985    vex_state->guest_VSCR = 0x1 << (127 - 111);
2986 # else
2987    /* LE API requires NJ be set to 0. */
2988    vex_state->guest_VSCR = 0x0;
2989 #endif
2990
2991    vex_state->guest_EMNOTE = EmNote_NONE;
2992
2993    vex_state->padding = 0;
2994
2995    vex_state->guest_CMSTART = 0;
2996    vex_state->guest_CMLEN   = 0;
2997
2998    vex_state->guest_NRADDR = 0;
2999    vex_state->guest_NRADDR_GPR2 = 0;
3000
3001    vex_state->guest_REDIR_SP = -1;
3002    for (i = 0; i < VEX_GUEST_PPC64_REDIR_STACK_SIZE; i++)
3003       vex_state->guest_REDIR_STACK[i] = 0;
3004
3005    vex_state->guest_IP_AT_SYSCALL = 0;
3006    vex_state->guest_SPRG3_RO = 0;
3007    vex_state->guest_TFHAR  = 0;
3008    vex_state->guest_TFIAR  = 0;
3009    vex_state->guest_TEXASR = 0;
3010    vex_state->guest_PPR = 0x4ULL << 50;  // medium priority
3011    vex_state->guest_PSPB = 0x100;  // an arbitrary non-zero value to start with
3012    vex_state->guest_DSCR = 0;
3013
3014 }
3015
3016
3017 /*-----------------------------------------------------------*/
3018 /*--- Describing the ppc guest state, for the benefit     ---*/
3019 /*--- of iropt and instrumenters.                         ---*/
3020 /*-----------------------------------------------------------*/
3021
3022 /* Figure out if any part of the guest state contained in minoff
3023    .. maxoff requires precise memory exceptions.  If in doubt return
3024    True (but this is generates significantly slower code).
3025
3026    By default we enforce precise exns for guest R1 (stack pointer),
3027    CIA (current insn address) and LR (link register).  These are the
3028    minimum needed to extract correct stack backtraces from ppc
3029    code. [[NB: not sure if keeping LR up to date is actually
3030    necessary.]]
3031
3032    Only R1 is needed in mode VexRegUpdSpAtMemAccess.
3033 */
3034 Bool guest_ppc32_state_requires_precise_mem_exns (
3035         Int minoff, Int maxoff, VexRegisterUpdates pxControl
3036      )
3037 {
3038    Int lr_min  = offsetof(VexGuestPPC32State, guest_LR);
3039    Int lr_max  = lr_min + 4 - 1;
3040    Int r1_min  = offsetof(VexGuestPPC32State, guest_GPR1);
3041    Int r1_max  = r1_min + 4 - 1;
3042    Int cia_min = offsetof(VexGuestPPC32State, guest_CIA);
3043    Int cia_max = cia_min + 4 - 1;
3044
3045    if (maxoff < r1_min || minoff > r1_max) {
3046       /* no overlap with R1 */
3047       if (pxControl == VexRegUpdSpAtMemAccess)
3048          return False; // We only need to check stack pointer.
3049    } else {
3050       return True;
3051    }
3052
3053    if (maxoff < lr_min || minoff > lr_max) {
3054       /* no overlap with LR */
3055    } else {
3056       return True;
3057    }
3058
3059    if (maxoff < cia_min || minoff > cia_max) {
3060       /* no overlap with CIA */
3061    } else {
3062       return True;
3063    }
3064
3065    return False;
3066 }
3067
3068 Bool guest_ppc64_state_requires_precise_mem_exns (
3069         Int minoff, Int maxoff, VexRegisterUpdates pxControl
3070      )
3071 {
3072    /* Given that R2 is a Big Deal in the ELF ppc64 ABI, it seems
3073       prudent to be conservative with it, even though thus far there
3074       is no evidence to suggest that it actually needs to be kept up
3075       to date wrt possible exceptions. */
3076    Int lr_min  = offsetof(VexGuestPPC64State, guest_LR);
3077    Int lr_max  = lr_min + 8 - 1;
3078    Int r1_min  = offsetof(VexGuestPPC64State, guest_GPR1);
3079    Int r1_max  = r1_min + 8 - 1;
3080    Int r2_min  = offsetof(VexGuestPPC64State, guest_GPR2);
3081    Int r2_max  = r2_min + 8 - 1;
3082    Int cia_min = offsetof(VexGuestPPC64State, guest_CIA);
3083    Int cia_max = cia_min + 8 - 1;
3084
3085    if (maxoff < r1_min || minoff > r1_max) {
3086       /* no overlap with R1 */
3087       if (pxControl == VexRegUpdSpAtMemAccess)
3088          return False; // We only need to check stack pointer.
3089    } else {
3090       return True;
3091    }
3092
3093    if (maxoff < lr_min || minoff > lr_max) {
3094       /* no overlap with LR */
3095    } else {
3096       return True;
3097    }
3098
3099    if (maxoff < r2_min || minoff > r2_max) {
3100       /* no overlap with R2 */
3101    } else {
3102       return True;
3103    }
3104
3105    if (maxoff < cia_min || minoff > cia_max) {
3106       /* no overlap with CIA */
3107    } else {
3108       return True;
3109    }
3110
3111    return False;
3112 }
3113
3114
3115 #define ALWAYSDEFD32(field)                           \
3116     { offsetof(VexGuestPPC32State, field),            \
3117       (sizeof ((VexGuestPPC32State*)0)->field) }
3118
3119 VexGuestLayout
3120    ppc32Guest_layout
3121       = {
3122           /* Total size of the guest state, in bytes. */
3123           .total_sizeB = sizeof(VexGuestPPC32State),
3124
3125           /* Describe the stack pointer. */
3126           .offset_SP = offsetof(VexGuestPPC32State,guest_GPR1),
3127           .sizeof_SP = 4,
3128
3129           /* Describe the frame pointer. */
3130           .offset_FP = offsetof(VexGuestPPC32State,guest_GPR1),
3131           .sizeof_FP = 4,
3132
3133           /* Describe the instruction pointer. */
3134           .offset_IP = offsetof(VexGuestPPC32State,guest_CIA),
3135           .sizeof_IP = 4,
3136
3137           /* Describe any sections to be regarded by Memcheck as
3138              'always-defined'. */
3139           .n_alwaysDefd = 12,
3140
3141           .alwaysDefd
3142           = { /*  0 */ ALWAYSDEFD32(guest_CIA),
3143               /*  1 */ ALWAYSDEFD32(guest_EMNOTE),
3144               /*  2 */ ALWAYSDEFD32(guest_CMSTART),
3145               /*  3 */ ALWAYSDEFD32(guest_CMLEN),
3146               /*  4 */ ALWAYSDEFD32(guest_VSCR),
3147               /*  5 */ ALWAYSDEFD32(guest_FPROUND),
3148               /*  6 */ ALWAYSDEFD32(guest_NRADDR),
3149               /*  7 */ ALWAYSDEFD32(guest_NRADDR_GPR2),
3150               /*  8 */ ALWAYSDEFD32(guest_REDIR_SP),
3151               /*  9 */ ALWAYSDEFD32(guest_REDIR_STACK),
3152               /* 10 */ ALWAYSDEFD32(guest_IP_AT_SYSCALL),
3153               /* 11 */ ALWAYSDEFD32(guest_C_FPCC)
3154             }
3155         };
3156
3157 #define ALWAYSDEFD64(field)                           \
3158     { offsetof(VexGuestPPC64State, field),            \
3159       (sizeof ((VexGuestPPC64State*)0)->field) }
3160
3161 VexGuestLayout
3162    ppc64Guest_layout
3163       = {
3164           /* Total size of the guest state, in bytes. */
3165           .total_sizeB = sizeof(VexGuestPPC64State),
3166
3167           /* Describe the stack pointer. */
3168           .offset_SP = offsetof(VexGuestPPC64State,guest_GPR1),
3169           .sizeof_SP = 8,
3170
3171           /* Describe the frame pointer. */
3172           .offset_FP = offsetof(VexGuestPPC64State,guest_GPR1),
3173           .sizeof_FP = 8,
3174
3175           /* Describe the instruction pointer. */
3176           .offset_IP = offsetof(VexGuestPPC64State,guest_CIA),
3177           .sizeof_IP = 8,
3178
3179           /* Describe any sections to be regarded by Memcheck as
3180              'always-defined'. */
3181           .n_alwaysDefd = 12,
3182
3183           .alwaysDefd
3184           = { /*  0 */ ALWAYSDEFD64(guest_CIA),
3185               /*  1 */ ALWAYSDEFD64(guest_EMNOTE),
3186               /*  2 */ ALWAYSDEFD64(guest_CMSTART),
3187               /*  3 */ ALWAYSDEFD64(guest_CMLEN),
3188               /*  4 */ ALWAYSDEFD64(guest_VSCR),
3189               /*  5 */ ALWAYSDEFD64(guest_FPROUND),
3190               /*  6 */ ALWAYSDEFD64(guest_NRADDR),
3191               /*  7 */ ALWAYSDEFD64(guest_NRADDR_GPR2),
3192               /*  8 */ ALWAYSDEFD64(guest_REDIR_SP),
3193               /*  9 */ ALWAYSDEFD64(guest_REDIR_STACK),
3194               /* 10 */ ALWAYSDEFD64(guest_IP_AT_SYSCALL),
3195               /* 11 */ ALWAYSDEFD64(guest_C_FPCC)
3196             }
3197         };
3198
3199 UInt copy_paste_abort_dirty_helper(UInt addr, UInt op) {
3200 #  if defined(__powerpc__) && defined(HAS_ISA_3_00)
3201 /* The enable copy, paste., and cpabort were introduced in ISA 3.0. */
3202    ULong ret;
3203    UInt cr;
3204
3205    if (op == COPY_INST)
3206       __asm__ __volatile__ (".machine push;\n"
3207                             ".machine power9;\n"
3208                             "copy 0,%0;\n"
3209                             ".machine pop" :: "r" (addr));
3210
3211    else if (op == PASTE_INST)
3212       __asm__ __volatile__ (".machine push;\n"
3213                             ".machine power9;\n"
3214                             "paste. 0,%0\n"
3215                             ".machine pop" :: "r" (addr));
3216
3217    else if (op == CPABORT_INST)
3218       __asm__ __volatile__ (".machine push;\n"
3219                             ".machine power9;\n"
3220                             "cpabort\n"
3221                             ".machine pop");
3222
3223    else
3224       /* Unknown operation */
3225       vassert(0);
3226
3227    /* Return the CR0 value. Contains status for the paste instruction. */
3228    __asm__ __volatile__ ("mfocrf %0,128" : "=r" (cr));
3229    __asm__ __volatile__ ("srawi %0,%1,28" : "=r" (ret) : "r" (cr));
3230    /* Make sure the upper bits of the return value are zero per the hack
3231       described in function dis_copy_paste().  */
3232    return 0xFF & ret;
3233 # else
3234    return 0;
3235 # endif
3236 }
3237
3238 /*---------------------------------------------------------------*/
3239 /*--- end                                 guest_ppc_helpers.c ---*/
3240 /*---------------------------------------------------------------*/