VEX/priv/host_generic_simd64.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                             host_generic_simd64.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
  35    where the instruction selectors cannot generate code in-line.
  36    These are purely back-end entities and cannot be seen/referenced
  37    from IR.  There are also helpers for 32-bit arithmetic in here. */
  38
  39 #include "libvex_basictypes.h"
  40 #include "main_util.h"              // LIKELY, UNLIKELY
  41 #include "host_generic_simd64.h"
  42
  43
  44
  45 /* Tuple/select functions for 32x2 vectors. */
  46
  47 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
  48    return (((ULong)w1) << 32) | ((ULong)w0);
  49 }
  50
  51 static inline UInt sel32x2_1 ( ULong w64 ) {
  52    return 0xFFFFFFFF & toUInt(w64 >> 32);
  53 }
  54 static inline UInt sel32x2_0 ( ULong w64 ) {
  55    return 0xFFFFFFFF & toUInt(w64);
  56 }
  57
  58
  59 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
  60    with 64-bit shifts so we give it a hand. */
  61
  62 static inline ULong mk16x4 ( UShort w3, UShort w2,
  63                              UShort w1, UShort w0 ) {
  64    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
  65    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
  66    return mk32x2(hi32, lo32);
  67 }
  68
  69 static inline UShort sel16x4_3 ( ULong w64 ) {
  70    UInt hi32 = toUInt(w64 >> 32);
  71    return toUShort(0xFFFF & (hi32 >> 16));
  72 }
  73 static inline UShort sel16x4_2 ( ULong w64 ) {
  74    UInt hi32 = toUInt(w64 >> 32);
  75    return toUShort(0xFFFF & hi32);
  76 }
  77 static inline UShort sel16x4_1 ( ULong w64 ) {
  78    UInt lo32 = (UInt)w64;
  79    return toUShort(0xFFFF & (lo32 >> 16));
  80 }
  81 static inline UShort sel16x4_0 ( ULong w64 ) {
  82    UInt lo32 = (UInt)w64;
  83    return toUShort(0xFFFF & lo32);
  84 }
  85
  86
  87 /* Tuple/select functions for 8x8 vectors. */
  88
  89 static inline ULong mk8x8 ( UChar w7, UChar w6,
  90                             UChar w5, UChar w4,
  91                             UChar w3, UChar w2,
  92                             UChar w1, UChar w0 ) {
  93    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
  94                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
  95    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
  96                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
  97    return mk32x2(hi32, lo32);
  98 }
  99
 100 static inline UChar sel8x8_7 ( ULong w64 ) {
 101    UInt hi32 = toUInt(w64 >> 32);
 102    return toUChar(0xFF & (hi32 >> 24));
 103 }
 104 static inline UChar sel8x8_6 ( ULong w64 ) {
 105    UInt hi32 = toUInt(w64 >> 32);
 106    return toUChar(0xFF & (hi32 >> 16));
 107 }
 108 static inline UChar sel8x8_5 ( ULong w64 ) {
 109    UInt hi32 = toUInt(w64 >> 32);
 110    return toUChar(0xFF & (hi32 >> 8));
 111 }
 112 static inline UChar sel8x8_4 ( ULong w64 ) {
 113    UInt hi32 = toUInt(w64 >> 32);
 114    return toUChar(0xFF & (hi32 >> 0));
 115 }
 116 static inline UChar sel8x8_3 ( ULong w64 ) {
 117    UInt lo32 = (UInt)w64;
 118    return toUChar(0xFF & (lo32 >> 24));
 119 }
 120 static inline UChar sel8x8_2 ( ULong w64 ) {
 121    UInt lo32 = (UInt)w64;
 122    return toUChar(0xFF & (lo32 >> 16));
 123 }
 124 static inline UChar sel8x8_1 ( ULong w64 ) {
 125    UInt lo32 = (UInt)w64;
 126    return toUChar(0xFF & (lo32 >> 8));
 127 }
 128 static inline UChar sel8x8_0 ( ULong w64 ) {
 129    UInt lo32 = (UInt)w64;
 130    return toUChar(0xFF & (lo32 >> 0));
 131 }
 132
 133 static inline UChar index8x8 ( ULong w64, UChar ix ) {
 134    ix &= 7;
 135    return toUChar((w64 >> (8*ix)) & 0xFF);
 136 }
 137
 138 static inline UChar indexOrZero8x8 ( ULong w64, UChar ix ) {
 139    Char zeroingMask = (Char)ix;
 140    zeroingMask ^= 0x80;
 141    zeroingMask >>= 7;
 142    ix &= 7;
 143    return toUChar( ((w64 >> (8*ix)) & zeroingMask) & 0xFF );
 144 }
 145
 146
 147 /* Scalar helpers. */
 148
 149 static inline Int qadd32S ( Int xx, Int yy )
 150 {
 151    Long t = ((Long)xx) + ((Long)yy);
 152    const Long loLim = -0x80000000LL;
 153    const Long hiLim =  0x7FFFFFFFLL;
 154    if (t < loLim) t = loLim;
 155    if (t > hiLim) t = hiLim;
 156    return (Int)t;
 157 }
 158
 159 static inline Short qadd16S ( Short xx, Short yy )
 160 {
 161    Int t = ((Int)xx) + ((Int)yy);
 162    if (t < -32768) t = -32768;
 163    if (t > 32767)  t = 32767;
 164    return (Short)t;
 165 }
 166
 167 static inline Char qadd8S ( Char xx, Char yy )
 168 {
 169    Int t = ((Int)xx) + ((Int)yy);
 170    if (t < -128) t = -128;
 171    if (t > 127)  t = 127;
 172    return (Char)t;
 173 }
 174
 175 static inline UShort qadd16U ( UShort xx, UShort yy )
 176 {
 177    UInt t = ((UInt)xx) + ((UInt)yy);
 178    if (t > 0xFFFF) t = 0xFFFF;
 179    return (UShort)t;
 180 }
 181
 182 static inline UChar qadd8U ( UChar xx, UChar yy )
 183 {
 184    UInt t = ((UInt)xx) + ((UInt)yy);
 185    if (t > 0xFF) t = 0xFF;
 186    return (UChar)t;
 187 }
 188
 189 static inline Int qsub32S ( Int xx, Int yy )
 190 {
 191    Long t = ((Long)xx) - ((Long)yy);
 192    const Long loLim = -0x80000000LL;
 193    const Long hiLim =  0x7FFFFFFFLL;
 194    if (t < loLim) t = loLim;
 195    if (t > hiLim) t = hiLim;
 196    return (Int)t;
 197 }
 198
 199 static inline Short qsub16S ( Short xx, Short yy )
 200 {
 201    Int t = ((Int)xx) - ((Int)yy);
 202    if (t < -32768) t = -32768;
 203    if (t > 32767)  t = 32767;
 204    return (Short)t;
 205 }
 206
 207 static inline Char qsub8S ( Char xx, Char yy )
 208 {
 209    Int t = ((Int)xx) - ((Int)yy);
 210    if (t < -128) t = -128;
 211    if (t > 127)  t = 127;
 212    return (Char)t;
 213 }
 214
 215 static inline UShort qsub16U ( UShort xx, UShort yy )
 216 {
 217    Int t = ((Int)xx) - ((Int)yy);
 218    if (t < 0)      t = 0;
 219    if (t > 0xFFFF) t = 0xFFFF;
 220    return (UShort)t;
 221 }
 222
 223 static inline UChar qsub8U ( UChar xx, UChar yy )
 224 {
 225    Int t = ((Int)xx) - ((Int)yy);
 226    if (t < 0)    t = 0;
 227    if (t > 0xFF) t = 0xFF;
 228    return (UChar)t;
 229 }
 230
 231 static inline Short mul16 ( Short xx, Short yy )
 232 {
 233    Int t = ((Int)xx) * ((Int)yy);
 234    return (Short)t;
 235 }
 236
 237 static inline Int mul32 ( Int xx, Int yy )
 238 {
 239    Int t = ((Int)xx) * ((Int)yy);
 240    return (Int)t;
 241 }
 242
 243 static inline Short mulhi16S ( Short xx, Short yy )
 244 {
 245    Int t = ((Int)xx) * ((Int)yy);
 246    t >>=/*s*/ 16;
 247    return (Short)t;
 248 }
 249
 250 static inline UShort mulhi16U ( UShort xx, UShort yy )
 251 {
 252    UInt t = ((UInt)xx) * ((UInt)yy);
 253    t >>=/*u*/ 16;
 254    return (UShort)t;
 255 }
 256
 257 static inline UInt cmpeq32 ( UInt xx, UInt yy )
 258 {
 259    return xx==yy ? 0xFFFFFFFF : 0;
 260 }
 261
 262 static inline UShort cmpeq16 ( UShort xx, UShort yy )
 263 {
 264    return toUShort(xx==yy ? 0xFFFF : 0);
 265 }
 266
 267 static inline UChar cmpeq8 ( UChar xx, UChar yy )
 268 {
 269    return toUChar(xx==yy ? 0xFF : 0);
 270 }
 271
 272 static inline UInt cmpgt32S ( Int xx, Int yy )
 273 {
 274    return xx>yy ? 0xFFFFFFFF : 0;
 275 }
 276
 277 static inline UShort cmpgt16S ( Short xx, Short yy )
 278 {
 279    return toUShort(xx>yy ? 0xFFFF : 0);
 280 }
 281
 282 static inline UChar cmpgt8S ( Char xx, Char yy )
 283 {
 284    return toUChar(xx>yy ? 0xFF : 0);
 285 }
 286
 287 static inline UInt cmpnez32 ( UInt xx )
 288 {
 289    return xx==0 ? 0 : 0xFFFFFFFF;
 290 }
 291
 292 static inline UShort cmpnez16 ( UShort xx )
 293 {
 294    return toUShort(xx==0 ? 0 : 0xFFFF);
 295 }
 296
 297 static inline UChar cmpnez8 ( UChar xx )
 298 {
 299    return toUChar(xx==0 ? 0 : 0xFF);
 300 }
 301
 302 static inline Short qnarrow32Sto16S ( UInt xx0 )
 303 {
 304    Int xx = (Int)xx0;
 305    if (xx < -32768) xx = -32768;
 306    if (xx > 32767)  xx = 32767;
 307    return (Short)xx;
 308 }
 309
 310 static inline Char qnarrow16Sto8S ( UShort xx0 )
 311 {
 312    Short xx = (Short)xx0;
 313    if (xx < -128) xx = -128;
 314    if (xx > 127)  xx = 127;
 315    return (Char)xx;
 316 }
 317
 318 static inline UChar qnarrow16Sto8U ( UShort xx0 )
 319 {
 320    Short xx = (Short)xx0;
 321    if (xx < 0)   xx = 0;
 322    if (xx > 255) xx = 255;
 323    return (UChar)xx;
 324 }
 325
 326 static inline UShort narrow32to16 ( UInt xx )
 327 {
 328    return (UShort)xx;
 329 }
 330
 331 static inline UChar narrow16to8 ( UShort xx )
 332 {
 333    return (UChar)xx;
 334 }
 335
 336 /* shifts: we don't care about out-of-range ones, since
 337    that is dealt with at a higher level. */
 338
 339 static inline UChar shl8 ( UChar v, UInt n )
 340 {
 341    return toUChar(v << n);
 342 }
 343
 344 static inline UChar sar8 ( UChar v, UInt n )
 345 {
 346    return toUChar(((Char)v) >> n);
 347 }
 348
 349 static inline UShort shl16 ( UShort v, UInt n )
 350 {
 351    return toUShort(v << n);
 352 }
 353
 354 static inline UShort shr16 ( UShort v, UInt n )
 355 {
 356    return toUShort((((UShort)v) >> n));
 357 }
 358
 359 static inline UShort sar16 ( UShort v, UInt n )
 360 {
 361    return toUShort(((Short)v) >> n);
 362 }
 363
 364 static inline UInt shl32 ( UInt v, UInt n )
 365 {
 366    return v << n;
 367 }
 368
 369 static inline UInt shr32 ( UInt v, UInt n )
 370 {
 371    return (((UInt)v) >> n);
 372 }
 373
 374 static inline UInt sar32 ( UInt v, UInt n )
 375 {
 376    return ((Int)v) >> n;
 377 }
 378
 379 static inline UChar avg8U ( UChar xx, UChar yy )
 380 {
 381    UInt xxi = (UInt)xx;
 382    UInt yyi = (UInt)yy;
 383    UInt r   = (xxi + yyi + 1) >> 1;
 384    return (UChar)r;
 385 }
 386
 387 static inline UShort avg16U ( UShort xx, UShort yy )
 388 {
 389    UInt xxi = (UInt)xx;
 390    UInt yyi = (UInt)yy;
 391    UInt r   = (xxi + yyi + 1) >> 1;
 392    return (UShort)r;
 393 }
 394
 395 static inline Short max16S ( Short xx, Short yy )
 396 {
 397    return toUShort((xx > yy) ? xx : yy);
 398 }
 399
 400 static inline UChar max8U ( UChar xx, UChar yy )
 401 {
 402    return toUChar((xx > yy) ? xx : yy);
 403 }
 404
 405 static inline Short min16S ( Short xx, Short yy )
 406 {
 407    return toUShort((xx < yy) ? xx : yy);
 408 }
 409
 410 static inline UChar min8U ( UChar xx, UChar yy )
 411 {
 412    return toUChar((xx < yy) ? xx : yy);
 413 }
 414
 415 static inline UShort hadd16U ( UShort xx, UShort yy )
 416 {
 417    UInt xxi = (UInt)xx;
 418    UInt yyi = (UInt)yy;
 419    UInt r   = (xxi + yyi) >> 1;
 420    return (UShort)r;
 421 }
 422
 423 static inline Short hadd16S ( Short xx, Short yy )
 424 {
 425    Int xxi = (Int)xx;
 426    Int yyi = (Int)yy;
 427    Int r   = (xxi + yyi) >> 1;
 428    return (Short)r;
 429 }
 430
 431 static inline UShort hsub16U ( UShort xx, UShort yy )
 432 {
 433    UInt xxi = (UInt)xx;
 434    UInt yyi = (UInt)yy;
 435    UInt r   = (xxi - yyi) >> 1;
 436    return (UShort)r;
 437 }
 438
 439 static inline Short hsub16S ( Short xx, Short yy )
 440 {
 441    Int xxi = (Int)xx;
 442    Int yyi = (Int)yy;
 443    Int r   = (xxi - yyi) >> 1;
 444    return (Short)r;
 445 }
 446
 447 static inline UChar hadd8U ( UChar xx, UChar yy )
 448 {
 449    UInt xxi = (UInt)xx;
 450    UInt yyi = (UInt)yy;
 451    UInt r   = (xxi + yyi) >> 1;
 452    return (UChar)r;
 453 }
 454
 455 static inline Char hadd8S ( Char xx, Char yy )
 456 {
 457    Int xxi = (Int)xx;
 458    Int yyi = (Int)yy;
 459    Int r   = (xxi + yyi) >> 1;
 460    return (Char)r;
 461 }
 462
 463 static inline UChar hsub8U ( UChar xx, UChar yy )
 464 {
 465    UInt xxi = (UInt)xx;
 466    UInt yyi = (UInt)yy;
 467    UInt r   = (xxi - yyi) >> 1;
 468    return (UChar)r;
 469 }
 470
 471 static inline Char hsub8S ( Char xx, Char yy )
 472 {
 473    Int xxi = (Int)xx;
 474    Int yyi = (Int)yy;
 475    Int r   = (xxi - yyi) >> 1;
 476    return (Char)r;
 477 }
 478
 479 static inline UInt absdiff8U ( UChar xx, UChar yy )
 480 {
 481    UInt xxu = (UChar)xx;
 482    UInt yyu = (UChar)yy;
 483    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
 484 }
 485
 486 /* ----------------------------------------------------- */
 487 /* Start of the externally visible functions.  These simply
 488    implement the corresponding IR primops. */
 489 /* ----------------------------------------------------- */
 490
 491 /* ------------ Normal addition ------------ */
 492
 493 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
 494 {
 495    return mk32x2(
 496              sel32x2_1(xx) + sel32x2_1(yy),
 497              sel32x2_0(xx) + sel32x2_0(yy)
 498           );
 499 }
 500
 501 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
 502 {
 503    return mk16x4(
 504              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
 505              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
 506              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
 507              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
 508           );
 509 }
 510
 511 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
 512 {
 513    return mk8x8(
 514              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
 515              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
 516              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
 517              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
 518              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
 519              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
 520              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
 521              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
 522           );
 523 }
 524
 525 /* ------------ Saturating addition ------------ */
 526
 527 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
 528 {
 529    return mk16x4(
 530              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
 531              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
 532              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
 533              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
 534           );
 535 }
 536
 537 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
 538 {
 539    return mk8x8(
 540              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
 541              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
 542              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
 543              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
 544              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
 545              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
 546              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
 547              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
 548           );
 549 }
 550
 551 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
 552 {
 553    return mk16x4(
 554              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
 555              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
 556              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
 557              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
 558           );
 559 }
 560
 561 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
 562 {
 563    return mk8x8(
 564              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
 565              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
 566              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
 567              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
 568              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
 569              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
 570              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
 571              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
 572           );
 573 }
 574
 575 /* ------------ Normal subtraction ------------ */
 576
 577 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
 578 {
 579    return mk32x2(
 580              sel32x2_1(xx) - sel32x2_1(yy),
 581              sel32x2_0(xx) - sel32x2_0(yy)
 582           );
 583 }
 584
 585 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
 586 {
 587    return mk16x4(
 588              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
 589              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
 590              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
 591              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
 592           );
 593 }
 594
 595 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
 596 {
 597    return mk8x8(
 598              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
 599              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
 600              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
 601              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
 602              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
 603              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
 604              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
 605              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
 606           );
 607 }
 608
 609 /* ------------ Saturating subtraction ------------ */
 610
 611 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
 612 {
 613    return mk16x4(
 614              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
 615              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
 616              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
 617              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
 618           );
 619 }
 620
 621 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
 622 {
 623    return mk8x8(
 624              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
 625              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
 626              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
 627              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
 628              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
 629              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
 630              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
 631              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
 632           );
 633 }
 634
 635 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
 636 {
 637    return mk16x4(
 638              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
 639              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
 640              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
 641              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
 642           );
 643 }
 644
 645 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
 646 {
 647    return mk8x8(
 648              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
 649              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
 650              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
 651              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
 652              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
 653              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
 654              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
 655              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
 656           );
 657 }
 658
 659 /* ------------ Multiplication ------------ */
 660
 661 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
 662 {
 663    return mk16x4(
 664              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
 665              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
 666              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
 667              mul16( sel16x4_0(xx), sel16x4_0(yy) )
 668           );
 669 }
 670
 671 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
 672 {
 673    return mk32x2(
 674              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
 675              mul32( sel32x2_0(xx), sel32x2_0(yy) )
 676           );
 677 }
 678
 679 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
 680 {
 681    return mk16x4(
 682              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
 683              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
 684              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
 685              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
 686           );
 687 }
 688
 689 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
 690 {
 691    return mk16x4(
 692              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
 693              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
 694              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
 695              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
 696           );
 697 }
 698
 699 /* ------------ Comparison ------------ */
 700
 701 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
 702 {
 703    return mk32x2(
 704              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
 705              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
 706           );
 707 }
 708
 709 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
 710 {
 711    return mk16x4(
 712              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
 713              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
 714              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
 715              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
 716           );
 717 }
 718
 719 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
 720 {
 721    return mk8x8(
 722              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
 723              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
 724              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
 725              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
 726              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
 727              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
 728              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
 729              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
 730           );
 731 }
 732
 733 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
 734 {
 735    return mk32x2(
 736              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
 737              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
 738           );
 739 }
 740
 741 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
 742 {
 743    return mk16x4(
 744              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
 745              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
 746              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
 747              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
 748           );
 749 }
 750
 751 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
 752 {
 753    return mk8x8(
 754              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
 755              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
 756              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
 757              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
 758              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
 759              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
 760              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
 761              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
 762           );
 763 }
 764
 765 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
 766 {
 767    return mk32x2(
 768              cmpnez32( sel32x2_1(xx) ),
 769              cmpnez32( sel32x2_0(xx) )
 770           );
 771 }
 772
 773 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
 774 {
 775    return mk16x4(
 776              cmpnez16( sel16x4_3(xx) ),
 777              cmpnez16( sel16x4_2(xx) ),
 778              cmpnez16( sel16x4_1(xx) ),
 779              cmpnez16( sel16x4_0(xx) )
 780           );
 781 }
 782
 783 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
 784 {
 785    return mk8x8(
 786              cmpnez8( sel8x8_7(xx) ),
 787              cmpnez8( sel8x8_6(xx) ),
 788              cmpnez8( sel8x8_5(xx) ),
 789              cmpnez8( sel8x8_4(xx) ),
 790              cmpnez8( sel8x8_3(xx) ),
 791              cmpnez8( sel8x8_2(xx) ),
 792              cmpnez8( sel8x8_1(xx) ),
 793              cmpnez8( sel8x8_0(xx) )
 794           );
 795 }
 796
 797 /* ------------ Saturating narrowing ------------ */
 798
 799 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
 800 {
 801    UInt d = sel32x2_1(aa);
 802    UInt c = sel32x2_0(aa);
 803    UInt b = sel32x2_1(bb);
 804    UInt a = sel32x2_0(bb);
 805    return mk16x4(
 806              qnarrow32Sto16S(d),
 807              qnarrow32Sto16S(c),
 808              qnarrow32Sto16S(b),
 809              qnarrow32Sto16S(a)
 810           );
 811 }
 812
 813 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
 814 {
 815    UShort h = sel16x4_3(aa);
 816    UShort g = sel16x4_2(aa);
 817    UShort f = sel16x4_1(aa);
 818    UShort e = sel16x4_0(aa);
 819    UShort d = sel16x4_3(bb);
 820    UShort c = sel16x4_2(bb);
 821    UShort b = sel16x4_1(bb);
 822    UShort a = sel16x4_0(bb);
 823    return mk8x8(
 824              qnarrow16Sto8S(h),
 825              qnarrow16Sto8S(g),
 826              qnarrow16Sto8S(f),
 827              qnarrow16Sto8S(e),
 828              qnarrow16Sto8S(d),
 829              qnarrow16Sto8S(c),
 830              qnarrow16Sto8S(b),
 831              qnarrow16Sto8S(a)
 832           );
 833 }
 834
 835 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
 836 {
 837    UShort h = sel16x4_3(aa);
 838    UShort g = sel16x4_2(aa);
 839    UShort f = sel16x4_1(aa);
 840    UShort e = sel16x4_0(aa);
 841    UShort d = sel16x4_3(bb);
 842    UShort c = sel16x4_2(bb);
 843    UShort b = sel16x4_1(bb);
 844    UShort a = sel16x4_0(bb);
 845    return mk8x8(
 846              qnarrow16Sto8U(h),
 847              qnarrow16Sto8U(g),
 848              qnarrow16Sto8U(f),
 849              qnarrow16Sto8U(e),
 850              qnarrow16Sto8U(d),
 851              qnarrow16Sto8U(c),
 852              qnarrow16Sto8U(b),
 853              qnarrow16Sto8U(a)
 854           );
 855 }
 856
 857 /* ------------ Truncating narrowing ------------ */
 858
 859 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
 860 {
 861    UInt d = sel32x2_1(aa);
 862    UInt c = sel32x2_0(aa);
 863    UInt b = sel32x2_1(bb);
 864    UInt a = sel32x2_0(bb);
 865    return mk16x4(
 866              narrow32to16(d),
 867              narrow32to16(c),
 868              narrow32to16(b),
 869              narrow32to16(a)
 870           );
 871 }
 872
 873 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
 874 {
 875    UShort h = sel16x4_3(aa);
 876    UShort g = sel16x4_2(aa);
 877    UShort f = sel16x4_1(aa);
 878    UShort e = sel16x4_0(aa);
 879    UShort d = sel16x4_3(bb);
 880    UShort c = sel16x4_2(bb);
 881    UShort b = sel16x4_1(bb);
 882    UShort a = sel16x4_0(bb);
 883    return mk8x8(
 884              narrow16to8(h),
 885              narrow16to8(g),
 886              narrow16to8(f),
 887              narrow16to8(e),
 888              narrow16to8(d),
 889              narrow16to8(c),
 890              narrow16to8(b),
 891              narrow16to8(a)
 892           );
 893 }
 894
 895 /* ------------ Interleaving ------------ */
 896
 897 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
 898 {
 899    return mk8x8(
 900              sel8x8_7(aa),
 901              sel8x8_7(bb),
 902              sel8x8_6(aa),
 903              sel8x8_6(bb),
 904              sel8x8_5(aa),
 905              sel8x8_5(bb),
 906              sel8x8_4(aa),
 907              sel8x8_4(bb)
 908           );
 909 }
 910
 911 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
 912 {
 913    return mk8x8(
 914              sel8x8_3(aa),
 915              sel8x8_3(bb),
 916              sel8x8_2(aa),
 917              sel8x8_2(bb),
 918              sel8x8_1(aa),
 919              sel8x8_1(bb),
 920              sel8x8_0(aa),
 921              sel8x8_0(bb)
 922           );
 923 }
 924
 925 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
 926 {
 927    return mk16x4(
 928              sel16x4_3(aa),
 929              sel16x4_3(bb),
 930              sel16x4_2(aa),
 931              sel16x4_2(bb)
 932           );
 933 }
 934
 935 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
 936 {
 937    return mk16x4(
 938              sel16x4_1(aa),
 939              sel16x4_1(bb),
 940              sel16x4_0(aa),
 941              sel16x4_0(bb)
 942           );
 943 }
 944
 945 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
 946 {
 947    return mk32x2(
 948              sel32x2_1(aa),
 949              sel32x2_1(bb)
 950           );
 951 }
 952
 953 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
 954 {
 955    return mk32x2(
 956              sel32x2_0(aa),
 957              sel32x2_0(bb)
 958           );
 959 }
 960
 961 /* ------------ Concatenation ------------ */
 962
 963 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
 964 {
 965    return mk16x4(
 966              sel16x4_3(aa),
 967              sel16x4_1(aa),
 968              sel16x4_3(bb),
 969              sel16x4_1(bb)
 970           );
 971 }
 972
 973 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
 974 {
 975    return mk16x4(
 976              sel16x4_2(aa),
 977              sel16x4_0(aa),
 978              sel16x4_2(bb),
 979              sel16x4_0(bb)
 980           );
 981 }
 982
 983 /* ------------ Permutation ------------ */
 984
 985 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
 986 {
 987    return mk8x8(
 988              index8x8(aa, sel8x8_7(bb)),
 989              index8x8(aa, sel8x8_6(bb)),
 990              index8x8(aa, sel8x8_5(bb)),
 991              index8x8(aa, sel8x8_4(bb)),
 992              index8x8(aa, sel8x8_3(bb)),
 993              index8x8(aa, sel8x8_2(bb)),
 994              index8x8(aa, sel8x8_1(bb)),
 995              index8x8(aa, sel8x8_0(bb))
 996           );
 997 }
 998
 999 ULong h_generic_calc_PermOrZero8x8 ( ULong aa, ULong bb )
1000 {
1001    return mk8x8(
1002              indexOrZero8x8(aa, sel8x8_7(bb)),
1003              indexOrZero8x8(aa, sel8x8_6(bb)),
1004              indexOrZero8x8(aa, sel8x8_5(bb)),
1005              indexOrZero8x8(aa, sel8x8_4(bb)),
1006              indexOrZero8x8(aa, sel8x8_3(bb)),
1007              indexOrZero8x8(aa, sel8x8_2(bb)),
1008              indexOrZero8x8(aa, sel8x8_1(bb)),
1009              indexOrZero8x8(aa, sel8x8_0(bb))
1010           );
1011 }
1012
1013 /* ------------ Shifting ------------ */
1014 /* Note that because these primops are undefined if the shift amount
1015    equals or exceeds the lane width, the shift amount is masked so
1016    that the scalar shifts are always in range.  In fact, given the
1017    semantics of these primops (ShlN16x4, etc) it is an error if in
1018    fact we are ever given an out-of-range shift amount.
1019 */
1020 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
1021 {
1022    /* vassert(nn < 32); */
1023    nn &= 31;
1024    return mk32x2(
1025              shl32( sel32x2_1(xx), nn ),
1026              shl32( sel32x2_0(xx), nn )
1027           );
1028 }
1029
1030 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1031 {
1032    /* vassert(nn < 16); */
1033    nn &= 15;
1034    return mk16x4(
1035              shl16( sel16x4_3(xx), nn ),
1036              shl16( sel16x4_2(xx), nn ),
1037              shl16( sel16x4_1(xx), nn ),
1038              shl16( sel16x4_0(xx), nn )
1039           );
1040 }
1041
1042 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
1043 {
1044    /* vassert(nn < 8); */
1045    nn &= 7;
1046    return mk8x8(
1047              shl8( sel8x8_7(xx), nn ),
1048              shl8( sel8x8_6(xx), nn ),
1049              shl8( sel8x8_5(xx), nn ),
1050              shl8( sel8x8_4(xx), nn ),
1051              shl8( sel8x8_3(xx), nn ),
1052              shl8( sel8x8_2(xx), nn ),
1053              shl8( sel8x8_1(xx), nn ),
1054              shl8( sel8x8_0(xx), nn )
1055           );
1056 }
1057
1058 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1059 {
1060    /* vassert(nn < 32); */
1061    nn &= 31;
1062    return mk32x2(
1063              shr32( sel32x2_1(xx), nn ),
1064              shr32( sel32x2_0(xx), nn )
1065           );
1066 }
1067
1068 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1069 {
1070    /* vassert(nn < 16); */
1071    nn &= 15;
1072    return mk16x4(
1073              shr16( sel16x4_3(xx), nn ),
1074              shr16( sel16x4_2(xx), nn ),
1075              shr16( sel16x4_1(xx), nn ),
1076              shr16( sel16x4_0(xx), nn )
1077           );
1078 }
1079
1080 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1081 {
1082    /* vassert(nn < 32); */
1083    nn &= 31;
1084    return mk32x2(
1085              sar32( sel32x2_1(xx), nn ),
1086              sar32( sel32x2_0(xx), nn )
1087           );
1088 }
1089
1090 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1091 {
1092    /* vassert(nn < 16); */
1093    nn &= 15;
1094    return mk16x4(
1095              sar16( sel16x4_3(xx), nn ),
1096              sar16( sel16x4_2(xx), nn ),
1097              sar16( sel16x4_1(xx), nn ),
1098              sar16( sel16x4_0(xx), nn )
1099           );
1100 }
1101
1102 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1103 {
1104    /* vassert(nn < 8); */
1105    nn &= 7;
1106    return mk8x8(
1107              sar8( sel8x8_7(xx), nn ),
1108              sar8( sel8x8_6(xx), nn ),
1109              sar8( sel8x8_5(xx), nn ),
1110              sar8( sel8x8_4(xx), nn ),
1111              sar8( sel8x8_3(xx), nn ),
1112              sar8( sel8x8_2(xx), nn ),
1113              sar8( sel8x8_1(xx), nn ),
1114              sar8( sel8x8_0(xx), nn )
1115           );
1116 }
1117
1118 /* ------------ Averaging ------------ */
1119
1120 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1121 {
1122    return mk8x8(
1123              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1124              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1125              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1126              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1127              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1128              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1129              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1130              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1131           );
1132 }
1133
1134 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1135 {
1136    return mk16x4(
1137              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1138              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1139              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1140              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1141           );
1142 }
1143
1144 /* ------------ max/min ------------ */
1145
1146 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1147 {
1148    return mk16x4(
1149              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1150              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1151              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1152              max16S( sel16x4_0(xx), sel16x4_0(yy) )
1153           );
1154 }
1155
1156 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1157 {
1158    return mk8x8(
1159              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1160              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1161              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1162              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1163              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1164              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1165              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1166              max8U( sel8x8_0(xx), sel8x8_0(yy) )
1167           );
1168 }
1169
1170 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1171 {
1172    return mk16x4(
1173              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1174              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1175              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1176              min16S( sel16x4_0(xx), sel16x4_0(yy) )
1177           );
1178 }
1179
1180 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1181 {
1182    return mk8x8(
1183              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1184              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1185              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1186              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1187              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1188              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1189              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1190              min8U( sel8x8_0(xx), sel8x8_0(yy) )
1191           );
1192 }
1193
1194 UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1195 {
1196    UInt r = 0;
1197    if (xx & (1ULL << (64-1))) r |= (1<<7);
1198    if (xx & (1ULL << (56-1))) r |= (1<<6);
1199    if (xx & (1ULL << (48-1))) r |= (1<<5);
1200    if (xx & (1ULL << (40-1))) r |= (1<<4);
1201    if (xx & (1ULL << (32-1))) r |= (1<<3);
1202    if (xx & (1ULL << (24-1))) r |= (1<<2);
1203    if (xx & (1ULL << (16-1))) r |= (1<<1);
1204    if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1205    return r;
1206 }
1207
1208 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1209
1210 /* Tuple/select functions for 16x2 vectors. */
1211 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1212    return (((UInt)w1) << 16) | ((UInt)w2);
1213 }
1214
1215 static inline UShort sel16x2_1 ( UInt w32 ) {
1216    return 0xFFFF & (UShort)(w32 >> 16);
1217 }
1218 static inline UShort sel16x2_0 ( UInt w32 ) {
1219    return 0xFFFF & (UShort)(w32);
1220 }
1221
1222 static inline UInt mk8x4 ( UChar w3, UChar w2,
1223                            UChar w1, UChar w0 ) {
1224    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
1225               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
1226    return w32;
1227 }
1228
1229 static inline UChar sel8x4_3 ( UInt w32 ) {
1230    return toUChar(0xFF & (w32 >> 24));
1231 }
1232 static inline UChar sel8x4_2 ( UInt w32 ) {
1233    return toUChar(0xFF & (w32 >> 16));
1234 }
1235 static inline UChar sel8x4_1 ( UInt w32 ) {
1236    return toUChar(0xFF & (w32 >> 8));
1237 }
1238 static inline UChar sel8x4_0 ( UInt w32 ) {
1239    return toUChar(0xFF & (w32 >> 0));
1240 }
1241
1242
1243 /* ----------------------------------------------------- */
1244 /* More externally visible functions.  These simply
1245    implement the corresponding IR primops. */
1246 /* ----------------------------------------------------- */
1247
1248 /* ------ 16x2 ------ */
1249
1250 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1251 {
1252    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1253                   sel16x2_0(xx) + sel16x2_0(yy) );
1254 }
1255
1256 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1257 {
1258    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1259                   sel16x2_0(xx) - sel16x2_0(yy) );
1260 }
1261
1262 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1263 {
1264    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1265                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1266 }
1267
1268 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1269 {
1270    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1271                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1272 }
1273
1274 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1275 {
1276    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1277                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1278 }
1279
1280 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1281 {
1282    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1283                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1284 }
1285
1286 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1287 {
1288    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1289                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1290 }
1291
1292 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1293 {
1294    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1295                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1296 }
1297
1298 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1299 {
1300    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1301                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1302 }
1303
1304 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1305 {
1306    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1307                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1308 }
1309
1310 /* ------ 8x4 ------ */
1311
1312 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1313 {
1314    return mk8x4(
1315              sel8x4_3(xx) + sel8x4_3(yy),
1316              sel8x4_2(xx) + sel8x4_2(yy),
1317              sel8x4_1(xx) + sel8x4_1(yy),
1318              sel8x4_0(xx) + sel8x4_0(yy)
1319           );
1320 }
1321
1322 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1323 {
1324    return mk8x4(
1325              sel8x4_3(xx) - sel8x4_3(yy),
1326              sel8x4_2(xx) - sel8x4_2(yy),
1327              sel8x4_1(xx) - sel8x4_1(yy),
1328              sel8x4_0(xx) - sel8x4_0(yy)
1329           );
1330 }
1331
1332 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1333 {
1334    return mk8x4(
1335              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1336              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1337              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1338              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1339           );
1340 }
1341
1342 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1343 {
1344    return mk8x4(
1345              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1346              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1347              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1348              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1349           );
1350 }
1351
1352 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1353 {
1354    return mk8x4(
1355              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1356              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1357              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1358              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1359           );
1360 }
1361
1362 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1363 {
1364    return mk8x4(
1365              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1366              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1367              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1368              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1369           );
1370 }
1371
1372 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1373 {
1374    return mk8x4(
1375              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1376              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1377              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1378              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1379           );
1380 }
1381
1382 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1383 {
1384    return mk8x4(
1385              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1386              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1387              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1388              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1389           );
1390 }
1391
1392 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1393 {
1394    return mk8x4(
1395              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1396              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1397              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1398              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1399           );
1400 }
1401
1402 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1403 {
1404    return mk8x4(
1405              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1406              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1407              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1408              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1409           );
1410 }
1411
1412 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1413 {
1414    return mk16x2(
1415              cmpnez16( sel16x2_1(xx) ),
1416              cmpnez16( sel16x2_0(xx) )
1417           );
1418 }
1419
1420 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1421 {
1422    return mk8x4(
1423              cmpnez8( sel8x4_3(xx) ),
1424              cmpnez8( sel8x4_2(xx) ),
1425              cmpnez8( sel8x4_1(xx) ),
1426              cmpnez8( sel8x4_0(xx) )
1427           );
1428 }
1429
1430 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1431 {
1432    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1433           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1434           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1435           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1436 }
1437
1438 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1439 {
1440    return qadd32S( xx, yy );
1441 }
1442
1443 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1444 {
1445    return qsub32S( xx, yy );
1446 }
1447
1448
1449 /*------------------------------------------------------------------*/
1450 /* Decimal Floating Point (DFP) externally visible helper functions */
1451 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD                     */
1452 /*------------------------------------------------------------------*/
1453
1454 #define NOT( x )    ( ( ( x ) == 0) ? 1 : 0)
1455 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1456 #define PUT( x, y ) ( ( x )<< ( y ) )
1457
1458 static ULong dpb_to_bcd( ULong chunk )
1459 {
1460    Short a, b, c, d, e, f, g, h, i, j, k, m;
1461    Short p, q, r, s, t, u, v, w, x, y;
1462    ULong value;
1463
1464    /* convert 10 bit densely packed BCD to BCD */
1465    p = GET( chunk, 9 );
1466    q = GET( chunk, 8 );
1467    r = GET( chunk, 7 );
1468    s = GET( chunk, 6 );
1469    t = GET( chunk, 5 );
1470    u = GET( chunk, 4 );
1471    v = GET( chunk, 3 );
1472    w = GET( chunk, 2 );
1473    x = GET( chunk, 1 );
1474    y = GET( chunk, 0 );
1475
1476    /* The BCD bit values are given by the following boolean equations.*/
1477    a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1478    b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1479    c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1480    d = r;
1481    e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1482    f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1483    g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1484    h = u;
1485    i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1486    j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1487             | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1488    k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1489             | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1490    m = y;
1491
1492    value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1493             | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1494             | PUT(k, 1) | PUT(m, 0);
1495    return value;
1496 }
1497
1498 static ULong bcd_to_dpb( ULong chunk )
1499 {
1500    Short a, b, c, d, e, f, g, h, i, j, k, m;
1501    Short p, q, r, s, t, u, v, w, x, y;
1502    ULong value;
1503    /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1504     The boolean equations to calculate the value of each of the DPD bit
1505     is given in Appendix B  of Book 1: Power ISA User Instruction set.  The
1506     bits for the DPD number are [abcdefghijkm].  The bits for the BCD value
1507     are [pqrstuvwxy].  The boolean logic equations in psuedo C code are:
1508     */
1509    a = GET( chunk, 11 );
1510    b = GET( chunk, 10 );
1511    c = GET( chunk, 9 );
1512    d = GET( chunk, 8 );
1513    e = GET( chunk, 7 );
1514    f = GET( chunk, 6 );
1515    g = GET( chunk, 5 );
1516    h = GET( chunk, 4 );
1517    i = GET( chunk, 3 );
1518    j = GET( chunk, 2 );
1519    k = GET( chunk, 1 );
1520    m = GET( chunk, 0 );
1521
1522    p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1523    q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1524    r = d;
1525    s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1526             | ( f & NOT(a) & NOT(e) ) | ( e & i );
1527    t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1528             | ( g & NOT(a) & NOT(e) ) | ( a & i );
1529    u = h;
1530    v = a | e | i;
1531    w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1532    x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1533    y = m;
1534
1535    value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1536             | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1537
1538    return value;
1539 }
1540
1541 ULong h_calc_DPBtoBCD( ULong dpb )
1542 {
1543    ULong result, chunk;
1544    Int i;
1545
1546    result = 0;
1547
1548    for (i = 0; i < 5; i++) {
1549       chunk = dpb >> ( 4 - i ) * 10;
1550       result = result << 12;
1551       result |= dpb_to_bcd( chunk & 0x3FF );
1552    }
1553    return result;
1554 }
1555
1556 ULong h_calc_BCDtoDPB( ULong bcd )
1557 {
1558    ULong result, chunk;
1559    Int i;
1560
1561    result = 0;
1562
1563    for (i = 0; i < 5; i++) {
1564       chunk = bcd >> ( 4 - i ) * 12;
1565       result = result << 10;
1566       result |= bcd_to_dpb( chunk & 0xFFF );
1567    }
1568    return result;
1569 }
1570 #undef NOT
1571 #undef GET
1572 #undef PUT
1573
1574
1575 /* ----------------------------------------------------- */
1576 /* Signed and unsigned integer division, that behave like
1577    the ARMv7 UDIV ansd SDIV instructions.
1578
1579    sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1580    udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1581 */
1582 /* ----------------------------------------------------- */
1583
1584 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
1585 {
1586    // Division by zero --> zero
1587    if (UNLIKELY(y == 0)) return 0;
1588    // C requires rounding towards zero, which is also what we need.
1589    return x / y;
1590 }
1591
1592 ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
1593 {
1594    // Division by zero --> zero
1595    if (UNLIKELY(y == 0)) return 0;
1596    // C requires rounding towards zero, which is also what we need.
1597    return x / y;
1598 }
1599
1600 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
1601 {
1602    // Division by zero --> zero
1603    if (UNLIKELY(y == 0)) return 0;
1604    // The single case that produces an unrepresentable result
1605    if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
1606                  && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
1607       return (Int)(UInt)0x80000000;
1608    // Else return the result rounded towards zero.  C89 says
1609    // this is implementation defined (in the signed case), but gcc
1610    // promises to round towards zero.  Nevertheless, at startup,
1611    // in main_main.c, do a check for that.
1612    return x / y;
1613 }
1614
1615 Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
1616 {
1617    // Division by zero --> zero
1618    if (UNLIKELY(y == 0)) return 0;
1619    // The single case that produces an unrepresentable result
1620    if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
1621                  && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
1622       return (Long)(ULong)0x8000000000000000ULL;
1623    // Else return the result rounded towards zero.  C89 says
1624    // this is implementation defined (in the signed case), but gcc
1625    // promises to round towards zero.  Nevertheless, at startup,
1626    // in main_main.c, do a check for that.
1627    return x / y;
1628 }
1629
1630
1631 /*---------------------------------------------------------------*/
1632 /*--- end                               host_generic_simd64.c ---*/
1633 /*---------------------------------------------------------------*/