VEX/priv/host_generic_simd64.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                             host_generic_simd64.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, write to the Free Software
  25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26    02110-1301, USA.
  27
  28    The GNU General Public License is contained in the file COPYING.
  29
  30    Neither the names of the U.S. Department of Energy nor the
  31    University of California nor the names of its contributors may be
  32    used to endorse or promote products derived from this software
  33    without prior written permission.
  34 */
  35
  36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
  37    where the instruction selectors cannot generate code in-line.
  38    These are purely back-end entities and cannot be seen/referenced
  39    from IR.  There are also helpers for 32-bit arithmetic in here. */
  40
  41 #include "libvex_basictypes.h"
  42 #include "main_util.h"              // LIKELY, UNLIKELY
  43 #include "host_generic_simd64.h"
  44
  45
  46
  47 /* Tuple/select functions for 32x2 vectors. */
  48
  49 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
  50    return (((ULong)w1) << 32) | ((ULong)w0);
  51 }
  52
  53 static inline UInt sel32x2_1 ( ULong w64 ) {
  54    return 0xFFFFFFFF & toUInt(w64 >> 32);
  55 }
  56 static inline UInt sel32x2_0 ( ULong w64 ) {
  57    return 0xFFFFFFFF & toUInt(w64);
  58 }
  59
  60
  61 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
  62    with 64-bit shifts so we give it a hand. */
  63
  64 static inline ULong mk16x4 ( UShort w3, UShort w2,
  65                              UShort w1, UShort w0 ) {
  66    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
  67    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
  68    return mk32x2(hi32, lo32);
  69 }
  70
  71 static inline UShort sel16x4_3 ( ULong w64 ) {
  72    UInt hi32 = toUInt(w64 >> 32);
  73    return toUShort(0xFFFF & (hi32 >> 16));
  74 }
  75 static inline UShort sel16x4_2 ( ULong w64 ) {
  76    UInt hi32 = toUInt(w64 >> 32);
  77    return toUShort(0xFFFF & hi32);
  78 }
  79 static inline UShort sel16x4_1 ( ULong w64 ) {
  80    UInt lo32 = (UInt)w64;
  81    return toUShort(0xFFFF & (lo32 >> 16));
  82 }
  83 static inline UShort sel16x4_0 ( ULong w64 ) {
  84    UInt lo32 = (UInt)w64;
  85    return toUShort(0xFFFF & lo32);
  86 }
  87
  88
  89 /* Tuple/select functions for 8x8 vectors. */
  90
  91 static inline ULong mk8x8 ( UChar w7, UChar w6,
  92                             UChar w5, UChar w4,
  93                             UChar w3, UChar w2,
  94                             UChar w1, UChar w0 ) {
  95    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
  96                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
  97    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
  98                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
  99    return mk32x2(hi32, lo32);
 100 }
 101
 102 static inline UChar sel8x8_7 ( ULong w64 ) {
 103    UInt hi32 = toUInt(w64 >> 32);
 104    return toUChar(0xFF & (hi32 >> 24));
 105 }
 106 static inline UChar sel8x8_6 ( ULong w64 ) {
 107    UInt hi32 = toUInt(w64 >> 32);
 108    return toUChar(0xFF & (hi32 >> 16));
 109 }
 110 static inline UChar sel8x8_5 ( ULong w64 ) {
 111    UInt hi32 = toUInt(w64 >> 32);
 112    return toUChar(0xFF & (hi32 >> 8));
 113 }
 114 static inline UChar sel8x8_4 ( ULong w64 ) {
 115    UInt hi32 = toUInt(w64 >> 32);
 116    return toUChar(0xFF & (hi32 >> 0));
 117 }
 118 static inline UChar sel8x8_3 ( ULong w64 ) {
 119    UInt lo32 = (UInt)w64;
 120    return toUChar(0xFF & (lo32 >> 24));
 121 }
 122 static inline UChar sel8x8_2 ( ULong w64 ) {
 123    UInt lo32 = (UInt)w64;
 124    return toUChar(0xFF & (lo32 >> 16));
 125 }
 126 static inline UChar sel8x8_1 ( ULong w64 ) {
 127    UInt lo32 = (UInt)w64;
 128    return toUChar(0xFF & (lo32 >> 8));
 129 }
 130 static inline UChar sel8x8_0 ( ULong w64 ) {
 131    UInt lo32 = (UInt)w64;
 132    return toUChar(0xFF & (lo32 >> 0));
 133 }
 134
 135 static inline UChar index8x8 ( ULong w64, UChar ix ) {
 136    ix &= 7;
 137    return toUChar((w64 >> (8*ix)) & 0xFF);
 138 }
 139
 140
 141 /* Scalar helpers. */
 142
 143 static inline Int qadd32S ( Int xx, Int yy )
 144 {
 145    Long t = ((Long)xx) + ((Long)yy);
 146    const Long loLim = -0x80000000LL;
 147    const Long hiLim =  0x7FFFFFFFLL;
 148    if (t < loLim) t = loLim;
 149    if (t > hiLim) t = hiLim;
 150    return (Int)t;
 151 }
 152
 153 static inline Short qadd16S ( Short xx, Short yy )
 154 {
 155    Int t = ((Int)xx) + ((Int)yy);
 156    if (t < -32768) t = -32768;
 157    if (t > 32767)  t = 32767;
 158    return (Short)t;
 159 }
 160
 161 static inline Char qadd8S ( Char xx, Char yy )
 162 {
 163    Int t = ((Int)xx) + ((Int)yy);
 164    if (t < -128) t = -128;
 165    if (t > 127)  t = 127;
 166    return (Char)t;
 167 }
 168
 169 static inline UShort qadd16U ( UShort xx, UShort yy )
 170 {
 171    UInt t = ((UInt)xx) + ((UInt)yy);
 172    if (t > 0xFFFF) t = 0xFFFF;
 173    return (UShort)t;
 174 }
 175
 176 static inline UChar qadd8U ( UChar xx, UChar yy )
 177 {
 178    UInt t = ((UInt)xx) + ((UInt)yy);
 179    if (t > 0xFF) t = 0xFF;
 180    return (UChar)t;
 181 }
 182
 183 static inline Int qsub32S ( Int xx, Int yy )
 184 {
 185    Long t = ((Long)xx) - ((Long)yy);
 186    const Long loLim = -0x80000000LL;
 187    const Long hiLim =  0x7FFFFFFFLL;
 188    if (t < loLim) t = loLim;
 189    if (t > hiLim) t = hiLim;
 190    return (Int)t;
 191 }
 192
 193 static inline Short qsub16S ( Short xx, Short yy )
 194 {
 195    Int t = ((Int)xx) - ((Int)yy);
 196    if (t < -32768) t = -32768;
 197    if (t > 32767)  t = 32767;
 198    return (Short)t;
 199 }
 200
 201 static inline Char qsub8S ( Char xx, Char yy )
 202 {
 203    Int t = ((Int)xx) - ((Int)yy);
 204    if (t < -128) t = -128;
 205    if (t > 127)  t = 127;
 206    return (Char)t;
 207 }
 208
 209 static inline UShort qsub16U ( UShort xx, UShort yy )
 210 {
 211    Int t = ((Int)xx) - ((Int)yy);
 212    if (t < 0)      t = 0;
 213    if (t > 0xFFFF) t = 0xFFFF;
 214    return (UShort)t;
 215 }
 216
 217 static inline UChar qsub8U ( UChar xx, UChar yy )
 218 {
 219    Int t = ((Int)xx) - ((Int)yy);
 220    if (t < 0)    t = 0;
 221    if (t > 0xFF) t = 0xFF;
 222    return (UChar)t;
 223 }
 224
 225 static inline Short mul16 ( Short xx, Short yy )
 226 {
 227    Int t = ((Int)xx) * ((Int)yy);
 228    return (Short)t;
 229 }
 230
 231 static inline Int mul32 ( Int xx, Int yy )
 232 {
 233    Int t = ((Int)xx) * ((Int)yy);
 234    return (Int)t;
 235 }
 236
 237 static inline Short mulhi16S ( Short xx, Short yy )
 238 {
 239    Int t = ((Int)xx) * ((Int)yy);
 240    t >>=/*s*/ 16;
 241    return (Short)t;
 242 }
 243
 244 static inline UShort mulhi16U ( UShort xx, UShort yy )
 245 {
 246    UInt t = ((UInt)xx) * ((UInt)yy);
 247    t >>=/*u*/ 16;
 248    return (UShort)t;
 249 }
 250
 251 static inline UInt cmpeq32 ( UInt xx, UInt yy )
 252 {
 253    return xx==yy ? 0xFFFFFFFF : 0;
 254 }
 255
 256 static inline UShort cmpeq16 ( UShort xx, UShort yy )
 257 {
 258    return toUShort(xx==yy ? 0xFFFF : 0);
 259 }
 260
 261 static inline UChar cmpeq8 ( UChar xx, UChar yy )
 262 {
 263    return toUChar(xx==yy ? 0xFF : 0);
 264 }
 265
 266 static inline UInt cmpgt32S ( Int xx, Int yy )
 267 {
 268    return xx>yy ? 0xFFFFFFFF : 0;
 269 }
 270
 271 static inline UShort cmpgt16S ( Short xx, Short yy )
 272 {
 273    return toUShort(xx>yy ? 0xFFFF : 0);
 274 }
 275
 276 static inline UChar cmpgt8S ( Char xx, Char yy )
 277 {
 278    return toUChar(xx>yy ? 0xFF : 0);
 279 }
 280
 281 static inline UInt cmpnez32 ( UInt xx )
 282 {
 283    return xx==0 ? 0 : 0xFFFFFFFF;
 284 }
 285
 286 static inline UShort cmpnez16 ( UShort xx )
 287 {
 288    return toUShort(xx==0 ? 0 : 0xFFFF);
 289 }
 290
 291 static inline UChar cmpnez8 ( UChar xx )
 292 {
 293    return toUChar(xx==0 ? 0 : 0xFF);
 294 }
 295
 296 static inline Short qnarrow32Sto16S ( UInt xx0 )
 297 {
 298    Int xx = (Int)xx0;
 299    if (xx < -32768) xx = -32768;
 300    if (xx > 32767)  xx = 32767;
 301    return (Short)xx;
 302 }
 303
 304 static inline Char qnarrow16Sto8S ( UShort xx0 )
 305 {
 306    Short xx = (Short)xx0;
 307    if (xx < -128) xx = -128;
 308    if (xx > 127)  xx = 127;
 309    return (Char)xx;
 310 }
 311
 312 static inline UChar qnarrow16Sto8U ( UShort xx0 )
 313 {
 314    Short xx = (Short)xx0;
 315    if (xx < 0)   xx = 0;
 316    if (xx > 255) xx = 255;
 317    return (UChar)xx;
 318 }
 319
 320 static inline UShort narrow32to16 ( UInt xx )
 321 {
 322    return (UShort)xx;
 323 }
 324
 325 static inline UChar narrow16to8 ( UShort xx )
 326 {
 327    return (UChar)xx;
 328 }
 329
 330 /* shifts: we don't care about out-of-range ones, since
 331    that is dealt with at a higher level. */
 332
 333 static inline UChar shl8 ( UChar v, UInt n )
 334 {
 335    return toUChar(v << n);
 336 }
 337
 338 static inline UChar sar8 ( UChar v, UInt n )
 339 {
 340    return toUChar(((Char)v) >> n);
 341 }
 342
 343 static inline UShort shl16 ( UShort v, UInt n )
 344 {
 345    return toUShort(v << n);
 346 }
 347
 348 static inline UShort shr16 ( UShort v, UInt n )
 349 {
 350    return toUShort((((UShort)v) >> n));
 351 }
 352
 353 static inline UShort sar16 ( UShort v, UInt n )
 354 {
 355    return toUShort(((Short)v) >> n);
 356 }
 357
 358 static inline UInt shl32 ( UInt v, UInt n )
 359 {
 360    return v << n;
 361 }
 362
 363 static inline UInt shr32 ( UInt v, UInt n )
 364 {
 365    return (((UInt)v) >> n);
 366 }
 367
 368 static inline UInt sar32 ( UInt v, UInt n )
 369 {
 370    return ((Int)v) >> n;
 371 }
 372
 373 static inline UChar avg8U ( UChar xx, UChar yy )
 374 {
 375    UInt xxi = (UInt)xx;
 376    UInt yyi = (UInt)yy;
 377    UInt r   = (xxi + yyi + 1) >> 1;
 378    return (UChar)r;
 379 }
 380
 381 static inline UShort avg16U ( UShort xx, UShort yy )
 382 {
 383    UInt xxi = (UInt)xx;
 384    UInt yyi = (UInt)yy;
 385    UInt r   = (xxi + yyi + 1) >> 1;
 386    return (UShort)r;
 387 }
 388
 389 static inline Short max16S ( Short xx, Short yy )
 390 {
 391    return toUShort((xx > yy) ? xx : yy);
 392 }
 393
 394 static inline UChar max8U ( UChar xx, UChar yy )
 395 {
 396    return toUChar((xx > yy) ? xx : yy);
 397 }
 398
 399 static inline Short min16S ( Short xx, Short yy )
 400 {
 401    return toUShort((xx < yy) ? xx : yy);
 402 }
 403
 404 static inline UChar min8U ( UChar xx, UChar yy )
 405 {
 406    return toUChar((xx < yy) ? xx : yy);
 407 }
 408
 409 static inline UShort hadd16U ( UShort xx, UShort yy )
 410 {
 411    UInt xxi = (UInt)xx;
 412    UInt yyi = (UInt)yy;
 413    UInt r   = (xxi + yyi) >> 1;
 414    return (UShort)r;
 415 }
 416
 417 static inline Short hadd16S ( Short xx, Short yy )
 418 {
 419    Int xxi = (Int)xx;
 420    Int yyi = (Int)yy;
 421    Int r   = (xxi + yyi) >> 1;
 422    return (Short)r;
 423 }
 424
 425 static inline UShort hsub16U ( UShort xx, UShort yy )
 426 {
 427    UInt xxi = (UInt)xx;
 428    UInt yyi = (UInt)yy;
 429    UInt r   = (xxi - yyi) >> 1;
 430    return (UShort)r;
 431 }
 432
 433 static inline Short hsub16S ( Short xx, Short yy )
 434 {
 435    Int xxi = (Int)xx;
 436    Int yyi = (Int)yy;
 437    Int r   = (xxi - yyi) >> 1;
 438    return (Short)r;
 439 }
 440
 441 static inline UChar hadd8U ( UChar xx, UChar yy )
 442 {
 443    UInt xxi = (UInt)xx;
 444    UInt yyi = (UInt)yy;
 445    UInt r   = (xxi + yyi) >> 1;
 446    return (UChar)r;
 447 }
 448
 449 static inline Char hadd8S ( Char xx, Char yy )
 450 {
 451    Int xxi = (Int)xx;
 452    Int yyi = (Int)yy;
 453    Int r   = (xxi + yyi) >> 1;
 454    return (Char)r;
 455 }
 456
 457 static inline UChar hsub8U ( UChar xx, UChar yy )
 458 {
 459    UInt xxi = (UInt)xx;
 460    UInt yyi = (UInt)yy;
 461    UInt r   = (xxi - yyi) >> 1;
 462    return (UChar)r;
 463 }
 464
 465 static inline Char hsub8S ( Char xx, Char yy )
 466 {
 467    Int xxi = (Int)xx;
 468    Int yyi = (Int)yy;
 469    Int r   = (xxi - yyi) >> 1;
 470    return (Char)r;
 471 }
 472
 473 static inline UInt absdiff8U ( UChar xx, UChar yy )
 474 {
 475    UInt xxu = (UChar)xx;
 476    UInt yyu = (UChar)yy;
 477    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
 478 }
 479
 480 /* ----------------------------------------------------- */
 481 /* Start of the externally visible functions.  These simply
 482    implement the corresponding IR primops. */
 483 /* ----------------------------------------------------- */
 484
 485 /* ------------ Normal addition ------------ */
 486
 487 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
 488 {
 489    return mk32x2(
 490              sel32x2_1(xx) + sel32x2_1(yy),
 491              sel32x2_0(xx) + sel32x2_0(yy)
 492           );
 493 }
 494
 495 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
 496 {
 497    return mk16x4(
 498              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
 499              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
 500              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
 501              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
 502           );
 503 }
 504
 505 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
 506 {
 507    return mk8x8(
 508              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
 509              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
 510              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
 511              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
 512              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
 513              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
 514              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
 515              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
 516           );
 517 }
 518
 519 /* ------------ Saturating addition ------------ */
 520
 521 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
 522 {
 523    return mk16x4(
 524              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
 525              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
 526              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
 527              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
 528           );
 529 }
 530
 531 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
 532 {
 533    return mk8x8(
 534              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
 535              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
 536              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
 537              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
 538              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
 539              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
 540              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
 541              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
 542           );
 543 }
 544
 545 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
 546 {
 547    return mk16x4(
 548              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
 549              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
 550              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
 551              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
 552           );
 553 }
 554
 555 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
 556 {
 557    return mk8x8(
 558              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
 559              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
 560              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
 561              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
 562              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
 563              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
 564              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
 565              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
 566           );
 567 }
 568
 569 /* ------------ Normal subtraction ------------ */
 570
 571 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
 572 {
 573    return mk32x2(
 574              sel32x2_1(xx) - sel32x2_1(yy),
 575              sel32x2_0(xx) - sel32x2_0(yy)
 576           );
 577 }
 578
 579 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
 580 {
 581    return mk16x4(
 582              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
 583              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
 584              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
 585              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
 586           );
 587 }
 588
 589 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
 590 {
 591    return mk8x8(
 592              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
 593              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
 594              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
 595              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
 596              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
 597              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
 598              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
 599              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
 600           );
 601 }
 602
 603 /* ------------ Saturating subtraction ------------ */
 604
 605 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
 606 {
 607    return mk16x4(
 608              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
 609              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
 610              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
 611              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
 612           );
 613 }
 614
 615 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
 616 {
 617    return mk8x8(
 618              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
 619              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
 620              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
 621              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
 622              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
 623              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
 624              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
 625              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
 626           );
 627 }
 628
 629 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
 630 {
 631    return mk16x4(
 632              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
 633              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
 634              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
 635              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
 636           );
 637 }
 638
 639 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
 640 {
 641    return mk8x8(
 642              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
 643              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
 644              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
 645              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
 646              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
 647              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
 648              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
 649              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
 650           );
 651 }
 652
 653 /* ------------ Multiplication ------------ */
 654
 655 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
 656 {
 657    return mk16x4(
 658              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
 659              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
 660              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
 661              mul16( sel16x4_0(xx), sel16x4_0(yy) )
 662           );
 663 }
 664
 665 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
 666 {
 667    return mk32x2(
 668              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
 669              mul32( sel32x2_0(xx), sel32x2_0(yy) )
 670           );
 671 }
 672
 673 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
 674 {
 675    return mk16x4(
 676              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
 677              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
 678              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
 679              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
 680           );
 681 }
 682
 683 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
 684 {
 685    return mk16x4(
 686              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
 687              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
 688              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
 689              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
 690           );
 691 }
 692
 693 /* ------------ Comparison ------------ */
 694
 695 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
 696 {
 697    return mk32x2(
 698              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
 699              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
 700           );
 701 }
 702
 703 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
 704 {
 705    return mk16x4(
 706              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
 707              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
 708              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
 709              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
 710           );
 711 }
 712
 713 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
 714 {
 715    return mk8x8(
 716              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
 717              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
 718              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
 719              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
 720              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
 721              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
 722              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
 723              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
 724           );
 725 }
 726
 727 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
 728 {
 729    return mk32x2(
 730              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
 731              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
 732           );
 733 }
 734
 735 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
 736 {
 737    return mk16x4(
 738              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
 739              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
 740              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
 741              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
 742           );
 743 }
 744
 745 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
 746 {
 747    return mk8x8(
 748              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
 749              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
 750              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
 751              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
 752              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
 753              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
 754              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
 755              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
 756           );
 757 }
 758
 759 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
 760 {
 761    return mk32x2(
 762              cmpnez32( sel32x2_1(xx) ),
 763              cmpnez32( sel32x2_0(xx) )
 764           );
 765 }
 766
 767 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
 768 {
 769    return mk16x4(
 770              cmpnez16( sel16x4_3(xx) ),
 771              cmpnez16( sel16x4_2(xx) ),
 772              cmpnez16( sel16x4_1(xx) ),
 773              cmpnez16( sel16x4_0(xx) )
 774           );
 775 }
 776
 777 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
 778 {
 779    return mk8x8(
 780              cmpnez8( sel8x8_7(xx) ),
 781              cmpnez8( sel8x8_6(xx) ),
 782              cmpnez8( sel8x8_5(xx) ),
 783              cmpnez8( sel8x8_4(xx) ),
 784              cmpnez8( sel8x8_3(xx) ),
 785              cmpnez8( sel8x8_2(xx) ),
 786              cmpnez8( sel8x8_1(xx) ),
 787              cmpnez8( sel8x8_0(xx) )
 788           );
 789 }
 790
 791 /* ------------ Saturating narrowing ------------ */
 792
 793 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
 794 {
 795    UInt d = sel32x2_1(aa);
 796    UInt c = sel32x2_0(aa);
 797    UInt b = sel32x2_1(bb);
 798    UInt a = sel32x2_0(bb);
 799    return mk16x4(
 800              qnarrow32Sto16S(d),
 801              qnarrow32Sto16S(c),
 802              qnarrow32Sto16S(b),
 803              qnarrow32Sto16S(a)
 804           );
 805 }
 806
 807 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
 808 {
 809    UShort h = sel16x4_3(aa);
 810    UShort g = sel16x4_2(aa);
 811    UShort f = sel16x4_1(aa);
 812    UShort e = sel16x4_0(aa);
 813    UShort d = sel16x4_3(bb);
 814    UShort c = sel16x4_2(bb);
 815    UShort b = sel16x4_1(bb);
 816    UShort a = sel16x4_0(bb);
 817    return mk8x8(
 818              qnarrow16Sto8S(h),
 819              qnarrow16Sto8S(g),
 820              qnarrow16Sto8S(f),
 821              qnarrow16Sto8S(e),
 822              qnarrow16Sto8S(d),
 823              qnarrow16Sto8S(c),
 824              qnarrow16Sto8S(b),
 825              qnarrow16Sto8S(a)
 826           );
 827 }
 828
 829 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
 830 {
 831    UShort h = sel16x4_3(aa);
 832    UShort g = sel16x4_2(aa);
 833    UShort f = sel16x4_1(aa);
 834    UShort e = sel16x4_0(aa);
 835    UShort d = sel16x4_3(bb);
 836    UShort c = sel16x4_2(bb);
 837    UShort b = sel16x4_1(bb);
 838    UShort a = sel16x4_0(bb);
 839    return mk8x8(
 840              qnarrow16Sto8U(h),
 841              qnarrow16Sto8U(g),
 842              qnarrow16Sto8U(f),
 843              qnarrow16Sto8U(e),
 844              qnarrow16Sto8U(d),
 845              qnarrow16Sto8U(c),
 846              qnarrow16Sto8U(b),
 847              qnarrow16Sto8U(a)
 848           );
 849 }
 850
 851 /* ------------ Truncating narrowing ------------ */
 852
 853 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
 854 {
 855    UInt d = sel32x2_1(aa);
 856    UInt c = sel32x2_0(aa);
 857    UInt b = sel32x2_1(bb);
 858    UInt a = sel32x2_0(bb);
 859    return mk16x4(
 860              narrow32to16(d),
 861              narrow32to16(c),
 862              narrow32to16(b),
 863              narrow32to16(a)
 864           );
 865 }
 866
 867 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
 868 {
 869    UShort h = sel16x4_3(aa);
 870    UShort g = sel16x4_2(aa);
 871    UShort f = sel16x4_1(aa);
 872    UShort e = sel16x4_0(aa);
 873    UShort d = sel16x4_3(bb);
 874    UShort c = sel16x4_2(bb);
 875    UShort b = sel16x4_1(bb);
 876    UShort a = sel16x4_0(bb);
 877    return mk8x8(
 878              narrow16to8(h),
 879              narrow16to8(g),
 880              narrow16to8(f),
 881              narrow16to8(e),
 882              narrow16to8(d),
 883              narrow16to8(c),
 884              narrow16to8(b),
 885              narrow16to8(a)
 886           );
 887 }
 888
 889 /* ------------ Interleaving ------------ */
 890
 891 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
 892 {
 893    return mk8x8(
 894              sel8x8_7(aa),
 895              sel8x8_7(bb),
 896              sel8x8_6(aa),
 897              sel8x8_6(bb),
 898              sel8x8_5(aa),
 899              sel8x8_5(bb),
 900              sel8x8_4(aa),
 901              sel8x8_4(bb)
 902           );
 903 }
 904
 905 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
 906 {
 907    return mk8x8(
 908              sel8x8_3(aa),
 909              sel8x8_3(bb),
 910              sel8x8_2(aa),
 911              sel8x8_2(bb),
 912              sel8x8_1(aa),
 913              sel8x8_1(bb),
 914              sel8x8_0(aa),
 915              sel8x8_0(bb)
 916           );
 917 }
 918
 919 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
 920 {
 921    return mk16x4(
 922              sel16x4_3(aa),
 923              sel16x4_3(bb),
 924              sel16x4_2(aa),
 925              sel16x4_2(bb)
 926           );
 927 }
 928
 929 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
 930 {
 931    return mk16x4(
 932              sel16x4_1(aa),
 933              sel16x4_1(bb),
 934              sel16x4_0(aa),
 935              sel16x4_0(bb)
 936           );
 937 }
 938
 939 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
 940 {
 941    return mk32x2(
 942              sel32x2_1(aa),
 943              sel32x2_1(bb)
 944           );
 945 }
 946
 947 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
 948 {
 949    return mk32x2(
 950              sel32x2_0(aa),
 951              sel32x2_0(bb)
 952           );
 953 }
 954
 955 /* ------------ Concatenation ------------ */
 956
 957 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
 958 {
 959    return mk16x4(
 960              sel16x4_3(aa),
 961              sel16x4_1(aa),
 962              sel16x4_3(bb),
 963              sel16x4_1(bb)
 964           );
 965 }
 966
 967 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
 968 {
 969    return mk16x4(
 970              sel16x4_2(aa),
 971              sel16x4_0(aa),
 972              sel16x4_2(bb),
 973              sel16x4_0(bb)
 974           );
 975 }
 976
 977 /* misc hack looking for a proper home */
 978 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
 979 {
 980    return mk8x8(
 981              index8x8(aa, sel8x8_7(bb)),
 982              index8x8(aa, sel8x8_6(bb)),
 983              index8x8(aa, sel8x8_5(bb)),
 984              index8x8(aa, sel8x8_4(bb)),
 985              index8x8(aa, sel8x8_3(bb)),
 986              index8x8(aa, sel8x8_2(bb)),
 987              index8x8(aa, sel8x8_1(bb)),
 988              index8x8(aa, sel8x8_0(bb))
 989           );
 990 }
 991
 992 /* ------------ Shifting ------------ */
 993 /* Note that because these primops are undefined if the shift amount
 994    equals or exceeds the lane width, the shift amount is masked so
 995    that the scalar shifts are always in range.  In fact, given the
 996    semantics of these primops (ShlN16x4, etc) it is an error if in
 997    fact we are ever given an out-of-range shift amount.
 998 */
 999 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
1000 {
1001    /* vassert(nn < 32); */
1002    nn &= 31;
1003    return mk32x2(
1004              shl32( sel32x2_1(xx), nn ),
1005              shl32( sel32x2_0(xx), nn )
1006           );
1007 }
1008
1009 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1010 {
1011    /* vassert(nn < 16); */
1012    nn &= 15;
1013    return mk16x4(
1014              shl16( sel16x4_3(xx), nn ),
1015              shl16( sel16x4_2(xx), nn ),
1016              shl16( sel16x4_1(xx), nn ),
1017              shl16( sel16x4_0(xx), nn )
1018           );
1019 }
1020
1021 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
1022 {
1023    /* vassert(nn < 8); */
1024    nn &= 7;
1025    return mk8x8(
1026              shl8( sel8x8_7(xx), nn ),
1027              shl8( sel8x8_6(xx), nn ),
1028              shl8( sel8x8_5(xx), nn ),
1029              shl8( sel8x8_4(xx), nn ),
1030              shl8( sel8x8_3(xx), nn ),
1031              shl8( sel8x8_2(xx), nn ),
1032              shl8( sel8x8_1(xx), nn ),
1033              shl8( sel8x8_0(xx), nn )
1034           );
1035 }
1036
1037 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1038 {
1039    /* vassert(nn < 32); */
1040    nn &= 31;
1041    return mk32x2(
1042              shr32( sel32x2_1(xx), nn ),
1043              shr32( sel32x2_0(xx), nn )
1044           );
1045 }
1046
1047 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1048 {
1049    /* vassert(nn < 16); */
1050    nn &= 15;
1051    return mk16x4(
1052              shr16( sel16x4_3(xx), nn ),
1053              shr16( sel16x4_2(xx), nn ),
1054              shr16( sel16x4_1(xx), nn ),
1055              shr16( sel16x4_0(xx), nn )
1056           );
1057 }
1058
1059 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1060 {
1061    /* vassert(nn < 32); */
1062    nn &= 31;
1063    return mk32x2(
1064              sar32( sel32x2_1(xx), nn ),
1065              sar32( sel32x2_0(xx), nn )
1066           );
1067 }
1068
1069 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1070 {
1071    /* vassert(nn < 16); */
1072    nn &= 15;
1073    return mk16x4(
1074              sar16( sel16x4_3(xx), nn ),
1075              sar16( sel16x4_2(xx), nn ),
1076              sar16( sel16x4_1(xx), nn ),
1077              sar16( sel16x4_0(xx), nn )
1078           );
1079 }
1080
1081 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1082 {
1083    /* vassert(nn < 8); */
1084    nn &= 7;
1085    return mk8x8(
1086              sar8( sel8x8_7(xx), nn ),
1087              sar8( sel8x8_6(xx), nn ),
1088              sar8( sel8x8_5(xx), nn ),
1089              sar8( sel8x8_4(xx), nn ),
1090              sar8( sel8x8_3(xx), nn ),
1091              sar8( sel8x8_2(xx), nn ),
1092              sar8( sel8x8_1(xx), nn ),
1093              sar8( sel8x8_0(xx), nn )
1094           );
1095 }
1096
1097 /* ------------ Averaging ------------ */
1098
1099 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1100 {
1101    return mk8x8(
1102              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1103              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1104              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1105              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1106              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1107              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1108              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1109              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1110           );
1111 }
1112
1113 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1114 {
1115    return mk16x4(
1116              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1117              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1118              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1119              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1120           );
1121 }
1122
1123 /* ------------ max/min ------------ */
1124
1125 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1126 {
1127    return mk16x4(
1128              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1129              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1130              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1131              max16S( sel16x4_0(xx), sel16x4_0(yy) )
1132           );
1133 }
1134
1135 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1136 {
1137    return mk8x8(
1138              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1139              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1140              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1141              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1142              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1143              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1144              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1145              max8U( sel8x8_0(xx), sel8x8_0(yy) )
1146           );
1147 }
1148
1149 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1150 {
1151    return mk16x4(
1152              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1153              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1154              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1155              min16S( sel16x4_0(xx), sel16x4_0(yy) )
1156           );
1157 }
1158
1159 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1160 {
1161    return mk8x8(
1162              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1163              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1164              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1165              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1166              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1167              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1168              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1169              min8U( sel8x8_0(xx), sel8x8_0(yy) )
1170           );
1171 }
1172
1173 UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1174 {
1175    UInt r = 0;
1176    if (xx & (1ULL << (64-1))) r |= (1<<7);
1177    if (xx & (1ULL << (56-1))) r |= (1<<6);
1178    if (xx & (1ULL << (48-1))) r |= (1<<5);
1179    if (xx & (1ULL << (40-1))) r |= (1<<4);
1180    if (xx & (1ULL << (32-1))) r |= (1<<3);
1181    if (xx & (1ULL << (24-1))) r |= (1<<2);
1182    if (xx & (1ULL << (16-1))) r |= (1<<1);
1183    if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1184    return r;
1185 }
1186
1187 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1188
1189 /* Tuple/select functions for 16x2 vectors. */
1190 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1191    return (((UInt)w1) << 16) | ((UInt)w2);
1192 }
1193
1194 static inline UShort sel16x2_1 ( UInt w32 ) {
1195    return 0xFFFF & (UShort)(w32 >> 16);
1196 }
1197 static inline UShort sel16x2_0 ( UInt w32 ) {
1198    return 0xFFFF & (UShort)(w32);
1199 }
1200
1201 static inline UInt mk8x4 ( UChar w3, UChar w2,
1202                            UChar w1, UChar w0 ) {
1203    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
1204               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
1205    return w32;
1206 }
1207
1208 static inline UChar sel8x4_3 ( UInt w32 ) {
1209    return toUChar(0xFF & (w32 >> 24));
1210 }
1211 static inline UChar sel8x4_2 ( UInt w32 ) {
1212    return toUChar(0xFF & (w32 >> 16));
1213 }
1214 static inline UChar sel8x4_1 ( UInt w32 ) {
1215    return toUChar(0xFF & (w32 >> 8));
1216 }
1217 static inline UChar sel8x4_0 ( UInt w32 ) {
1218    return toUChar(0xFF & (w32 >> 0));
1219 }
1220
1221
1222 /* ----------------------------------------------------- */
1223 /* More externally visible functions.  These simply
1224    implement the corresponding IR primops. */
1225 /* ----------------------------------------------------- */
1226
1227 /* ------ 16x2 ------ */
1228
1229 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1230 {
1231    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1232                   sel16x2_0(xx) + sel16x2_0(yy) );
1233 }
1234
1235 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1236 {
1237    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1238                   sel16x2_0(xx) - sel16x2_0(yy) );
1239 }
1240
1241 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1242 {
1243    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1244                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1245 }
1246
1247 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1248 {
1249    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1250                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1251 }
1252
1253 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1254 {
1255    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1256                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1257 }
1258
1259 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1260 {
1261    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1262                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1263 }
1264
1265 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1266 {
1267    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1268                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1269 }
1270
1271 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1272 {
1273    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1274                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1275 }
1276
1277 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1278 {
1279    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1280                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1281 }
1282
1283 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1284 {
1285    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1286                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1287 }
1288
1289 /* ------ 8x4 ------ */
1290
1291 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1292 {
1293    return mk8x4(
1294              sel8x4_3(xx) + sel8x4_3(yy),
1295              sel8x4_2(xx) + sel8x4_2(yy),
1296              sel8x4_1(xx) + sel8x4_1(yy),
1297              sel8x4_0(xx) + sel8x4_0(yy)
1298           );
1299 }
1300
1301 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1302 {
1303    return mk8x4(
1304              sel8x4_3(xx) - sel8x4_3(yy),
1305              sel8x4_2(xx) - sel8x4_2(yy),
1306              sel8x4_1(xx) - sel8x4_1(yy),
1307              sel8x4_0(xx) - sel8x4_0(yy)
1308           );
1309 }
1310
1311 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1312 {
1313    return mk8x4(
1314              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1315              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1316              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1317              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1318           );
1319 }
1320
1321 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1322 {
1323    return mk8x4(
1324              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1325              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1326              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1327              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1328           );
1329 }
1330
1331 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1332 {
1333    return mk8x4(
1334              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1335              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1336              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1337              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1338           );
1339 }
1340
1341 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1342 {
1343    return mk8x4(
1344              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1345              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1346              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1347              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1348           );
1349 }
1350
1351 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1352 {
1353    return mk8x4(
1354              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1355              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1356              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1357              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1358           );
1359 }
1360
1361 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1362 {
1363    return mk8x4(
1364              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1365              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1366              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1367              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1368           );
1369 }
1370
1371 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1372 {
1373    return mk8x4(
1374              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1375              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1376              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1377              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1378           );
1379 }
1380
1381 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1382 {
1383    return mk8x4(
1384              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1385              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1386              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1387              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1388           );
1389 }
1390
1391 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1392 {
1393    return mk16x2(
1394              cmpnez16( sel16x2_1(xx) ),
1395              cmpnez16( sel16x2_0(xx) )
1396           );
1397 }
1398
1399 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1400 {
1401    return mk8x4(
1402              cmpnez8( sel8x4_3(xx) ),
1403              cmpnez8( sel8x4_2(xx) ),
1404              cmpnez8( sel8x4_1(xx) ),
1405              cmpnez8( sel8x4_0(xx) )
1406           );
1407 }
1408
1409 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1410 {
1411    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1412           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1413           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1414           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1415 }
1416
1417 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1418 {
1419    return qadd32S( xx, yy );
1420 }
1421
1422 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1423 {
1424    return qsub32S( xx, yy );
1425 }
1426
1427
1428 /*------------------------------------------------------------------*/
1429 /* Decimal Floating Point (DFP) externally visible helper functions */
1430 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD                     */
1431 /*------------------------------------------------------------------*/
1432
1433 #define NOT( x )    ( ( ( x ) == 0) ? 1 : 0)
1434 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1435 #define PUT( x, y ) ( ( x )<< ( y ) )
1436
1437 static ULong dpb_to_bcd( ULong chunk )
1438 {
1439    Short a, b, c, d, e, f, g, h, i, j, k, m;
1440    Short p, q, r, s, t, u, v, w, x, y;
1441    ULong value;
1442
1443    /* convert 10 bit densely packed BCD to BCD */
1444    p = GET( chunk, 9 );
1445    q = GET( chunk, 8 );
1446    r = GET( chunk, 7 );
1447    s = GET( chunk, 6 );
1448    t = GET( chunk, 5 );
1449    u = GET( chunk, 4 );
1450    v = GET( chunk, 3 );
1451    w = GET( chunk, 2 );
1452    x = GET( chunk, 1 );
1453    y = GET( chunk, 0 );
1454
1455    /* The BCD bit values are given by the following boolean equations.*/
1456    a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1457    b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1458    c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1459    d = r;
1460    e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1461    f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1462    g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1463    h = u;
1464    i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1465    j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1466             | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1467    k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1468             | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1469    m = y;
1470
1471    value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1472             | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1473             | PUT(k, 1) | PUT(m, 0);
1474    return value;
1475 }
1476
1477 static ULong bcd_to_dpb( ULong chunk )
1478 {
1479    Short a, b, c, d, e, f, g, h, i, j, k, m;
1480    Short p, q, r, s, t, u, v, w, x, y;
1481    ULong value;
1482    /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1483     The boolean equations to calculate the value of each of the DPD bit
1484     is given in Appendix B  of Book 1: Power ISA User Instruction set.  The
1485     bits for the DPD number are [abcdefghijkm].  The bits for the BCD value
1486     are [pqrstuvwxy].  The boolean logic equations in psuedo C code are:
1487     */
1488    a = GET( chunk, 11 );
1489    b = GET( chunk, 10 );
1490    c = GET( chunk, 9 );
1491    d = GET( chunk, 8 );
1492    e = GET( chunk, 7 );
1493    f = GET( chunk, 6 );
1494    g = GET( chunk, 5 );
1495    h = GET( chunk, 4 );
1496    i = GET( chunk, 3 );
1497    j = GET( chunk, 2 );
1498    k = GET( chunk, 1 );
1499    m = GET( chunk, 0 );
1500
1501    p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1502    q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1503    r = d;
1504    s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1505             | ( f & NOT(a) & NOT(e) ) | ( e & i );
1506    t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1507             | ( g & NOT(a) & NOT(e) ) | ( a & i );
1508    u = h;
1509    v = a | e | i;
1510    w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1511    x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1512    y = m;
1513
1514    value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1515             | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1516
1517    return value;
1518 }
1519
1520 ULong h_calc_DPBtoBCD( ULong dpb )
1521 {
1522    ULong result, chunk;
1523    Int i;
1524
1525    result = 0;
1526
1527    for (i = 0; i < 5; i++) {
1528       chunk = dpb >> ( 4 - i ) * 10;
1529       result = result << 12;
1530       result |= dpb_to_bcd( chunk & 0x3FF );
1531    }
1532    return result;
1533 }
1534
1535 ULong h_calc_BCDtoDPB( ULong bcd )
1536 {
1537    ULong result, chunk;
1538    Int i;
1539
1540    result = 0;
1541
1542    for (i = 0; i < 5; i++) {
1543       chunk = bcd >> ( 4 - i ) * 12;
1544       result = result << 10;
1545       result |= bcd_to_dpb( chunk & 0xFFF );
1546    }
1547    return result;
1548 }
1549 #undef NOT
1550 #undef GET
1551 #undef PUT
1552
1553
1554 /* ----------------------------------------------------- */
1555 /* Signed and unsigned integer division, that behave like
1556    the ARMv7 UDIV ansd SDIV instructions.
1557
1558    sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1559    udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1560 */
1561 /* ----------------------------------------------------- */
1562
1563 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
1564 {
1565    // Division by zero --> zero
1566    if (UNLIKELY(y == 0)) return 0;
1567    // C requires rounding towards zero, which is also what we need.
1568    return x / y;
1569 }
1570
1571 ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
1572 {
1573    // Division by zero --> zero
1574    if (UNLIKELY(y == 0)) return 0;
1575    // C requires rounding towards zero, which is also what we need.
1576    return x / y;
1577 }
1578
1579 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
1580 {
1581    // Division by zero --> zero
1582    if (UNLIKELY(y == 0)) return 0;
1583    // The single case that produces an unrepresentable result
1584    if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
1585                  && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
1586       return (Int)(UInt)0x80000000;
1587    // Else return the result rounded towards zero.  C89 says
1588    // this is implementation defined (in the signed case), but gcc
1589    // promises to round towards zero.  Nevertheless, at startup,
1590    // in main_main.c, do a check for that.
1591    return x / y;
1592 }
1593
1594 Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
1595 {
1596    // Division by zero --> zero
1597    if (UNLIKELY(y == 0)) return 0;
1598    // The single case that produces an unrepresentable result
1599    if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
1600                  && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
1601       return (Long)(ULong)0x8000000000000000ULL;
1602    // Else return the result rounded towards zero.  C89 says
1603    // this is implementation defined (in the signed case), but gcc
1604    // promises to round towards zero.  Nevertheless, at startup,
1605    // in main_main.c, do a check for that.
1606    return x / y;
1607 }
1608
1609
1610 /*---------------------------------------------------------------*/
1611 /*--- end                               host_generic_simd64.c ---*/
1612 /*---------------------------------------------------------------*/