Add 469782 to NEWS
[valgrind.git] / VEX / priv / host_generic_simd64.c
blob523a5cd1c6fbd85201718dd69952c69b6ca912fd
2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
28 Neither the names of the U.S. Department of Energy nor the
29 University of California nor the names of its contributors may be
30 used to endorse or promote products derived from this software
31 without prior written permission.
34 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
35 where the instruction selectors cannot generate code in-line.
36 These are purely back-end entities and cannot be seen/referenced
37 from IR. There are also helpers for 32-bit arithmetic in here. */
39 #include "libvex_basictypes.h"
40 #include "main_util.h" // LIKELY, UNLIKELY
41 #include "host_generic_simd64.h"
45 /* Tuple/select functions for 32x2 vectors. */
47 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
48 return (((ULong)w1) << 32) | ((ULong)w0);
51 static inline UInt sel32x2_1 ( ULong w64 ) {
52 return 0xFFFFFFFF & toUInt(w64 >> 32);
54 static inline UInt sel32x2_0 ( ULong w64 ) {
55 return 0xFFFFFFFF & toUInt(w64);
59 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
60 with 64-bit shifts so we give it a hand. */
62 static inline ULong mk16x4 ( UShort w3, UShort w2,
63 UShort w1, UShort w0 ) {
64 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
65 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
66 return mk32x2(hi32, lo32);
69 static inline UShort sel16x4_3 ( ULong w64 ) {
70 UInt hi32 = toUInt(w64 >> 32);
71 return toUShort(0xFFFF & (hi32 >> 16));
73 static inline UShort sel16x4_2 ( ULong w64 ) {
74 UInt hi32 = toUInt(w64 >> 32);
75 return toUShort(0xFFFF & hi32);
77 static inline UShort sel16x4_1 ( ULong w64 ) {
78 UInt lo32 = (UInt)w64;
79 return toUShort(0xFFFF & (lo32 >> 16));
81 static inline UShort sel16x4_0 ( ULong w64 ) {
82 UInt lo32 = (UInt)w64;
83 return toUShort(0xFFFF & lo32);
87 /* Tuple/select functions for 8x8 vectors. */
89 static inline ULong mk8x8 ( UChar w7, UChar w6,
90 UChar w5, UChar w4,
91 UChar w3, UChar w2,
92 UChar w1, UChar w0 ) {
93 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
94 | (((UInt)w5) << 8) | (((UInt)w4) << 0);
95 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
96 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
97 return mk32x2(hi32, lo32);
100 static inline UChar sel8x8_7 ( ULong w64 ) {
101 UInt hi32 = toUInt(w64 >> 32);
102 return toUChar(0xFF & (hi32 >> 24));
104 static inline UChar sel8x8_6 ( ULong w64 ) {
105 UInt hi32 = toUInt(w64 >> 32);
106 return toUChar(0xFF & (hi32 >> 16));
108 static inline UChar sel8x8_5 ( ULong w64 ) {
109 UInt hi32 = toUInt(w64 >> 32);
110 return toUChar(0xFF & (hi32 >> 8));
112 static inline UChar sel8x8_4 ( ULong w64 ) {
113 UInt hi32 = toUInt(w64 >> 32);
114 return toUChar(0xFF & (hi32 >> 0));
116 static inline UChar sel8x8_3 ( ULong w64 ) {
117 UInt lo32 = (UInt)w64;
118 return toUChar(0xFF & (lo32 >> 24));
120 static inline UChar sel8x8_2 ( ULong w64 ) {
121 UInt lo32 = (UInt)w64;
122 return toUChar(0xFF & (lo32 >> 16));
124 static inline UChar sel8x8_1 ( ULong w64 ) {
125 UInt lo32 = (UInt)w64;
126 return toUChar(0xFF & (lo32 >> 8));
128 static inline UChar sel8x8_0 ( ULong w64 ) {
129 UInt lo32 = (UInt)w64;
130 return toUChar(0xFF & (lo32 >> 0));
133 static inline UChar index8x8 ( ULong w64, UChar ix ) {
134 ix &= 7;
135 return toUChar((w64 >> (8*ix)) & 0xFF);
138 static inline UChar indexOrZero8x8 ( ULong w64, UChar ix ) {
139 Char zeroingMask = (Char)ix;
140 zeroingMask ^= 0x80;
141 zeroingMask >>= 7;
142 ix &= 7;
143 return toUChar( ((w64 >> (8*ix)) & zeroingMask) & 0xFF );
147 /* Scalar helpers. */
149 static inline Int qadd32S ( Int xx, Int yy )
151 Long t = ((Long)xx) + ((Long)yy);
152 const Long loLim = -0x80000000LL;
153 const Long hiLim = 0x7FFFFFFFLL;
154 if (t < loLim) t = loLim;
155 if (t > hiLim) t = hiLim;
156 return (Int)t;
159 static inline Short qadd16S ( Short xx, Short yy )
161 Int t = ((Int)xx) + ((Int)yy);
162 if (t < -32768) t = -32768;
163 if (t > 32767) t = 32767;
164 return (Short)t;
167 static inline Char qadd8S ( Char xx, Char yy )
169 Int t = ((Int)xx) + ((Int)yy);
170 if (t < -128) t = -128;
171 if (t > 127) t = 127;
172 return (Char)t;
175 static inline UShort qadd16U ( UShort xx, UShort yy )
177 UInt t = ((UInt)xx) + ((UInt)yy);
178 if (t > 0xFFFF) t = 0xFFFF;
179 return (UShort)t;
182 static inline UChar qadd8U ( UChar xx, UChar yy )
184 UInt t = ((UInt)xx) + ((UInt)yy);
185 if (t > 0xFF) t = 0xFF;
186 return (UChar)t;
189 static inline Int qsub32S ( Int xx, Int yy )
191 Long t = ((Long)xx) - ((Long)yy);
192 const Long loLim = -0x80000000LL;
193 const Long hiLim = 0x7FFFFFFFLL;
194 if (t < loLim) t = loLim;
195 if (t > hiLim) t = hiLim;
196 return (Int)t;
199 static inline Short qsub16S ( Short xx, Short yy )
201 Int t = ((Int)xx) - ((Int)yy);
202 if (t < -32768) t = -32768;
203 if (t > 32767) t = 32767;
204 return (Short)t;
207 static inline Char qsub8S ( Char xx, Char yy )
209 Int t = ((Int)xx) - ((Int)yy);
210 if (t < -128) t = -128;
211 if (t > 127) t = 127;
212 return (Char)t;
215 static inline UShort qsub16U ( UShort xx, UShort yy )
217 Int t = ((Int)xx) - ((Int)yy);
218 if (t < 0) t = 0;
219 if (t > 0xFFFF) t = 0xFFFF;
220 return (UShort)t;
223 static inline UChar qsub8U ( UChar xx, UChar yy )
225 Int t = ((Int)xx) - ((Int)yy);
226 if (t < 0) t = 0;
227 if (t > 0xFF) t = 0xFF;
228 return (UChar)t;
231 static inline Short mul16 ( Short xx, Short yy )
233 Int t = ((Int)xx) * ((Int)yy);
234 return (Short)t;
237 static inline Int mul32 ( Int xx, Int yy )
239 Int t = ((Int)xx) * ((Int)yy);
240 return (Int)t;
243 static inline Short mulhi16S ( Short xx, Short yy )
245 Int t = ((Int)xx) * ((Int)yy);
246 t >>=/*s*/ 16;
247 return (Short)t;
250 static inline UShort mulhi16U ( UShort xx, UShort yy )
252 UInt t = ((UInt)xx) * ((UInt)yy);
253 t >>=/*u*/ 16;
254 return (UShort)t;
257 static inline UInt cmpeq32 ( UInt xx, UInt yy )
259 return xx==yy ? 0xFFFFFFFF : 0;
262 static inline UShort cmpeq16 ( UShort xx, UShort yy )
264 return toUShort(xx==yy ? 0xFFFF : 0);
267 static inline UChar cmpeq8 ( UChar xx, UChar yy )
269 return toUChar(xx==yy ? 0xFF : 0);
272 static inline UInt cmpgt32S ( Int xx, Int yy )
274 return xx>yy ? 0xFFFFFFFF : 0;
277 static inline UShort cmpgt16S ( Short xx, Short yy )
279 return toUShort(xx>yy ? 0xFFFF : 0);
282 static inline UChar cmpgt8S ( Char xx, Char yy )
284 return toUChar(xx>yy ? 0xFF : 0);
287 static inline UInt cmpnez32 ( UInt xx )
289 return xx==0 ? 0 : 0xFFFFFFFF;
292 static inline UShort cmpnez16 ( UShort xx )
294 return toUShort(xx==0 ? 0 : 0xFFFF);
297 static inline UChar cmpnez8 ( UChar xx )
299 return toUChar(xx==0 ? 0 : 0xFF);
302 static inline Short qnarrow32Sto16S ( UInt xx0 )
304 Int xx = (Int)xx0;
305 if (xx < -32768) xx = -32768;
306 if (xx > 32767) xx = 32767;
307 return (Short)xx;
310 static inline Char qnarrow16Sto8S ( UShort xx0 )
312 Short xx = (Short)xx0;
313 if (xx < -128) xx = -128;
314 if (xx > 127) xx = 127;
315 return (Char)xx;
318 static inline UChar qnarrow16Sto8U ( UShort xx0 )
320 Short xx = (Short)xx0;
321 if (xx < 0) xx = 0;
322 if (xx > 255) xx = 255;
323 return (UChar)xx;
326 static inline UShort narrow32to16 ( UInt xx )
328 return (UShort)xx;
331 static inline UChar narrow16to8 ( UShort xx )
333 return (UChar)xx;
336 /* shifts: we don't care about out-of-range ones, since
337 that is dealt with at a higher level. */
339 static inline UChar shl8 ( UChar v, UInt n )
341 return toUChar(v << n);
344 static inline UChar sar8 ( UChar v, UInt n )
346 return toUChar(((Char)v) >> n);
349 static inline UShort shl16 ( UShort v, UInt n )
351 return toUShort(v << n);
354 static inline UShort shr16 ( UShort v, UInt n )
356 return toUShort((((UShort)v) >> n));
359 static inline UShort sar16 ( UShort v, UInt n )
361 return toUShort(((Short)v) >> n);
364 static inline UInt shl32 ( UInt v, UInt n )
366 return v << n;
369 static inline UInt shr32 ( UInt v, UInt n )
371 return (((UInt)v) >> n);
374 static inline UInt sar32 ( UInt v, UInt n )
376 return ((Int)v) >> n;
379 static inline UChar avg8U ( UChar xx, UChar yy )
381 UInt xxi = (UInt)xx;
382 UInt yyi = (UInt)yy;
383 UInt r = (xxi + yyi + 1) >> 1;
384 return (UChar)r;
387 static inline UShort avg16U ( UShort xx, UShort yy )
389 UInt xxi = (UInt)xx;
390 UInt yyi = (UInt)yy;
391 UInt r = (xxi + yyi + 1) >> 1;
392 return (UShort)r;
395 static inline Short max16S ( Short xx, Short yy )
397 return toUShort((xx > yy) ? xx : yy);
400 static inline UChar max8U ( UChar xx, UChar yy )
402 return toUChar((xx > yy) ? xx : yy);
405 static inline Short min16S ( Short xx, Short yy )
407 return toUShort((xx < yy) ? xx : yy);
410 static inline UChar min8U ( UChar xx, UChar yy )
412 return toUChar((xx < yy) ? xx : yy);
415 static inline UShort hadd16U ( UShort xx, UShort yy )
417 UInt xxi = (UInt)xx;
418 UInt yyi = (UInt)yy;
419 UInt r = (xxi + yyi) >> 1;
420 return (UShort)r;
423 static inline Short hadd16S ( Short xx, Short yy )
425 Int xxi = (Int)xx;
426 Int yyi = (Int)yy;
427 Int r = (xxi + yyi) >> 1;
428 return (Short)r;
431 static inline UShort hsub16U ( UShort xx, UShort yy )
433 UInt xxi = (UInt)xx;
434 UInt yyi = (UInt)yy;
435 UInt r = (xxi - yyi) >> 1;
436 return (UShort)r;
439 static inline Short hsub16S ( Short xx, Short yy )
441 Int xxi = (Int)xx;
442 Int yyi = (Int)yy;
443 Int r = (xxi - yyi) >> 1;
444 return (Short)r;
447 static inline UChar hadd8U ( UChar xx, UChar yy )
449 UInt xxi = (UInt)xx;
450 UInt yyi = (UInt)yy;
451 UInt r = (xxi + yyi) >> 1;
452 return (UChar)r;
455 static inline Char hadd8S ( Char xx, Char yy )
457 Int xxi = (Int)xx;
458 Int yyi = (Int)yy;
459 Int r = (xxi + yyi) >> 1;
460 return (Char)r;
463 static inline UChar hsub8U ( UChar xx, UChar yy )
465 UInt xxi = (UInt)xx;
466 UInt yyi = (UInt)yy;
467 UInt r = (xxi - yyi) >> 1;
468 return (UChar)r;
471 static inline Char hsub8S ( Char xx, Char yy )
473 Int xxi = (Int)xx;
474 Int yyi = (Int)yy;
475 Int r = (xxi - yyi) >> 1;
476 return (Char)r;
479 static inline UInt absdiff8U ( UChar xx, UChar yy )
481 UInt xxu = (UChar)xx;
482 UInt yyu = (UChar)yy;
483 return xxu >= yyu ? xxu - yyu : yyu - xxu;
486 /* ----------------------------------------------------- */
487 /* Start of the externally visible functions. These simply
488 implement the corresponding IR primops. */
489 /* ----------------------------------------------------- */
491 /* ------------ Normal addition ------------ */
493 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
495 return mk32x2(
496 sel32x2_1(xx) + sel32x2_1(yy),
497 sel32x2_0(xx) + sel32x2_0(yy)
501 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
503 return mk16x4(
504 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
505 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
506 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
507 toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
511 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
513 return mk8x8(
514 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
515 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
516 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
517 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
518 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
519 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
520 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
521 toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
525 /* ------------ Saturating addition ------------ */
527 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
529 return mk16x4(
530 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
531 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
532 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
533 qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
537 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
539 return mk8x8(
540 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
541 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
542 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
543 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
544 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
545 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
546 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
547 qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
551 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
553 return mk16x4(
554 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
555 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
556 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
557 qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
561 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
563 return mk8x8(
564 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
565 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
566 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
567 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
568 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
569 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
570 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
571 qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
575 /* ------------ Normal subtraction ------------ */
577 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
579 return mk32x2(
580 sel32x2_1(xx) - sel32x2_1(yy),
581 sel32x2_0(xx) - sel32x2_0(yy)
585 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
587 return mk16x4(
588 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
589 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
590 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
591 toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
595 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
597 return mk8x8(
598 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
599 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
600 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
601 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
602 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
603 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
604 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
605 toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
609 /* ------------ Saturating subtraction ------------ */
611 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
613 return mk16x4(
614 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
615 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
616 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
617 qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
621 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
623 return mk8x8(
624 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
625 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
626 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
627 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
628 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
629 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
630 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
631 qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
635 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
637 return mk16x4(
638 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
639 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
640 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
641 qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
645 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
647 return mk8x8(
648 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
649 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
650 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
651 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
652 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
653 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
654 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
655 qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
659 /* ------------ Multiplication ------------ */
661 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
663 return mk16x4(
664 mul16( sel16x4_3(xx), sel16x4_3(yy) ),
665 mul16( sel16x4_2(xx), sel16x4_2(yy) ),
666 mul16( sel16x4_1(xx), sel16x4_1(yy) ),
667 mul16( sel16x4_0(xx), sel16x4_0(yy) )
671 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
673 return mk32x2(
674 mul32( sel32x2_1(xx), sel32x2_1(yy) ),
675 mul32( sel32x2_0(xx), sel32x2_0(yy) )
679 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
681 return mk16x4(
682 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
683 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
684 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
685 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
689 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
691 return mk16x4(
692 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
693 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
694 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
695 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
699 /* ------------ Comparison ------------ */
701 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
703 return mk32x2(
704 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
705 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
709 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
711 return mk16x4(
712 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
713 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
714 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
715 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
719 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
721 return mk8x8(
722 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
723 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
724 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
725 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
726 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
727 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
728 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
729 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
733 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
735 return mk32x2(
736 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
737 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
741 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
743 return mk16x4(
744 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
745 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
746 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
747 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
751 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
753 return mk8x8(
754 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
755 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
756 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
757 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
758 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
759 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
760 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
761 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
765 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
767 return mk32x2(
768 cmpnez32( sel32x2_1(xx) ),
769 cmpnez32( sel32x2_0(xx) )
773 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
775 return mk16x4(
776 cmpnez16( sel16x4_3(xx) ),
777 cmpnez16( sel16x4_2(xx) ),
778 cmpnez16( sel16x4_1(xx) ),
779 cmpnez16( sel16x4_0(xx) )
783 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
785 return mk8x8(
786 cmpnez8( sel8x8_7(xx) ),
787 cmpnez8( sel8x8_6(xx) ),
788 cmpnez8( sel8x8_5(xx) ),
789 cmpnez8( sel8x8_4(xx) ),
790 cmpnez8( sel8x8_3(xx) ),
791 cmpnez8( sel8x8_2(xx) ),
792 cmpnez8( sel8x8_1(xx) ),
793 cmpnez8( sel8x8_0(xx) )
797 /* ------------ Saturating narrowing ------------ */
799 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
801 UInt d = sel32x2_1(aa);
802 UInt c = sel32x2_0(aa);
803 UInt b = sel32x2_1(bb);
804 UInt a = sel32x2_0(bb);
805 return mk16x4(
806 qnarrow32Sto16S(d),
807 qnarrow32Sto16S(c),
808 qnarrow32Sto16S(b),
809 qnarrow32Sto16S(a)
813 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
815 UShort h = sel16x4_3(aa);
816 UShort g = sel16x4_2(aa);
817 UShort f = sel16x4_1(aa);
818 UShort e = sel16x4_0(aa);
819 UShort d = sel16x4_3(bb);
820 UShort c = sel16x4_2(bb);
821 UShort b = sel16x4_1(bb);
822 UShort a = sel16x4_0(bb);
823 return mk8x8(
824 qnarrow16Sto8S(h),
825 qnarrow16Sto8S(g),
826 qnarrow16Sto8S(f),
827 qnarrow16Sto8S(e),
828 qnarrow16Sto8S(d),
829 qnarrow16Sto8S(c),
830 qnarrow16Sto8S(b),
831 qnarrow16Sto8S(a)
835 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
837 UShort h = sel16x4_3(aa);
838 UShort g = sel16x4_2(aa);
839 UShort f = sel16x4_1(aa);
840 UShort e = sel16x4_0(aa);
841 UShort d = sel16x4_3(bb);
842 UShort c = sel16x4_2(bb);
843 UShort b = sel16x4_1(bb);
844 UShort a = sel16x4_0(bb);
845 return mk8x8(
846 qnarrow16Sto8U(h),
847 qnarrow16Sto8U(g),
848 qnarrow16Sto8U(f),
849 qnarrow16Sto8U(e),
850 qnarrow16Sto8U(d),
851 qnarrow16Sto8U(c),
852 qnarrow16Sto8U(b),
853 qnarrow16Sto8U(a)
857 /* ------------ Truncating narrowing ------------ */
859 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
861 UInt d = sel32x2_1(aa);
862 UInt c = sel32x2_0(aa);
863 UInt b = sel32x2_1(bb);
864 UInt a = sel32x2_0(bb);
865 return mk16x4(
866 narrow32to16(d),
867 narrow32to16(c),
868 narrow32to16(b),
869 narrow32to16(a)
873 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
875 UShort h = sel16x4_3(aa);
876 UShort g = sel16x4_2(aa);
877 UShort f = sel16x4_1(aa);
878 UShort e = sel16x4_0(aa);
879 UShort d = sel16x4_3(bb);
880 UShort c = sel16x4_2(bb);
881 UShort b = sel16x4_1(bb);
882 UShort a = sel16x4_0(bb);
883 return mk8x8(
884 narrow16to8(h),
885 narrow16to8(g),
886 narrow16to8(f),
887 narrow16to8(e),
888 narrow16to8(d),
889 narrow16to8(c),
890 narrow16to8(b),
891 narrow16to8(a)
895 /* ------------ Interleaving ------------ */
897 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
899 return mk8x8(
900 sel8x8_7(aa),
901 sel8x8_7(bb),
902 sel8x8_6(aa),
903 sel8x8_6(bb),
904 sel8x8_5(aa),
905 sel8x8_5(bb),
906 sel8x8_4(aa),
907 sel8x8_4(bb)
911 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
913 return mk8x8(
914 sel8x8_3(aa),
915 sel8x8_3(bb),
916 sel8x8_2(aa),
917 sel8x8_2(bb),
918 sel8x8_1(aa),
919 sel8x8_1(bb),
920 sel8x8_0(aa),
921 sel8x8_0(bb)
925 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
927 return mk16x4(
928 sel16x4_3(aa),
929 sel16x4_3(bb),
930 sel16x4_2(aa),
931 sel16x4_2(bb)
935 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
937 return mk16x4(
938 sel16x4_1(aa),
939 sel16x4_1(bb),
940 sel16x4_0(aa),
941 sel16x4_0(bb)
945 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
947 return mk32x2(
948 sel32x2_1(aa),
949 sel32x2_1(bb)
953 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
955 return mk32x2(
956 sel32x2_0(aa),
957 sel32x2_0(bb)
961 /* ------------ Concatenation ------------ */
963 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
965 return mk16x4(
966 sel16x4_3(aa),
967 sel16x4_1(aa),
968 sel16x4_3(bb),
969 sel16x4_1(bb)
973 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
975 return mk16x4(
976 sel16x4_2(aa),
977 sel16x4_0(aa),
978 sel16x4_2(bb),
979 sel16x4_0(bb)
983 /* ------------ Permutation ------------ */
985 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
987 return mk8x8(
988 index8x8(aa, sel8x8_7(bb)),
989 index8x8(aa, sel8x8_6(bb)),
990 index8x8(aa, sel8x8_5(bb)),
991 index8x8(aa, sel8x8_4(bb)),
992 index8x8(aa, sel8x8_3(bb)),
993 index8x8(aa, sel8x8_2(bb)),
994 index8x8(aa, sel8x8_1(bb)),
995 index8x8(aa, sel8x8_0(bb))
999 ULong h_generic_calc_PermOrZero8x8 ( ULong aa, ULong bb )
1001 return mk8x8(
1002 indexOrZero8x8(aa, sel8x8_7(bb)),
1003 indexOrZero8x8(aa, sel8x8_6(bb)),
1004 indexOrZero8x8(aa, sel8x8_5(bb)),
1005 indexOrZero8x8(aa, sel8x8_4(bb)),
1006 indexOrZero8x8(aa, sel8x8_3(bb)),
1007 indexOrZero8x8(aa, sel8x8_2(bb)),
1008 indexOrZero8x8(aa, sel8x8_1(bb)),
1009 indexOrZero8x8(aa, sel8x8_0(bb))
1013 /* ------------ Shifting ------------ */
1014 /* Note that because these primops are undefined if the shift amount
1015 equals or exceeds the lane width, the shift amount is masked so
1016 that the scalar shifts are always in range. In fact, given the
1017 semantics of these primops (ShlN16x4, etc) it is an error if in
1018 fact we are ever given an out-of-range shift amount.
1020 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
1022 /* vassert(nn < 32); */
1023 nn &= 31;
1024 return mk32x2(
1025 shl32( sel32x2_1(xx), nn ),
1026 shl32( sel32x2_0(xx), nn )
1030 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1032 /* vassert(nn < 16); */
1033 nn &= 15;
1034 return mk16x4(
1035 shl16( sel16x4_3(xx), nn ),
1036 shl16( sel16x4_2(xx), nn ),
1037 shl16( sel16x4_1(xx), nn ),
1038 shl16( sel16x4_0(xx), nn )
1042 ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
1044 /* vassert(nn < 8); */
1045 nn &= 7;
1046 return mk8x8(
1047 shl8( sel8x8_7(xx), nn ),
1048 shl8( sel8x8_6(xx), nn ),
1049 shl8( sel8x8_5(xx), nn ),
1050 shl8( sel8x8_4(xx), nn ),
1051 shl8( sel8x8_3(xx), nn ),
1052 shl8( sel8x8_2(xx), nn ),
1053 shl8( sel8x8_1(xx), nn ),
1054 shl8( sel8x8_0(xx), nn )
1058 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1060 /* vassert(nn < 32); */
1061 nn &= 31;
1062 return mk32x2(
1063 shr32( sel32x2_1(xx), nn ),
1064 shr32( sel32x2_0(xx), nn )
1068 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1070 /* vassert(nn < 16); */
1071 nn &= 15;
1072 return mk16x4(
1073 shr16( sel16x4_3(xx), nn ),
1074 shr16( sel16x4_2(xx), nn ),
1075 shr16( sel16x4_1(xx), nn ),
1076 shr16( sel16x4_0(xx), nn )
1080 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1082 /* vassert(nn < 32); */
1083 nn &= 31;
1084 return mk32x2(
1085 sar32( sel32x2_1(xx), nn ),
1086 sar32( sel32x2_0(xx), nn )
1090 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1092 /* vassert(nn < 16); */
1093 nn &= 15;
1094 return mk16x4(
1095 sar16( sel16x4_3(xx), nn ),
1096 sar16( sel16x4_2(xx), nn ),
1097 sar16( sel16x4_1(xx), nn ),
1098 sar16( sel16x4_0(xx), nn )
1102 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1104 /* vassert(nn < 8); */
1105 nn &= 7;
1106 return mk8x8(
1107 sar8( sel8x8_7(xx), nn ),
1108 sar8( sel8x8_6(xx), nn ),
1109 sar8( sel8x8_5(xx), nn ),
1110 sar8( sel8x8_4(xx), nn ),
1111 sar8( sel8x8_3(xx), nn ),
1112 sar8( sel8x8_2(xx), nn ),
1113 sar8( sel8x8_1(xx), nn ),
1114 sar8( sel8x8_0(xx), nn )
1118 /* ------------ Averaging ------------ */
1120 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1122 return mk8x8(
1123 avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1124 avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1125 avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1126 avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1127 avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1128 avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1129 avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1130 avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1134 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1136 return mk16x4(
1137 avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1138 avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1139 avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1140 avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1144 /* ------------ max/min ------------ */
1146 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1148 return mk16x4(
1149 max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1150 max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1151 max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1152 max16S( sel16x4_0(xx), sel16x4_0(yy) )
1156 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1158 return mk8x8(
1159 max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1160 max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1161 max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1162 max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1163 max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1164 max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1165 max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1166 max8U( sel8x8_0(xx), sel8x8_0(yy) )
1170 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1172 return mk16x4(
1173 min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1174 min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1175 min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1176 min16S( sel16x4_0(xx), sel16x4_0(yy) )
1180 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1182 return mk8x8(
1183 min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1184 min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1185 min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1186 min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1187 min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1188 min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1189 min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1190 min8U( sel8x8_0(xx), sel8x8_0(yy) )
1194 UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1196 UInt r = 0;
1197 if (xx & (1ULL << (64-1))) r |= (1<<7);
1198 if (xx & (1ULL << (56-1))) r |= (1<<6);
1199 if (xx & (1ULL << (48-1))) r |= (1<<5);
1200 if (xx & (1ULL << (40-1))) r |= (1<<4);
1201 if (xx & (1ULL << (32-1))) r |= (1<<3);
1202 if (xx & (1ULL << (24-1))) r |= (1<<2);
1203 if (xx & (1ULL << (16-1))) r |= (1<<1);
1204 if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1205 return r;
1208 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1210 /* Tuple/select functions for 16x2 vectors. */
1211 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1212 return (((UInt)w1) << 16) | ((UInt)w2);
1215 static inline UShort sel16x2_1 ( UInt w32 ) {
1216 return 0xFFFF & (UShort)(w32 >> 16);
1218 static inline UShort sel16x2_0 ( UInt w32 ) {
1219 return 0xFFFF & (UShort)(w32);
1222 static inline UInt mk8x4 ( UChar w3, UChar w2,
1223 UChar w1, UChar w0 ) {
1224 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
1225 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
1226 return w32;
1229 static inline UChar sel8x4_3 ( UInt w32 ) {
1230 return toUChar(0xFF & (w32 >> 24));
1232 static inline UChar sel8x4_2 ( UInt w32 ) {
1233 return toUChar(0xFF & (w32 >> 16));
1235 static inline UChar sel8x4_1 ( UInt w32 ) {
1236 return toUChar(0xFF & (w32 >> 8));
1238 static inline UChar sel8x4_0 ( UInt w32 ) {
1239 return toUChar(0xFF & (w32 >> 0));
1243 /* ----------------------------------------------------- */
1244 /* More externally visible functions. These simply
1245 implement the corresponding IR primops. */
1246 /* ----------------------------------------------------- */
1248 /* ------ 16x2 ------ */
1250 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1252 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1253 sel16x2_0(xx) + sel16x2_0(yy) );
1256 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1258 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1259 sel16x2_0(xx) - sel16x2_0(yy) );
1262 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1264 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1265 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1268 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1270 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1271 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1274 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1276 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1277 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1280 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1282 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1283 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1286 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1288 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1289 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1292 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1294 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1295 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1298 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1300 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1301 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1304 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1306 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1307 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1310 /* ------ 8x4 ------ */
1312 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1314 return mk8x4(
1315 sel8x4_3(xx) + sel8x4_3(yy),
1316 sel8x4_2(xx) + sel8x4_2(yy),
1317 sel8x4_1(xx) + sel8x4_1(yy),
1318 sel8x4_0(xx) + sel8x4_0(yy)
1322 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1324 return mk8x4(
1325 sel8x4_3(xx) - sel8x4_3(yy),
1326 sel8x4_2(xx) - sel8x4_2(yy),
1327 sel8x4_1(xx) - sel8x4_1(yy),
1328 sel8x4_0(xx) - sel8x4_0(yy)
1332 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1334 return mk8x4(
1335 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1336 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1337 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1338 hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1342 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1344 return mk8x4(
1345 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1346 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1347 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1348 hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1352 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1354 return mk8x4(
1355 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1356 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1357 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1358 hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1362 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1364 return mk8x4(
1365 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1366 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1367 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1368 hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1372 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1374 return mk8x4(
1375 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1376 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1377 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1378 qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1382 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1384 return mk8x4(
1385 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1386 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1387 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1388 qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1392 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1394 return mk8x4(
1395 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1396 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1397 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1398 qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1402 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1404 return mk8x4(
1405 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1406 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1407 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1408 qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1412 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1414 return mk16x2(
1415 cmpnez16( sel16x2_1(xx) ),
1416 cmpnez16( sel16x2_0(xx) )
1420 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1422 return mk8x4(
1423 cmpnez8( sel8x4_3(xx) ),
1424 cmpnez8( sel8x4_2(xx) ),
1425 cmpnez8( sel8x4_1(xx) ),
1426 cmpnez8( sel8x4_0(xx) )
1430 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1432 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1433 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1434 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1435 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1438 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1440 return qadd32S( xx, yy );
1443 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1445 return qsub32S( xx, yy );
1449 /*------------------------------------------------------------------*/
1450 /* Decimal Floating Point (DFP) externally visible helper functions */
1451 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
1452 /*------------------------------------------------------------------*/
1454 #define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
1455 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1456 #define PUT( x, y ) ( ( x )<< ( y ) )
1458 static ULong dpb_to_bcd( ULong chunk )
1460 Short a, b, c, d, e, f, g, h, i, j, k, m;
1461 Short p, q, r, s, t, u, v, w, x, y;
1462 ULong value;
1464 /* convert 10 bit densely packed BCD to BCD */
1465 p = GET( chunk, 9 );
1466 q = GET( chunk, 8 );
1467 r = GET( chunk, 7 );
1468 s = GET( chunk, 6 );
1469 t = GET( chunk, 5 );
1470 u = GET( chunk, 4 );
1471 v = GET( chunk, 3 );
1472 w = GET( chunk, 2 );
1473 x = GET( chunk, 1 );
1474 y = GET( chunk, 0 );
1476 /* The BCD bit values are given by the following boolean equations.*/
1477 a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1478 b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1479 c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1480 d = r;
1481 e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1482 f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1483 g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1484 h = u;
1485 i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1486 j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1487 | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1488 k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1489 | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1490 m = y;
1492 value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1493 | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1494 | PUT(k, 1) | PUT(m, 0);
1495 return value;
1498 static ULong bcd_to_dpb( ULong chunk )
1500 Short a, b, c, d, e, f, g, h, i, j, k, m;
1501 Short p, q, r, s, t, u, v, w, x, y;
1502 ULong value;
1503 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1504 The boolean equations to calculate the value of each of the DPD bit
1505 is given in Appendix B of Book 1: Power ISA User Instruction set. The
1506 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
1507 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
1509 a = GET( chunk, 11 );
1510 b = GET( chunk, 10 );
1511 c = GET( chunk, 9 );
1512 d = GET( chunk, 8 );
1513 e = GET( chunk, 7 );
1514 f = GET( chunk, 6 );
1515 g = GET( chunk, 5 );
1516 h = GET( chunk, 4 );
1517 i = GET( chunk, 3 );
1518 j = GET( chunk, 2 );
1519 k = GET( chunk, 1 );
1520 m = GET( chunk, 0 );
1522 p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1523 q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1524 r = d;
1525 s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1526 | ( f & NOT(a) & NOT(e) ) | ( e & i );
1527 t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1528 | ( g & NOT(a) & NOT(e) ) | ( a & i );
1529 u = h;
1530 v = a | e | i;
1531 w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1532 x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1533 y = m;
1535 value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1536 | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1538 return value;
1541 ULong h_calc_DPBtoBCD( ULong dpb )
1543 ULong result, chunk;
1544 Int i;
1546 result = 0;
1548 for (i = 0; i < 5; i++) {
1549 chunk = dpb >> ( 4 - i ) * 10;
1550 result = result << 12;
1551 result |= dpb_to_bcd( chunk & 0x3FF );
1553 return result;
1556 ULong h_calc_BCDtoDPB( ULong bcd )
1558 ULong result, chunk;
1559 Int i;
1561 result = 0;
1563 for (i = 0; i < 5; i++) {
1564 chunk = bcd >> ( 4 - i ) * 12;
1565 result = result << 10;
1566 result |= bcd_to_dpb( chunk & 0xFFF );
1568 return result;
1570 #undef NOT
1571 #undef GET
1572 #undef PUT
1575 /* ----------------------------------------------------- */
1576 /* Signed and unsigned integer division, that behave like
1577 the ARMv7 UDIV ansd SDIV instructions.
1579 sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1580 udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1582 /* ----------------------------------------------------- */
1584 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
1586 // Division by zero --> zero
1587 if (UNLIKELY(y == 0)) return 0;
1588 // C requires rounding towards zero, which is also what we need.
1589 return x / y;
1592 ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
1594 // Division by zero --> zero
1595 if (UNLIKELY(y == 0)) return 0;
1596 // C requires rounding towards zero, which is also what we need.
1597 return x / y;
1600 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
1602 // Division by zero --> zero
1603 if (UNLIKELY(y == 0)) return 0;
1604 // The single case that produces an unrepresentable result
1605 if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
1606 && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
1607 return (Int)(UInt)0x80000000;
1608 // Else return the result rounded towards zero. C89 says
1609 // this is implementation defined (in the signed case), but gcc
1610 // promises to round towards zero. Nevertheless, at startup,
1611 // in main_main.c, do a check for that.
1612 return x / y;
1615 Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
1617 // Division by zero --> zero
1618 if (UNLIKELY(y == 0)) return 0;
1619 // The single case that produces an unrepresentable result
1620 if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
1621 && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
1622 return (Long)(ULong)0x8000000000000000ULL;
1623 // Else return the result rounded towards zero. C89 says
1624 // this is implementation defined (in the signed case), but gcc
1625 // promises to round towards zero. Nevertheless, at startup,
1626 // in main_main.c, do a check for that.
1627 return x / y;
1631 /*---------------------------------------------------------------*/
1632 /*--- end host_generic_simd64.c ---*/
1633 /*---------------------------------------------------------------*/