2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
7 This file is part of Valgrind, a dynamic binary instrumentation
10 Copyright (C) 2004-2017 OpenWorks LLP
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
28 Neither the names of the U.S. Department of Energy nor the
29 University of California nor the names of its contributors may be
30 used to endorse or promote products derived from this software
31 without prior written permission.
34 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
35 where the instruction selectors cannot generate code in-line.
36 These are purely back-end entities and cannot be seen/referenced
37 from IR. There are also helpers for 32-bit arithmetic in here. */
39 #include "libvex_basictypes.h"
40 #include "main_util.h" // LIKELY, UNLIKELY
41 #include "host_generic_simd64.h"
45 /* Tuple/select functions for 32x2 vectors. */
47 static inline ULong
mk32x2 ( UInt w1
, UInt w0
) {
48 return (((ULong
)w1
) << 32) | ((ULong
)w0
);
51 static inline UInt
sel32x2_1 ( ULong w64
) {
52 return 0xFFFFFFFF & toUInt(w64
>> 32);
54 static inline UInt
sel32x2_0 ( ULong w64
) {
55 return 0xFFFFFFFF & toUInt(w64
);
59 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
60 with 64-bit shifts so we give it a hand. */
62 static inline ULong
mk16x4 ( UShort w3
, UShort w2
,
63 UShort w1
, UShort w0
) {
64 UInt hi32
= (((UInt
)w3
) << 16) | ((UInt
)w2
);
65 UInt lo32
= (((UInt
)w1
) << 16) | ((UInt
)w0
);
66 return mk32x2(hi32
, lo32
);
69 static inline UShort
sel16x4_3 ( ULong w64
) {
70 UInt hi32
= toUInt(w64
>> 32);
71 return toUShort(0xFFFF & (hi32
>> 16));
73 static inline UShort
sel16x4_2 ( ULong w64
) {
74 UInt hi32
= toUInt(w64
>> 32);
75 return toUShort(0xFFFF & hi32
);
77 static inline UShort
sel16x4_1 ( ULong w64
) {
78 UInt lo32
= (UInt
)w64
;
79 return toUShort(0xFFFF & (lo32
>> 16));
81 static inline UShort
sel16x4_0 ( ULong w64
) {
82 UInt lo32
= (UInt
)w64
;
83 return toUShort(0xFFFF & lo32
);
87 /* Tuple/select functions for 8x8 vectors. */
89 static inline ULong
mk8x8 ( UChar w7
, UChar w6
,
92 UChar w1
, UChar w0
) {
93 UInt hi32
= (((UInt
)w7
) << 24) | (((UInt
)w6
) << 16)
94 | (((UInt
)w5
) << 8) | (((UInt
)w4
) << 0);
95 UInt lo32
= (((UInt
)w3
) << 24) | (((UInt
)w2
) << 16)
96 | (((UInt
)w1
) << 8) | (((UInt
)w0
) << 0);
97 return mk32x2(hi32
, lo32
);
100 static inline UChar
sel8x8_7 ( ULong w64
) {
101 UInt hi32
= toUInt(w64
>> 32);
102 return toUChar(0xFF & (hi32
>> 24));
104 static inline UChar
sel8x8_6 ( ULong w64
) {
105 UInt hi32
= toUInt(w64
>> 32);
106 return toUChar(0xFF & (hi32
>> 16));
108 static inline UChar
sel8x8_5 ( ULong w64
) {
109 UInt hi32
= toUInt(w64
>> 32);
110 return toUChar(0xFF & (hi32
>> 8));
112 static inline UChar
sel8x8_4 ( ULong w64
) {
113 UInt hi32
= toUInt(w64
>> 32);
114 return toUChar(0xFF & (hi32
>> 0));
116 static inline UChar
sel8x8_3 ( ULong w64
) {
117 UInt lo32
= (UInt
)w64
;
118 return toUChar(0xFF & (lo32
>> 24));
120 static inline UChar
sel8x8_2 ( ULong w64
) {
121 UInt lo32
= (UInt
)w64
;
122 return toUChar(0xFF & (lo32
>> 16));
124 static inline UChar
sel8x8_1 ( ULong w64
) {
125 UInt lo32
= (UInt
)w64
;
126 return toUChar(0xFF & (lo32
>> 8));
128 static inline UChar
sel8x8_0 ( ULong w64
) {
129 UInt lo32
= (UInt
)w64
;
130 return toUChar(0xFF & (lo32
>> 0));
133 static inline UChar
index8x8 ( ULong w64
, UChar ix
) {
135 return toUChar((w64
>> (8*ix
)) & 0xFF);
138 static inline UChar
indexOrZero8x8 ( ULong w64
, UChar ix
) {
139 Char zeroingMask
= (Char
)ix
;
143 return toUChar( ((w64
>> (8*ix
)) & zeroingMask
) & 0xFF );
147 /* Scalar helpers. */
149 static inline Int
qadd32S ( Int xx
, Int yy
)
151 Long t
= ((Long
)xx
) + ((Long
)yy
);
152 const Long loLim
= -0x80000000LL
;
153 const Long hiLim
= 0x7FFFFFFFLL
;
154 if (t
< loLim
) t
= loLim
;
155 if (t
> hiLim
) t
= hiLim
;
159 static inline Short
qadd16S ( Short xx
, Short yy
)
161 Int t
= ((Int
)xx
) + ((Int
)yy
);
162 if (t
< -32768) t
= -32768;
163 if (t
> 32767) t
= 32767;
167 static inline Char
qadd8S ( Char xx
, Char yy
)
169 Int t
= ((Int
)xx
) + ((Int
)yy
);
170 if (t
< -128) t
= -128;
171 if (t
> 127) t
= 127;
175 static inline UShort
qadd16U ( UShort xx
, UShort yy
)
177 UInt t
= ((UInt
)xx
) + ((UInt
)yy
);
178 if (t
> 0xFFFF) t
= 0xFFFF;
182 static inline UChar
qadd8U ( UChar xx
, UChar yy
)
184 UInt t
= ((UInt
)xx
) + ((UInt
)yy
);
185 if (t
> 0xFF) t
= 0xFF;
189 static inline Int
qsub32S ( Int xx
, Int yy
)
191 Long t
= ((Long
)xx
) - ((Long
)yy
);
192 const Long loLim
= -0x80000000LL
;
193 const Long hiLim
= 0x7FFFFFFFLL
;
194 if (t
< loLim
) t
= loLim
;
195 if (t
> hiLim
) t
= hiLim
;
199 static inline Short
qsub16S ( Short xx
, Short yy
)
201 Int t
= ((Int
)xx
) - ((Int
)yy
);
202 if (t
< -32768) t
= -32768;
203 if (t
> 32767) t
= 32767;
207 static inline Char
qsub8S ( Char xx
, Char yy
)
209 Int t
= ((Int
)xx
) - ((Int
)yy
);
210 if (t
< -128) t
= -128;
211 if (t
> 127) t
= 127;
215 static inline UShort
qsub16U ( UShort xx
, UShort yy
)
217 Int t
= ((Int
)xx
) - ((Int
)yy
);
219 if (t
> 0xFFFF) t
= 0xFFFF;
223 static inline UChar
qsub8U ( UChar xx
, UChar yy
)
225 Int t
= ((Int
)xx
) - ((Int
)yy
);
227 if (t
> 0xFF) t
= 0xFF;
231 static inline Short
mul16 ( Short xx
, Short yy
)
233 Int t
= ((Int
)xx
) * ((Int
)yy
);
237 static inline Int
mul32 ( Int xx
, Int yy
)
239 Int t
= ((Int
)xx
) * ((Int
)yy
);
243 static inline Short
mulhi16S ( Short xx
, Short yy
)
245 Int t
= ((Int
)xx
) * ((Int
)yy
);
250 static inline UShort
mulhi16U ( UShort xx
, UShort yy
)
252 UInt t
= ((UInt
)xx
) * ((UInt
)yy
);
257 static inline UInt
cmpeq32 ( UInt xx
, UInt yy
)
259 return xx
==yy
? 0xFFFFFFFF : 0;
262 static inline UShort
cmpeq16 ( UShort xx
, UShort yy
)
264 return toUShort(xx
==yy
? 0xFFFF : 0);
267 static inline UChar
cmpeq8 ( UChar xx
, UChar yy
)
269 return toUChar(xx
==yy
? 0xFF : 0);
272 static inline UInt
cmpgt32S ( Int xx
, Int yy
)
274 return xx
>yy
? 0xFFFFFFFF : 0;
277 static inline UShort
cmpgt16S ( Short xx
, Short yy
)
279 return toUShort(xx
>yy
? 0xFFFF : 0);
282 static inline UChar
cmpgt8S ( Char xx
, Char yy
)
284 return toUChar(xx
>yy
? 0xFF : 0);
287 static inline UInt
cmpnez32 ( UInt xx
)
289 return xx
==0 ? 0 : 0xFFFFFFFF;
292 static inline UShort
cmpnez16 ( UShort xx
)
294 return toUShort(xx
==0 ? 0 : 0xFFFF);
297 static inline UChar
cmpnez8 ( UChar xx
)
299 return toUChar(xx
==0 ? 0 : 0xFF);
302 static inline Short
qnarrow32Sto16S ( UInt xx0
)
305 if (xx
< -32768) xx
= -32768;
306 if (xx
> 32767) xx
= 32767;
310 static inline Char
qnarrow16Sto8S ( UShort xx0
)
312 Short xx
= (Short
)xx0
;
313 if (xx
< -128) xx
= -128;
314 if (xx
> 127) xx
= 127;
318 static inline UChar
qnarrow16Sto8U ( UShort xx0
)
320 Short xx
= (Short
)xx0
;
322 if (xx
> 255) xx
= 255;
326 static inline UShort
narrow32to16 ( UInt xx
)
331 static inline UChar
narrow16to8 ( UShort xx
)
336 /* shifts: we don't care about out-of-range ones, since
337 that is dealt with at a higher level. */
339 static inline UChar
shl8 ( UChar v
, UInt n
)
341 return toUChar(v
<< n
);
344 static inline UChar
sar8 ( UChar v
, UInt n
)
346 return toUChar(((Char
)v
) >> n
);
349 static inline UShort
shl16 ( UShort v
, UInt n
)
351 return toUShort(v
<< n
);
354 static inline UShort
shr16 ( UShort v
, UInt n
)
356 return toUShort((((UShort
)v
) >> n
));
359 static inline UShort
sar16 ( UShort v
, UInt n
)
361 return toUShort(((Short
)v
) >> n
);
364 static inline UInt
shl32 ( UInt v
, UInt n
)
369 static inline UInt
shr32 ( UInt v
, UInt n
)
371 return (((UInt
)v
) >> n
);
374 static inline UInt
sar32 ( UInt v
, UInt n
)
376 return ((Int
)v
) >> n
;
379 static inline UChar
avg8U ( UChar xx
, UChar yy
)
383 UInt r
= (xxi
+ yyi
+ 1) >> 1;
387 static inline UShort
avg16U ( UShort xx
, UShort yy
)
391 UInt r
= (xxi
+ yyi
+ 1) >> 1;
395 static inline Short
max16S ( Short xx
, Short yy
)
397 return toUShort((xx
> yy
) ? xx
: yy
);
400 static inline UChar
max8U ( UChar xx
, UChar yy
)
402 return toUChar((xx
> yy
) ? xx
: yy
);
405 static inline Short
min16S ( Short xx
, Short yy
)
407 return toUShort((xx
< yy
) ? xx
: yy
);
410 static inline UChar
min8U ( UChar xx
, UChar yy
)
412 return toUChar((xx
< yy
) ? xx
: yy
);
415 static inline UShort
hadd16U ( UShort xx
, UShort yy
)
419 UInt r
= (xxi
+ yyi
) >> 1;
423 static inline Short
hadd16S ( Short xx
, Short yy
)
427 Int r
= (xxi
+ yyi
) >> 1;
431 static inline UShort
hsub16U ( UShort xx
, UShort yy
)
435 UInt r
= (xxi
- yyi
) >> 1;
439 static inline Short
hsub16S ( Short xx
, Short yy
)
443 Int r
= (xxi
- yyi
) >> 1;
447 static inline UChar
hadd8U ( UChar xx
, UChar yy
)
451 UInt r
= (xxi
+ yyi
) >> 1;
455 static inline Char
hadd8S ( Char xx
, Char yy
)
459 Int r
= (xxi
+ yyi
) >> 1;
463 static inline UChar
hsub8U ( UChar xx
, UChar yy
)
467 UInt r
= (xxi
- yyi
) >> 1;
471 static inline Char
hsub8S ( Char xx
, Char yy
)
475 Int r
= (xxi
- yyi
) >> 1;
479 static inline UInt
absdiff8U ( UChar xx
, UChar yy
)
481 UInt xxu
= (UChar
)xx
;
482 UInt yyu
= (UChar
)yy
;
483 return xxu
>= yyu
? xxu
- yyu
: yyu
- xxu
;
486 /* ----------------------------------------------------- */
487 /* Start of the externally visible functions. These simply
488 implement the corresponding IR primops. */
489 /* ----------------------------------------------------- */
491 /* ------------ Normal addition ------------ */
493 ULong
h_generic_calc_Add32x2 ( ULong xx
, ULong yy
)
496 sel32x2_1(xx
) + sel32x2_1(yy
),
497 sel32x2_0(xx
) + sel32x2_0(yy
)
501 ULong
h_generic_calc_Add16x4 ( ULong xx
, ULong yy
)
504 toUShort( sel16x4_3(xx
) + sel16x4_3(yy
) ),
505 toUShort( sel16x4_2(xx
) + sel16x4_2(yy
) ),
506 toUShort( sel16x4_1(xx
) + sel16x4_1(yy
) ),
507 toUShort( sel16x4_0(xx
) + sel16x4_0(yy
) )
511 ULong
h_generic_calc_Add8x8 ( ULong xx
, ULong yy
)
514 toUChar( sel8x8_7(xx
) + sel8x8_7(yy
) ),
515 toUChar( sel8x8_6(xx
) + sel8x8_6(yy
) ),
516 toUChar( sel8x8_5(xx
) + sel8x8_5(yy
) ),
517 toUChar( sel8x8_4(xx
) + sel8x8_4(yy
) ),
518 toUChar( sel8x8_3(xx
) + sel8x8_3(yy
) ),
519 toUChar( sel8x8_2(xx
) + sel8x8_2(yy
) ),
520 toUChar( sel8x8_1(xx
) + sel8x8_1(yy
) ),
521 toUChar( sel8x8_0(xx
) + sel8x8_0(yy
) )
525 /* ------------ Saturating addition ------------ */
527 ULong
h_generic_calc_QAdd16Sx4 ( ULong xx
, ULong yy
)
530 qadd16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
531 qadd16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
532 qadd16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
533 qadd16S( sel16x4_0(xx
), sel16x4_0(yy
) )
537 ULong
h_generic_calc_QAdd8Sx8 ( ULong xx
, ULong yy
)
540 qadd8S( sel8x8_7(xx
), sel8x8_7(yy
) ),
541 qadd8S( sel8x8_6(xx
), sel8x8_6(yy
) ),
542 qadd8S( sel8x8_5(xx
), sel8x8_5(yy
) ),
543 qadd8S( sel8x8_4(xx
), sel8x8_4(yy
) ),
544 qadd8S( sel8x8_3(xx
), sel8x8_3(yy
) ),
545 qadd8S( sel8x8_2(xx
), sel8x8_2(yy
) ),
546 qadd8S( sel8x8_1(xx
), sel8x8_1(yy
) ),
547 qadd8S( sel8x8_0(xx
), sel8x8_0(yy
) )
551 ULong
h_generic_calc_QAdd16Ux4 ( ULong xx
, ULong yy
)
554 qadd16U( sel16x4_3(xx
), sel16x4_3(yy
) ),
555 qadd16U( sel16x4_2(xx
), sel16x4_2(yy
) ),
556 qadd16U( sel16x4_1(xx
), sel16x4_1(yy
) ),
557 qadd16U( sel16x4_0(xx
), sel16x4_0(yy
) )
561 ULong
h_generic_calc_QAdd8Ux8 ( ULong xx
, ULong yy
)
564 qadd8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
565 qadd8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
566 qadd8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
567 qadd8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
568 qadd8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
569 qadd8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
570 qadd8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
571 qadd8U( sel8x8_0(xx
), sel8x8_0(yy
) )
575 /* ------------ Normal subtraction ------------ */
577 ULong
h_generic_calc_Sub32x2 ( ULong xx
, ULong yy
)
580 sel32x2_1(xx
) - sel32x2_1(yy
),
581 sel32x2_0(xx
) - sel32x2_0(yy
)
585 ULong
h_generic_calc_Sub16x4 ( ULong xx
, ULong yy
)
588 toUShort( sel16x4_3(xx
) - sel16x4_3(yy
) ),
589 toUShort( sel16x4_2(xx
) - sel16x4_2(yy
) ),
590 toUShort( sel16x4_1(xx
) - sel16x4_1(yy
) ),
591 toUShort( sel16x4_0(xx
) - sel16x4_0(yy
) )
595 ULong
h_generic_calc_Sub8x8 ( ULong xx
, ULong yy
)
598 toUChar( sel8x8_7(xx
) - sel8x8_7(yy
) ),
599 toUChar( sel8x8_6(xx
) - sel8x8_6(yy
) ),
600 toUChar( sel8x8_5(xx
) - sel8x8_5(yy
) ),
601 toUChar( sel8x8_4(xx
) - sel8x8_4(yy
) ),
602 toUChar( sel8x8_3(xx
) - sel8x8_3(yy
) ),
603 toUChar( sel8x8_2(xx
) - sel8x8_2(yy
) ),
604 toUChar( sel8x8_1(xx
) - sel8x8_1(yy
) ),
605 toUChar( sel8x8_0(xx
) - sel8x8_0(yy
) )
609 /* ------------ Saturating subtraction ------------ */
611 ULong
h_generic_calc_QSub16Sx4 ( ULong xx
, ULong yy
)
614 qsub16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
615 qsub16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
616 qsub16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
617 qsub16S( sel16x4_0(xx
), sel16x4_0(yy
) )
621 ULong
h_generic_calc_QSub8Sx8 ( ULong xx
, ULong yy
)
624 qsub8S( sel8x8_7(xx
), sel8x8_7(yy
) ),
625 qsub8S( sel8x8_6(xx
), sel8x8_6(yy
) ),
626 qsub8S( sel8x8_5(xx
), sel8x8_5(yy
) ),
627 qsub8S( sel8x8_4(xx
), sel8x8_4(yy
) ),
628 qsub8S( sel8x8_3(xx
), sel8x8_3(yy
) ),
629 qsub8S( sel8x8_2(xx
), sel8x8_2(yy
) ),
630 qsub8S( sel8x8_1(xx
), sel8x8_1(yy
) ),
631 qsub8S( sel8x8_0(xx
), sel8x8_0(yy
) )
635 ULong
h_generic_calc_QSub16Ux4 ( ULong xx
, ULong yy
)
638 qsub16U( sel16x4_3(xx
), sel16x4_3(yy
) ),
639 qsub16U( sel16x4_2(xx
), sel16x4_2(yy
) ),
640 qsub16U( sel16x4_1(xx
), sel16x4_1(yy
) ),
641 qsub16U( sel16x4_0(xx
), sel16x4_0(yy
) )
645 ULong
h_generic_calc_QSub8Ux8 ( ULong xx
, ULong yy
)
648 qsub8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
649 qsub8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
650 qsub8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
651 qsub8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
652 qsub8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
653 qsub8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
654 qsub8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
655 qsub8U( sel8x8_0(xx
), sel8x8_0(yy
) )
659 /* ------------ Multiplication ------------ */
661 ULong
h_generic_calc_Mul16x4 ( ULong xx
, ULong yy
)
664 mul16( sel16x4_3(xx
), sel16x4_3(yy
) ),
665 mul16( sel16x4_2(xx
), sel16x4_2(yy
) ),
666 mul16( sel16x4_1(xx
), sel16x4_1(yy
) ),
667 mul16( sel16x4_0(xx
), sel16x4_0(yy
) )
671 ULong
h_generic_calc_Mul32x2 ( ULong xx
, ULong yy
)
674 mul32( sel32x2_1(xx
), sel32x2_1(yy
) ),
675 mul32( sel32x2_0(xx
), sel32x2_0(yy
) )
679 ULong
h_generic_calc_MulHi16Sx4 ( ULong xx
, ULong yy
)
682 mulhi16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
683 mulhi16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
684 mulhi16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
685 mulhi16S( sel16x4_0(xx
), sel16x4_0(yy
) )
689 ULong
h_generic_calc_MulHi16Ux4 ( ULong xx
, ULong yy
)
692 mulhi16U( sel16x4_3(xx
), sel16x4_3(yy
) ),
693 mulhi16U( sel16x4_2(xx
), sel16x4_2(yy
) ),
694 mulhi16U( sel16x4_1(xx
), sel16x4_1(yy
) ),
695 mulhi16U( sel16x4_0(xx
), sel16x4_0(yy
) )
699 /* ------------ Comparison ------------ */
701 ULong
h_generic_calc_CmpEQ32x2 ( ULong xx
, ULong yy
)
704 cmpeq32( sel32x2_1(xx
), sel32x2_1(yy
) ),
705 cmpeq32( sel32x2_0(xx
), sel32x2_0(yy
) )
709 ULong
h_generic_calc_CmpEQ16x4 ( ULong xx
, ULong yy
)
712 cmpeq16( sel16x4_3(xx
), sel16x4_3(yy
) ),
713 cmpeq16( sel16x4_2(xx
), sel16x4_2(yy
) ),
714 cmpeq16( sel16x4_1(xx
), sel16x4_1(yy
) ),
715 cmpeq16( sel16x4_0(xx
), sel16x4_0(yy
) )
719 ULong
h_generic_calc_CmpEQ8x8 ( ULong xx
, ULong yy
)
722 cmpeq8( sel8x8_7(xx
), sel8x8_7(yy
) ),
723 cmpeq8( sel8x8_6(xx
), sel8x8_6(yy
) ),
724 cmpeq8( sel8x8_5(xx
), sel8x8_5(yy
) ),
725 cmpeq8( sel8x8_4(xx
), sel8x8_4(yy
) ),
726 cmpeq8( sel8x8_3(xx
), sel8x8_3(yy
) ),
727 cmpeq8( sel8x8_2(xx
), sel8x8_2(yy
) ),
728 cmpeq8( sel8x8_1(xx
), sel8x8_1(yy
) ),
729 cmpeq8( sel8x8_0(xx
), sel8x8_0(yy
) )
733 ULong
h_generic_calc_CmpGT32Sx2 ( ULong xx
, ULong yy
)
736 cmpgt32S( sel32x2_1(xx
), sel32x2_1(yy
) ),
737 cmpgt32S( sel32x2_0(xx
), sel32x2_0(yy
) )
741 ULong
h_generic_calc_CmpGT16Sx4 ( ULong xx
, ULong yy
)
744 cmpgt16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
745 cmpgt16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
746 cmpgt16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
747 cmpgt16S( sel16x4_0(xx
), sel16x4_0(yy
) )
751 ULong
h_generic_calc_CmpGT8Sx8 ( ULong xx
, ULong yy
)
754 cmpgt8S( sel8x8_7(xx
), sel8x8_7(yy
) ),
755 cmpgt8S( sel8x8_6(xx
), sel8x8_6(yy
) ),
756 cmpgt8S( sel8x8_5(xx
), sel8x8_5(yy
) ),
757 cmpgt8S( sel8x8_4(xx
), sel8x8_4(yy
) ),
758 cmpgt8S( sel8x8_3(xx
), sel8x8_3(yy
) ),
759 cmpgt8S( sel8x8_2(xx
), sel8x8_2(yy
) ),
760 cmpgt8S( sel8x8_1(xx
), sel8x8_1(yy
) ),
761 cmpgt8S( sel8x8_0(xx
), sel8x8_0(yy
) )
765 ULong
h_generic_calc_CmpNEZ32x2 ( ULong xx
)
768 cmpnez32( sel32x2_1(xx
) ),
769 cmpnez32( sel32x2_0(xx
) )
773 ULong
h_generic_calc_CmpNEZ16x4 ( ULong xx
)
776 cmpnez16( sel16x4_3(xx
) ),
777 cmpnez16( sel16x4_2(xx
) ),
778 cmpnez16( sel16x4_1(xx
) ),
779 cmpnez16( sel16x4_0(xx
) )
783 ULong
h_generic_calc_CmpNEZ8x8 ( ULong xx
)
786 cmpnez8( sel8x8_7(xx
) ),
787 cmpnez8( sel8x8_6(xx
) ),
788 cmpnez8( sel8x8_5(xx
) ),
789 cmpnez8( sel8x8_4(xx
) ),
790 cmpnez8( sel8x8_3(xx
) ),
791 cmpnez8( sel8x8_2(xx
) ),
792 cmpnez8( sel8x8_1(xx
) ),
793 cmpnez8( sel8x8_0(xx
) )
797 /* ------------ Saturating narrowing ------------ */
799 ULong
h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa
, ULong bb
)
801 UInt d
= sel32x2_1(aa
);
802 UInt c
= sel32x2_0(aa
);
803 UInt b
= sel32x2_1(bb
);
804 UInt a
= sel32x2_0(bb
);
813 ULong
h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa
, ULong bb
)
815 UShort h
= sel16x4_3(aa
);
816 UShort g
= sel16x4_2(aa
);
817 UShort f
= sel16x4_1(aa
);
818 UShort e
= sel16x4_0(aa
);
819 UShort d
= sel16x4_3(bb
);
820 UShort c
= sel16x4_2(bb
);
821 UShort b
= sel16x4_1(bb
);
822 UShort a
= sel16x4_0(bb
);
835 ULong
h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa
, ULong bb
)
837 UShort h
= sel16x4_3(aa
);
838 UShort g
= sel16x4_2(aa
);
839 UShort f
= sel16x4_1(aa
);
840 UShort e
= sel16x4_0(aa
);
841 UShort d
= sel16x4_3(bb
);
842 UShort c
= sel16x4_2(bb
);
843 UShort b
= sel16x4_1(bb
);
844 UShort a
= sel16x4_0(bb
);
857 /* ------------ Truncating narrowing ------------ */
859 ULong
h_generic_calc_NarrowBin32to16x4 ( ULong aa
, ULong bb
)
861 UInt d
= sel32x2_1(aa
);
862 UInt c
= sel32x2_0(aa
);
863 UInt b
= sel32x2_1(bb
);
864 UInt a
= sel32x2_0(bb
);
873 ULong
h_generic_calc_NarrowBin16to8x8 ( ULong aa
, ULong bb
)
875 UShort h
= sel16x4_3(aa
);
876 UShort g
= sel16x4_2(aa
);
877 UShort f
= sel16x4_1(aa
);
878 UShort e
= sel16x4_0(aa
);
879 UShort d
= sel16x4_3(bb
);
880 UShort c
= sel16x4_2(bb
);
881 UShort b
= sel16x4_1(bb
);
882 UShort a
= sel16x4_0(bb
);
895 /* ------------ Interleaving ------------ */
897 ULong
h_generic_calc_InterleaveHI8x8 ( ULong aa
, ULong bb
)
911 ULong
h_generic_calc_InterleaveLO8x8 ( ULong aa
, ULong bb
)
925 ULong
h_generic_calc_InterleaveHI16x4 ( ULong aa
, ULong bb
)
935 ULong
h_generic_calc_InterleaveLO16x4 ( ULong aa
, ULong bb
)
945 ULong
h_generic_calc_InterleaveHI32x2 ( ULong aa
, ULong bb
)
953 ULong
h_generic_calc_InterleaveLO32x2 ( ULong aa
, ULong bb
)
961 /* ------------ Concatenation ------------ */
963 ULong
h_generic_calc_CatOddLanes16x4 ( ULong aa
, ULong bb
)
973 ULong
h_generic_calc_CatEvenLanes16x4 ( ULong aa
, ULong bb
)
983 /* ------------ Permutation ------------ */
985 ULong
h_generic_calc_Perm8x8 ( ULong aa
, ULong bb
)
988 index8x8(aa
, sel8x8_7(bb
)),
989 index8x8(aa
, sel8x8_6(bb
)),
990 index8x8(aa
, sel8x8_5(bb
)),
991 index8x8(aa
, sel8x8_4(bb
)),
992 index8x8(aa
, sel8x8_3(bb
)),
993 index8x8(aa
, sel8x8_2(bb
)),
994 index8x8(aa
, sel8x8_1(bb
)),
995 index8x8(aa
, sel8x8_0(bb
))
999 ULong
h_generic_calc_PermOrZero8x8 ( ULong aa
, ULong bb
)
1002 indexOrZero8x8(aa
, sel8x8_7(bb
)),
1003 indexOrZero8x8(aa
, sel8x8_6(bb
)),
1004 indexOrZero8x8(aa
, sel8x8_5(bb
)),
1005 indexOrZero8x8(aa
, sel8x8_4(bb
)),
1006 indexOrZero8x8(aa
, sel8x8_3(bb
)),
1007 indexOrZero8x8(aa
, sel8x8_2(bb
)),
1008 indexOrZero8x8(aa
, sel8x8_1(bb
)),
1009 indexOrZero8x8(aa
, sel8x8_0(bb
))
1013 /* ------------ Shifting ------------ */
1014 /* Note that because these primops are undefined if the shift amount
1015 equals or exceeds the lane width, the shift amount is masked so
1016 that the scalar shifts are always in range. In fact, given the
1017 semantics of these primops (ShlN16x4, etc) it is an error if in
1018 fact we are ever given an out-of-range shift amount.
1020 ULong
h_generic_calc_ShlN32x2 ( ULong xx
, UInt nn
)
1022 /* vassert(nn < 32); */
1025 shl32( sel32x2_1(xx
), nn
),
1026 shl32( sel32x2_0(xx
), nn
)
1030 ULong
h_generic_calc_ShlN16x4 ( ULong xx
, UInt nn
)
1032 /* vassert(nn < 16); */
1035 shl16( sel16x4_3(xx
), nn
),
1036 shl16( sel16x4_2(xx
), nn
),
1037 shl16( sel16x4_1(xx
), nn
),
1038 shl16( sel16x4_0(xx
), nn
)
1042 ULong
h_generic_calc_ShlN8x8 ( ULong xx
, UInt nn
)
1044 /* vassert(nn < 8); */
1047 shl8( sel8x8_7(xx
), nn
),
1048 shl8( sel8x8_6(xx
), nn
),
1049 shl8( sel8x8_5(xx
), nn
),
1050 shl8( sel8x8_4(xx
), nn
),
1051 shl8( sel8x8_3(xx
), nn
),
1052 shl8( sel8x8_2(xx
), nn
),
1053 shl8( sel8x8_1(xx
), nn
),
1054 shl8( sel8x8_0(xx
), nn
)
1058 ULong
h_generic_calc_ShrN32x2 ( ULong xx
, UInt nn
)
1060 /* vassert(nn < 32); */
1063 shr32( sel32x2_1(xx
), nn
),
1064 shr32( sel32x2_0(xx
), nn
)
1068 ULong
h_generic_calc_ShrN16x4 ( ULong xx
, UInt nn
)
1070 /* vassert(nn < 16); */
1073 shr16( sel16x4_3(xx
), nn
),
1074 shr16( sel16x4_2(xx
), nn
),
1075 shr16( sel16x4_1(xx
), nn
),
1076 shr16( sel16x4_0(xx
), nn
)
1080 ULong
h_generic_calc_SarN32x2 ( ULong xx
, UInt nn
)
1082 /* vassert(nn < 32); */
1085 sar32( sel32x2_1(xx
), nn
),
1086 sar32( sel32x2_0(xx
), nn
)
1090 ULong
h_generic_calc_SarN16x4 ( ULong xx
, UInt nn
)
1092 /* vassert(nn < 16); */
1095 sar16( sel16x4_3(xx
), nn
),
1096 sar16( sel16x4_2(xx
), nn
),
1097 sar16( sel16x4_1(xx
), nn
),
1098 sar16( sel16x4_0(xx
), nn
)
1102 ULong
h_generic_calc_SarN8x8 ( ULong xx
, UInt nn
)
1104 /* vassert(nn < 8); */
1107 sar8( sel8x8_7(xx
), nn
),
1108 sar8( sel8x8_6(xx
), nn
),
1109 sar8( sel8x8_5(xx
), nn
),
1110 sar8( sel8x8_4(xx
), nn
),
1111 sar8( sel8x8_3(xx
), nn
),
1112 sar8( sel8x8_2(xx
), nn
),
1113 sar8( sel8x8_1(xx
), nn
),
1114 sar8( sel8x8_0(xx
), nn
)
1118 /* ------------ Averaging ------------ */
1120 ULong
h_generic_calc_Avg8Ux8 ( ULong xx
, ULong yy
)
1123 avg8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
1124 avg8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
1125 avg8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
1126 avg8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
1127 avg8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
1128 avg8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
1129 avg8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
1130 avg8U( sel8x8_0(xx
), sel8x8_0(yy
) )
1134 ULong
h_generic_calc_Avg16Ux4 ( ULong xx
, ULong yy
)
1137 avg16U( sel16x4_3(xx
), sel16x4_3(yy
) ),
1138 avg16U( sel16x4_2(xx
), sel16x4_2(yy
) ),
1139 avg16U( sel16x4_1(xx
), sel16x4_1(yy
) ),
1140 avg16U( sel16x4_0(xx
), sel16x4_0(yy
) )
1144 /* ------------ max/min ------------ */
1146 ULong
h_generic_calc_Max16Sx4 ( ULong xx
, ULong yy
)
1149 max16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
1150 max16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
1151 max16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
1152 max16S( sel16x4_0(xx
), sel16x4_0(yy
) )
1156 ULong
h_generic_calc_Max8Ux8 ( ULong xx
, ULong yy
)
1159 max8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
1160 max8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
1161 max8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
1162 max8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
1163 max8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
1164 max8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
1165 max8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
1166 max8U( sel8x8_0(xx
), sel8x8_0(yy
) )
1170 ULong
h_generic_calc_Min16Sx4 ( ULong xx
, ULong yy
)
1173 min16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
1174 min16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
1175 min16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
1176 min16S( sel16x4_0(xx
), sel16x4_0(yy
) )
1180 ULong
h_generic_calc_Min8Ux8 ( ULong xx
, ULong yy
)
1183 min8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
1184 min8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
1185 min8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
1186 min8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
1187 min8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
1188 min8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
1189 min8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
1190 min8U( sel8x8_0(xx
), sel8x8_0(yy
) )
1194 UInt
h_generic_calc_GetMSBs8x8 ( ULong xx
)
1197 if (xx
& (1ULL << (64-1))) r
|= (1<<7);
1198 if (xx
& (1ULL << (56-1))) r
|= (1<<6);
1199 if (xx
& (1ULL << (48-1))) r
|= (1<<5);
1200 if (xx
& (1ULL << (40-1))) r
|= (1<<4);
1201 if (xx
& (1ULL << (32-1))) r
|= (1<<3);
1202 if (xx
& (1ULL << (24-1))) r
|= (1<<2);
1203 if (xx
& (1ULL << (16-1))) r
|= (1<<1);
1204 if (xx
& (1ULL << ( 8-1))) r
|= (1<<0);
1208 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1210 /* Tuple/select functions for 16x2 vectors. */
1211 static inline UInt
mk16x2 ( UShort w1
, UShort w2
) {
1212 return (((UInt
)w1
) << 16) | ((UInt
)w2
);
1215 static inline UShort
sel16x2_1 ( UInt w32
) {
1216 return 0xFFFF & (UShort
)(w32
>> 16);
1218 static inline UShort
sel16x2_0 ( UInt w32
) {
1219 return 0xFFFF & (UShort
)(w32
);
1222 static inline UInt
mk8x4 ( UChar w3
, UChar w2
,
1223 UChar w1
, UChar w0
) {
1224 UInt w32
= (((UInt
)w3
) << 24) | (((UInt
)w2
) << 16)
1225 | (((UInt
)w1
) << 8) | (((UInt
)w0
) << 0);
1229 static inline UChar
sel8x4_3 ( UInt w32
) {
1230 return toUChar(0xFF & (w32
>> 24));
1232 static inline UChar
sel8x4_2 ( UInt w32
) {
1233 return toUChar(0xFF & (w32
>> 16));
1235 static inline UChar
sel8x4_1 ( UInt w32
) {
1236 return toUChar(0xFF & (w32
>> 8));
1238 static inline UChar
sel8x4_0 ( UInt w32
) {
1239 return toUChar(0xFF & (w32
>> 0));
1243 /* ----------------------------------------------------- */
1244 /* More externally visible functions. These simply
1245 implement the corresponding IR primops. */
1246 /* ----------------------------------------------------- */
1248 /* ------ 16x2 ------ */
1250 UInt
h_generic_calc_Add16x2 ( UInt xx
, UInt yy
)
1252 return mk16x2( sel16x2_1(xx
) + sel16x2_1(yy
),
1253 sel16x2_0(xx
) + sel16x2_0(yy
) );
1256 UInt
h_generic_calc_Sub16x2 ( UInt xx
, UInt yy
)
1258 return mk16x2( sel16x2_1(xx
) - sel16x2_1(yy
),
1259 sel16x2_0(xx
) - sel16x2_0(yy
) );
1262 UInt
h_generic_calc_HAdd16Ux2 ( UInt xx
, UInt yy
)
1264 return mk16x2( hadd16U( sel16x2_1(xx
), sel16x2_1(yy
) ),
1265 hadd16U( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1268 UInt
h_generic_calc_HAdd16Sx2 ( UInt xx
, UInt yy
)
1270 return mk16x2( hadd16S( sel16x2_1(xx
), sel16x2_1(yy
) ),
1271 hadd16S( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1274 UInt
h_generic_calc_HSub16Ux2 ( UInt xx
, UInt yy
)
1276 return mk16x2( hsub16U( sel16x2_1(xx
), sel16x2_1(yy
) ),
1277 hsub16U( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1280 UInt
h_generic_calc_HSub16Sx2 ( UInt xx
, UInt yy
)
1282 return mk16x2( hsub16S( sel16x2_1(xx
), sel16x2_1(yy
) ),
1283 hsub16S( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1286 UInt
h_generic_calc_QAdd16Ux2 ( UInt xx
, UInt yy
)
1288 return mk16x2( qadd16U( sel16x2_1(xx
), sel16x2_1(yy
) ),
1289 qadd16U( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1292 UInt
h_generic_calc_QAdd16Sx2 ( UInt xx
, UInt yy
)
1294 return mk16x2( qadd16S( sel16x2_1(xx
), sel16x2_1(yy
) ),
1295 qadd16S( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1298 UInt
h_generic_calc_QSub16Ux2 ( UInt xx
, UInt yy
)
1300 return mk16x2( qsub16U( sel16x2_1(xx
), sel16x2_1(yy
) ),
1301 qsub16U( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1304 UInt
h_generic_calc_QSub16Sx2 ( UInt xx
, UInt yy
)
1306 return mk16x2( qsub16S( sel16x2_1(xx
), sel16x2_1(yy
) ),
1307 qsub16S( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1310 /* ------ 8x4 ------ */
1312 UInt
h_generic_calc_Add8x4 ( UInt xx
, UInt yy
)
1315 sel8x4_3(xx
) + sel8x4_3(yy
),
1316 sel8x4_2(xx
) + sel8x4_2(yy
),
1317 sel8x4_1(xx
) + sel8x4_1(yy
),
1318 sel8x4_0(xx
) + sel8x4_0(yy
)
1322 UInt
h_generic_calc_Sub8x4 ( UInt xx
, UInt yy
)
1325 sel8x4_3(xx
) - sel8x4_3(yy
),
1326 sel8x4_2(xx
) - sel8x4_2(yy
),
1327 sel8x4_1(xx
) - sel8x4_1(yy
),
1328 sel8x4_0(xx
) - sel8x4_0(yy
)
1332 UInt
h_generic_calc_HAdd8Ux4 ( UInt xx
, UInt yy
)
1335 hadd8U( sel8x4_3(xx
), sel8x4_3(yy
) ),
1336 hadd8U( sel8x4_2(xx
), sel8x4_2(yy
) ),
1337 hadd8U( sel8x4_1(xx
), sel8x4_1(yy
) ),
1338 hadd8U( sel8x4_0(xx
), sel8x4_0(yy
) )
1342 UInt
h_generic_calc_HAdd8Sx4 ( UInt xx
, UInt yy
)
1345 hadd8S( sel8x4_3(xx
), sel8x4_3(yy
) ),
1346 hadd8S( sel8x4_2(xx
), sel8x4_2(yy
) ),
1347 hadd8S( sel8x4_1(xx
), sel8x4_1(yy
) ),
1348 hadd8S( sel8x4_0(xx
), sel8x4_0(yy
) )
1352 UInt
h_generic_calc_HSub8Ux4 ( UInt xx
, UInt yy
)
1355 hsub8U( sel8x4_3(xx
), sel8x4_3(yy
) ),
1356 hsub8U( sel8x4_2(xx
), sel8x4_2(yy
) ),
1357 hsub8U( sel8x4_1(xx
), sel8x4_1(yy
) ),
1358 hsub8U( sel8x4_0(xx
), sel8x4_0(yy
) )
1362 UInt
h_generic_calc_HSub8Sx4 ( UInt xx
, UInt yy
)
1365 hsub8S( sel8x4_3(xx
), sel8x4_3(yy
) ),
1366 hsub8S( sel8x4_2(xx
), sel8x4_2(yy
) ),
1367 hsub8S( sel8x4_1(xx
), sel8x4_1(yy
) ),
1368 hsub8S( sel8x4_0(xx
), sel8x4_0(yy
) )
1372 UInt
h_generic_calc_QAdd8Ux4 ( UInt xx
, UInt yy
)
1375 qadd8U( sel8x4_3(xx
), sel8x4_3(yy
) ),
1376 qadd8U( sel8x4_2(xx
), sel8x4_2(yy
) ),
1377 qadd8U( sel8x4_1(xx
), sel8x4_1(yy
) ),
1378 qadd8U( sel8x4_0(xx
), sel8x4_0(yy
) )
1382 UInt
h_generic_calc_QAdd8Sx4 ( UInt xx
, UInt yy
)
1385 qadd8S( sel8x4_3(xx
), sel8x4_3(yy
) ),
1386 qadd8S( sel8x4_2(xx
), sel8x4_2(yy
) ),
1387 qadd8S( sel8x4_1(xx
), sel8x4_1(yy
) ),
1388 qadd8S( sel8x4_0(xx
), sel8x4_0(yy
) )
1392 UInt
h_generic_calc_QSub8Ux4 ( UInt xx
, UInt yy
)
1395 qsub8U( sel8x4_3(xx
), sel8x4_3(yy
) ),
1396 qsub8U( sel8x4_2(xx
), sel8x4_2(yy
) ),
1397 qsub8U( sel8x4_1(xx
), sel8x4_1(yy
) ),
1398 qsub8U( sel8x4_0(xx
), sel8x4_0(yy
) )
1402 UInt
h_generic_calc_QSub8Sx4 ( UInt xx
, UInt yy
)
1405 qsub8S( sel8x4_3(xx
), sel8x4_3(yy
) ),
1406 qsub8S( sel8x4_2(xx
), sel8x4_2(yy
) ),
1407 qsub8S( sel8x4_1(xx
), sel8x4_1(yy
) ),
1408 qsub8S( sel8x4_0(xx
), sel8x4_0(yy
) )
1412 UInt
h_generic_calc_CmpNEZ16x2 ( UInt xx
)
1415 cmpnez16( sel16x2_1(xx
) ),
1416 cmpnez16( sel16x2_0(xx
) )
1420 UInt
h_generic_calc_CmpNEZ8x4 ( UInt xx
)
1423 cmpnez8( sel8x4_3(xx
) ),
1424 cmpnez8( sel8x4_2(xx
) ),
1425 cmpnez8( sel8x4_1(xx
) ),
1426 cmpnez8( sel8x4_0(xx
) )
1430 UInt
h_generic_calc_Sad8Ux4 ( UInt xx
, UInt yy
)
1432 return absdiff8U( sel8x4_3(xx
), sel8x4_3(yy
) )
1433 + absdiff8U( sel8x4_2(xx
), sel8x4_2(yy
) )
1434 + absdiff8U( sel8x4_1(xx
), sel8x4_1(yy
) )
1435 + absdiff8U( sel8x4_0(xx
), sel8x4_0(yy
) );
1438 UInt
h_generic_calc_QAdd32S ( UInt xx
, UInt yy
)
1440 return qadd32S( xx
, yy
);
1443 UInt
h_generic_calc_QSub32S ( UInt xx
, UInt yy
)
1445 return qsub32S( xx
, yy
);
1449 /*------------------------------------------------------------------*/
1450 /* Decimal Floating Point (DFP) externally visible helper functions */
1451 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
1452 /*------------------------------------------------------------------*/
1454 #define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
1455 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1456 #define PUT( x, y ) ( ( x )<< ( y ) )
1458 static ULong
dpb_to_bcd( ULong chunk
)
1460 Short a
, b
, c
, d
, e
, f
, g
, h
, i
, j
, k
, m
;
1461 Short p
, q
, r
, s
, t
, u
, v
, w
, x
, y
;
1464 /* convert 10 bit densely packed BCD to BCD */
1465 p
= GET( chunk
, 9 );
1466 q
= GET( chunk
, 8 );
1467 r
= GET( chunk
, 7 );
1468 s
= GET( chunk
, 6 );
1469 t
= GET( chunk
, 5 );
1470 u
= GET( chunk
, 4 );
1471 v
= GET( chunk
, 3 );
1472 w
= GET( chunk
, 2 );
1473 x
= GET( chunk
, 1 );
1474 y
= GET( chunk
, 0 );
1476 /* The BCD bit values are given by the following boolean equations.*/
1477 a
= ( NOT(s
) & v
& w
) | ( t
& v
& w
& s
) | ( v
& w
& NOT(x
) );
1478 b
= ( p
& s
& x
& NOT(t
) ) | ( p
& NOT(w
) ) | ( p
& NOT(v
) );
1479 c
= ( q
& s
& x
& NOT(t
) ) | ( q
& NOT(w
) ) | ( q
& NOT(v
) );
1481 e
= ( v
& NOT(w
) & x
) | ( s
& v
& w
& x
) | ( NOT(t
) & v
& x
& w
);
1482 f
= ( p
& t
& v
& w
& x
& NOT(s
) ) | ( s
& NOT(x
) & v
) | ( s
& NOT(v
) );
1483 g
= ( q
& t
& w
& v
& x
& NOT(s
) ) | ( t
& NOT(x
) & v
) | ( t
& NOT(v
) );
1485 i
= ( t
& v
& w
& x
) | ( s
& v
& w
& x
) | ( v
& NOT(w
) & NOT(x
) );
1486 j
= ( p
& NOT(s
) & NOT(t
) & w
& v
) | ( s
& v
& NOT(w
) & x
)
1487 | ( p
& w
& NOT(x
) & v
) | ( w
& NOT(v
) );
1488 k
= ( q
& NOT(s
) & NOT(t
) & v
& w
) | ( t
& v
& NOT(w
) & x
)
1489 | ( q
& v
& w
& NOT(x
) ) | ( x
& NOT(v
) );
1492 value
= PUT(a
, 11) | PUT(b
, 10) | PUT(c
, 9) | PUT(d
, 8) | PUT(e
, 7)
1493 | PUT(f
, 6) | PUT(g
, 5) | PUT(h
, 4) | PUT(i
, 3) | PUT(j
, 2)
1494 | PUT(k
, 1) | PUT(m
, 0);
1498 static ULong
bcd_to_dpb( ULong chunk
)
1500 Short a
, b
, c
, d
, e
, f
, g
, h
, i
, j
, k
, m
;
1501 Short p
, q
, r
, s
, t
, u
, v
, w
, x
, y
;
1503 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1504 The boolean equations to calculate the value of each of the DPD bit
1505 is given in Appendix B of Book 1: Power ISA User Instruction set. The
1506 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
1507 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
1509 a
= GET( chunk
, 11 );
1510 b
= GET( chunk
, 10 );
1511 c
= GET( chunk
, 9 );
1512 d
= GET( chunk
, 8 );
1513 e
= GET( chunk
, 7 );
1514 f
= GET( chunk
, 6 );
1515 g
= GET( chunk
, 5 );
1516 h
= GET( chunk
, 4 );
1517 i
= GET( chunk
, 3 );
1518 j
= GET( chunk
, 2 );
1519 k
= GET( chunk
, 1 );
1520 m
= GET( chunk
, 0 );
1522 p
= ( f
& a
& i
& NOT(e
) ) | ( j
& a
& NOT(i
) ) | ( b
& NOT(a
) );
1523 q
= ( g
& a
& i
& NOT(e
) ) | ( k
& a
& NOT(i
) ) | ( c
& NOT(a
) );
1525 s
= ( j
& NOT(a
) & e
& NOT(i
) ) | ( f
& NOT(i
) & NOT(e
) )
1526 | ( f
& NOT(a
) & NOT(e
) ) | ( e
& i
);
1527 t
= ( k
& NOT(a
) & e
& NOT(i
) ) | ( g
& NOT(i
) & NOT(e
) )
1528 | ( g
& NOT(a
) & NOT(e
) ) | ( a
& i
);
1531 w
= ( NOT(e
) & j
& NOT(i
) ) | ( e
& i
) | a
;
1532 x
= ( NOT(a
) & k
& NOT(i
) ) | ( a
& i
) | e
;
1535 value
= PUT(p
, 9) | PUT(q
, 8) | PUT(r
, 7) | PUT(s
, 6) | PUT(t
, 5)
1536 | PUT(u
, 4) | PUT(v
, 3) | PUT(w
, 2) | PUT(x
, 1) | y
;
1541 ULong
h_calc_DPBtoBCD( ULong dpb
)
1543 ULong result
, chunk
;
1548 for (i
= 0; i
< 5; i
++) {
1549 chunk
= dpb
>> ( 4 - i
) * 10;
1550 result
= result
<< 12;
1551 result
|= dpb_to_bcd( chunk
& 0x3FF );
1556 ULong
h_calc_BCDtoDPB( ULong bcd
)
1558 ULong result
, chunk
;
1563 for (i
= 0; i
< 5; i
++) {
1564 chunk
= bcd
>> ( 4 - i
) * 12;
1565 result
= result
<< 10;
1566 result
|= bcd_to_dpb( chunk
& 0xFFF );
1575 /* ----------------------------------------------------- */
1576 /* Signed and unsigned integer division, that behave like
1577 the ARMv7 UDIV ansd SDIV instructions.
1579 sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1580 udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1582 /* ----------------------------------------------------- */
1584 UInt
h_calc_udiv32_w_arm_semantics ( UInt x
, UInt y
)
1586 // Division by zero --> zero
1587 if (UNLIKELY(y
== 0)) return 0;
1588 // C requires rounding towards zero, which is also what we need.
1592 ULong
h_calc_udiv64_w_arm_semantics ( ULong x
, ULong y
)
1594 // Division by zero --> zero
1595 if (UNLIKELY(y
== 0)) return 0;
1596 // C requires rounding towards zero, which is also what we need.
1600 Int
h_calc_sdiv32_w_arm_semantics ( Int x
, Int y
)
1602 // Division by zero --> zero
1603 if (UNLIKELY(y
== 0)) return 0;
1604 // The single case that produces an unrepresentable result
1605 if (UNLIKELY( ((UInt
)x
) == ((UInt
)0x80000000)
1606 && ((UInt
)y
) == ((UInt
)0xFFFFFFFF) ))
1607 return (Int
)(UInt
)0x80000000;
1608 // Else return the result rounded towards zero. C89 says
1609 // this is implementation defined (in the signed case), but gcc
1610 // promises to round towards zero. Nevertheless, at startup,
1611 // in main_main.c, do a check for that.
1615 Long
h_calc_sdiv64_w_arm_semantics ( Long x
, Long y
)
1617 // Division by zero --> zero
1618 if (UNLIKELY(y
== 0)) return 0;
1619 // The single case that produces an unrepresentable result
1620 if (UNLIKELY( ((ULong
)x
) == ((ULong
)0x8000000000000000ULL
)
1621 && ((ULong
)y
) == ((ULong
)0xFFFFFFFFFFFFFFFFULL
) ))
1622 return (Long
)(ULong
)0x8000000000000000ULL
;
1623 // Else return the result rounded towards zero. C89 says
1624 // this is implementation defined (in the signed case), but gcc
1625 // promises to round towards zero. Nevertheless, at startup,
1626 // in main_main.c, do a check for that.
1631 /*---------------------------------------------------------------*/
1632 /*--- end host_generic_simd64.c ---*/
1633 /*---------------------------------------------------------------*/