2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
7 This file is part of Valgrind, a dynamic binary instrumentation
10 Copyright (C) 2004-2017 OpenWorks LLP
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
28 The GNU General Public License is contained in the file COPYING.
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37 where the instruction selectors cannot generate code in-line.
38 These are purely back-end entities and cannot be seen/referenced
39 from IR. There are also helpers for 32-bit arithmetic in here. */
41 #include "libvex_basictypes.h"
42 #include "main_util.h" // LIKELY, UNLIKELY
43 #include "host_generic_simd64.h"
47 /* Tuple/select functions for 32x2 vectors. */
49 static inline ULong
mk32x2 ( UInt w1
, UInt w0
) {
50 return (((ULong
)w1
) << 32) | ((ULong
)w0
);
53 static inline UInt
sel32x2_1 ( ULong w64
) {
54 return 0xFFFFFFFF & toUInt(w64
>> 32);
56 static inline UInt
sel32x2_0 ( ULong w64
) {
57 return 0xFFFFFFFF & toUInt(w64
);
61 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
62 with 64-bit shifts so we give it a hand. */
64 static inline ULong
mk16x4 ( UShort w3
, UShort w2
,
65 UShort w1
, UShort w0
) {
66 UInt hi32
= (((UInt
)w3
) << 16) | ((UInt
)w2
);
67 UInt lo32
= (((UInt
)w1
) << 16) | ((UInt
)w0
);
68 return mk32x2(hi32
, lo32
);
71 static inline UShort
sel16x4_3 ( ULong w64
) {
72 UInt hi32
= toUInt(w64
>> 32);
73 return toUShort(0xFFFF & (hi32
>> 16));
75 static inline UShort
sel16x4_2 ( ULong w64
) {
76 UInt hi32
= toUInt(w64
>> 32);
77 return toUShort(0xFFFF & hi32
);
79 static inline UShort
sel16x4_1 ( ULong w64
) {
80 UInt lo32
= (UInt
)w64
;
81 return toUShort(0xFFFF & (lo32
>> 16));
83 static inline UShort
sel16x4_0 ( ULong w64
) {
84 UInt lo32
= (UInt
)w64
;
85 return toUShort(0xFFFF & lo32
);
89 /* Tuple/select functions for 8x8 vectors. */
91 static inline ULong
mk8x8 ( UChar w7
, UChar w6
,
94 UChar w1
, UChar w0
) {
95 UInt hi32
= (((UInt
)w7
) << 24) | (((UInt
)w6
) << 16)
96 | (((UInt
)w5
) << 8) | (((UInt
)w4
) << 0);
97 UInt lo32
= (((UInt
)w3
) << 24) | (((UInt
)w2
) << 16)
98 | (((UInt
)w1
) << 8) | (((UInt
)w0
) << 0);
99 return mk32x2(hi32
, lo32
);
102 static inline UChar
sel8x8_7 ( ULong w64
) {
103 UInt hi32
= toUInt(w64
>> 32);
104 return toUChar(0xFF & (hi32
>> 24));
106 static inline UChar
sel8x8_6 ( ULong w64
) {
107 UInt hi32
= toUInt(w64
>> 32);
108 return toUChar(0xFF & (hi32
>> 16));
110 static inline UChar
sel8x8_5 ( ULong w64
) {
111 UInt hi32
= toUInt(w64
>> 32);
112 return toUChar(0xFF & (hi32
>> 8));
114 static inline UChar
sel8x8_4 ( ULong w64
) {
115 UInt hi32
= toUInt(w64
>> 32);
116 return toUChar(0xFF & (hi32
>> 0));
118 static inline UChar
sel8x8_3 ( ULong w64
) {
119 UInt lo32
= (UInt
)w64
;
120 return toUChar(0xFF & (lo32
>> 24));
122 static inline UChar
sel8x8_2 ( ULong w64
) {
123 UInt lo32
= (UInt
)w64
;
124 return toUChar(0xFF & (lo32
>> 16));
126 static inline UChar
sel8x8_1 ( ULong w64
) {
127 UInt lo32
= (UInt
)w64
;
128 return toUChar(0xFF & (lo32
>> 8));
130 static inline UChar
sel8x8_0 ( ULong w64
) {
131 UInt lo32
= (UInt
)w64
;
132 return toUChar(0xFF & (lo32
>> 0));
135 static inline UChar
index8x8 ( ULong w64
, UChar ix
) {
137 return toUChar((w64
>> (8*ix
)) & 0xFF);
141 /* Scalar helpers. */
143 static inline Int
qadd32S ( Int xx
, Int yy
)
145 Long t
= ((Long
)xx
) + ((Long
)yy
);
146 const Long loLim
= -0x80000000LL
;
147 const Long hiLim
= 0x7FFFFFFFLL
;
148 if (t
< loLim
) t
= loLim
;
149 if (t
> hiLim
) t
= hiLim
;
153 static inline Short
qadd16S ( Short xx
, Short yy
)
155 Int t
= ((Int
)xx
) + ((Int
)yy
);
156 if (t
< -32768) t
= -32768;
157 if (t
> 32767) t
= 32767;
161 static inline Char
qadd8S ( Char xx
, Char yy
)
163 Int t
= ((Int
)xx
) + ((Int
)yy
);
164 if (t
< -128) t
= -128;
165 if (t
> 127) t
= 127;
169 static inline UShort
qadd16U ( UShort xx
, UShort yy
)
171 UInt t
= ((UInt
)xx
) + ((UInt
)yy
);
172 if (t
> 0xFFFF) t
= 0xFFFF;
176 static inline UChar
qadd8U ( UChar xx
, UChar yy
)
178 UInt t
= ((UInt
)xx
) + ((UInt
)yy
);
179 if (t
> 0xFF) t
= 0xFF;
183 static inline Int
qsub32S ( Int xx
, Int yy
)
185 Long t
= ((Long
)xx
) - ((Long
)yy
);
186 const Long loLim
= -0x80000000LL
;
187 const Long hiLim
= 0x7FFFFFFFLL
;
188 if (t
< loLim
) t
= loLim
;
189 if (t
> hiLim
) t
= hiLim
;
193 static inline Short
qsub16S ( Short xx
, Short yy
)
195 Int t
= ((Int
)xx
) - ((Int
)yy
);
196 if (t
< -32768) t
= -32768;
197 if (t
> 32767) t
= 32767;
201 static inline Char
qsub8S ( Char xx
, Char yy
)
203 Int t
= ((Int
)xx
) - ((Int
)yy
);
204 if (t
< -128) t
= -128;
205 if (t
> 127) t
= 127;
209 static inline UShort
qsub16U ( UShort xx
, UShort yy
)
211 Int t
= ((Int
)xx
) - ((Int
)yy
);
213 if (t
> 0xFFFF) t
= 0xFFFF;
217 static inline UChar
qsub8U ( UChar xx
, UChar yy
)
219 Int t
= ((Int
)xx
) - ((Int
)yy
);
221 if (t
> 0xFF) t
= 0xFF;
225 static inline Short
mul16 ( Short xx
, Short yy
)
227 Int t
= ((Int
)xx
) * ((Int
)yy
);
231 static inline Int
mul32 ( Int xx
, Int yy
)
233 Int t
= ((Int
)xx
) * ((Int
)yy
);
237 static inline Short
mulhi16S ( Short xx
, Short yy
)
239 Int t
= ((Int
)xx
) * ((Int
)yy
);
244 static inline UShort
mulhi16U ( UShort xx
, UShort yy
)
246 UInt t
= ((UInt
)xx
) * ((UInt
)yy
);
251 static inline UInt
cmpeq32 ( UInt xx
, UInt yy
)
253 return xx
==yy
? 0xFFFFFFFF : 0;
256 static inline UShort
cmpeq16 ( UShort xx
, UShort yy
)
258 return toUShort(xx
==yy
? 0xFFFF : 0);
261 static inline UChar
cmpeq8 ( UChar xx
, UChar yy
)
263 return toUChar(xx
==yy
? 0xFF : 0);
266 static inline UInt
cmpgt32S ( Int xx
, Int yy
)
268 return xx
>yy
? 0xFFFFFFFF : 0;
271 static inline UShort
cmpgt16S ( Short xx
, Short yy
)
273 return toUShort(xx
>yy
? 0xFFFF : 0);
276 static inline UChar
cmpgt8S ( Char xx
, Char yy
)
278 return toUChar(xx
>yy
? 0xFF : 0);
281 static inline UInt
cmpnez32 ( UInt xx
)
283 return xx
==0 ? 0 : 0xFFFFFFFF;
286 static inline UShort
cmpnez16 ( UShort xx
)
288 return toUShort(xx
==0 ? 0 : 0xFFFF);
291 static inline UChar
cmpnez8 ( UChar xx
)
293 return toUChar(xx
==0 ? 0 : 0xFF);
296 static inline Short
qnarrow32Sto16S ( UInt xx0
)
299 if (xx
< -32768) xx
= -32768;
300 if (xx
> 32767) xx
= 32767;
304 static inline Char
qnarrow16Sto8S ( UShort xx0
)
306 Short xx
= (Short
)xx0
;
307 if (xx
< -128) xx
= -128;
308 if (xx
> 127) xx
= 127;
312 static inline UChar
qnarrow16Sto8U ( UShort xx0
)
314 Short xx
= (Short
)xx0
;
316 if (xx
> 255) xx
= 255;
320 static inline UShort
narrow32to16 ( UInt xx
)
325 static inline UChar
narrow16to8 ( UShort xx
)
330 /* shifts: we don't care about out-of-range ones, since
331 that is dealt with at a higher level. */
333 static inline UChar
shl8 ( UChar v
, UInt n
)
335 return toUChar(v
<< n
);
338 static inline UChar
sar8 ( UChar v
, UInt n
)
340 return toUChar(((Char
)v
) >> n
);
343 static inline UShort
shl16 ( UShort v
, UInt n
)
345 return toUShort(v
<< n
);
348 static inline UShort
shr16 ( UShort v
, UInt n
)
350 return toUShort((((UShort
)v
) >> n
));
353 static inline UShort
sar16 ( UShort v
, UInt n
)
355 return toUShort(((Short
)v
) >> n
);
358 static inline UInt
shl32 ( UInt v
, UInt n
)
363 static inline UInt
shr32 ( UInt v
, UInt n
)
365 return (((UInt
)v
) >> n
);
368 static inline UInt
sar32 ( UInt v
, UInt n
)
370 return ((Int
)v
) >> n
;
373 static inline UChar
avg8U ( UChar xx
, UChar yy
)
377 UInt r
= (xxi
+ yyi
+ 1) >> 1;
381 static inline UShort
avg16U ( UShort xx
, UShort yy
)
385 UInt r
= (xxi
+ yyi
+ 1) >> 1;
389 static inline Short
max16S ( Short xx
, Short yy
)
391 return toUShort((xx
> yy
) ? xx
: yy
);
394 static inline UChar
max8U ( UChar xx
, UChar yy
)
396 return toUChar((xx
> yy
) ? xx
: yy
);
399 static inline Short
min16S ( Short xx
, Short yy
)
401 return toUShort((xx
< yy
) ? xx
: yy
);
404 static inline UChar
min8U ( UChar xx
, UChar yy
)
406 return toUChar((xx
< yy
) ? xx
: yy
);
409 static inline UShort
hadd16U ( UShort xx
, UShort yy
)
413 UInt r
= (xxi
+ yyi
) >> 1;
417 static inline Short
hadd16S ( Short xx
, Short yy
)
421 Int r
= (xxi
+ yyi
) >> 1;
425 static inline UShort
hsub16U ( UShort xx
, UShort yy
)
429 UInt r
= (xxi
- yyi
) >> 1;
433 static inline Short
hsub16S ( Short xx
, Short yy
)
437 Int r
= (xxi
- yyi
) >> 1;
441 static inline UChar
hadd8U ( UChar xx
, UChar yy
)
445 UInt r
= (xxi
+ yyi
) >> 1;
449 static inline Char
hadd8S ( Char xx
, Char yy
)
453 Int r
= (xxi
+ yyi
) >> 1;
457 static inline UChar
hsub8U ( UChar xx
, UChar yy
)
461 UInt r
= (xxi
- yyi
) >> 1;
465 static inline Char
hsub8S ( Char xx
, Char yy
)
469 Int r
= (xxi
- yyi
) >> 1;
473 static inline UInt
absdiff8U ( UChar xx
, UChar yy
)
475 UInt xxu
= (UChar
)xx
;
476 UInt yyu
= (UChar
)yy
;
477 return xxu
>= yyu
? xxu
- yyu
: yyu
- xxu
;
480 /* ----------------------------------------------------- */
481 /* Start of the externally visible functions. These simply
482 implement the corresponding IR primops. */
483 /* ----------------------------------------------------- */
485 /* ------------ Normal addition ------------ */
487 ULong
h_generic_calc_Add32x2 ( ULong xx
, ULong yy
)
490 sel32x2_1(xx
) + sel32x2_1(yy
),
491 sel32x2_0(xx
) + sel32x2_0(yy
)
495 ULong
h_generic_calc_Add16x4 ( ULong xx
, ULong yy
)
498 toUShort( sel16x4_3(xx
) + sel16x4_3(yy
) ),
499 toUShort( sel16x4_2(xx
) + sel16x4_2(yy
) ),
500 toUShort( sel16x4_1(xx
) + sel16x4_1(yy
) ),
501 toUShort( sel16x4_0(xx
) + sel16x4_0(yy
) )
505 ULong
h_generic_calc_Add8x8 ( ULong xx
, ULong yy
)
508 toUChar( sel8x8_7(xx
) + sel8x8_7(yy
) ),
509 toUChar( sel8x8_6(xx
) + sel8x8_6(yy
) ),
510 toUChar( sel8x8_5(xx
) + sel8x8_5(yy
) ),
511 toUChar( sel8x8_4(xx
) + sel8x8_4(yy
) ),
512 toUChar( sel8x8_3(xx
) + sel8x8_3(yy
) ),
513 toUChar( sel8x8_2(xx
) + sel8x8_2(yy
) ),
514 toUChar( sel8x8_1(xx
) + sel8x8_1(yy
) ),
515 toUChar( sel8x8_0(xx
) + sel8x8_0(yy
) )
519 /* ------------ Saturating addition ------------ */
521 ULong
h_generic_calc_QAdd16Sx4 ( ULong xx
, ULong yy
)
524 qadd16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
525 qadd16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
526 qadd16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
527 qadd16S( sel16x4_0(xx
), sel16x4_0(yy
) )
531 ULong
h_generic_calc_QAdd8Sx8 ( ULong xx
, ULong yy
)
534 qadd8S( sel8x8_7(xx
), sel8x8_7(yy
) ),
535 qadd8S( sel8x8_6(xx
), sel8x8_6(yy
) ),
536 qadd8S( sel8x8_5(xx
), sel8x8_5(yy
) ),
537 qadd8S( sel8x8_4(xx
), sel8x8_4(yy
) ),
538 qadd8S( sel8x8_3(xx
), sel8x8_3(yy
) ),
539 qadd8S( sel8x8_2(xx
), sel8x8_2(yy
) ),
540 qadd8S( sel8x8_1(xx
), sel8x8_1(yy
) ),
541 qadd8S( sel8x8_0(xx
), sel8x8_0(yy
) )
545 ULong
h_generic_calc_QAdd16Ux4 ( ULong xx
, ULong yy
)
548 qadd16U( sel16x4_3(xx
), sel16x4_3(yy
) ),
549 qadd16U( sel16x4_2(xx
), sel16x4_2(yy
) ),
550 qadd16U( sel16x4_1(xx
), sel16x4_1(yy
) ),
551 qadd16U( sel16x4_0(xx
), sel16x4_0(yy
) )
555 ULong
h_generic_calc_QAdd8Ux8 ( ULong xx
, ULong yy
)
558 qadd8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
559 qadd8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
560 qadd8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
561 qadd8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
562 qadd8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
563 qadd8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
564 qadd8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
565 qadd8U( sel8x8_0(xx
), sel8x8_0(yy
) )
569 /* ------------ Normal subtraction ------------ */
571 ULong
h_generic_calc_Sub32x2 ( ULong xx
, ULong yy
)
574 sel32x2_1(xx
) - sel32x2_1(yy
),
575 sel32x2_0(xx
) - sel32x2_0(yy
)
579 ULong
h_generic_calc_Sub16x4 ( ULong xx
, ULong yy
)
582 toUShort( sel16x4_3(xx
) - sel16x4_3(yy
) ),
583 toUShort( sel16x4_2(xx
) - sel16x4_2(yy
) ),
584 toUShort( sel16x4_1(xx
) - sel16x4_1(yy
) ),
585 toUShort( sel16x4_0(xx
) - sel16x4_0(yy
) )
589 ULong
h_generic_calc_Sub8x8 ( ULong xx
, ULong yy
)
592 toUChar( sel8x8_7(xx
) - sel8x8_7(yy
) ),
593 toUChar( sel8x8_6(xx
) - sel8x8_6(yy
) ),
594 toUChar( sel8x8_5(xx
) - sel8x8_5(yy
) ),
595 toUChar( sel8x8_4(xx
) - sel8x8_4(yy
) ),
596 toUChar( sel8x8_3(xx
) - sel8x8_3(yy
) ),
597 toUChar( sel8x8_2(xx
) - sel8x8_2(yy
) ),
598 toUChar( sel8x8_1(xx
) - sel8x8_1(yy
) ),
599 toUChar( sel8x8_0(xx
) - sel8x8_0(yy
) )
603 /* ------------ Saturating subtraction ------------ */
605 ULong
h_generic_calc_QSub16Sx4 ( ULong xx
, ULong yy
)
608 qsub16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
609 qsub16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
610 qsub16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
611 qsub16S( sel16x4_0(xx
), sel16x4_0(yy
) )
615 ULong
h_generic_calc_QSub8Sx8 ( ULong xx
, ULong yy
)
618 qsub8S( sel8x8_7(xx
), sel8x8_7(yy
) ),
619 qsub8S( sel8x8_6(xx
), sel8x8_6(yy
) ),
620 qsub8S( sel8x8_5(xx
), sel8x8_5(yy
) ),
621 qsub8S( sel8x8_4(xx
), sel8x8_4(yy
) ),
622 qsub8S( sel8x8_3(xx
), sel8x8_3(yy
) ),
623 qsub8S( sel8x8_2(xx
), sel8x8_2(yy
) ),
624 qsub8S( sel8x8_1(xx
), sel8x8_1(yy
) ),
625 qsub8S( sel8x8_0(xx
), sel8x8_0(yy
) )
629 ULong
h_generic_calc_QSub16Ux4 ( ULong xx
, ULong yy
)
632 qsub16U( sel16x4_3(xx
), sel16x4_3(yy
) ),
633 qsub16U( sel16x4_2(xx
), sel16x4_2(yy
) ),
634 qsub16U( sel16x4_1(xx
), sel16x4_1(yy
) ),
635 qsub16U( sel16x4_0(xx
), sel16x4_0(yy
) )
639 ULong
h_generic_calc_QSub8Ux8 ( ULong xx
, ULong yy
)
642 qsub8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
643 qsub8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
644 qsub8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
645 qsub8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
646 qsub8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
647 qsub8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
648 qsub8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
649 qsub8U( sel8x8_0(xx
), sel8x8_0(yy
) )
653 /* ------------ Multiplication ------------ */
655 ULong
h_generic_calc_Mul16x4 ( ULong xx
, ULong yy
)
658 mul16( sel16x4_3(xx
), sel16x4_3(yy
) ),
659 mul16( sel16x4_2(xx
), sel16x4_2(yy
) ),
660 mul16( sel16x4_1(xx
), sel16x4_1(yy
) ),
661 mul16( sel16x4_0(xx
), sel16x4_0(yy
) )
665 ULong
h_generic_calc_Mul32x2 ( ULong xx
, ULong yy
)
668 mul32( sel32x2_1(xx
), sel32x2_1(yy
) ),
669 mul32( sel32x2_0(xx
), sel32x2_0(yy
) )
673 ULong
h_generic_calc_MulHi16Sx4 ( ULong xx
, ULong yy
)
676 mulhi16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
677 mulhi16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
678 mulhi16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
679 mulhi16S( sel16x4_0(xx
), sel16x4_0(yy
) )
683 ULong
h_generic_calc_MulHi16Ux4 ( ULong xx
, ULong yy
)
686 mulhi16U( sel16x4_3(xx
), sel16x4_3(yy
) ),
687 mulhi16U( sel16x4_2(xx
), sel16x4_2(yy
) ),
688 mulhi16U( sel16x4_1(xx
), sel16x4_1(yy
) ),
689 mulhi16U( sel16x4_0(xx
), sel16x4_0(yy
) )
693 /* ------------ Comparison ------------ */
695 ULong
h_generic_calc_CmpEQ32x2 ( ULong xx
, ULong yy
)
698 cmpeq32( sel32x2_1(xx
), sel32x2_1(yy
) ),
699 cmpeq32( sel32x2_0(xx
), sel32x2_0(yy
) )
703 ULong
h_generic_calc_CmpEQ16x4 ( ULong xx
, ULong yy
)
706 cmpeq16( sel16x4_3(xx
), sel16x4_3(yy
) ),
707 cmpeq16( sel16x4_2(xx
), sel16x4_2(yy
) ),
708 cmpeq16( sel16x4_1(xx
), sel16x4_1(yy
) ),
709 cmpeq16( sel16x4_0(xx
), sel16x4_0(yy
) )
713 ULong
h_generic_calc_CmpEQ8x8 ( ULong xx
, ULong yy
)
716 cmpeq8( sel8x8_7(xx
), sel8x8_7(yy
) ),
717 cmpeq8( sel8x8_6(xx
), sel8x8_6(yy
) ),
718 cmpeq8( sel8x8_5(xx
), sel8x8_5(yy
) ),
719 cmpeq8( sel8x8_4(xx
), sel8x8_4(yy
) ),
720 cmpeq8( sel8x8_3(xx
), sel8x8_3(yy
) ),
721 cmpeq8( sel8x8_2(xx
), sel8x8_2(yy
) ),
722 cmpeq8( sel8x8_1(xx
), sel8x8_1(yy
) ),
723 cmpeq8( sel8x8_0(xx
), sel8x8_0(yy
) )
727 ULong
h_generic_calc_CmpGT32Sx2 ( ULong xx
, ULong yy
)
730 cmpgt32S( sel32x2_1(xx
), sel32x2_1(yy
) ),
731 cmpgt32S( sel32x2_0(xx
), sel32x2_0(yy
) )
735 ULong
h_generic_calc_CmpGT16Sx4 ( ULong xx
, ULong yy
)
738 cmpgt16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
739 cmpgt16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
740 cmpgt16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
741 cmpgt16S( sel16x4_0(xx
), sel16x4_0(yy
) )
745 ULong
h_generic_calc_CmpGT8Sx8 ( ULong xx
, ULong yy
)
748 cmpgt8S( sel8x8_7(xx
), sel8x8_7(yy
) ),
749 cmpgt8S( sel8x8_6(xx
), sel8x8_6(yy
) ),
750 cmpgt8S( sel8x8_5(xx
), sel8x8_5(yy
) ),
751 cmpgt8S( sel8x8_4(xx
), sel8x8_4(yy
) ),
752 cmpgt8S( sel8x8_3(xx
), sel8x8_3(yy
) ),
753 cmpgt8S( sel8x8_2(xx
), sel8x8_2(yy
) ),
754 cmpgt8S( sel8x8_1(xx
), sel8x8_1(yy
) ),
755 cmpgt8S( sel8x8_0(xx
), sel8x8_0(yy
) )
759 ULong
h_generic_calc_CmpNEZ32x2 ( ULong xx
)
762 cmpnez32( sel32x2_1(xx
) ),
763 cmpnez32( sel32x2_0(xx
) )
767 ULong
h_generic_calc_CmpNEZ16x4 ( ULong xx
)
770 cmpnez16( sel16x4_3(xx
) ),
771 cmpnez16( sel16x4_2(xx
) ),
772 cmpnez16( sel16x4_1(xx
) ),
773 cmpnez16( sel16x4_0(xx
) )
777 ULong
h_generic_calc_CmpNEZ8x8 ( ULong xx
)
780 cmpnez8( sel8x8_7(xx
) ),
781 cmpnez8( sel8x8_6(xx
) ),
782 cmpnez8( sel8x8_5(xx
) ),
783 cmpnez8( sel8x8_4(xx
) ),
784 cmpnez8( sel8x8_3(xx
) ),
785 cmpnez8( sel8x8_2(xx
) ),
786 cmpnez8( sel8x8_1(xx
) ),
787 cmpnez8( sel8x8_0(xx
) )
791 /* ------------ Saturating narrowing ------------ */
793 ULong
h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa
, ULong bb
)
795 UInt d
= sel32x2_1(aa
);
796 UInt c
= sel32x2_0(aa
);
797 UInt b
= sel32x2_1(bb
);
798 UInt a
= sel32x2_0(bb
);
807 ULong
h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa
, ULong bb
)
809 UShort h
= sel16x4_3(aa
);
810 UShort g
= sel16x4_2(aa
);
811 UShort f
= sel16x4_1(aa
);
812 UShort e
= sel16x4_0(aa
);
813 UShort d
= sel16x4_3(bb
);
814 UShort c
= sel16x4_2(bb
);
815 UShort b
= sel16x4_1(bb
);
816 UShort a
= sel16x4_0(bb
);
829 ULong
h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa
, ULong bb
)
831 UShort h
= sel16x4_3(aa
);
832 UShort g
= sel16x4_2(aa
);
833 UShort f
= sel16x4_1(aa
);
834 UShort e
= sel16x4_0(aa
);
835 UShort d
= sel16x4_3(bb
);
836 UShort c
= sel16x4_2(bb
);
837 UShort b
= sel16x4_1(bb
);
838 UShort a
= sel16x4_0(bb
);
851 /* ------------ Truncating narrowing ------------ */
853 ULong
h_generic_calc_NarrowBin32to16x4 ( ULong aa
, ULong bb
)
855 UInt d
= sel32x2_1(aa
);
856 UInt c
= sel32x2_0(aa
);
857 UInt b
= sel32x2_1(bb
);
858 UInt a
= sel32x2_0(bb
);
867 ULong
h_generic_calc_NarrowBin16to8x8 ( ULong aa
, ULong bb
)
869 UShort h
= sel16x4_3(aa
);
870 UShort g
= sel16x4_2(aa
);
871 UShort f
= sel16x4_1(aa
);
872 UShort e
= sel16x4_0(aa
);
873 UShort d
= sel16x4_3(bb
);
874 UShort c
= sel16x4_2(bb
);
875 UShort b
= sel16x4_1(bb
);
876 UShort a
= sel16x4_0(bb
);
889 /* ------------ Interleaving ------------ */
891 ULong
h_generic_calc_InterleaveHI8x8 ( ULong aa
, ULong bb
)
905 ULong
h_generic_calc_InterleaveLO8x8 ( ULong aa
, ULong bb
)
919 ULong
h_generic_calc_InterleaveHI16x4 ( ULong aa
, ULong bb
)
929 ULong
h_generic_calc_InterleaveLO16x4 ( ULong aa
, ULong bb
)
939 ULong
h_generic_calc_InterleaveHI32x2 ( ULong aa
, ULong bb
)
947 ULong
h_generic_calc_InterleaveLO32x2 ( ULong aa
, ULong bb
)
955 /* ------------ Concatenation ------------ */
957 ULong
h_generic_calc_CatOddLanes16x4 ( ULong aa
, ULong bb
)
967 ULong
h_generic_calc_CatEvenLanes16x4 ( ULong aa
, ULong bb
)
977 /* misc hack looking for a proper home */
978 ULong
h_generic_calc_Perm8x8 ( ULong aa
, ULong bb
)
981 index8x8(aa
, sel8x8_7(bb
)),
982 index8x8(aa
, sel8x8_6(bb
)),
983 index8x8(aa
, sel8x8_5(bb
)),
984 index8x8(aa
, sel8x8_4(bb
)),
985 index8x8(aa
, sel8x8_3(bb
)),
986 index8x8(aa
, sel8x8_2(bb
)),
987 index8x8(aa
, sel8x8_1(bb
)),
988 index8x8(aa
, sel8x8_0(bb
))
992 /* ------------ Shifting ------------ */
993 /* Note that because these primops are undefined if the shift amount
994 equals or exceeds the lane width, the shift amount is masked so
995 that the scalar shifts are always in range. In fact, given the
996 semantics of these primops (ShlN16x4, etc) it is an error if in
997 fact we are ever given an out-of-range shift amount.
999 ULong
h_generic_calc_ShlN32x2 ( ULong xx
, UInt nn
)
1001 /* vassert(nn < 32); */
1004 shl32( sel32x2_1(xx
), nn
),
1005 shl32( sel32x2_0(xx
), nn
)
1009 ULong
h_generic_calc_ShlN16x4 ( ULong xx
, UInt nn
)
1011 /* vassert(nn < 16); */
1014 shl16( sel16x4_3(xx
), nn
),
1015 shl16( sel16x4_2(xx
), nn
),
1016 shl16( sel16x4_1(xx
), nn
),
1017 shl16( sel16x4_0(xx
), nn
)
1021 ULong
h_generic_calc_ShlN8x8 ( ULong xx
, UInt nn
)
1023 /* vassert(nn < 8); */
1026 shl8( sel8x8_7(xx
), nn
),
1027 shl8( sel8x8_6(xx
), nn
),
1028 shl8( sel8x8_5(xx
), nn
),
1029 shl8( sel8x8_4(xx
), nn
),
1030 shl8( sel8x8_3(xx
), nn
),
1031 shl8( sel8x8_2(xx
), nn
),
1032 shl8( sel8x8_1(xx
), nn
),
1033 shl8( sel8x8_0(xx
), nn
)
1037 ULong
h_generic_calc_ShrN32x2 ( ULong xx
, UInt nn
)
1039 /* vassert(nn < 32); */
1042 shr32( sel32x2_1(xx
), nn
),
1043 shr32( sel32x2_0(xx
), nn
)
1047 ULong
h_generic_calc_ShrN16x4 ( ULong xx
, UInt nn
)
1049 /* vassert(nn < 16); */
1052 shr16( sel16x4_3(xx
), nn
),
1053 shr16( sel16x4_2(xx
), nn
),
1054 shr16( sel16x4_1(xx
), nn
),
1055 shr16( sel16x4_0(xx
), nn
)
1059 ULong
h_generic_calc_SarN32x2 ( ULong xx
, UInt nn
)
1061 /* vassert(nn < 32); */
1064 sar32( sel32x2_1(xx
), nn
),
1065 sar32( sel32x2_0(xx
), nn
)
1069 ULong
h_generic_calc_SarN16x4 ( ULong xx
, UInt nn
)
1071 /* vassert(nn < 16); */
1074 sar16( sel16x4_3(xx
), nn
),
1075 sar16( sel16x4_2(xx
), nn
),
1076 sar16( sel16x4_1(xx
), nn
),
1077 sar16( sel16x4_0(xx
), nn
)
1081 ULong
h_generic_calc_SarN8x8 ( ULong xx
, UInt nn
)
1083 /* vassert(nn < 8); */
1086 sar8( sel8x8_7(xx
), nn
),
1087 sar8( sel8x8_6(xx
), nn
),
1088 sar8( sel8x8_5(xx
), nn
),
1089 sar8( sel8x8_4(xx
), nn
),
1090 sar8( sel8x8_3(xx
), nn
),
1091 sar8( sel8x8_2(xx
), nn
),
1092 sar8( sel8x8_1(xx
), nn
),
1093 sar8( sel8x8_0(xx
), nn
)
1097 /* ------------ Averaging ------------ */
1099 ULong
h_generic_calc_Avg8Ux8 ( ULong xx
, ULong yy
)
1102 avg8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
1103 avg8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
1104 avg8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
1105 avg8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
1106 avg8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
1107 avg8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
1108 avg8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
1109 avg8U( sel8x8_0(xx
), sel8x8_0(yy
) )
1113 ULong
h_generic_calc_Avg16Ux4 ( ULong xx
, ULong yy
)
1116 avg16U( sel16x4_3(xx
), sel16x4_3(yy
) ),
1117 avg16U( sel16x4_2(xx
), sel16x4_2(yy
) ),
1118 avg16U( sel16x4_1(xx
), sel16x4_1(yy
) ),
1119 avg16U( sel16x4_0(xx
), sel16x4_0(yy
) )
1123 /* ------------ max/min ------------ */
1125 ULong
h_generic_calc_Max16Sx4 ( ULong xx
, ULong yy
)
1128 max16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
1129 max16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
1130 max16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
1131 max16S( sel16x4_0(xx
), sel16x4_0(yy
) )
1135 ULong
h_generic_calc_Max8Ux8 ( ULong xx
, ULong yy
)
1138 max8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
1139 max8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
1140 max8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
1141 max8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
1142 max8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
1143 max8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
1144 max8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
1145 max8U( sel8x8_0(xx
), sel8x8_0(yy
) )
1149 ULong
h_generic_calc_Min16Sx4 ( ULong xx
, ULong yy
)
1152 min16S( sel16x4_3(xx
), sel16x4_3(yy
) ),
1153 min16S( sel16x4_2(xx
), sel16x4_2(yy
) ),
1154 min16S( sel16x4_1(xx
), sel16x4_1(yy
) ),
1155 min16S( sel16x4_0(xx
), sel16x4_0(yy
) )
1159 ULong
h_generic_calc_Min8Ux8 ( ULong xx
, ULong yy
)
1162 min8U( sel8x8_7(xx
), sel8x8_7(yy
) ),
1163 min8U( sel8x8_6(xx
), sel8x8_6(yy
) ),
1164 min8U( sel8x8_5(xx
), sel8x8_5(yy
) ),
1165 min8U( sel8x8_4(xx
), sel8x8_4(yy
) ),
1166 min8U( sel8x8_3(xx
), sel8x8_3(yy
) ),
1167 min8U( sel8x8_2(xx
), sel8x8_2(yy
) ),
1168 min8U( sel8x8_1(xx
), sel8x8_1(yy
) ),
1169 min8U( sel8x8_0(xx
), sel8x8_0(yy
) )
1173 UInt
h_generic_calc_GetMSBs8x8 ( ULong xx
)
1176 if (xx
& (1ULL << (64-1))) r
|= (1<<7);
1177 if (xx
& (1ULL << (56-1))) r
|= (1<<6);
1178 if (xx
& (1ULL << (48-1))) r
|= (1<<5);
1179 if (xx
& (1ULL << (40-1))) r
|= (1<<4);
1180 if (xx
& (1ULL << (32-1))) r
|= (1<<3);
1181 if (xx
& (1ULL << (24-1))) r
|= (1<<2);
1182 if (xx
& (1ULL << (16-1))) r
|= (1<<1);
1183 if (xx
& (1ULL << ( 8-1))) r
|= (1<<0);
1187 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1189 /* Tuple/select functions for 16x2 vectors. */
1190 static inline UInt
mk16x2 ( UShort w1
, UShort w2
) {
1191 return (((UInt
)w1
) << 16) | ((UInt
)w2
);
1194 static inline UShort
sel16x2_1 ( UInt w32
) {
1195 return 0xFFFF & (UShort
)(w32
>> 16);
1197 static inline UShort
sel16x2_0 ( UInt w32
) {
1198 return 0xFFFF & (UShort
)(w32
);
1201 static inline UInt
mk8x4 ( UChar w3
, UChar w2
,
1202 UChar w1
, UChar w0
) {
1203 UInt w32
= (((UInt
)w3
) << 24) | (((UInt
)w2
) << 16)
1204 | (((UInt
)w1
) << 8) | (((UInt
)w0
) << 0);
1208 static inline UChar
sel8x4_3 ( UInt w32
) {
1209 return toUChar(0xFF & (w32
>> 24));
1211 static inline UChar
sel8x4_2 ( UInt w32
) {
1212 return toUChar(0xFF & (w32
>> 16));
1214 static inline UChar
sel8x4_1 ( UInt w32
) {
1215 return toUChar(0xFF & (w32
>> 8));
1217 static inline UChar
sel8x4_0 ( UInt w32
) {
1218 return toUChar(0xFF & (w32
>> 0));
1222 /* ----------------------------------------------------- */
1223 /* More externally visible functions. These simply
1224 implement the corresponding IR primops. */
1225 /* ----------------------------------------------------- */
1227 /* ------ 16x2 ------ */
1229 UInt
h_generic_calc_Add16x2 ( UInt xx
, UInt yy
)
1231 return mk16x2( sel16x2_1(xx
) + sel16x2_1(yy
),
1232 sel16x2_0(xx
) + sel16x2_0(yy
) );
1235 UInt
h_generic_calc_Sub16x2 ( UInt xx
, UInt yy
)
1237 return mk16x2( sel16x2_1(xx
) - sel16x2_1(yy
),
1238 sel16x2_0(xx
) - sel16x2_0(yy
) );
1241 UInt
h_generic_calc_HAdd16Ux2 ( UInt xx
, UInt yy
)
1243 return mk16x2( hadd16U( sel16x2_1(xx
), sel16x2_1(yy
) ),
1244 hadd16U( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1247 UInt
h_generic_calc_HAdd16Sx2 ( UInt xx
, UInt yy
)
1249 return mk16x2( hadd16S( sel16x2_1(xx
), sel16x2_1(yy
) ),
1250 hadd16S( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1253 UInt
h_generic_calc_HSub16Ux2 ( UInt xx
, UInt yy
)
1255 return mk16x2( hsub16U( sel16x2_1(xx
), sel16x2_1(yy
) ),
1256 hsub16U( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1259 UInt
h_generic_calc_HSub16Sx2 ( UInt xx
, UInt yy
)
1261 return mk16x2( hsub16S( sel16x2_1(xx
), sel16x2_1(yy
) ),
1262 hsub16S( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1265 UInt
h_generic_calc_QAdd16Ux2 ( UInt xx
, UInt yy
)
1267 return mk16x2( qadd16U( sel16x2_1(xx
), sel16x2_1(yy
) ),
1268 qadd16U( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1271 UInt
h_generic_calc_QAdd16Sx2 ( UInt xx
, UInt yy
)
1273 return mk16x2( qadd16S( sel16x2_1(xx
), sel16x2_1(yy
) ),
1274 qadd16S( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1277 UInt
h_generic_calc_QSub16Ux2 ( UInt xx
, UInt yy
)
1279 return mk16x2( qsub16U( sel16x2_1(xx
), sel16x2_1(yy
) ),
1280 qsub16U( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1283 UInt
h_generic_calc_QSub16Sx2 ( UInt xx
, UInt yy
)
1285 return mk16x2( qsub16S( sel16x2_1(xx
), sel16x2_1(yy
) ),
1286 qsub16S( sel16x2_0(xx
), sel16x2_0(yy
) ) );
1289 /* ------ 8x4 ------ */
1291 UInt
h_generic_calc_Add8x4 ( UInt xx
, UInt yy
)
1294 sel8x4_3(xx
) + sel8x4_3(yy
),
1295 sel8x4_2(xx
) + sel8x4_2(yy
),
1296 sel8x4_1(xx
) + sel8x4_1(yy
),
1297 sel8x4_0(xx
) + sel8x4_0(yy
)
1301 UInt
h_generic_calc_Sub8x4 ( UInt xx
, UInt yy
)
1304 sel8x4_3(xx
) - sel8x4_3(yy
),
1305 sel8x4_2(xx
) - sel8x4_2(yy
),
1306 sel8x4_1(xx
) - sel8x4_1(yy
),
1307 sel8x4_0(xx
) - sel8x4_0(yy
)
1311 UInt
h_generic_calc_HAdd8Ux4 ( UInt xx
, UInt yy
)
1314 hadd8U( sel8x4_3(xx
), sel8x4_3(yy
) ),
1315 hadd8U( sel8x4_2(xx
), sel8x4_2(yy
) ),
1316 hadd8U( sel8x4_1(xx
), sel8x4_1(yy
) ),
1317 hadd8U( sel8x4_0(xx
), sel8x4_0(yy
) )
1321 UInt
h_generic_calc_HAdd8Sx4 ( UInt xx
, UInt yy
)
1324 hadd8S( sel8x4_3(xx
), sel8x4_3(yy
) ),
1325 hadd8S( sel8x4_2(xx
), sel8x4_2(yy
) ),
1326 hadd8S( sel8x4_1(xx
), sel8x4_1(yy
) ),
1327 hadd8S( sel8x4_0(xx
), sel8x4_0(yy
) )
1331 UInt
h_generic_calc_HSub8Ux4 ( UInt xx
, UInt yy
)
1334 hsub8U( sel8x4_3(xx
), sel8x4_3(yy
) ),
1335 hsub8U( sel8x4_2(xx
), sel8x4_2(yy
) ),
1336 hsub8U( sel8x4_1(xx
), sel8x4_1(yy
) ),
1337 hsub8U( sel8x4_0(xx
), sel8x4_0(yy
) )
1341 UInt
h_generic_calc_HSub8Sx4 ( UInt xx
, UInt yy
)
1344 hsub8S( sel8x4_3(xx
), sel8x4_3(yy
) ),
1345 hsub8S( sel8x4_2(xx
), sel8x4_2(yy
) ),
1346 hsub8S( sel8x4_1(xx
), sel8x4_1(yy
) ),
1347 hsub8S( sel8x4_0(xx
), sel8x4_0(yy
) )
1351 UInt
h_generic_calc_QAdd8Ux4 ( UInt xx
, UInt yy
)
1354 qadd8U( sel8x4_3(xx
), sel8x4_3(yy
) ),
1355 qadd8U( sel8x4_2(xx
), sel8x4_2(yy
) ),
1356 qadd8U( sel8x4_1(xx
), sel8x4_1(yy
) ),
1357 qadd8U( sel8x4_0(xx
), sel8x4_0(yy
) )
1361 UInt
h_generic_calc_QAdd8Sx4 ( UInt xx
, UInt yy
)
1364 qadd8S( sel8x4_3(xx
), sel8x4_3(yy
) ),
1365 qadd8S( sel8x4_2(xx
), sel8x4_2(yy
) ),
1366 qadd8S( sel8x4_1(xx
), sel8x4_1(yy
) ),
1367 qadd8S( sel8x4_0(xx
), sel8x4_0(yy
) )
1371 UInt
h_generic_calc_QSub8Ux4 ( UInt xx
, UInt yy
)
1374 qsub8U( sel8x4_3(xx
), sel8x4_3(yy
) ),
1375 qsub8U( sel8x4_2(xx
), sel8x4_2(yy
) ),
1376 qsub8U( sel8x4_1(xx
), sel8x4_1(yy
) ),
1377 qsub8U( sel8x4_0(xx
), sel8x4_0(yy
) )
1381 UInt
h_generic_calc_QSub8Sx4 ( UInt xx
, UInt yy
)
1384 qsub8S( sel8x4_3(xx
), sel8x4_3(yy
) ),
1385 qsub8S( sel8x4_2(xx
), sel8x4_2(yy
) ),
1386 qsub8S( sel8x4_1(xx
), sel8x4_1(yy
) ),
1387 qsub8S( sel8x4_0(xx
), sel8x4_0(yy
) )
1391 UInt
h_generic_calc_CmpNEZ16x2 ( UInt xx
)
1394 cmpnez16( sel16x2_1(xx
) ),
1395 cmpnez16( sel16x2_0(xx
) )
1399 UInt
h_generic_calc_CmpNEZ8x4 ( UInt xx
)
1402 cmpnez8( sel8x4_3(xx
) ),
1403 cmpnez8( sel8x4_2(xx
) ),
1404 cmpnez8( sel8x4_1(xx
) ),
1405 cmpnez8( sel8x4_0(xx
) )
1409 UInt
h_generic_calc_Sad8Ux4 ( UInt xx
, UInt yy
)
1411 return absdiff8U( sel8x4_3(xx
), sel8x4_3(yy
) )
1412 + absdiff8U( sel8x4_2(xx
), sel8x4_2(yy
) )
1413 + absdiff8U( sel8x4_1(xx
), sel8x4_1(yy
) )
1414 + absdiff8U( sel8x4_0(xx
), sel8x4_0(yy
) );
1417 UInt
h_generic_calc_QAdd32S ( UInt xx
, UInt yy
)
1419 return qadd32S( xx
, yy
);
1422 UInt
h_generic_calc_QSub32S ( UInt xx
, UInt yy
)
1424 return qsub32S( xx
, yy
);
1428 /*------------------------------------------------------------------*/
1429 /* Decimal Floating Point (DFP) externally visible helper functions */
1430 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
1431 /*------------------------------------------------------------------*/
1433 #define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
1434 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1435 #define PUT( x, y ) ( ( x )<< ( y ) )
1437 static ULong
dpb_to_bcd( ULong chunk
)
1439 Short a
, b
, c
, d
, e
, f
, g
, h
, i
, j
, k
, m
;
1440 Short p
, q
, r
, s
, t
, u
, v
, w
, x
, y
;
1443 /* convert 10 bit densely packed BCD to BCD */
1444 p
= GET( chunk
, 9 );
1445 q
= GET( chunk
, 8 );
1446 r
= GET( chunk
, 7 );
1447 s
= GET( chunk
, 6 );
1448 t
= GET( chunk
, 5 );
1449 u
= GET( chunk
, 4 );
1450 v
= GET( chunk
, 3 );
1451 w
= GET( chunk
, 2 );
1452 x
= GET( chunk
, 1 );
1453 y
= GET( chunk
, 0 );
1455 /* The BCD bit values are given by the following boolean equations.*/
1456 a
= ( NOT(s
) & v
& w
) | ( t
& v
& w
& s
) | ( v
& w
& NOT(x
) );
1457 b
= ( p
& s
& x
& NOT(t
) ) | ( p
& NOT(w
) ) | ( p
& NOT(v
) );
1458 c
= ( q
& s
& x
& NOT(t
) ) | ( q
& NOT(w
) ) | ( q
& NOT(v
) );
1460 e
= ( v
& NOT(w
) & x
) | ( s
& v
& w
& x
) | ( NOT(t
) & v
& x
& w
);
1461 f
= ( p
& t
& v
& w
& x
& NOT(s
) ) | ( s
& NOT(x
) & v
) | ( s
& NOT(v
) );
1462 g
= ( q
& t
& w
& v
& x
& NOT(s
) ) | ( t
& NOT(x
) & v
) | ( t
& NOT(v
) );
1464 i
= ( t
& v
& w
& x
) | ( s
& v
& w
& x
) | ( v
& NOT(w
) & NOT(x
) );
1465 j
= ( p
& NOT(s
) & NOT(t
) & w
& v
) | ( s
& v
& NOT(w
) & x
)
1466 | ( p
& w
& NOT(x
) & v
) | ( w
& NOT(v
) );
1467 k
= ( q
& NOT(s
) & NOT(t
) & v
& w
) | ( t
& v
& NOT(w
) & x
)
1468 | ( q
& v
& w
& NOT(x
) ) | ( x
& NOT(v
) );
1471 value
= PUT(a
, 11) | PUT(b
, 10) | PUT(c
, 9) | PUT(d
, 8) | PUT(e
, 7)
1472 | PUT(f
, 6) | PUT(g
, 5) | PUT(h
, 4) | PUT(i
, 3) | PUT(j
, 2)
1473 | PUT(k
, 1) | PUT(m
, 0);
1477 static ULong
bcd_to_dpb( ULong chunk
)
1479 Short a
, b
, c
, d
, e
, f
, g
, h
, i
, j
, k
, m
;
1480 Short p
, q
, r
, s
, t
, u
, v
, w
, x
, y
;
1482 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1483 The boolean equations to calculate the value of each of the DPD bit
1484 is given in Appendix B of Book 1: Power ISA User Instruction set. The
1485 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
1486 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
1488 a
= GET( chunk
, 11 );
1489 b
= GET( chunk
, 10 );
1490 c
= GET( chunk
, 9 );
1491 d
= GET( chunk
, 8 );
1492 e
= GET( chunk
, 7 );
1493 f
= GET( chunk
, 6 );
1494 g
= GET( chunk
, 5 );
1495 h
= GET( chunk
, 4 );
1496 i
= GET( chunk
, 3 );
1497 j
= GET( chunk
, 2 );
1498 k
= GET( chunk
, 1 );
1499 m
= GET( chunk
, 0 );
1501 p
= ( f
& a
& i
& NOT(e
) ) | ( j
& a
& NOT(i
) ) | ( b
& NOT(a
) );
1502 q
= ( g
& a
& i
& NOT(e
) ) | ( k
& a
& NOT(i
) ) | ( c
& NOT(a
) );
1504 s
= ( j
& NOT(a
) & e
& NOT(i
) ) | ( f
& NOT(i
) & NOT(e
) )
1505 | ( f
& NOT(a
) & NOT(e
) ) | ( e
& i
);
1506 t
= ( k
& NOT(a
) & e
& NOT(i
) ) | ( g
& NOT(i
) & NOT(e
) )
1507 | ( g
& NOT(a
) & NOT(e
) ) | ( a
& i
);
1510 w
= ( NOT(e
) & j
& NOT(i
) ) | ( e
& i
) | a
;
1511 x
= ( NOT(a
) & k
& NOT(i
) ) | ( a
& i
) | e
;
1514 value
= PUT(p
, 9) | PUT(q
, 8) | PUT(r
, 7) | PUT(s
, 6) | PUT(t
, 5)
1515 | PUT(u
, 4) | PUT(v
, 3) | PUT(w
, 2) | PUT(x
, 1) | y
;
1520 ULong
h_calc_DPBtoBCD( ULong dpb
)
1522 ULong result
, chunk
;
1527 for (i
= 0; i
< 5; i
++) {
1528 chunk
= dpb
>> ( 4 - i
) * 10;
1529 result
= result
<< 12;
1530 result
|= dpb_to_bcd( chunk
& 0x3FF );
1535 ULong
h_calc_BCDtoDPB( ULong bcd
)
1537 ULong result
, chunk
;
1542 for (i
= 0; i
< 5; i
++) {
1543 chunk
= bcd
>> ( 4 - i
) * 12;
1544 result
= result
<< 10;
1545 result
|= bcd_to_dpb( chunk
& 0xFFF );
1554 /* ----------------------------------------------------- */
1555 /* Signed and unsigned integer division, that behave like
1556 the ARMv7 UDIV ansd SDIV instructions.
1558 sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1559 udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1561 /* ----------------------------------------------------- */
1563 UInt
h_calc_udiv32_w_arm_semantics ( UInt x
, UInt y
)
1565 // Division by zero --> zero
1566 if (UNLIKELY(y
== 0)) return 0;
1567 // C requires rounding towards zero, which is also what we need.
1571 ULong
h_calc_udiv64_w_arm_semantics ( ULong x
, ULong y
)
1573 // Division by zero --> zero
1574 if (UNLIKELY(y
== 0)) return 0;
1575 // C requires rounding towards zero, which is also what we need.
1579 Int
h_calc_sdiv32_w_arm_semantics ( Int x
, Int y
)
1581 // Division by zero --> zero
1582 if (UNLIKELY(y
== 0)) return 0;
1583 // The single case that produces an unrepresentable result
1584 if (UNLIKELY( ((UInt
)x
) == ((UInt
)0x80000000)
1585 && ((UInt
)y
) == ((UInt
)0xFFFFFFFF) ))
1586 return (Int
)(UInt
)0x80000000;
1587 // Else return the result rounded towards zero. C89 says
1588 // this is implementation defined (in the signed case), but gcc
1589 // promises to round towards zero. Nevertheless, at startup,
1590 // in main_main.c, do a check for that.
1594 Long
h_calc_sdiv64_w_arm_semantics ( Long x
, Long y
)
1596 // Division by zero --> zero
1597 if (UNLIKELY(y
== 0)) return 0;
1598 // The single case that produces an unrepresentable result
1599 if (UNLIKELY( ((ULong
)x
) == ((ULong
)0x8000000000000000ULL
)
1600 && ((ULong
)y
) == ((ULong
)0xFFFFFFFFFFFFFFFFULL
) ))
1601 return (Long
)(ULong
)0x8000000000000000ULL
;
1602 // Else return the result rounded towards zero. C89 says
1603 // this is implementation defined (in the signed case), but gcc
1604 // promises to round towards zero. Nevertheless, at startup,
1605 // in main_main.c, do a check for that.
1610 /*---------------------------------------------------------------*/
1611 /*--- end host_generic_simd64.c ---*/
1612 /*---------------------------------------------------------------*/