3 //For all types EXCEPT long
, which is implemented separately
4 #define __CLC_MUL_HI_IMPL
(BGENTYPE, GENTYPE
, GENSIZE
) \
5 _CLC_OVERLOAD _CLC_DEF GENTYPE mul_hi
(GENTYPE x
, GENTYPE y
){ \
6 return
(GENTYPE)(((BGENTYPE)x
* (BGENTYPE)y
) >> GENSIZE
); \
9 //FOIL-based long mul_hi
11 // Summary
: Treat mul_hi
(long x
, long y
) as
:
12 // (a+b
) * (c+d
) where a and c are the high-order parts of x and y respectively
13 // and b and d are the low-order parts of x and y.
14 // Thinking back to algebra
, we use FOIL to do the work.
16 _CLC_OVERLOAD _CLC_DEF long mul_hi
(long x
, long y
){
20 //Move the high
/low halves of x
/y into the lower
32-bits of variables so
21 //that we can multiply them without worrying about overflow.
23 long x_lo
= x
& UINT_MAX
;
25 long y_lo
= y
& UINT_MAX
;
27 //Multiply all of the components according to FOIL method
33 //Now add the components back together in the following steps
:
34 //F
: doesn
't need to be modified
35 //O
/I
: Need to be added together.
36 //L
: Shift right by
32-bits
, then add into the sum of O and I
37 //Once O
/I
/L are summed up
, then shift the sum by
32-bits and add to F.
39 //We use hadd to give us a bit of extra precision for the intermediate sums
40 //but as a result
, we shift by
31 bits instead of
32
41 return
(long)(f + (hadd(o, (i + (long)((ulong)l
>>32))) >> 31));
44 _CLC_OVERLOAD _CLC_DEF ulong mul_hi
(ulong x
, ulong y
){
48 //Move the high
/low halves of x
/y into the lower
32-bits of variables so
49 //that we can multiply them without worrying about overflow.
51 ulong x_lo
= x
& UINT_MAX
;
53 ulong y_lo
= y
& UINT_MAX
;
55 //Multiply all of the components according to FOIL method
61 //Now add the components back together
, taking care to respect the fact that
:
62 //F
: doesn
't need to be modified
63 //O
/I
: Need to be added together.
64 //L
: Shift right by
32-bits
, then add into the sum of O and I
65 //Once O
/I
/L are summed up
, then shift the sum by
32-bits and add to F.
67 //We use hadd to give us a bit of extra precision for the intermediate sums
68 //but as a result
, we shift by
31 bits instead of
32
69 return
(f + (hadd(o, (i + (l>>32))) >> 31));
72 #define __CLC_MUL_HI_VEC
(GENTYPE) \
73 _CLC_OVERLOAD _CLC_DEF GENTYPE
##2 mul_hi
(GENTYPE##2 x
, GENTYPE
##2 y
){ \
74 return
(GENTYPE##2){mul_hi
(x.s0
, y.s0
), mul_hi
(x.s1
, y.s1
)}; \
76 _CLC_OVERLOAD _CLC_DEF GENTYPE
##3 mul_hi
(GENTYPE##3 x
, GENTYPE
##3 y
){ \
77 return
(GENTYPE##3){mul_hi
(x.s0
, y.s0
), mul_hi
(x.s1
, y.s1
), mul_hi
(x.s2
, y.s2
)}; \
79 _CLC_OVERLOAD _CLC_DEF GENTYPE
##4 mul_hi
(GENTYPE##4 x
, GENTYPE
##4 y
){ \
80 return
(GENTYPE##4){mul_hi
(x.lo
, y.lo
), mul_hi
(x.hi
, y.hi
)}; \
82 _CLC_OVERLOAD _CLC_DEF GENTYPE
##8 mul_hi
(GENTYPE##8 x
, GENTYPE
##8 y
){ \
83 return
(GENTYPE##8){mul_hi
(x.lo
, y.lo
), mul_hi
(x.hi
, y.hi
)}; \
85 _CLC_OVERLOAD _CLC_DEF GENTYPE
##16 mul_hi
(GENTYPE##16 x
, GENTYPE
##16 y
){ \
86 return
(GENTYPE##16){mul_hi
(x.lo
, y.lo
), mul_hi
(x.hi
, y.hi
)}; \
89 #define __CLC_MUL_HI_DEC_IMPL
(BTYPE, TYPE
, BITS
) \
90 __CLC_MUL_HI_IMPL
(BTYPE, TYPE
, BITS
) \
91 __CLC_MUL_HI_VEC
(TYPE)
93 #define __CLC_MUL_HI_TYPES
() \
94 __CLC_MUL_HI_DEC_IMPL
(short, char
, 8) \
95 __CLC_MUL_HI_DEC_IMPL
(ushort, uchar
, 8) \
96 __CLC_MUL_HI_DEC_IMPL
(int, short
, 16) \
97 __CLC_MUL_HI_DEC_IMPL
(uint, ushort
, 16) \
98 __CLC_MUL_HI_DEC_IMPL
(long, int
, 32) \
99 __CLC_MUL_HI_DEC_IMPL
(ulong, uint
, 32) \
100 __CLC_MUL_HI_VEC
(long) \
101 __CLC_MUL_HI_VEC
(ulong)
105 #undef __CLC_MUL_HI_TYPES
106 #undef __CLC_MUL_HI_DEC_IMPL
107 #undef __CLC_MUL_HI_IMPL
108 #undef __CLC_MUL_HI_VEC