2 * Copyright
(c) 2014 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include
<clc
/clcmacro.h
>
25 #include
<clc
/integer
/clc_abs.h
>
26 #include
<clc
/relational
/clc_isinf.h
>
27 #include
<clc
/relational
/clc_isnan.h
>
28 #include
<clc
/shared
/clc_max.h
>
39 _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma
(float a
, float b
, float c
) {
41 if
(__clc_isnan(a) || __clc_isnan
(b) || __clc_isnan
(c) || __clc_isinf
(a) ||
45 /* If only c is inf
, and both a
,b are regular numbers
, the result is c
*/
49 a
= __clc_flush_denormal_if_not_supported
(a);
50 b
= __clc_flush_denormal_if_not_supported
(b);
51 c
= __clc_flush_denormal_if_not_supported
(c);
56 struct fp st_a
, st_b
, st_c
;
58 st_a.exponent
= a
== .0f ?
0 : ((as_uint(a) & 0x7f800000) >> 23) -
127;
59 st_b.exponent
= b
== .0f ?
0 : ((as_uint(b) & 0x7f800000) >> 23) -
127;
60 st_c.exponent
= c
== .0f ?
0 : ((as_uint(c) & 0x7f800000) >> 23) -
127;
62 st_a.mantissa
= a
== .0f ?
0 : (as_uint(a) & 0x7fffff) |
0x800000;
63 st_b.mantissa
= b
== .0f ?
0 : (as_uint(b) & 0x7fffff) |
0x800000;
64 st_c.mantissa
= c
== .0f ?
0 : (as_uint(c) & 0x7fffff) |
0x800000;
66 st_a.sign
= as_uint
(a) & 0x80000000;
67 st_b.sign
= as_uint
(b) & 0x80000000;
68 st_c.sign
= as_uint
(c) & 0x80000000;
71 // Move the product to the highest bits to maximize precision
72 // mantissa is
24 bits
=> product is
48 bits
, 2bits non-fraction.
73 // Add one bit for future addition overflow
,
74 // add another bit to detect subtraction underflow
76 st_mul.sign
= st_a.sign ^ st_b.sign
;
77 st_mul.mantissa
= (st_a.mantissa
* st_b.mantissa
) << 14ul;
78 st_mul.exponent
= st_mul.mantissa ? st_a.exponent
+ st_b.exponent
: 0;
80 // FIXME
: Detecting a
== 0 || b
== 0 above crashed GCN isel
81 if
(st_mul.exponent
== 0 && st_mul.mantissa
== 0)
84 // Mantissa is
23 fractional bits
, shift it the same way as product mantissa
87 // both exponents are bias adjusted
88 int exp_diff
= st_mul.exponent - st_c.exponent
;
90 st_c.mantissa
<<= C_ADJUST
;
91 ulong cutoff_bits
= 0;
92 ulong cutoff_mask
= (1ul << __clc_abs
(exp_diff)) -
1ul;
95 exp_diff
>= 64 ? st_c.mantissa
: (st_c.mantissa
& cutoff_mask
);
96 st_c.mantissa
= exp_diff
>= 64 ?
0 : (st_c.mantissa
>> exp_diff
);
99 -exp_diff
>= 64 ? st_mul.mantissa
: (st_mul.mantissa
& cutoff_mask
);
100 st_mul.mantissa
= -exp_diff
>= 64 ?
0 : (st_mul.mantissa
>> -exp_diff
);
104 st_fma.sign
= st_mul.sign
;
105 st_fma.exponent
= __clc_max
(st_mul.exponent
, st_c.exponent
);
106 if
(st_c.sign
== st_mul.sign
) {
107 st_fma.mantissa
= st_mul.mantissa
+ st_c.mantissa
;
109 // cutoff bits borrow one
111 st_mul.mantissa - st_c.mantissa -
112 (cutoff_bits && (st_mul.exponent
> st_c.exponent
) ?
1 : 0);
115 // underflow
: st_c.sign
!= st_mul.sign
, and magnitude switches the sign
116 if
(st_fma.mantissa
> LONG_MAX
) {
117 st_fma.mantissa
= 0 - st_fma.mantissa
;
118 st_fma.sign
= st_mul.sign ^
0x80000000;
121 // detect overflow
/underflow
122 int overflow_bits
= 3 - clz
(st_fma.mantissa
);
125 st_fma.exponent
+= overflow_bits
;
128 if
(overflow_bits < 0) {
129 st_fma.mantissa
<<= -overflow_bits
;
134 ulong trunc_mask
= (1ul << (C_ADJUST + overflow_bits
)) -
1;
135 ulong trunc_bits
= (st_fma.mantissa
& trunc_mask
) |
(cutoff_bits != 0);
136 ulong last_bit
= st_fma.mantissa
& (1ul << (C_ADJUST + overflow_bits
));
137 ulong grs_bits
= (0x4ul << (C_ADJUST -
3 + overflow_bits
));
139 // round to nearest even
140 if
((trunc_bits > grs_bits
) ||
(trunc_bits == grs_bits
&& last_bit
!= 0))
141 st_fma.mantissa
+= (1ul << (C_ADJUST + overflow_bits
));
143 // Shift mantissa back to bit
23
144 st_fma.mantissa
= (st_fma.mantissa
>> (C_ADJUST + overflow_bits
));
146 // Detect rounding overflow
147 if
(st_fma.mantissa
> 0xffffff) {
149 st_fma.mantissa
>>= 1;
152 if
(st_fma.mantissa
== 0)
155 // Flating point range limit
156 if
(st_fma.exponent
> 127)
157 return as_float
(as_uint(INFINITY) | st_fma.sign
);
160 if
(st_fma.exponent
<= -
127)
161 return as_float
(st_fma.sign
);
163 return as_float
(st_fma.sign |
((st_fma.exponent
+ 127) << 23) |
164 ((uint)st_fma.mantissa
& 0x7fffff));
166 _CLC_TERNARY_VECTORIZE
(_CLC_DEF _CLC_OVERLOAD
, float
, __clc_sw_fma
, float
,