2 * Copyright
(c) 2014 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include
<clc
/clcmacro.h
>
25 #include
<clc
/math
/clc_fabs.h
>
32 compute pow using log and exp
35 we take care not to lose precision in the intermediate steps
37 When computing log
, calculate it in splits
,
39 r
= f
* (p_invead + p_inv_tail
)
42 calculate log polynomial using r
, in end addition
, do
43 poly
= poly
+ ((rh-r) + rt
)
46 ltt
= ((xexp * log2_t
) - poly
) + logT
49 lh
= (xexp * log2_h
) + logH
52 Calculate final log answer as gh and gt
,
53 gh
= l
& higher-half bits
54 gt
= (((ltt -
(lt - lth
)) + ((lh - l
) + lt
)) + (l - gh
))
56 yh
= y
& higher-half bits
59 Before entering computation of exp
,
60 vs
= ((yt*gt
+ yt
*gh
) + yh
*gt
)
62 vt
= ((yh*gh - v
) + vs
)
64 In calculation of exp
, add vt to r that is used for poly
66 ((((expT * poly
) + expT
) + expH
*poly
) + expH
)
69 _CLC_DEF _CLC_OVERLOAD float __clc_pow
(float x
, float y
)
73 int ax
= ix
& EXSIGNBIT_SP32
;
77 int ay
= iy
& EXSIGNBIT_SP32
;
80 /* Extra precise log calculation
81 * First handle case that x is close to
1
83 float r
= 1.0f - as_float
(ax);
84 int near1
= __clc_fabs
(r) < 0x1.0p-4f
;
87 /* Coefficients are just
1/3, 1/4, 1/5 and
1/6 */
91 mad
(r, 0x1.24924ap-3f
, 0x1.555556p-3f
),
98 float lth_near1
= -r2
* 0.5f
;
99 float ltt_near1
= -poly
;
100 float lt_near1
= lth_near1
+ ltt_near1
;
102 float l_near1
= lh_near1
+ lt_near1
;
104 /* Computations for x not near
1 */
105 int m
= (int)(ax >> EXPSHIFTBITS_SP32
) - EXPBIAS_SP32
;
107 int ixs
= as_int
(as_float(ax |
0x3f800000) -
1.0f
);
108 float mfs
= (float)((ixs >> EXPSHIFTBITS_SP32
) -
253);
110 int ixn
= c ? ixs
: ax
;
111 float mfn
= c ? mfs
: mf
;
113 int indx
= (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
116 float f
= as_float
(0x3f000000 | indx
) - as_float
(0x3f000000 |
(ixn & MANTBITS_SP32
));
119 float2 tv
= USE_TABLE
(log_inv_tbl_ep, indx
);
120 float rh
= f
* tv.s0
;
121 float rt
= f
* tv.s1
;
124 poly
= mad
(r, mad
(r, 0x1.0p-2f
, 0x1.555556p-2f
), 0x1.0p-1f
) * (r*r
);
125 poly
+= (rh - r
) + rt
;
127 const float LOG2_HEAD
= 0x1.62e000p-1f
; /* 0.693115234 */
128 const float LOG2_TAIL
= 0x1.0bfbe8p-15f
; /* 0.0000319461833 */
129 tv
= USE_TABLE
(loge_tbl, indx
);
131 float ltt
= mad
(mfn, LOG2_TAIL
, -poly
) + tv.s1
;
132 float lt
= lth
+ ltt
;
133 float lh
= mad
(mfn, LOG2_HEAD
, tv.s0
);
136 /* Select near
1 or not
*/
137 lth
= near1 ? lth_near1
: lth
;
138 ltt
= near1 ? ltt_near1
: ltt
;
139 lt
= near1 ? lt_near1
: lt
;
140 lh
= near1 ? lh_near1
: lh
;
141 l
= near1 ? l_near1
: l
;
143 float gh
= as_float
(as_int(l) & 0xfffff000);
144 float gt
= ((ltt -
(lt - lth
)) + ((lh - l
) + lt
)) + (l - gh
);
146 float yh
= as_float
(iy & 0xfffff000);
150 float ylogx_s
= mad
(gt, yh
, mad
(gh, yt
, yt
*gt
));
151 float ylogx
= mad
(yh, gh
, ylogx_s
);
152 float ylogx_t
= mad
(yh, gh
, -ylogx
) + ylogx_s
;
154 /* Extra precise exp of ylogx
*/
155 const float R_64_BY_LOG2
= 0x1.715476p
+6f
; /* 64/log2 : 92.332482616893657 */
156 int n
= convert_int
(ylogx * R_64_BY_LOG2
);
157 float nf
= (float) n
;
161 int m2
= m
<< EXPSHIFTBITS_SP32
;
163 const float R_LOG2_BY_64_LD
= 0x1.620000p-7f
; /* log2/64 lead: 0.0108032227 */
164 const float R_LOG2_BY_64_TL
= 0x1.c85fdep-16f
; /* log2/64 tail: 0.0000272020388 */
165 r
= mad
(nf, -R_LOG2_BY_64_TL
, mad
(nf, -R_LOG2_BY_64_LD
, ylogx
)) + ylogx_t
;
167 /* Truncated Taylor series for e^r
*/
168 poly
= mad
(mad(mad(r, 0x1.555556p-5f
, 0x1.555556p-3f
), r
, 0x1.000000p-1f
), r
*r
, r
);
170 tv
= USE_TABLE
(exp_tbl_ep, j
);
172 float expylogx
= mad
(tv.s0
, poly
, mad
(tv.s1
, poly
, tv.s1
)) + tv.s0
;
173 float sexpylogx
= expylogx
* as_float
(0x1 << (m + 149));
174 float texpylogx
= as_float
(as_int(expylogx) + m2
);
175 expylogx
= m
< -
125 ? sexpylogx
: texpylogx
;
177 /* Result is
+-Inf if
(ylogx + ylogx_t
) > 128*log2
*/
178 expylogx
= (ylogx > 0x1.62e430p
+6f
) |
(ylogx == 0x1.62e430p
+6f
& ylogx_t
> -
0x1.05c610p-22f
) ? as_float
(PINFBITPATT_SP32) : expylogx
;
180 /* Result is
0 if ylogx
< -
149*log2
*/
181 expylogx
= ylogx
< -
0x1.9d1da0p
+6f ?
0.0f
: expylogx
;
184 * inty
= 0 means not an integer.
185 * inty
= 1 means odd integer.
186 * inty
= 2 means even integer.
189 int yexp
= (int)(ay >> EXPSHIFTBITS_SP32
) - EXPBIAS_SP32
+ 1;
190 int mask
= (1 << (24 - yexp
)) -
1;
191 int yodd
= ((iy >> (24 - yexp
)) & 0x1) != 0;
192 int inty
= yodd ?
1 : 2;
193 inty
= (iy & mask
) != 0 ?
0 : inty
;
194 inty
= yexp
< 1 ?
0 : inty
;
195 inty
= yexp
> 24 ?
2 : inty
;
197 float signval
= as_float
((as_uint(expylogx) ^ SIGNBIT_SP32
));
198 expylogx
= ((inty == 1) & !xpos
) ? signval
: expylogx
;
199 int ret
= as_int
(expylogx);
201 /* Corner case handling
*/
202 ret
= (!xpos
& (inty == 0)) ? QNANBITPATT_SP32
: ret
;
203 ret
= ax
< 0x3f800000 & iy
== NINFBITPATT_SP32 ? PINFBITPATT_SP32
: ret
;
204 ret
= ax
> 0x3f800000 & iy
== NINFBITPATT_SP32 ?
0 : ret
;
205 ret
= ax
< 0x3f800000 & iy
== PINFBITPATT_SP32 ?
0 : ret
;
206 ret
= ax
> 0x3f800000 & iy
== PINFBITPATT_SP32 ? PINFBITPATT_SP32
: ret
;
207 int xinf
= xpos ? PINFBITPATT_SP32
: NINFBITPATT_SP32
;
208 ret
= ((ax == 0) & !ypos
& (inty == 1)) ? xinf
: ret
;
209 ret
= ((ax == 0) & !ypos
& (inty != 1)) ? PINFBITPATT_SP32
: ret
;
210 int xzero
= xpos ?
0 : 0x80000000;
211 ret
= ((ax == 0) & ypos
& (inty == 1)) ? xzero
: ret
;
212 ret
= ((ax == 0) & ypos
& (inty != 1)) ?
0 : ret
;
213 ret
= ((ax == 0) & (iy == NINFBITPATT_SP32
)) ? PINFBITPATT_SP32
: ret
;
214 ret
= ((ix == 0xbf800000) & (ay == PINFBITPATT_SP32
)) ?
0x3f800000 : ret
;
215 ret
= ((ix == NINFBITPATT_SP32
) & !ypos
& (inty == 1)) ?
0x80000000 : ret
;
216 ret
= ((ix == NINFBITPATT_SP32
) & !ypos
& (inty != 1)) ?
0 : ret
;
217 ret
= ((ix == NINFBITPATT_SP32
) & ypos
& (inty == 1)) ? NINFBITPATT_SP32
: ret
;
218 ret
= ((ix == NINFBITPATT_SP32
) & ypos
& (inty != 1)) ? PINFBITPATT_SP32
: ret
;
219 ret
= ((ix == PINFBITPATT_SP32
) & !ypos
) ?
0 : ret
;
220 ret
= ((ix == PINFBITPATT_SP32
) & ypos
) ? PINFBITPATT_SP32
: ret
;
221 ret
= (ax > PINFBITPATT_SP32
) ? ix
: ret
;
222 ret
= (ay > PINFBITPATT_SP32
) ? iy
: ret
;
223 ret
= ay
== 0 ?
0x3f800000 : ret
;
224 ret
= ix
== 0x3f800000 ?
0x3f800000 : ret
;
226 return as_float
(ret);
228 _CLC_BINARY_VECTORIZE
(_CLC_DEF _CLC_OVERLOAD
, float
, __clc_pow
, float
, float
)
231 _CLC_DEF _CLC_OVERLOAD double __clc_pow
(double x
, double y
)
233 const double real_log2_tail
= 5.76999904754328540596e-08;
234 const double real_log2_lead
= 6.93147122859954833984e-01;
236 long ux
= as_long
(x);
237 long ax
= ux
& (~SIGNBIT_DP64
);
240 long uy
= as_long
(y);
241 long ay
= uy
& (~SIGNBIT_DP64
);
244 // Extended precision log
247 int exp
= (int)(ax >> 52) -
1023;
248 int mask_exp_1023
= exp
== -
1023;
249 double xexp
= (double) exp
;
250 long mantissa
= ax
& 0x000FFFFFFFFFFFFFL
;
252 long temp_ux
= as_long
(as_double(0x3ff0000000000000L | mantissa
) -
1.0);
253 exp
= ((temp_ux & 0x7FF0000000000000L
) >> 52) -
2045;
254 double xexp1
= (double) exp
;
255 long mantissa1
= temp_ux
& 0x000FFFFFFFFFFFFFL
;
257 xexp
= mask_exp_1023 ? xexp1
: xexp
;
258 mantissa
= mask_exp_1023 ? mantissa1
: mantissa
;
260 long rax
= (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
261 int index
= rax
>> 44;
263 double F
= as_double
(rax |
0x3FE0000000000000L
);
264 double Y
= as_double
(mantissa |
0x3FE0000000000000L
);
266 double2 tv
= USE_TABLE
(log_f_inv_tbl, index
);
267 double log_h
= tv.s0
;
268 double log_t
= tv.s1
;
269 double f_inv
= (log_h + log_t
) * f
;
270 double r1
= as_double
(as_long(f_inv) & 0xfffffffff8000000L
);
271 double r2
= fma
(-F, r1
, f
) * (log_h + log_t
);
277 fma
(r, 1.0/7.0, 1.0/6.0),
281 poly
= poly
* r
* r
* r
;
283 double hr1r1
= 0.5*r1
*r1
;
284 double poly0h
= r1
+ hr1r1
;
285 double poly0t
= r1 - poly0h
+ hr1r1
;
286 poly
= fma
(r1, r2
, fma
(0.5
*r2
, r2
, poly
)) + r2
+ poly0t
;
288 tv
= USE_TABLE
(powlog_tbl, index
);
292 double resT_t
= fma
(xexp, real_log2_tail
, + log_t
) - poly
;
293 double resT
= resT_t - poly0h
;
294 double resH
= fma
(xexp, real_log2_lead
, log_h
);
295 double resT_h
= poly0h
;
297 double H
= resT
+ resH
;
298 double H_h
= as_double
(as_long(H) & 0xfffffffff8000000L
);
299 double T
= (resH - H
+ resT
) + (resT_t -
(resT + resT_h
)) + (H - H_h
);
302 double y_head
= as_double
(uy & 0xfffffffff8000000L
);
303 double y_tail
= y - y_head
;
305 double temp
= fma
(y_tail, H
, fma
(y_head, T
, y_tail
*T
));
306 v
= fma
(y_head, H
, temp
);
307 vt
= fma
(y_head, H
, -v
) + temp
;
310 // Now calculate exp of
(v,vt
)
314 const double max_exp_arg
= 709.782712893384;
315 const double min_exp_arg
= -
745.1332191019411;
316 const double sixtyfour_by_lnof2
= 92.33248261689366;
317 const double lnof2_by_64_head
= 0.010830424260348081;
318 const double lnof2_by_64_tail
= -
4.359010638708991e-10;
320 double temp
= v
* sixtyfour_by_lnof2
;
322 double dn
= (double)n
;
323 int j
= n
& 0x0000003f;
326 double2 tv
= USE_TABLE
(two_to_jby64_ep_tbl, j
);
331 double r1
= fma
(dn, -lnof2_by_64_head
, v
);
332 double r2
= dn
* lnof2_by_64_tail
;
333 double r
= (r1 + r2
) + vt
;
338 fma
(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
339 4.16666666662260795726e-02),
340 1.66666666665260878863e-01),
341 5.00000000000000008883e-01);
344 expv
= fma
(f, q
, f2
) + f1
;
345 expv
= ldexp
(expv, m
);
347 expv
= v
> max_exp_arg ? as_double
(0x7FF0000000000000L) : expv
;
348 expv
= v
< min_exp_arg ?
0.0 : expv
;
351 // See whether y is an integer.
352 // inty
= 0 means not an integer.
353 // inty
= 1 means odd integer.
354 // inty
= 2 means even integer.
358 int yexp
= (int)(ay >> EXPSHIFTBITS_DP64
) - EXPBIAS_DP64
+ 1;
359 inty
= yexp
< 1 ?
0 : 2;
360 inty
= yexp
> 53 ?
2 : inty
;
361 long mask
= (1L << (53 - yexp
)) -
1L;
362 int inty1
= (((ay & ~mask
) >> (53 - yexp
)) & 1L) == 1L ?
1 : 2;
363 inty1
= (ay & mask
) != 0 ?
0 : inty1
;
364 inty
= !(yexp < 1) & !(yexp > 53) ? inty1
: inty
;
367 expv
*= (inty == 1) & !xpos ? -
1.0 : 1.0;
369 long ret
= as_long
(expv);
371 // Now all the edge cases
372 ret
= !xpos
& (inty == 0) ? QNANBITPATT_DP64
: ret
;
373 ret
= ax
< 0x3ff0000000000000L
& uy
== NINFBITPATT_DP64 ? PINFBITPATT_DP64
: ret
;
374 ret
= ax
> 0x3ff0000000000000L
& uy
== NINFBITPATT_DP64 ?
0L : ret
;
375 ret
= ax
< 0x3ff0000000000000L
& uy
== PINFBITPATT_DP64 ?
0L : ret
;
376 ret
= ax
> 0x3ff0000000000000L
& uy
== PINFBITPATT_DP64 ? PINFBITPATT_DP64
: ret
;
377 long xinf
= xpos ? PINFBITPATT_DP64
: NINFBITPATT_DP64
;
378 ret
= ((ax == 0L) & !ypos
& (inty == 1)) ? xinf
: ret
;
379 ret
= ((ax == 0L) & !ypos
& (inty != 1)) ? PINFBITPATT_DP64
: ret
;
380 long xzero
= xpos ?
0L : 0x8000000000000000L
;
381 ret
= ((ax == 0L) & ypos
& (inty == 1)) ? xzero
: ret
;
382 ret
= ((ax == 0L) & ypos
& (inty != 1)) ?
0L : ret
;
383 ret
= ((ax == 0L) & (uy == NINFBITPATT_DP64
)) ? PINFBITPATT_DP64
: ret
;
384 ret
= ((ux == 0xbff0000000000000L
) & (ay == PINFBITPATT_DP64
)) ?
0x3ff0000000000000L
: ret
;
385 ret
= ((ux == NINFBITPATT_DP64
) & !ypos
& (inty == 1)) ?
0x8000000000000000L
: ret
;
386 ret
= ((ux == NINFBITPATT_DP64
) & !ypos
& (inty != 1)) ?
0L : ret
;
387 ret
= ((ux == NINFBITPATT_DP64
) & ypos
& (inty == 1)) ? NINFBITPATT_DP64
: ret
;
388 ret
= ((ux == NINFBITPATT_DP64
) & ypos
& (inty != 1)) ? PINFBITPATT_DP64
: ret
;
389 ret
= (ux == PINFBITPATT_DP64
) & !ypos ?
0L : ret
;
390 ret
= (ux == PINFBITPATT_DP64
) & ypos ? PINFBITPATT_DP64
: ret
;
391 ret
= ax
> PINFBITPATT_DP64 ? ux
: ret
;
392 ret
= ay
> PINFBITPATT_DP64 ? uy
: ret
;
393 ret
= ay
== 0L ?
0x3ff0000000000000L
: ret
;
394 ret
= ux
== 0x3ff0000000000000L ?
0x3ff0000000000000L
: ret
;
396 return as_double
(ret);
398 _CLC_BINARY_VECTORIZE
(_CLC_DEF _CLC_OVERLOAD
, double
, __clc_pow
, double
, double
)