2 * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
30 "Table-driven implementation of the logarithm function in IEEE
31 floating-point arithmetic"
32 ACM Transactions on Mathematical Software (TOMS)
33 Volume 16, Issue 4 (December 1990)
36 x very close to 1.0 is handled differently, for x everywhere else
37 a brief explanation is given below
40 x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8))
42 x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9))
44 Y = (2^(-1))*(2^(-m))*(2^m)*A
45 Now, range of Y is: 0.5 <= Y < 1
47 F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit)
48 Now, range of F is: 128 <= F <= 256
50 Now, range of F is: 0.5 <= F <= 1
52 f = -(Y-F), with (f <= 2^(-9))
54 log(x) = m*log(2) + log(2) + log(F-f)
55 log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
56 log(x) = m*log(2) + log(2*F) + log(1-r)
58 r = (f/F), with (r <= 2^(-8))
59 r = f*(1/F) with (1/F) precomputed to avoid division
61 log(x) = m*log(2) + log(G) - poly
64 poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5))
66 log(2) and log(G) need to be maintained in extra precision
67 to avoid losing precision in the calculations
70 For x close to 1.0, we employ the following technique to
71 ensure faster convergence.
73 log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7
80 _CLC_OVERLOAD _CLC_DEF
float
81 #if defined(COMPILING_LOG2)
83 #elif defined(COMPILING_LOG10)
90 #if defined(COMPILING_LOG2)
91 const float LOG2E
= 0x1.715476p
+0f
; // 1.4426950408889634
92 const float LOG2E_HEAD
= 0x1.700000p
+0f
; // 1.4375
93 const float LOG2E_TAIL
= 0x1.547652p
-8f
; // 0.00519504072
94 #elif defined(COMPILING_LOG10)
95 const float LOG10E
= 0x1.bcb7b2p
-2f
; // 0.43429448190325182
96 const float LOG10E_HEAD
= 0x1.bc0000p
-2f
; // 0.43359375
97 const float LOG10E_TAIL
= 0x1.6f62a4p
-11f
; // 0.0007007319
98 const float LOG10_2_HEAD
= 0x1.340000p
-2f
; // 0.30078125
99 const float LOG10_2_TAIL
= 0x1.04d426p
-12f
; // 0.000248745637
101 const float LOG2_HEAD
= 0x1.62e000p
-1f
; // 0.693115234
102 const float LOG2_TAIL
= 0x1.0bfbe8p
-15f
; // 0.0000319461833
105 uint xi
= as_uint(x
);
106 uint ax
= xi
& EXSIGNBIT_SP32
;
108 // Calculations for |x-1| < 2^-4
110 int near1
= fabs(r
) < 0x1.0p
-4f
;
111 float u2
= MATH_DIVIDE(r
, 2.0f
+ r
);
115 float znear1
, z1
, z2
;
117 // 2/(5 * 2^5), 2/(3 * 2^3)
118 z2
= mad(u
, mad(v
, 0x1.99999ap
-7f
, 0x1.555556p
-4f
)*v
, -corr
);
120 #if defined(COMPILING_LOG2)
121 z1
= as_float(as_int(r
) & 0xffff0000);
123 znear1
= mad(z1
, LOG2E_HEAD
, mad(z2
, LOG2E_HEAD
, mad(z1
, LOG2E_TAIL
, z2
*LOG2E_TAIL
)));
124 #elif defined(COMPILING_LOG10)
125 z1
= as_float(as_int(r
) & 0xffff0000);
127 znear1
= mad(z1
, LOG10E_HEAD
, mad(z2
, LOG10E_HEAD
, mad(z1
, LOG10E_TAIL
, z2
*LOG10E_TAIL
)));
132 // Calculations for x not near 1
133 int m
= (int)(xi
>> EXPSHIFTBITS_SP32
) - EXPBIAS_SP32
;
135 // Normalize subnormal
136 uint xis
= as_uint(as_float(xi
| 0x3f800000) - 1.0f
);
137 int ms
= (int)(xis
>> EXPSHIFTBITS_SP32
) - 253;
140 uint xin
= c
? xis
: xi
;
143 uint indx
= (xin
& 0x007f0000) + ((xin
& 0x00008000) << 1);
146 float f
= as_float(0x3f000000 | indx
) - as_float(0x3f000000 | (xin
& MANTBITS_SP32
));
149 r
= f
* USE_TABLE(log_inv_tbl
, indx
);
152 float poly
= mad(mad(r
, 0x1.555556p
-2f
, 0.5f
), r
*r
, r
);
154 #if defined(COMPILING_LOG2)
155 float2 tv
= USE_TABLE(log2_tbl
, indx
);
157 z2
= mad(poly
, -LOG2E
, tv
.s1
);
158 #elif defined(COMPILING_LOG10)
159 float2 tv
= USE_TABLE(log10_tbl
, indx
);
160 z1
= mad(mf
, LOG10_2_HEAD
, tv
.s0
);
161 z2
= mad(poly
, -LOG10E
, mf
*LOG10_2_TAIL
) + tv
.s1
;
163 float2 tv
= USE_TABLE(log_tbl
, indx
);
164 z1
= mad(mf
, LOG2_HEAD
, tv
.s0
);
165 z2
= mad(mf
, LOG2_TAIL
, -poly
) + tv
.s1
;
169 z
= near1
? znear1
: z
;
172 z
= ax
>= PINFBITPATT_SP32
? x
: z
;
173 z
= xi
!= ax
? as_float(QNANBITPATT_SP32
) : z
;
174 z
= ax
== 0 ? as_float(NINFBITPATT_SP32
) : z
;
181 _CLC_OVERLOAD _CLC_DEF
double
182 #if defined(COMPILING_LOG2)
184 #elif defined(COMPILING_LOG10)
191 #ifndef COMPILING_LOG2
192 // log2_lead and log2_tail sum to an extra-precise version of ln(2)
193 const double log2_lead
= 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
194 const double log2_tail
= 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
197 #if defined(COMPILING_LOG10)
198 // log10e_lead and log10e_tail sum to an extra-precision version of log10(e) (19 bits in lead)
199 const double log10e_lead
= 4.34293746948242187500e-01; /* 0x3fdbcb7800000000 */
200 const double log10e_tail
= 7.3495500964015109100644e-7; /* 0x3ea8a93728719535 */
201 #elif defined(COMPILING_LOG2)
202 // log2e_lead and log2e_tail sum to an extra-precision version of log2(e) (19 bits in lead)
203 const double log2e_lead
= 1.44269180297851562500E+00; /* 0x3FF7154400000000 */
204 const double log2e_tail
= 3.23791044778235969970E-06; /* 0x3ECB295C17F0BBBE */
207 // log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
208 // log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000
209 const double log_thresh1
= 0x1.e0faap
-1;
210 const double log_thresh2
= 0x1.1082cp
+0;
212 bool is_near
= x
>= log_thresh1
&& x
<= log_thresh2
;
216 double u
= r
/ (2.0 + r
);
217 double correction
= r
* u
;
222 const double ca_1
= 8.33333333333317923934e-02; /* 0x3fb55555555554e6 */
223 const double ca_2
= 1.25000000037717509602e-02; /* 0x3f89999999bac6d4 */
224 const double ca_3
= 2.23213998791944806202e-03; /* 0x3f62492307f1519f */
225 const double ca_4
= 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */
227 double r2
= fma(u
*v
, fma(v
, fma(v
, fma(v
, ca_4
, ca_3
), ca_2
), ca_1
), -correction
);
229 #if defined(COMPILING_LOG10)
231 r1
= as_double(as_ulong(r1
) & 0xffffffff00000000);
233 double ret_near
= fma(log10e_lead
, r1
, fma(log10e_lead
, r2
, fma(log10e_tail
, r1
, log10e_tail
* r2
)));
234 #elif defined(COMPILING_LOG2)
236 r1
= as_double(as_ulong(r1
) & 0xffffffff00000000);
238 double ret_near
= fma(log2e_lead
, r1
, fma(log2e_lead
, r2
, fma(log2e_tail
, r1
, log2e_tail
*r2
)));
240 double ret_near
= r1
+ r2
;
243 // This is the far from 1 code
245 // Deal with subnormal
246 ulong ux
= as_ulong(x
);
247 ulong uxs
= as_ulong(as_double(0x03d0000000000000UL
| ux
) - 0x1.0p
-962);
248 int c
= ux
< IMPBIT_DP64
;
250 int expadjust
= c
? 60 : 0;
252 int xexp
= ((as_int2(ux
).hi
>> 20) & 0x7ff) - EXPBIAS_DP64
- expadjust
;
253 double f
= as_double(HALFEXPBITS_DP64
| (ux
& MANTBITS_DP64
));
254 int index
= as_int2(ux
).hi
>> 13;
255 index
= ((0x80 | (index
& 0x7e)) >> 1) + (index
& 0x1);
257 double2 tv
= USE_TABLE(ln_tbl
, index
- 64);
261 double f1
= index
* 0x1.0p
-7;
263 u
= f2
/ fma(f2
, 0.5, f1
);
266 const double cb_1
= 8.33333333333333593622e-02; /* 0x3fb5555555555557 */
267 const double cb_2
= 1.24999999978138668903e-02; /* 0x3f89999999865ede */
268 const double cb_3
= 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */
270 double poly
= v
* fma(v
, fma(v
, cb_3
, cb_2
), cb_1
);
271 double z2
= q
+ fma(u
, poly
, u
);
273 double dxexp
= (double)xexp
;
274 #if defined (COMPILING_LOG10)
275 // Add xexp * log(2) to z1,z2 to get log(x)
276 r1
= fma(dxexp
, log2_lead
, z1
);
277 r2
= fma(dxexp
, log2_tail
, z2
);
278 double ret_far
= fma(log10e_lead
, r1
, fma(log10e_lead
, r2
, fma(log10e_tail
, r1
, log10e_tail
*r2
)));
279 #elif defined(COMPILING_LOG2)
280 r1
= fma(log2e_lead
, z1
, dxexp
);
281 r2
= fma(log2e_lead
, z2
, fma(log2e_tail
, z1
, log2e_tail
*z2
));
282 double ret_far
= r1
+ r2
;
284 r1
= fma(dxexp
, log2_lead
, z1
);
285 r2
= fma(dxexp
, log2_tail
, z2
);
286 double ret_far
= r1
+ r2
;
289 double ret
= is_near
? ret_near
: ret_far
;
291 ret
= isinf(x
) ? as_double(PINFBITPATT_DP64
) : ret
;
292 ret
= (isnan(x
) | (x
< 0.0)) ? as_double(QNANBITPATT_DP64
) : ret
;
293 ret
= x
== 0.0 ? as_double(NINFBITPATT_DP64
) : ret
;
297 #endif // cl_khr_fp64
301 _CLC_OVERLOAD _CLC_DEF half
302 #if defined(COMPILING_LOG2)
304 return (half
)log2((float)x
);
306 #elif defined(COMPILING_LOG10)
308 return (half
)log10((float)x
);
312 return (half
)log((float)x
);
316 #endif // cl_khr_fp16