2 * Copyright
(c) 2014 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include
<clc
/clcmacro.h
>
29 _CLC_OVERLOAD _CLC_DEF float log1p
(float x
)
33 uint ax
= ux
& EXSIGNBIT_SP32
;
36 float u2
= MATH_DIVIDE
(x, 2.0f
+ x
);
39 // 2/(5 * 2^
5), 2/(3 * 2^
3)
40 float zsmall
= mad
(-u2, x
, mad
(v, 0x1.99999ap-7f
, 0x1.555556p-4f
) * v
* u
) + x
;
43 ux
= as_uint
(x + 1.0f
);
45 int m
= (int)((ux >> EXPSHIFTBITS_SP32
) & 0xff) - EXPBIAS_SP32
;
47 uint indx
= (ux & 0x007f0000) + ((ux & 0x00008000) << 1);
48 float F
= as_float
(indx |
0x3f000000);
51 float fg24
= F - as_float
(0x3f000000 |
(ux & MANTBITS_SP32
));
54 uint xhi
= ux
& 0xffff8000;
55 float xh
= as_float
(xhi);
56 float xt
= (1.0f - xh
) + w
;
57 uint xnm
= ((~
(xhi & 0x7f800000)) -
0x00800000) & 0x7f800000;
58 xt
= xt
* as_float
(xnm) * 0.5f
;
59 float fl24
= F - as_float
(0x3f000000 |
(xhi & MANTBITS_SP32
)) - xt
;
61 float f
= mf
> 24.0f ? fg24
: fl24
;
64 float r
= f
* USE_TABLE
(log_inv_tbl, indx
);
67 float poly
= mad
(mad(r, 0x1.555556p-2f
, 0x1.0p-1f
), r
*r
, r
);
69 const float LOG2_HEAD
= 0x1.62e000p-1f
; // 0.693115234
70 const float LOG2_TAIL
= 0x1.0bfbe8p-15f
; // 0.0000319461833
72 float2 tv
= USE_TABLE
(loge_tbl, indx
);
73 float z1
= mad
(mf, LOG2_HEAD
, tv.s0
);
74 float z2
= mad
(mf, LOG2_TAIL
, -poly
) + tv.s1
;
77 z
= ax
< 0x3d800000U ? zsmall
: z
;
82 z
= ax
>= PINFBITPATT_SP32 ? w
: z
;
83 z
= w
< -
1.0f ? as_float
(QNANBITPATT_SP32) : z
;
84 z
= w
== -
1.0f ? as_float
(NINFBITPATT_SP32) : z
;
86 z
= ax
< 0x33800000 ? x
: z
;
91 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, float
, log1p
, float
);
95 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
97 _CLC_OVERLOAD _CLC_DEF double log1p
(double x
)
99 // Computes natural log
(1+x
). Algorithm based on
:
100 // Ping-Tak Peter Tang
101 // "Table-driven implementation of the logarithm function in IEEE
102 // floating-point arithmetic"
103 // ACM Transactions on Mathematical Software
(TOMS)
104 // Volume
16, Issue
4 (December 1990)
105 // Note that we use a lookup table of size
64 rather than
128,
106 // and compensate by having extra terms in the minimax polynomial
107 // for the kernel approximation.
109 // Process Inside the threshold now
110 ulong ux
= as_ulong
(1.0
+ x
);
111 int xexp
= ((as_int2(ux).hi
>> 20) & 0x7ff) - EXPBIAS_DP64
;
112 double f
= as_double
(ONEEXPBITS_DP64 |
(ux & MANTBITS_DP64
));
114 int j
= as_int2
(ux).hi
>> 13;
115 j
= ((0x80 |
(j & 0x7e)) >> 1) + (j & 0x1);
116 double f1
= (double)j
* 0x1.0p-6
;
119 double f2temp
= f - f1
;
120 double m2
= as_double
(convert_ulong(0x3ff - xexp
) << EXPSHIFTBITS_DP64
);
121 double f2l
= fma
(m2, x
, m2 - f1
);
122 double f2g
= fma
(m2, x
, -f1
) + m2
;
123 double f2
= xexp
<= MANTLENGTH_DP64-1 ? f2l
: f2g
;
124 f2
= (xexp <= -
2) |
(xexp >= MANTLENGTH_DP64
+8) ? f2temp
: f2
;
126 double2 tv
= USE_TABLE
(ln_tbl, j
);
130 double u
= MATH_DIVIDE
(f2, fma
(0.5
, f2
, f1
));
133 double poly
= v
* fma
(v,
134 fma
(v, 2.23219810758559851206e-03, 1.24999999978138668903e-02),
135 8.33333333333333593622e-02);
137 // log2_lead and log2_tail sum to an extra-precise version of log
(2)
138 const double log2_lead
= 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
139 const double log2_tail
= 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
141 double z2
= q
+ fma
(u, poly
, u
);
142 double dxexp
= (double)xexp
;
143 double r1
= fma
(dxexp, log2_lead
, z1
);
144 double r2
= fma
(dxexp, log2_tail
, z2
);
145 double result1
= r1
+ r2
;
147 // Process Outside the threshold now
150 double correction
= r
* u
;
157 fma
(v, 4.34887777707614552256e-04, 2.23213998791944806202e-03),
158 1.25000000037717509602e-02),
159 8.33333333333317923934e-02);
161 r2
= fma
(u*v
, poly
, -correction
);
163 // The values exp
(-1/16)-
1 and exp
(1/16)-
1
164 const double log1p_thresh1
= -
0x1.f0540438fd5c3p-5
;
165 const double log1p_thresh2
= 0x1.082b577d34ed8p-4
;
166 double result2
= r1
+ r2
;
167 result2
= x
< log1p_thresh1 | x
> log1p_thresh2 ? result1
: result2
;
169 result2
= isinf
(x) ? x
: result2
;
170 result2
= x
< -
1.0 ? as_double
(QNANBITPATT_DP64) : result2
;
171 result2
= x
== -
1.0 ? as_double
(NINFBITPATT_DP64) : result2
;
175 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, double
, log1p
, double
);
177 #endif
// cl_khr_fp64
179 _CLC_DEFINE_UNARY_BUILTIN_FP16
(log1p)