2 * Copyright
(c) 2014,2015 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
27 #include
"../clcmacro.h"
29 _CLC_OVERLOAD _CLC_DEF float asinh
(float x
) {
31 uint ax
= ux
& EXSIGNBIT_SP32
;
39 mad
(t, -
1.177198915954942694e-4f
, -
4.162727710583425360e-2f
),
40 -
5.063201055468483248e-1f
),
41 -
1.480204186473758321f
),
42 -
1.152965835871758072f
);
46 mad
(t, 6.284381367285534560e-2f
, 1.260024978680227945f
),
47 6.582362487198468066f
),
48 11.99423176003939087f
),
49 6.917795026025976739f
);
51 float q
= MATH_DIVIDE
(a, b
);
52 float z1
= mad
(x*t
, q
, x
);
56 // Arguments greater than
1/sqrt
(epsilon) in magnitude are
57 // approximated by asinh
(x) = ln
(2) + ln
(abs(x)), with sign of x
58 // Arguments such that
4.0 <= abs
(x) <= 1/sqrt
(epsilon) are
59 // approximated by asinhf
(x) = ln
(abs(x) + sqrt
(x*x
+1))
60 // with the sign of x
(see Abramowitz and Stegun
4.6.20)
62 float absx
= as_float
(ax);
63 int hi
= ax
> 0x46000000U
;
64 float y
= MATH_SQRT
(absx * absx
+ 1.0f
) + absx
;
66 float r
= log
(y) + (hi ?
0x1.62e430p-1f
: 0.0f
);
67 float z2
= as_float
(xsgn | as_uint
(r));
69 float z
= ax
<= 0x40000000 ? z1
: z2
;
70 z
= ax
< 0x39800000U | ax
>= PINFBITPATT_SP32 ? x
: z
;
75 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, float
, asinh
, float
)
78 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
80 #define NA0 -
0.12845379283524906084997e0
81 #define NA1 -
0.21060688498409799700819e0
82 #define NA2 -
0.10188951822578188309186e0
83 #define NA3 -
0.13891765817243625541799e-1
84 #define NA4 -
0.10324604871728082428024e-3
86 #define DA0
0.77072275701149440164511e0
87 #define DA1
0.16104665505597338100747e1
88 #define DA2
0.11296034614816689554875e1
89 #define DA3
0.30079351943799465092429e0
90 #define DA4
0.235224464765951442265117e-1
92 #define NB0 -
0.12186605129448852495563e0
93 #define NB1 -
0.19777978436593069928318e0
94 #define NB2 -
0.94379072395062374824320e-1
95 #define NB3 -
0.12620141363821680162036e-1
96 #define NB4 -
0.903396794842691998748349e-4
98 #define DB0
0.73119630776696495279434e0
99 #define DB1
0.15157170446881616648338e1
100 #define DB2
0.10524909506981282725413e1
101 #define DB3
0.27663713103600182193817e0
102 #define DB4
0.21263492900663656707646e-1
104 #define NC0 -
0.81210026327726247622500e-1
105 #define NC1 -
0.12327355080668808750232e0
106 #define NC2 -
0.53704925162784720405664e-1
107 #define NC3 -
0.63106739048128554465450e-2
108 #define NC4 -
0.35326896180771371053534e-4
110 #define DC0
0.48726015805581794231182e0
111 #define DC1
0.95890837357081041150936e0
112 #define DC2
0.62322223426940387752480e0
113 #define DC3
0.15028684818508081155141e0
114 #define DC4
0.10302171620320141529445e-1
116 #define ND0 -
0.4638179204422665073e-1
117 #define ND1 -
0.7162729496035415183e-1
118 #define ND2 -
0.3247795155696775148e-1
119 #define ND3 -
0.4225785421291932164e-2
120 #define ND4 -
0.3808984717603160127e-4
121 #define ND5
0.8023464184964125826e-6
123 #define DD0
0.2782907534642231184e0
124 #define DD1
0.5549945896829343308e0
125 #define DD2
0.3700732511330698879e0
126 #define DD3
0.9395783438240780722e-1
127 #define DD4
0.7200057974217143034e-2
129 #define NE0 -
0.121224194072430701e-4
130 #define NE1 -
0.273145455834305218e-3
131 #define NE2 -
0.152866982560895737e-2
132 #define NE3 -
0.292231744584913045e-2
133 #define NE4 -
0.174670900236060220e-2
134 #define NE5 -
0.891754209521081538e-12
136 #define DE0
0.499426632161317606e-4
137 #define DE1
0.139591210395547054e-2
138 #define DE2
0.107665231109108629e-1
139 #define DE3
0.325809818749873406e-1
140 #define DE4
0.415222526655158363e-1
141 #define DE5
0.186315628774716763e-1
143 #define NF0 -
0.195436610112717345e-4
144 #define NF1 -
0.233315515113382977e-3
145 #define NF2 -
0.645380957611087587e-3
146 #define NF3 -
0.478948863920281252e-3
147 #define NF4 -
0.805234112224091742e-12
148 #define NF5
0.246428598194879283e-13
150 #define DF0
0.822166621698664729e-4
151 #define DF1
0.135346265620413852e-2
152 #define DF2
0.602739242861830658e-2
153 #define DF3
0.972227795510722956e-2
154 #define DF4
0.510878800983771167e-2
156 #define NG0 -
0.209689451648100728e-6
157 #define NG1 -
0.219252358028695992e-5
158 #define NG2 -
0.551641756327550939e-5
159 #define NG3 -
0.382300259826830258e-5
160 #define NG4 -
0.421182121910667329e-17
161 #define NG5
0.492236019998237684e-19
163 #define DG0
0.889178444424237735e-6
164 #define DG1
0.131152171690011152e-4
165 #define DG2
0.537955850185616847e-4
166 #define DG3
0.814966175170941864e-4
167 #define DG4
0.407786943832260752e-4
169 #define NH0 -
0.178284193496441400e-6
170 #define NH1 -
0.928734186616614974e-6
171 #define NH2 -
0.923318925566302615e-6
172 #define NH3 -
0.776417026702577552e-19
173 #define NH4
0.290845644810826014e-21
175 #define DH0
0.786694697277890964e-6
176 #define DH1
0.685435665630965488e-5
177 #define DH2
0.153780175436788329e-4
178 #define DH3
0.984873520613417917e-5
180 #define NI0 -
0.538003743384069117e-10
181 #define NI1 -
0.273698654196756169e-9
182 #define NI2 -
0.268129826956403568e-9
183 #define NI3 -
0.804163374628432850e-29
185 #define DI0
0.238083376363471960e-9
186 #define DI1
0.203579344621125934e-8
187 #define DI2
0.450836980450693209e-8
188 #define DI3
0.286005148753497156e-8
190 _CLC_OVERLOAD _CLC_DEF double asinh
(double x
) {
191 const double rteps
= 0x1.6a09e667f3bcdp-27
;
192 const double recrteps
= 0x1.6a09e667f3bcdp
+26;
194 // log2_lead and log2_tail sum to an extra-precise version of log
(2)
195 const double log2_lead
= 0x1.62e42ep-1
;
196 const double log2_tail
= 0x1.efa39ef35793cp-25
;
198 ulong ux
= as_ulong
(x);
199 ulong ax
= ux
& ~SIGNBIT_DP64
;
200 double absx
= as_double
(ax);
203 double pn
, tn
, pd
, td
;
205 // XXX we are betting here that we can evaluate
8 pairs of
206 // polys faster than we can grab
12 coefficients from a table
207 // This also uses fewer registers
210 pn
= fma
(t, fma
(t, fma
(t, NI3
, NI2
), NI1
), NI0
);
211 pd
= fma
(t, fma
(t, fma
(t, DI3
, DI2
), DI1
), DI0
);
213 tn
= fma
(t, fma
(t, fma
(t, fma
(t, NH4
, NH3
), NH2
), NH1
), NH0
);
214 td
= fma
(t, fma
(t, fma
(t, DH3
, DH2
), DH1
), DH0
);
215 pn
= absx
< 8.0 ? tn
: pn
;
216 pd
= absx
< 8.0 ? td
: pd
;
218 tn
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, NG5
, NG4
), NG3
), NG2
), NG1
), NG0
);
219 td
= fma
(t, fma
(t, fma
(t, fma
(t, DG4
, DG3
), DG2
), DG1
), DG0
);
220 pn
= absx
< 4.0 ? tn
: pn
;
221 pd
= absx
< 4.0 ? td
: pd
;
223 tn
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, NF5
, NF4
), NF3
), NF2
), NF1
), NF0
);
224 td
= fma
(t, fma
(t, fma
(t, fma
(t, DF4
, DF3
), DF2
), DF1
), DF0
);
225 pn
= absx
< 2.0 ? tn
: pn
;
226 pd
= absx
< 2.0 ? td
: pd
;
228 tn
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, NE5
, NE4
), NE3
), NE2
), NE1
), NE0
);
229 td
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, DE5
, DE4
), DE3
), DE2
), DE1
), DE0
);
230 pn
= absx
< 1.5 ? tn
: pn
;
231 pd
= absx
< 1.5 ? td
: pd
;
233 tn
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, ND5
, ND4
), ND3
), ND2
), ND1
), ND0
);
234 td
= fma
(t, fma
(t, fma
(t, fma
(t, DD4
, DD3
), DD2
), DD1
), DD0
);
235 pn
= absx
<= 1.0 ? tn
: pn
;
236 pd
= absx
<= 1.0 ? td
: pd
;
238 tn
= fma
(t, fma
(t, fma
(t, fma
(t, NC4
, NC3
), NC2
), NC1
), NC0
);
239 td
= fma
(t, fma
(t, fma
(t, fma
(t, DC4
, DC3
), DC2
), DC1
), DC0
);
240 pn
= absx
< 0.75 ? tn
: pn
;
241 pd
= absx
< 0.75 ? td
: pd
;
243 tn
= fma
(t, fma
(t, fma
(t, fma
(t, NB4
, NB3
), NB2
), NB1
), NB0
);
244 td
= fma
(t, fma
(t, fma
(t, fma
(t, DB4
, DB3
), DB2
), DB1
), DB0
);
245 pn
= absx
< 0.5 ? tn
: pn
;
246 pd
= absx
< 0.5 ? td
: pd
;
248 tn
= fma
(t, fma
(t, fma
(t, fma
(t, NA4
, NA3
), NA2
), NA1
), NA0
);
249 td
= fma
(t, fma
(t, fma
(t, fma
(t, DA4
, DA3
), DA2
), DA1
), DA0
);
250 pn
= absx
< 0.25 ? tn
: pn
;
251 pd
= absx
< 0.25 ? td
: pd
;
253 double pq
= MATH_DIVIDE
(pn, pd
);
256 double result1
= fma
(absx*t
, pq
, absx
);
259 int xout
= absx
<= 32.0 | absx
> recrteps
;
260 double y
= absx
+ sqrt
(fma(absx, absx
, 1.0));
265 __clc_ep_log
(y, &xexp
, &r1
, &r2
);
267 double dxexp
= (double)(xexp + xout
);
268 r1
= fma
(dxexp, log2_lead
, r1
);
269 r2
= fma
(dxexp, log2_tail
, r2
);
272 double v2
= (pq + 0.25) / t
;
274 double s
= ((r1 - r
) + v2
) + r2
;
277 double result2
= v1
+ v2
;
280 double result3
= r1
+ r2
;
282 double ret
= absx
> 1.0 ? result2
: result1
;
283 ret
= absx
> 32.0 ? result3
: ret
;
284 ret
= x
< 0.0 ? -ret
: ret
;
286 // NaN
, +-Inf
, or x small enough that asinh
(x) = x
287 ret
= ax
>= PINFBITPATT_DP64 | absx
< rteps ? x
: ret
;
291 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, double
, asinh
, double
)