2 * Copyright
(c) 2014 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include
<clc
/clcmacro.h
>
29 * ====================================================
30 * Copyright
(C) 1993 by Sun Microsystems
, Inc. All rights reserved.
32 * Developed at SunPro
, a Sun Microsystems
, Inc. business.
33 * Permission to use
, copy
, modify
, and distribute this
34 * software is freely granted
, provided that this notice
36 * ====================================================
39 #define erx_f
8.4506291151e-01f
/* 0x3f58560b */
41 // Coefficients for approximation to erf on
[00.84375]
43 #define efx
1.2837916613e-01f
/* 0x3e0375d4 */
44 #define efx8
1.0270333290e+00f
/* 0x3f8375d4 */
46 #define pp0
1.2837916613e-01f
/* 0x3e0375d4 */
47 #define pp1 -
3.2504209876e-01f
/* 0xbea66beb */
48 #define pp2 -
2.8481749818e-02f
/* 0xbce9528f */
49 #define pp3 -
5.7702702470e-03f
/* 0xbbbd1489 */
50 #define pp4 -
2.3763017452e-05f
/* 0xb7c756b1 */
51 #define qq1
3.9791721106e-01f
/* 0x3ecbbbce */
52 #define qq2
6.5022252500e-02f
/* 0x3d852a63 */
53 #define qq3
5.0813062117e-03f
/* 0x3ba68116 */
54 #define qq4
1.3249473704e-04f
/* 0x390aee49 */
55 #define qq5 -
3.9602282413e-06f
/* 0xb684e21a */
57 // Coefficients for approximation to erf in
[0.843751.25]
59 #define pa0 -
2.3621185683e-03f
/* 0xbb1acdc6 */
60 #define pa1
4.1485610604e-01f
/* 0x3ed46805 */
61 #define pa2 -
3.7220788002e-01f
/* 0xbebe9208 */
62 #define pa3
3.1834661961e-01f
/* 0x3ea2fe54 */
63 #define pa4 -
1.1089469492e-01f
/* 0xbde31cc2 */
64 #define pa5
3.5478305072e-02f
/* 0x3d1151b3 */
65 #define pa6 -
2.1663755178e-03f
/* 0xbb0df9c0 */
66 #define qa1
1.0642088205e-01f
/* 0x3dd9f331 */
67 #define qa2
5.4039794207e-01f
/* 0x3f0a5785 */
68 #define qa3
7.1828655899e-02f
/* 0x3d931ae7 */
69 #define qa4
1.2617121637e-01f
/* 0x3e013307 */
70 #define qa5
1.3637083583e-02f
/* 0x3c5f6e13 */
71 #define qa6
1.1984500103e-02f
/* 0x3c445aa3 */
73 // Coefficients for approximation to erfc in
[1.251/0.35]
75 #define ra0 -
9.8649440333e-03f
/* 0xbc21a093 */
76 #define ra1 -
6.9385856390e-01f
/* 0xbf31a0b7 */
77 #define ra2 -
1.0558626175e+01f
/* 0xc128f022 */
78 #define ra3 -
6.2375331879e+01f
/* 0xc2798057 */
79 #define ra4 -
1.6239666748e+02f
/* 0xc322658c */
80 #define ra5 -
1.8460508728e+02f
/* 0xc3389ae7 */
81 #define ra6 -
8.1287437439e+01f
/* 0xc2a2932b */
82 #define ra7 -
9.8143291473e+00f
/* 0xc11d077e */
83 #define sa1
1.9651271820e+01f
/* 0x419d35ce */
84 #define sa2
1.3765776062e+02f
/* 0x4309a863 */
85 #define sa3
4.3456588745e+02f
/* 0x43d9486f */
86 #define sa4
6.4538726807e+02f
/* 0x442158c9 */
87 #define sa5
4.2900814819e+02f
/* 0x43d6810b */
88 #define sa6
1.0863500214e+02f
/* 0x42d9451f */
89 #define sa7
6.5702495575e+00f
/* 0x40d23f7c */
90 #define sa8 -
6.0424413532e-02f
/* 0xbd777f97 */
92 // Coefficients for approximation to erfc in
[1/.3528]
94 #define rb0 -
9.8649431020e-03f
/* 0xbc21a092 */
95 #define rb1 -
7.9928326607e-01f
/* 0xbf4c9dd4 */
96 #define rb2 -
1.7757955551e+01f
/* 0xc18e104b */
97 #define rb3 -
1.6063638306e+02f
/* 0xc320a2ea */
98 #define rb4 -
6.3756646729e+02f
/* 0xc41f6441 */
99 #define rb5 -
1.0250950928e+03f
/* 0xc480230b */
100 #define rb6 -
4.8351919556e+02f
/* 0xc3f1c275 */
101 #define sb1
3.0338060379e+01f
/* 0x41f2b459 */
102 #define sb2
3.2579251099e+02f
/* 0x43a2e571 */
103 #define sb3
1.5367296143e+03f
/* 0x44c01759 */
104 #define sb4
3.1998581543e+03f
/* 0x4547fdbb */
105 #define sb5
2.5530502930e+03f
/* 0x451f90ce */
106 #define sb6
4.7452853394e+02f
/* 0x43ed43a7 */
107 #define sb7 -
2.2440952301e+01f
/* 0xc1b38712 */
109 _CLC_OVERLOAD _CLC_DEF float erfc
(float x
) {
111 int ix
= hx
& 0x7fffffff;
112 float absx
= as_float
(ix);
114 // Argument for polys
115 float x2
= absx
* absx
;
117 float tt
= absx -
1.0f
;
118 t
= absx
< 1.25f ? tt
: t
;
119 t
= absx
< 0.84375f ? x2
: t
;
124 u
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, rb6
, rb5
), rb4
), rb3
), rb2
), rb1
), rb0
);
125 v
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, sb7
, sb6
), sb5
), sb4
), sb3
), sb2
), sb1
);
127 tu
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, ra7
, ra6
), ra5
), ra4
), ra3
), ra2
), ra1
), ra0
);
128 tv
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, sa8
, sa7
), sa6
), sa5
), sa4
), sa3
), sa2
), sa1
);
129 u
= absx
< 0x1.6db6dap
+1f ? tu
: u
;
130 v
= absx
< 0x1.6db6dap
+1f ? tv
: v
;
132 tu
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, pa6
, pa5
), pa4
), pa3
), pa2
), pa1
), pa0
);
133 tv
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, qa6
, qa5
), qa4
), qa3
), qa2
), qa1
);
134 u
= absx
< 1.25f ? tu
: u
;
135 v
= absx
< 1.25f ? tv
: v
;
137 tu
= mad
(t, mad
(t, mad
(t, mad
(t, pp4
, pp3
), pp2
), pp1
), pp0
);
138 tv
= mad
(t, mad
(t, mad
(t, mad
(t, qq5
, qq4
), qq3
), qq2
), qq1
);
139 u
= absx
< 0.84375f ? tu
: u
;
140 v
= absx
< 0.84375f ? tv
: v
;
144 float q
= MATH_DIVIDE
(u, v
);
148 float z
= as_float
(ix & 0xfffff000);
149 float r
= exp
(mad(-z, z
, -
0.5625f
)) * exp
(mad(z - absx
, z
+ absx
, q
));
150 r
= MATH_DIVIDE
(r, absx
);
152 r
= x
< 0.0f ? t
: r
;
153 ret
= absx
< 28.0f ? r
: ret
;
155 r
= 1.0f - erx_f - q
;
156 t
= erx_f
+ q
+ 1.0f
;
157 r
= x
< 0.0f ? t
: r
;
158 ret
= absx
< 1.25f ? r
: ret
;
160 r
= 0.5f - mad
(x, q
, x -
0.5f
);
161 ret
= absx
< 0.84375f ? r
: ret
;
163 ret
= x
< -
6.0f ?
2.0f
: ret
;
165 ret
= isnan
(x) ? x
: ret
;
170 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, float
, erfc
, float
);
174 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
177 * ====================================================
178 * Copyright
(C) 1993 by Sun Microsystems
, Inc. All rights reserved.
180 * Developed at SunPro
, a Sun Microsystems
, Inc. business.
181 * Permission to use
, copy
, modify
, and distribute this
182 * software is freely granted
, provided that this notice
184 * ====================================================
187 /* double erf
(double x
)
188 * double erfc
(double x
)
191 * erf
(x) = --------- | exp
(-t*t
)dt
198 * erfc
(-x) = 2 - erfc
(x)
201 * 1. For |x| in
[0, 0.84375]
202 * erf
(x) = x
+ x
*R
(x^
2)
203 * erfc
(x) = 1 - erf
(x) if x in
[-
.84375,0.25]
204 * = 0.5 + ((0.5-x
)-x
*R
) if x in
[0.25,0.84375]
205 * where R
= P
/Q where P is an odd poly of degree
8 and
206 * Q is an odd poly of degree
10.
208 * | R -
(erf(x)-x
)/x |
<= 2
211 * Remark. The formula is derived by noting
212 * erf
(x) = (2/sqrt
(pi))*(x - x^
3/3 + x^
5/10 - x^
7/42 + ....
)
214 * 2/sqrt
(pi) = 1.128379167095512573896158903121545171688
215 * is close to one. The interval is chosen because the fix
216 * point of erf
(x) is near
0.6174 (i.e.
, erf
(x)=x when x is
217 * near
0.6174), and by some experiment
, 0.84375 is chosen to
218 * guarantee the error is less than one ulp for erf.
220 * 2. For |x| in
[0.84375,1.25], let s
= |x| -
1, and
221 * c
= 0.84506291151 rounded to single
(24 bits
)
222 * erf
(x) = sign
(x) * (c + P1
(s)/Q1
(s))
223 * erfc
(x) = (1-c) - P1
(s)/Q1
(s) if x
> 0
224 * 1+(c+P1
(s)/Q1
(s)) if x
< 0
225 * |P1
/Q1 -
(erf(|x|
)-c
)|
<= 2**-
59.06
226 * Remark
: here we use the taylor series expansion at x
=1.
227 * erf
(1+s
) = erf
(1) + s
*Poly
(s)
228 * = 0.845..
+ P1
(s)/Q1
(s)
229 * That is
, we use rational approximation to approximate
230 * erf
(1+s
) -
(c = (single)0.84506291151)
231 * Note that |P1
/Q1|
< 0.078 for x in
[0.84375,1.25]
233 * P1
(s) = degree
6 poly in s
234 * Q1
(s) = degree
6 poly in s
236 * 3. For x in
[1.25,1/0.35(~
2.857143)],
237 * erfc
(x) = (1/x
)*exp
(-x*x-0.5625
+R1
/S1
)
238 * erf
(x) = 1 - erfc
(x)
240 * R1
(z) = degree
7 poly in z
, (z=1/x^
2)
241 * S1
(z) = degree
8 poly in z
243 * 4. For x in
[1/0.35,28]
244 * erfc
(x) = (1/x
)*exp
(-x*x-0.5625
+R2
/S2
) if x
> 0
245 * = 2.0 -
(1/x
)*exp
(-x*x-0.5625
+R2
/S2
) if -
6<x
<0
246 * = 2.0 - tiny
(if x
<= -
6)
247 * erf
(x) = sign
(x)*(1.0 - erfc
(x)) if x
< 6, else
248 * erf
(x) = sign
(x)*(1.0 - tiny
)
250 * R2
(z) = degree
6 poly in z
, (z=1/x^
2)
251 * S2
(z) = degree
7 poly in z
254 * To compute exp
(-x*x-0.5625
+R
/S
), let s be a single
255 * precision number and s
:= x
; then
256 * -x
*x
= -s
*s
+ (s-x)*(s+x
)
257 * exp
(-x*x-0.5626
+R
/S
) =
258 * exp
(-s*s-0.5625
)*exp
((s-x)*(s+x
)+R
/S
);
260 * Here
4 and
5 make use of the asymptotic series
262 * erfc
(x) ~ ----------
* ( 1 + Poly
(1/x^
2) )
264 * We use rational approximation to approximate
265 * g
(s)=f
(1/x^
2) = log
(erfc(x)*x
) - x
*x
+ 0.5625
266 * Here is the error bound for R1
/S1 and R2
/S2
267 * |R1
/S1 - f
(x)|
< 2**(-62.57
)
268 * |R2
/S2 - f
(x)|
< 2**(-61.52
)
270 * 5. For inf
> x
>= 28
271 * erf
(x) = sign
(x) *(1 - tiny
) (raise inexact
)
272 * erfc
(x) = tiny
*tiny
(raise underflow
) if x
> 0
276 * erf
(0) = 0, erf
(inf) = 1, erf
(-inf) = -
1,
277 * erfc
(0) = 1, erfc
(inf) = 0, erfc
(-inf) = 2,
278 * erfc
/erf
(NaN) is NaN
281 #define AU0 -
9.86494292470009928597e-03
282 #define AU1 -
7.99283237680523006574e-01
283 #define AU2 -
1.77579549177547519889e+01
284 #define AU3 -
1.60636384855821916062e+02
285 #define AU4 -
6.37566443368389627722e+02
286 #define AU5 -
1.02509513161107724954e+03
287 #define AU6 -
4.83519191608651397019e+02
289 #define AV0
3.03380607434824582924e+01
290 #define AV1
3.25792512996573918826e+02
291 #define AV2
1.53672958608443695994e+03
292 #define AV3
3.19985821950859553908e+03
293 #define AV4
2.55305040643316442583e+03
294 #define AV5
4.74528541206955367215e+02
295 #define AV6 -
2.24409524465858183362e+01
297 #define BU0 -
9.86494403484714822705e-03
298 #define BU1 -
6.93858572707181764372e-01
299 #define BU2 -
1.05586262253232909814e+01
300 #define BU3 -
6.23753324503260060396e+01
301 #define BU4 -
1.62396669462573470355e+02
302 #define BU5 -
1.84605092906711035994e+02
303 #define BU6 -
8.12874355063065934246e+01
304 #define BU7 -
9.81432934416914548592e+00
306 #define BV0
1.96512716674392571292e+01
307 #define BV1
1.37657754143519042600e+02
308 #define BV2
4.34565877475229228821e+02
309 #define BV3
6.45387271733267880336e+02
310 #define BV4
4.29008140027567833386e+02
311 #define BV5
1.08635005541779435134e+02
312 #define BV6
6.57024977031928170135e+00
313 #define BV7 -
6.04244152148580987438e-02
315 #define CU0 -
2.36211856075265944077e-03
316 #define CU1
4.14856118683748331666e-01
317 #define CU2 -
3.72207876035701323847e-01
318 #define CU3
3.18346619901161753674e-01
319 #define CU4 -
1.10894694282396677476e-01
320 #define CU5
3.54783043256182359371e-02
321 #define CU6 -
2.16637559486879084300e-03
323 #define CV0
1.06420880400844228286e-01
324 #define CV1
5.40397917702171048937e-01
325 #define CV2
7.18286544141962662868e-02
326 #define CV3
1.26171219808761642112e-01
327 #define CV4
1.36370839120290507362e-02
328 #define CV5
1.19844998467991074170e-02
330 #define DU0
1.28379167095512558561e-01
331 #define DU1 -
3.25042107247001499370e-01
332 #define DU2 -
2.84817495755985104766e-02
333 #define DU3 -
5.77027029648944159157e-03
334 #define DU4 -
2.37630166566501626084e-05
336 #define DV0
3.97917223959155352819e-01
337 #define DV1
6.50222499887672944485e-02
338 #define DV2
5.08130628187576562776e-03
339 #define DV3
1.32494738004321644526e-04
340 #define DV4 -
3.96022827877536812320e-06
342 _CLC_OVERLOAD _CLC_DEF double erfc
(double x
) {
343 long lx
= as_long
(x);
344 long ax
= lx
& 0x7fffffffffffffffL
;
345 double absx
= as_double
(ax);
350 double xm1
= absx -
1.0;
352 t
= absx
< 1.25 ? xm1
: t
;
353 t
= absx
< 0.84375 ? x2
: t
;
356 // Evaluate rational poly
357 // XXX Need to evaluate if we can grab the
14 coefficients from a
358 // table faster than evaluating
3 pairs of polys
362 u
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, AU6
, AU5
), AU4
), AU3
), AU2
), AU1
), AU0
);
363 v
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, AV6
, AV5
), AV4
), AV3
), AV2
), AV1
), AV0
);
365 tu
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, BU7
, BU6
), BU5
), BU4
), BU3
), BU2
), BU1
), BU0
);
366 tv
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, BV7
, BV6
), BV5
), BV4
), BV3
), BV2
), BV1
), BV0
);
367 u
= absx
< 0x1.6db6dp
+1 ? tu
: u
;
368 v
= absx
< 0x1.6db6dp
+1 ? tv
: v
;
370 tu
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, CU6
, CU5
), CU4
), CU3
), CU2
), CU1
), CU0
);
371 tv
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, CV5
, CV4
), CV3
), CV2
), CV1
), CV0
);
372 u
= absx
< 1.25 ? tu
: u
;
373 v
= absx
< 1.25 ? tv
: v
;
375 tu
= fma
(t, fma
(t, fma
(t, fma
(t, DU4
, DU3
), DU2
), DU1
), DU0
);
376 tv
= fma
(t, fma
(t, fma
(t, fma
(t, DV4
, DV3
), DV2
), DV1
), DV0
);
377 u
= absx
< 0.84375 ? tu
: u
;
378 v
= absx
< 0.84375 ? tv
: v
;
384 // Evaluate return value
387 double z
= as_double
(ax & 0xffffffff00000000UL
);
388 double ret
= exp
(-z * z -
0.5625) * exp
((z - absx
) * (z + absx
) + q
) / absx
;
390 ret
= xneg ? t
: ret
;
392 const double erx
= 8.45062911510467529297e-01;
396 ret
= absx
< 1.25 ? t
: ret
;
398 // z
= 1.0 - fma
(x, q
, x
);
399 // t
= 0.5 - fma
(x, q
, x -
0.5);
400 // t
= xneg
== 1 | absx
< 0.25 ? z
: t
;
401 t
= fma
(-x, q
, 1.0 - x
);
402 ret
= absx
< 0.84375 ? t
: ret
;
404 ret
= x
>= 28.0 ?
0.0 : ret
;
405 ret
= x
<= -
6.0 ?
2.0 : ret
;
406 ret
= ax
> 0x7ff0000000000000UL ? x
: ret
;
411 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, double
, erfc
, double
);
415 #pragma OPENCL EXTENSION cl_khr_fp16
: enable
417 _CLC_OVERLOAD _CLC_DEF half erfc
(half h
) {
418 return
(half)erfc
((float)h
);
421 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, half
, erfc
, half
);