2 * Copyright
(c) 2014 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 #include
"../clcmacro.h"
29 * ====================================================
30 * Copyright
(C) 1993 by Sun Microsystems
, Inc. All rights reserved.
32 * Developed at SunPro
, a Sun Microsystems
, Inc. business.
33 * Permission to use
, copy
, modify
, and distribute this
34 * software is freely granted
, provided that this notice
36 * ====================================================
39 #define erx
8.4506291151e-01f
/* 0x3f58560b */
41 // Coefficients for approximation to erf on
[00.84375]
43 #define efx
1.2837916613e-01f
/* 0x3e0375d4 */
44 #define efx8
1.0270333290e+00f
/* 0x3f8375d4 */
46 #define pp0
1.2837916613e-01f
/* 0x3e0375d4 */
47 #define pp1 -
3.2504209876e-01f
/* 0xbea66beb */
48 #define pp2 -
2.8481749818e-02f
/* 0xbce9528f */
49 #define pp3 -
5.7702702470e-03f
/* 0xbbbd1489 */
50 #define pp4 -
2.3763017452e-05f
/* 0xb7c756b1 */
51 #define qq1
3.9791721106e-01f
/* 0x3ecbbbce */
52 #define qq2
6.5022252500e-02f
/* 0x3d852a63 */
53 #define qq3
5.0813062117e-03f
/* 0x3ba68116 */
54 #define qq4
1.3249473704e-04f
/* 0x390aee49 */
55 #define qq5 -
3.9602282413e-06f
/* 0xb684e21a */
57 // Coefficients for approximation to erf in
[0.843751.25]
59 #define pa0 -
2.3621185683e-03f
/* 0xbb1acdc6 */
60 #define pa1
4.1485610604e-01f
/* 0x3ed46805 */
61 #define pa2 -
3.7220788002e-01f
/* 0xbebe9208 */
62 #define pa3
3.1834661961e-01f
/* 0x3ea2fe54 */
63 #define pa4 -
1.1089469492e-01f
/* 0xbde31cc2 */
64 #define pa5
3.5478305072e-02f
/* 0x3d1151b3 */
65 #define pa6 -
2.1663755178e-03f
/* 0xbb0df9c0 */
66 #define qa1
1.0642088205e-01f
/* 0x3dd9f331 */
67 #define qa2
5.4039794207e-01f
/* 0x3f0a5785 */
68 #define qa3
7.1828655899e-02f
/* 0x3d931ae7 */
69 #define qa4
1.2617121637e-01f
/* 0x3e013307 */
70 #define qa5
1.3637083583e-02f
/* 0x3c5f6e13 */
71 #define qa6
1.1984500103e-02f
/* 0x3c445aa3 */
73 // Coefficients for approximation to erfc in
[1.251/0.35]
75 #define ra0 -
9.8649440333e-03f
/* 0xbc21a093 */
76 #define ra1 -
6.9385856390e-01f
/* 0xbf31a0b7 */
77 #define ra2 -
1.0558626175e+01f
/* 0xc128f022 */
78 #define ra3 -
6.2375331879e+01f
/* 0xc2798057 */
79 #define ra4 -
1.6239666748e+02f
/* 0xc322658c */
80 #define ra5 -
1.8460508728e+02f
/* 0xc3389ae7 */
81 #define ra6 -
8.1287437439e+01f
/* 0xc2a2932b */
82 #define ra7 -
9.8143291473e+00f
/* 0xc11d077e */
83 #define sa1
1.9651271820e+01f
/* 0x419d35ce */
84 #define sa2
1.3765776062e+02f
/* 0x4309a863 */
85 #define sa3
4.3456588745e+02f
/* 0x43d9486f */
86 #define sa4
6.4538726807e+02f
/* 0x442158c9 */
87 #define sa5
4.2900814819e+02f
/* 0x43d6810b */
88 #define sa6
1.0863500214e+02f
/* 0x42d9451f */
89 #define sa7
6.5702495575e+00f
/* 0x40d23f7c */
90 #define sa8 -
6.0424413532e-02f
/* 0xbd777f97 */
92 // Coefficients for approximation to erfc in
[1/.3528]
94 #define rb0 -
9.8649431020e-03f
/* 0xbc21a092 */
95 #define rb1 -
7.9928326607e-01f
/* 0xbf4c9dd4 */
96 #define rb2 -
1.7757955551e+01f
/* 0xc18e104b */
97 #define rb3 -
1.6063638306e+02f
/* 0xc320a2ea */
98 #define rb4 -
6.3756646729e+02f
/* 0xc41f6441 */
99 #define rb5 -
1.0250950928e+03f
/* 0xc480230b */
100 #define rb6 -
4.8351919556e+02f
/* 0xc3f1c275 */
101 #define sb1
3.0338060379e+01f
/* 0x41f2b459 */
102 #define sb2
3.2579251099e+02f
/* 0x43a2e571 */
103 #define sb3
1.5367296143e+03f
/* 0x44c01759 */
104 #define sb4
3.1998581543e+03f
/* 0x4547fdbb */
105 #define sb5
2.5530502930e+03f
/* 0x451f90ce */
106 #define sb6
4.7452853394e+02f
/* 0x43ed43a7 */
107 #define sb7 -
2.2440952301e+01f
/* 0xc1b38712 */
109 _CLC_OVERLOAD _CLC_DEF float erf
(float x
) {
111 int ix
= hx
& 0x7fffffff;
112 float absx
= as_float
(ix);
114 float x2
= absx
* absx
;
116 float tt
= absx -
1.0f
;
117 t
= absx
< 1.25f ? tt
: t
;
118 t
= absx
< 0.84375f ? x2
: t
;
123 u
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, rb6
, rb5
), rb4
), rb3
), rb2
), rb1
), rb0
);
124 v
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, sb7
, sb6
), sb5
), sb4
), sb3
), sb2
), sb1
);
126 tu
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, ra7
, ra6
), ra5
), ra4
), ra3
), ra2
), ra1
), ra0
);
127 tv
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, sa8
, sa7
), sa6
), sa5
), sa4
), sa3
), sa2
), sa1
);
128 u
= absx
< 0x1.6db6dcp
+1f ? tu
: u
;
129 v
= absx
< 0x1.6db6dcp
+1f ? tv
: v
;
131 tu
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, pa6
, pa5
), pa4
), pa3
), pa2
), pa1
), pa0
);
132 tv
= mad
(t, mad
(t, mad
(t, mad
(t, mad
(t, qa6
, qa5
), qa4
), qa3
), qa2
), qa1
);
133 u
= absx
< 1.25f ? tu
: u
;
134 v
= absx
< 1.25f ? tv
: v
;
136 tu
= mad
(t, mad
(t, mad
(t, mad
(t, pp4
, pp3
), pp2
), pp1
), pp0
);
137 tv
= mad
(t, mad
(t, mad
(t, mad
(t, qq5
, qq4
), qq3
), qq2
), qq1
);
138 u
= absx
< 0.84375f ? tu
: u
;
139 v
= absx
< 0.84375f ? tv
: v
;
142 float q
= MATH_DIVIDE
(u, v
);
147 float z
= as_float
(ix & 0xfffff000);
148 float r
= exp
(mad(-z, z
, -
0.5625f
)) * exp
(mad(z-absx, z
+absx
, q
));
149 r
= 1.0f - MATH_DIVIDE
(r, absx
);
150 ret
= absx
< 6.0f ? r
: ret
;
153 ret
= absx
< 1.25f ? r
: ret
;
155 ret
= as_float
((hx & 0x80000000) | as_int
(ret));
158 ret
= absx
< 0.84375f ? r
: ret
;
161 r
= 0.125f
* mad
(8.0f
, x
, efx8
* x
);
162 ret
= absx
< 0x1.0p-28f ? r
: ret
;
164 ret
= isnan
(x) ? x
: ret
;
169 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, float
, erf
, float
);
173 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
176 * ====================================================
177 * Copyright
(C) 1993 by Sun Microsystems
, Inc. All rights reserved.
179 * Developed at SunPro
, a Sun Microsystems
, Inc. business.
180 * Permission to use
, copy
, modify
, and distribute this
181 * software is freely granted
, provided that this notice
183 * ====================================================
186 /* double erf
(double x
)
187 * double erfc
(double x
)
190 * erf
(x) = --------- | exp
(-t*t
)dt
197 * erfc
(-x) = 2 - erfc
(x)
200 * 1. For |x| in
[0, 0.84375]
201 * erf
(x) = x
+ x
*R
(x^
2)
202 * erfc
(x) = 1 - erf
(x) if x in
[-
.84375,0.25]
203 * = 0.5 + ((0.5-x
)-x
*R
) if x in
[0.25,0.84375]
204 * where R
= P
/Q where P is an odd poly of degree
8 and
205 * Q is an odd poly of degree
10.
207 * | R -
(erf(x)-x
)/x |
<= 2
210 * Remark. The formula is derived by noting
211 * erf
(x) = (2/sqrt
(pi))*(x - x^
3/3 + x^
5/10 - x^
7/42 + ....
)
213 * 2/sqrt
(pi) = 1.128379167095512573896158903121545171688
214 * is close to one. The interval is chosen because the fix
215 * point of erf
(x) is near
0.6174 (i.e.
, erf
(x)=x when x is
216 * near
0.6174), and by some experiment
, 0.84375 is chosen to
217 * guarantee the error is less than one ulp for erf.
219 * 2. For |x| in
[0.84375,1.25], let s
= |x| -
1, and
220 * c
= 0.84506291151 rounded to single
(24 bits
)
221 * erf
(x) = sign
(x) * (c + P1
(s)/Q1
(s))
222 * erfc
(x) = (1-c) - P1
(s)/Q1
(s) if x
> 0
223 * 1+(c+P1
(s)/Q1
(s)) if x
< 0
224 * |P1
/Q1 -
(erf(|x|
)-c
)|
<= 2**-
59.06
225 * Remark
: here we use the taylor series expansion at x
=1.
226 * erf
(1+s
) = erf
(1) + s
*Poly
(s)
227 * = 0.845..
+ P1
(s)/Q1
(s)
228 * That is
, we use rational approximation to approximate
229 * erf
(1+s
) -
(c = (single)0.84506291151)
230 * Note that |P1
/Q1|
< 0.078 for x in
[0.84375,1.25]
232 * P1
(s) = degree
6 poly in s
233 * Q1
(s) = degree
6 poly in s
235 * 3. For x in
[1.25,1/0.35(~
2.857143)],
236 * erfc
(x) = (1/x
)*exp
(-x*x-0.5625
+R1
/S1
)
237 * erf
(x) = 1 - erfc
(x)
239 * R1
(z) = degree
7 poly in z
, (z=1/x^
2)
240 * S1
(z) = degree
8 poly in z
242 * 4. For x in
[1/0.35,28]
243 * erfc
(x) = (1/x
)*exp
(-x*x-0.5625
+R2
/S2
) if x
> 0
244 * = 2.0 -
(1/x
)*exp
(-x*x-0.5625
+R2
/S2
) if -
6<x
<0
245 * = 2.0 - tiny
(if x
<= -
6)
246 * erf
(x) = sign
(x)*(1.0 - erfc
(x)) if x
< 6, else
247 * erf
(x) = sign
(x)*(1.0 - tiny
)
249 * R2
(z) = degree
6 poly in z
, (z=1/x^
2)
250 * S2
(z) = degree
7 poly in z
253 * To compute exp
(-x*x-0.5625
+R
/S
), let s be a single
254 * precision number and s
:= x
; then
255 * -x
*x
= -s
*s
+ (s-x)*(s+x
)
256 * exp
(-x*x-0.5626
+R
/S
) =
257 * exp
(-s*s-0.5625
)*exp
((s-x)*(s+x
)+R
/S
);
259 * Here
4 and
5 make use of the asymptotic series
261 * erfc
(x) ~ ----------
* ( 1 + Poly
(1/x^
2) )
263 * We use rational approximation to approximate
264 * g
(s)=f
(1/x^
2) = log
(erfc(x)*x
) - x
*x
+ 0.5625
265 * Here is the error bound for R1
/S1 and R2
/S2
266 * |R1
/S1 - f
(x)|
< 2**(-62.57
)
267 * |R2
/S2 - f
(x)|
< 2**(-61.52
)
269 * 5. For inf
> x
>= 28
270 * erf
(x) = sign
(x) *(1 - tiny
) (raise inexact
)
271 * erfc
(x) = tiny
*tiny
(raise underflow
) if x
> 0
275 * erf
(0) = 0, erf
(inf) = 1, erf
(-inf) = -
1,
276 * erfc
(0) = 1, erfc
(inf) = 0, erfc
(-inf) = 2,
277 * erfc
/erf
(NaN) is NaN
280 #define AU0 -
9.86494292470009928597e-03
281 #define AU1 -
7.99283237680523006574e-01
282 #define AU2 -
1.77579549177547519889e+01
283 #define AU3 -
1.60636384855821916062e+02
284 #define AU4 -
6.37566443368389627722e+02
285 #define AU5 -
1.02509513161107724954e+03
286 #define AU6 -
4.83519191608651397019e+02
288 #define AV1
3.03380607434824582924e+01
289 #define AV2
3.25792512996573918826e+02
290 #define AV3
1.53672958608443695994e+03
291 #define AV4
3.19985821950859553908e+03
292 #define AV5
2.55305040643316442583e+03
293 #define AV6
4.74528541206955367215e+02
294 #define AV7 -
2.24409524465858183362e+01
296 #define BU0 -
9.86494403484714822705e-03
297 #define BU1 -
6.93858572707181764372e-01
298 #define BU2 -
1.05586262253232909814e+01
299 #define BU3 -
6.23753324503260060396e+01
300 #define BU4 -
1.62396669462573470355e+02
301 #define BU5 -
1.84605092906711035994e+02
302 #define BU6 -
8.12874355063065934246e+01
303 #define BU7 -
9.81432934416914548592e+00
305 #define BV1
1.96512716674392571292e+01
306 #define BV2
1.37657754143519042600e+02
307 #define BV3
4.34565877475229228821e+02
308 #define BV4
6.45387271733267880336e+02
309 #define BV5
4.29008140027567833386e+02
310 #define BV6
1.08635005541779435134e+02
311 #define BV7
6.57024977031928170135e+00
312 #define BV8 -
6.04244152148580987438e-02
314 #define CU0 -
2.36211856075265944077e-03
315 #define CU1
4.14856118683748331666e-01
316 #define CU2 -
3.72207876035701323847e-01
317 #define CU3
3.18346619901161753674e-01
318 #define CU4 -
1.10894694282396677476e-01
319 #define CU5
3.54783043256182359371e-02
320 #define CU6 -
2.16637559486879084300e-03
322 #define CV1
1.06420880400844228286e-01
323 #define CV2
5.40397917702171048937e-01
324 #define CV3
7.18286544141962662868e-02
325 #define CV4
1.26171219808761642112e-01
326 #define CV5
1.36370839120290507362e-02
327 #define CV6
1.19844998467991074170e-02
329 #define DU0
1.28379167095512558561e-01
330 #define DU1 -
3.25042107247001499370e-01
331 #define DU2 -
2.84817495755985104766e-02
332 #define DU3 -
5.77027029648944159157e-03
333 #define DU4 -
2.37630166566501626084e-05
335 #define DV1
3.97917223959155352819e-01
336 #define DV2
6.50222499887672944485e-02
337 #define DV3
5.08130628187576562776e-03
338 #define DV4
1.32494738004321644526e-04
339 #define DV5 -
3.96022827877536812320e-06
341 _CLC_OVERLOAD _CLC_DEF double erf
(double y
) {
344 double xm1
= x -
1.0;
348 t
= x
< 1.25 ? xm1
: t
;
349 t
= x
< 0.84375 ? x2
: t
;
353 // Evaluate rational poly
354 // XXX We need to see of we can grab
16 coefficents from a table
355 // faster than evaluating
3 of the poly pairs
357 u
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, AU6
, AU5
), AU4
), AU3
), AU2
), AU1
), AU0
);
358 v
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, AV7
, AV6
), AV5
), AV4
), AV3
), AV2
), AV1
);
360 ut
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, BU7
, BU6
), BU5
), BU4
), BU3
), BU2
), BU1
), BU0
);
361 vt
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, BV8
, BV7
), BV6
), BV5
), BV4
), BV3
), BV2
), BV1
);
362 u
= x
< 0x1.6db6ep
+1 ? ut
: u
;
363 v
= x
< 0x1.6db6ep
+1 ? vt
: v
;
365 ut
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, CU6
, CU5
), CU4
), CU3
), CU2
), CU1
), CU0
);
366 vt
= fma
(t, fma
(t, fma
(t, fma
(t, fma
(t, CV6
, CV5
), CV4
), CV3
), CV2
), CV1
);
367 u
= x
< 1.25 ? ut
: u
;
368 v
= x
< 1.25 ? vt
: v
;
370 ut
= fma
(t, fma
(t, fma
(t, fma
(t, DU4
, DU3
), DU2
), DU1
), DU0
);
371 vt
= fma
(t, fma
(t, fma
(t, fma
(t, DV5
, DV4
), DV3
), DV2
), DV1
);
372 u
= x
< 0.84375 ? ut
: u
;
373 v
= x
< 0.84375 ? vt
: v
;
377 // Compute rational approximation
381 double z
= as_double
(as_long(x) & 0xffffffff00000000L
);
382 double r
= exp
(-z * z -
0.5625) * exp
((z - x
) * (z + x
) + q
);
385 double ret
= x
< 6.0 ? r
: 1.0;
387 r
= 8.45062911510467529297e-01 + q
;
388 ret
= x
< 1.25 ? r
: ret
;
390 q
= x
< 0x1.0p-28 ?
1.28379167095512586316e-01 : q
;
393 ret
= x
< 0.84375 ? r
: ret
;
395 ret
= isnan
(x) ? x
: ret
;
397 return y
< 0.0 ? -ret
: ret
;
400 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, double
, erf
, double
);
404 #pragma OPENCL EXTENSION cl_khr_fp16
: enable
406 _CLC_OVERLOAD _CLC_DEF half erf
(half h
) {
407 return
(half)erf
((float)h
);
410 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, half
, erf
, half
);