2 * Copyright 2023 Siemens
4 * The authors hereby grant permission to use, copy, modify, distribute,
5 * and license this software and its documentation for any purpose, provided
6 * that existing copyright notices are retained in all copies and that this
7 * notice is included verbatim in any distributions. No written agreement,
8 * license, or royalty fee is required for any of the authorized uses.
9 * Modifications to this software may be copyrighted by their authors
10 * and need not follow the licensing terms described here, provided that
11 * the new terms are clearly indicated on the first page of each file where
16 * ====================================================
17 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
19 * Developed at SunPro, a Sun Microsystems, Inc. business.
20 * Permission to use, copy, modify, and distribute this
21 * software is freely granted, provided that this notice
23 * ====================================================
26 /* Based on newlib/libm/mathfp/sf_erf.c in Newlib. */
28 #include "amdgcnmach.h"
30 v64sf
v64sf_expf_aux (v64sf
, v64si
);
34 half
= 5.0000000000e-01, /* 0x3F000000 */
35 one
= 1.0000000000e+00, /* 0x3F800000 */
36 two
= 2.0000000000e+00, /* 0x40000000 */
37 /* c = (subfloat)0.84506291151 */
38 erx
= 8.4506291151e-01, /* 0x3f58560b */
40 * Coefficients for approximation to erf on [0,0.84375]
42 efx
= 1.2837916613e-01, /* 0x3e0375d4 */
43 efx8
= 1.0270333290e+00, /* 0x3f8375d4 */
44 pp0
= 1.2837916613e-01, /* 0x3e0375d4 */
45 pp1
= -3.2504209876e-01, /* 0xbea66beb */
46 pp2
= -2.8481749818e-02, /* 0xbce9528f */
47 pp3
= -5.7702702470e-03, /* 0xbbbd1489 */
48 pp4
= -2.3763017452e-05, /* 0xb7c756b1 */
49 qq1
= 3.9791721106e-01, /* 0x3ecbbbce */
50 qq2
= 6.5022252500e-02, /* 0x3d852a63 */
51 qq3
= 5.0813062117e-03, /* 0x3ba68116 */
52 qq4
= 1.3249473704e-04, /* 0x390aee49 */
53 qq5
= -3.9602282413e-06, /* 0xb684e21a */
55 * Coefficients for approximation to erf in [0.84375,1.25]
57 pa0
= -2.3621185683e-03, /* 0xbb1acdc6 */
58 pa1
= 4.1485610604e-01, /* 0x3ed46805 */
59 pa2
= -3.7220788002e-01, /* 0xbebe9208 */
60 pa3
= 3.1834661961e-01, /* 0x3ea2fe54 */
61 pa4
= -1.1089469492e-01, /* 0xbde31cc2 */
62 pa5
= 3.5478305072e-02, /* 0x3d1151b3 */
63 pa6
= -2.1663755178e-03, /* 0xbb0df9c0 */
64 qa1
= 1.0642088205e-01, /* 0x3dd9f331 */
65 qa2
= 5.4039794207e-01, /* 0x3f0a5785 */
66 qa3
= 7.1828655899e-02, /* 0x3d931ae7 */
67 qa4
= 1.2617121637e-01, /* 0x3e013307 */
68 qa5
= 1.3637083583e-02, /* 0x3c5f6e13 */
69 qa6
= 1.1984500103e-02, /* 0x3c445aa3 */
71 * Coefficients for approximation to erfc in [1.25,1/0.35]
73 ra0
= -9.8649440333e-03, /* 0xbc21a093 */
74 ra1
= -6.9385856390e-01, /* 0xbf31a0b7 */
75 ra2
= -1.0558626175e+01, /* 0xc128f022 */
76 ra3
= -6.2375331879e+01, /* 0xc2798057 */
77 ra4
= -1.6239666748e+02, /* 0xc322658c */
78 ra5
= -1.8460508728e+02, /* 0xc3389ae7 */
79 ra6
= -8.1287437439e+01, /* 0xc2a2932b */
80 ra7
= -9.8143291473e+00, /* 0xc11d077e */
81 sa1
= 1.9651271820e+01, /* 0x419d35ce */
82 sa2
= 1.3765776062e+02, /* 0x4309a863 */
83 sa3
= 4.3456588745e+02, /* 0x43d9486f */
84 sa4
= 6.4538726807e+02, /* 0x442158c9 */
85 sa5
= 4.2900814819e+02, /* 0x43d6810b */
86 sa6
= 1.0863500214e+02, /* 0x42d9451f */
87 sa7
= 6.5702495575e+00, /* 0x40d23f7c */
88 sa8
= -6.0424413532e-02, /* 0xbd777f97 */
90 * Coefficients for approximation to erfc in [1/.35,28]
92 rb0
= -9.8649431020e-03, /* 0xbc21a092 */
93 rb1
= -7.9928326607e-01, /* 0xbf4c9dd4 */
94 rb2
= -1.7757955551e+01, /* 0xc18e104b */
95 rb3
= -1.6063638306e+02, /* 0xc320a2ea */
96 rb4
= -6.3756646729e+02, /* 0xc41f6441 */
97 rb5
= -1.0250950928e+03, /* 0xc480230b */
98 rb6
= -4.8351919556e+02, /* 0xc3f1c275 */
99 sb1
= 3.0338060379e+01, /* 0x41f2b459 */
100 sb2
= 3.2579251099e+02, /* 0x43a2e571 */
101 sb3
= 1.5367296143e+03, /* 0x44c01759 */
102 sb4
= 3.1998581543e+03, /* 0x4547fdbb */
103 sb5
= 2.5530502930e+03, /* 0x451f90ce */
104 sb6
= 4.7452853394e+02, /* 0x43ed43a7 */
105 sb7
= -2.2440952301e+01; /* 0xc1b38712 */
107 #if defined (__has_builtin) && __has_builtin (__builtin_gcn_fabsvf)
109 DEF_VS_MATH_FUNC (v64sf
, erff
, v64sf x
)
111 FUNCTION_INIT (v64sf
);
114 GET_FLOAT_WORD (hx
, x
, NO_COND
);
115 v64si ix
= hx
& 0x7fffffff;
117 VECTOR_IF (ix
>= 0x7f800000, cond
) /* erf(nan)=nan */
118 v64si i
= (hx
>> 31) << 1;
120 VECTOR_RETURN (__builtin_convertvector (1 - i
, v64sf
) + 1.0f
/ x
, cond
);
123 VECTOR_IF (ix
< 0x3f580000, cond
) /* |x|<0.84375 */
124 VECTOR_IF2 (ix
< 0x31800000, cond2
, cond
) /* |x|<2**-28 */
125 VECTOR_IF2 (ix
< 0x04000000, cond3
, cond2
) /* avoid underflow */
126 VECTOR_RETURN (0.125f
*(8.0f
*x
+ efx8
*x
), cond3
);
128 VECTOR_RETURN (x
+ efx
*x
, cond2
);
132 v64sf r
= pp0
+z
*(pp1
+z
*(pp2
+z
*(pp3
+z
*pp4
)));
133 v64sf s
= one
+z
*(qq1
+z
*(qq2
+z
*(qq3
+z
*(qq4
+z
*qq5
))));
136 VECTOR_RETURN (x
+ x
*y
, cond
);
139 VECTOR_IF (ix
< 0x3fa00000, cond
) /* 0.84375 <= |x| < 1.25 */
140 v64sf s
= __builtin_gcn_fabsvf (x
) - 1.0f
;
141 v64sf P
= pa0
+s
*(pa1
+s
*(pa2
+s
*(pa3
+s
*(pa4
+s
*(pa5
+s
*pa6
)))));
142 v64sf Q
= one
+s
*(qa1
+s
*(qa2
+s
*(qa3
+s
*(qa4
+s
*(qa5
+s
*qa6
)))));
143 VECTOR_IF2 (hx
>= 0, cond2
, cond
)
144 VECTOR_RETURN (erx
+ P
/Q
, cond2
);
145 VECTOR_ELSE2 (cond2
, cond
)
146 VECTOR_RETURN (-erx
- P
/Q
, cond2
);
150 VECTOR_IF (ix
>= 0x40c00000, cond
) /* inf>|x|>=6 */
151 VECTOR_IF2 (hx
>= 0, cond2
, cond
)
152 VECTOR_RETURN (VECTOR_INIT (1.0f
- tiny
), cond2
);
153 VECTOR_ELSE2 (cond2
, cond
)
154 VECTOR_RETURN (VECTOR_INIT (tiny
- 1.0f
), cond2
);
158 x
= __builtin_gcn_fabsvf(x
);
159 v64sf s
= 1.0f
/ (x
*x
);
161 VECTOR_IF (ix
< 0x4036DB6E, cond
) /* |x| < 1/0.35 */
162 VECTOR_COND_MOVE (R
, ra0
+s
*(ra1
+s
*(ra2
+s
*(ra3
+s
*(ra4
+s
*(
163 ra5
+s
*(ra6
+s
*ra7
)))))), cond
);
164 VECTOR_COND_MOVE (S
, one
+s
*(sa1
+s
*(sa2
+s
*(sa3
+s
*(sa4
+s
*(
165 sa5
+s
*(sa6
+s
*(sa7
+s
*sa8
))))))), cond
);
166 VECTOR_ELSE (cond
) /* |x| >= 1/0.35 */
167 VECTOR_COND_MOVE (R
, rb0
+s
*(rb1
+s
*(rb2
+s
*(rb3
+s
*(rb4
+s
*(
168 rb5
+s
*rb6
))))), cond
);
169 VECTOR_COND_MOVE (S
, one
+s
*(sb1
+s
*(sb2
+s
*(sb3
+s
*(sb4
+s
*(
170 sb5
+s
*(sb6
+s
*sb7
)))))), cond
);
173 GET_FLOAT_WORD (ix
, x
, NO_COND
);
175 SET_FLOAT_WORD (z
, ix
& 0xfffff000, NO_COND
);
176 v64sf r
= v64sf_expf_aux (-z
*z
- 0.5625f
, __mask
)
177 * v64sf_expf_aux ((z
-x
)*(z
+x
) + R
/S
, __mask
);
178 VECTOR_RETURN (one
- r
/x
, hx
>= 0);
179 VECTOR_RETURN (r
/x
- one
, hx
< 0);
184 DEF_VARIANTS (erff
, sf
, sf
)