2 * Copyright 2023 Siemens
4 * The authors hereby grant permission to use, copy, modify, distribute,
5 * and license this software and its documentation for any purpose, provided
6 * that existing copyright notices are retained in all copies and that this
7 * notice is included verbatim in any distributions. No written agreement,
8 * license, or royalty fee is required for any of the authorized uses.
9 * Modifications to this software may be copyrighted by their authors
10 * and need not follow the licensing terms described here, provided that
11 * the new terms are clearly indicated on the first page of each file where
16 * ====================================================
17 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
19 * Developed at SunPro, a Sun Microsystems, Inc. business.
20 * Permission to use, copy, modify, and distribute this
21 * software is freely granted, provided that this notice
23 * ====================================================
27 /* Based on newlib/libm/math/er_lgamma.c in Newlib. */
29 #include "amdgcnmach.h"
31 static const double two52
= 4.50359962737049600000e+15, /* 0x43300000, 0x00000000 */
32 half
= 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
33 one
= 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
34 pi
= 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
35 a0
= 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
36 a1
= 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
37 a2
= 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
38 a3
= 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
39 a4
= 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
40 a5
= 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
41 a6
= 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
42 a7
= 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
43 a8
= 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
44 a9
= 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
45 a10
= 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
46 a11
= 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
47 tc
= 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
48 tf
= -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
49 /* tt = -(tail of tf) */
50 tt
= -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
51 t0
= 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
52 t1
= -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
53 t2
= 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
54 t3
= -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
55 t4
= 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
56 t5
= -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
57 t6
= 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
58 t7
= -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
59 t8
= 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
60 t9
= -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
61 t10
= 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
62 t11
= -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
63 t12
= 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
64 t13
= -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
65 t14
= 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
66 u0
= -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
67 u1
= 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
68 u2
= 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
69 u3
= 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
70 u4
= 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
71 u5
= 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
72 v1
= 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
73 v2
= 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
74 v3
= 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
75 v4
= 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
76 v5
= 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
77 s0
= -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
78 s1
= 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
79 s2
= 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
80 s3
= 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
81 s4
= 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
82 s5
= 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
83 s6
= 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
84 r1
= 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
85 r2
= 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
86 r3
= 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
87 r4
= 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
88 r5
= 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
89 r6
= 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
90 w0
= 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
91 w1
= 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
92 w2
= -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
93 w3
= 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
94 w4
= -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
95 w5
= 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
96 w6
= -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
98 static const double zero
= 0.00000000000000000000e+00;
100 v64df
v64df_cos_aux (v64df x
, v64di __mask
);
101 v64df
v64df_log_aux (v64df x
, v64di __mask
);
102 v64df
v64df_sin_aux (v64df x
, v64di __mask
);
104 #if defined (__has_builtin) \
105 && __has_builtin (__builtin_gcn_floorv) \
106 && __has_builtin (__builtin_gcn_fabsv)
109 v64df_sin_pi (v64df x
)
111 // Explicitly create mask for internal function.
112 v64di __mask
= VECTOR_INIT (-1L);
113 FUNCTION_INIT (v64df
);
118 GET_HIGH_WORD (ix
, x
, NO_COND
);
121 VECTOR_IF (ix
< 0x3fd00000, cond
)
122 VECTOR_RETURN (v64df_sin_aux (pi
* x
, __mask
), cond
);
124 y
= -x
; /* x is assume negative */
127 * argument reduction, make sure inexact flag not raised if input
130 z
= __builtin_gcn_floorv (y
);
131 VECTOR_IF (z
!= y
, cond
)
133 VECTOR_COND_MOVE(y
, y
* 0.5, cond
);
134 VECTOR_COND_MOVE(y
, 2.0 * (y
- __builtin_gcn_floorv (y
)), cond
); /* y = |x| mod 2.0 */
135 VECTOR_COND_MOVE(n
, __builtin_convertvector(y
* 4.0, v64si
), cond
);
137 VECTOR_IF2 (__builtin_convertvector(ix
>= 0x43400000, v64di
), cond2
, cond
)
138 VECTOR_COND_MOVE(y
, VECTOR_INIT(zero
), cond2
);
139 VECTOR_COND_MOVE(n
, VECTOR_INIT(0), cond2
); /* y must be even */
140 VECTOR_ELSE2 (cond2
, cond
)
141 VECTOR_COND_MOVE(z
, y
+ two52
/* exact */, cond2
& __builtin_convertvector(ix
< 0x43300000, v64di
));
142 GET_LOW_WORD (n
, z
, cond2
);
143 VECTOR_COND_MOVE(n
, n
& 1, cond2
);
144 VECTOR_COND_MOVE(y
, __builtin_convertvector(n
, v64df
), cond2
);
145 VECTOR_COND_MOVE(n
, n
<< 2, cond2
);
149 VECTOR_IF (n
== 0, cond
)
150 VECTOR_COND_MOVE(y
, v64df_sin_aux (pi
* y
, __mask
), cond
);
151 VECTOR_ELSEIF (n
== 1 | n
== 2, cond
)
152 VECTOR_COND_MOVE(y
, v64df_cos_aux (pi
* (0.5 - y
), __mask
), cond
);
153 VECTOR_ELSEIF (n
== 3 | n
== 4, cond
)
154 VECTOR_COND_MOVE(y
, v64df_sin_aux (pi
* (VECTOR_INIT(one
) - y
), __mask
), cond
);
155 VECTOR_ELSEIF (n
== 5 | n
== 6, cond
)
156 VECTOR_COND_MOVE(y
, -v64df_cos_aux (pi
* (y
- 1.5), __mask
), cond
);
158 VECTOR_COND_MOVE(y
, v64df_sin_aux (pi
* (y
- 2.0), __mask
), cond
);
161 VECTOR_RETURN(-y
, NO_COND
);
165 DEF_VD_MATH_FUNC (v64df
, lgamma_r
, v64df x
, v64si
*signgamp
)
167 FUNCTION_INIT (v64df
);
169 v64df t
,y
,z
,nadj
= VECTOR_INIT(0.0),p
,p1
,p2
,p3
,q
,r
,w
;
172 EXTRACT_WORDS(hx
,lx
,x
);
174 /* purge off +-inf, NaN, +-0, and negative arguments */
175 *signgamp
= VECTOR_INIT(1);
177 VECTOR_IF(ix
>=0x7ff00000, cond
)
178 VECTOR_RETURN (x
*x
, cond
);
180 VECTOR_IF((ix
|lx
)==0, cond
)
181 VECTOR_COND_MOVE(*signgamp
, VECTOR_INIT(-1), cond
& (hx
<0));
182 VECTOR_RETURN(one
/(x
-x
), cond
);
184 VECTOR_IF (ix
< 0x3b900000, cond
) /* |x|<2**-70, return -log(|x|) */
185 VECTOR_IF2(hx
<0, cond2
, cond
)
186 VECTOR_COND_MOVE(*signgamp
, VECTOR_INIT(-1), cond
);
187 VECTOR_RETURN (-v64df_log_aux(-x
, __mask
), cond2
);
188 VECTOR_ELSE2(cond2
, cond
)
189 VECTOR_RETURN (-v64df_log_aux(x
, __mask
), cond2
);
192 VECTOR_IF (hx
< 0, cond
)
193 VECTOR_IF2(ix
>=0x43300000, cond2
, cond
) /* |x|>=2**52, must be -integer */
194 VECTOR_RETURN(one
/(x
-x
), cond2
); /* -integer */
196 VECTOR_COND_MOVE (t
, v64df_sin_pi (x
), cond
);
197 VECTOR_IF2(__builtin_convertvector(t
==zero
, v64si
), cond2
, cond
)
198 VECTOR_RETURN(one
/(x
-x
), cond2
); /* -integer */
200 VECTOR_COND_MOVE(nadj
, v64df_log_aux(VECTOR_INIT(pi
)/__builtin_gcn_fabsv(t
*x
), __mask
), cond
);
201 VECTOR_COND_MOVE(*signgamp
, VECTOR_INIT(-1), cond
& __builtin_convertvector(t
< zero
, v64si
));
202 VECTOR_COND_MOVE(x
, -x
, cond
);
205 /* purge off 1 and 2 */
206 VECTOR_IF((((ix
-0x3ff00000)|lx
)==0)|(((ix
-0x40000000)|lx
)==0), cond
)
207 VECTOR_COND_MOVE(r
, VECTOR_INIT(0.0), cond
);
209 VECTOR_ELSEIF(ix
<0x40000000, cond
)
210 VECTOR_IF2(ix
<=0x3feccccc, cond2
, cond
)
211 /* lgamma(x) = lgamma(x+1)-log(x) */
212 r
= -v64df_log_aux(x
, __mask
);
213 VECTOR_IF2(ix
>=0x3FE76944, cond3
, cond2
)
214 VECTOR_COND_MOVE(y
, one
-x
, cond3
);
215 VECTOR_COND_MOVE(i
, VECTOR_INIT(0), cond3
);
216 VECTOR_ELSEIF2(ix
>=0x3FCDA661, cond3
, cond2
)
217 VECTOR_COND_MOVE(y
, x
-(tc
-one
), cond3
);
218 VECTOR_COND_MOVE(i
, VECTOR_INIT(1), cond3
);
219 VECTOR_ELSE2(cond3
, cond2
)
220 VECTOR_COND_MOVE(y
, x
, cond3
);
221 VECTOR_COND_MOVE(i
, VECTOR_INIT(2), cond3
);
223 VECTOR_ELSE2(cond2
, cond
)
224 VECTOR_COND_MOVE(r
, VECTOR_INIT(zero
), cond2
);
225 VECTOR_IF2(ix
>=0x3FFBB4C3, cond3
, cond2
) /* [1.7316,2] */
226 VECTOR_COND_MOVE(y
, VECTOR_INIT(2.0)-x
, cond3
);
227 VECTOR_COND_MOVE(i
, VECTOR_INIT(0), cond3
);
228 VECTOR_ELSEIF2(ix
>=0x3FF3B4C4, cond3
, cond2
) /* [1.23,1.73] */
229 VECTOR_COND_MOVE(y
, x
-tc
, cond3
);
230 VECTOR_COND_MOVE(i
, VECTOR_INIT(1), cond3
);
231 VECTOR_ELSE2(cond3
, cond2
)
232 VECTOR_COND_MOVE(y
, x
-one
, cond3
);
233 VECTOR_COND_MOVE(i
, VECTOR_INIT(2), cond3
);
237 VECTOR_IF2(i
==0, cond2
, cond
)
238 VECTOR_COND_MOVE(z
, y
*y
, cond2
);
239 VECTOR_COND_MOVE(p1
, a0
+z
*(a2
+z
*(a4
+z
*(a6
+z
*(a8
+z
*a10
)))), cond2
);
240 VECTOR_COND_MOVE(p2
, z
*(a1
+z
*(a3
+z
*(a5
+z
*(a7
+z
*(a9
+z
*a11
))))), cond2
);
241 VECTOR_COND_MOVE(p
, y
*p1
+p2
, cond2
);
242 VECTOR_COND_MOVE(r
, r
+ (p
-0.5*y
), cond2
);
243 VECTOR_ELSEIF2(i
==1, cond2
, cond
)
244 VECTOR_COND_MOVE(z
, y
*y
, cond2
);
245 VECTOR_COND_MOVE(w
, z
*y
, cond2
);
246 VECTOR_COND_MOVE(p1
, t0
+w
*(t3
+w
*(t6
+w
*(t9
+w
*t12
))), cond2
); /* parallel comp */
247 VECTOR_COND_MOVE(p2
, t1
+w
*(t4
+w
*(t7
+w
*(t10
+w
*t13
))), cond2
);
248 VECTOR_COND_MOVE(p3
, t2
+w
*(t5
+w
*(t8
+w
*(t11
+w
*t14
))), cond2
);
249 VECTOR_COND_MOVE(p
, z
*p1
-(tt
-w
*(p2
+y
*p3
)), cond2
);
250 VECTOR_COND_MOVE(r
, r
+ (tf
+ p
), cond2
);
251 VECTOR_ELSEIF2(i
==2, cond2
, cond
)
252 VECTOR_COND_MOVE(p1
, y
*(u0
+y
*(u1
+y
*(u2
+y
*(u3
+y
*(u4
+y
*u5
))))), cond2
);
253 VECTOR_COND_MOVE(p2
, one
+y
*(v1
+y
*(v2
+y
*(v3
+y
*(v4
+y
*v5
)))), cond2
);
254 VECTOR_COND_MOVE(r
, r
+ (-0.5*y
+ p1
/p2
), cond2
);
256 VECTOR_ELSEIF(ix
<0x40200000, cond
)
258 VECTOR_COND_MOVE(i
, __builtin_convertvector(x
, v64si
), cond
);
259 VECTOR_COND_MOVE(t
, VECTOR_INIT(zero
), cond
);
260 VECTOR_COND_MOVE(y
, x
-__builtin_convertvector(i
, v64df
), cond
);
261 VECTOR_COND_MOVE(p
, y
*(s0
+y
*(s1
+y
*(s2
+y
*(s3
+y
*(s4
+y
*(s5
+y
*s6
)))))), cond
);
262 VECTOR_COND_MOVE(q
, one
+y
*(r1
+y
*(r2
+y
*(r3
+y
*(r4
+y
*(r5
+y
*r6
))))), cond
);
263 VECTOR_COND_MOVE(r
, half
*y
+p
/q
, cond
);
264 VECTOR_COND_MOVE(z
, VECTOR_INIT(one
), cond
); /* lgamma(1+s) = log(s) + lgamma(s) */
265 VECTOR_IF2(i
==7, cond2
, cond
)
266 VECTOR_COND_MOVE(z
, z
* (y
+6.0), cond2
);
268 VECTOR_IF2(i
==7 | i
==6, cond2
, cond
)
269 VECTOR_COND_MOVE(z
, z
* (y
+5.0), cond2
);
271 VECTOR_IF2(i
<=7 & i
>=5, cond2
, cond
)
272 VECTOR_COND_MOVE(z
, z
* (y
+4.0), cond2
);
274 VECTOR_IF2(i
<=7 & i
>=4, cond2
, cond
)
275 VECTOR_COND_MOVE(z
, z
* (y
+3.0), cond2
);
277 VECTOR_IF2(i
<=7 & i
>=3, cond2
, cond
)
278 VECTOR_COND_MOVE(z
, z
* (y
+2.0), cond2
);
279 VECTOR_COND_MOVE(r
, r
+ v64df_log_aux(z
, __mask
), cond2
);
281 /* 8.0 <= x < 2**58 */
282 VECTOR_ELSEIF(ix
< 0x43900000, cond
)
283 VECTOR_COND_MOVE(t
, v64df_log_aux(x
, __mask
), cond
);
284 VECTOR_COND_MOVE(z
, one
/x
, cond
);
285 VECTOR_COND_MOVE(y
, z
*z
, cond
);
286 VECTOR_COND_MOVE(w
, w0
+z
*(w1
+y
*(w2
+y
*(w3
+y
*(w4
+y
*(w5
+y
*w6
))))), cond
);
287 VECTOR_COND_MOVE(r
, (x
-half
)*(t
-one
)+w
, cond
);
289 /* 2**58 <= x <= inf */
290 VECTOR_COND_MOVE(r
, x
*(v64df_log_aux(x
, __mask
)-one
), cond
);
292 VECTOR_IF(hx
<0, cond
)
293 VECTOR_COND_MOVE(r
, nadj
- r
, cond
);
296 VECTOR_RETURN(r
, NO_COND
);