2 * Copyright
(c) 2014 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include
<clc
/shared
/clc_max.h
>
28 #include
"sincos_helpers.h"
30 #define bitalign
(hi, lo
, shift
) \
31 ((hi) << (32 -
(shift))) |
((lo) >> (shift));
33 #define bytealign
(src0, src1
, src2
) \
34 ((uint) (((((long)(src0)) << 32) |
(long)(src1)) >> (((src2) & 3)*8)))
36 _CLC_DEF float __clc_sinf_piby4
(float x
, float y
) {
37 // Taylor series for sin
(x) is x - x^
3/3! + x^
5/5! - x^
7/7! ...
38 // = x
* (1 - x^
2/3! + x^
4/5! - x^
6/7! ...
40 // where w
= x
*x and f
(w) = (1 - w
/3! + w^
2/5! - w^
3/7! ...
41 // We use a minimax approximation of
(f(w) -
1) / w
42 // because this produces an expansion in even powers of x.
44 const float c1
= -
0.1666666666e0f
;
45 const float c2
= 0.8333331876e-2f
;
46 const float c3
= -
0.198400874e-3f
;
47 const float c4
= 0.272500015e-5f
;
48 const float c5
= -
2.5050759689e-08f
; // 0xb2d72f34
49 const float c6
= 1.5896910177e-10f
; // 0x2f2ec9d3
53 float r
= mad
(z, mad
(z, mad
(z, mad
(z, c6
, c5
), c4
), c3
), c2
);
54 float ret
= x - mad
(v, -c1
, mad
(z, mad
(y, 0.5f
, -v
*r
), -y
));
59 _CLC_DEF float __clc_cosf_piby4
(float x
, float y
) {
60 // Taylor series for cos
(x) is
1 - x^
2/2! + x^
4/4! - x^
6/6! ...
62 // where w
= x
*x and f
(w) = (1 - w
/2! + w^
2/4! - w^
3/6! ...
63 // We use a minimax approximation of
(f(w) -
1 + w
/2) / (w*w
)
64 // because this produces an expansion in even powers of x.
66 const float c1
= 0.416666666e-1f
;
67 const float c2
= -
0.138888876e-2f
;
68 const float c3
= 0.248006008e-4f
;
69 const float c4
= -
0.2730101334e-6f
;
70 const float c5
= 2.0875723372e-09f
; // 0x310f74f6
71 const float c6
= -
1.1359647598e-11f
; // 0xad47d74e
74 float r
= z
* mad
(z, mad
(z, mad
(z, mad
(z, mad
(z, c6
, c5
), c4
), c3
), c2
), c1
);
79 int ix
= as_int
(x) & EXSIGNBIT_SP32
;
81 // 0.78125 > |x|
>= 0.3
82 float xby4
= as_float
(ix -
0x01000000);
83 qx
= (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4
: qx
;
86 qx
= ix
> 0x3f480000 ?
0.28125f
: qx
;
88 float hz
= mad
(z, 0.5f
, -qx
);
90 float ret
= a -
(hz - mad
(z, r
, -x
*y
));
94 _CLC_DEF float __clc_tanf_piby4
(float x
, int regn
)
96 // Core Remez
[1,2] approximation to tan
(x) on the interval
[0,pi
/4].
99 float a
= mad
(r, -
0.0172032480471481694693109f
, 0.385296071263995406715129f
);
102 mad
(r, 0.01844239256901656082986661f
, -
0.51396505478854532132342f
),
103 1.15588821434688393452299f
);
105 float t
= mad
(x*r
, native_divide
(a, b
), x
);
106 float tr
= -MATH_RECIP
(t);
108 return regn
& 1 ? tr
: t
;
111 _CLC_DEF void __clc_fullMulS
(float *hi
, float
*lo
, float a
, float b
, float bh
, float bt
)
113 if
(HAVE_HW_FMA32()) {
116 *lo
= fma
(a, b
, -ph
);
118 float ah
= as_float
(as_uint(a) & 0xfffff000U
);
121 float pt
= mad
(at, bt
, mad
(at, bh
, mad
(ah, bt
, mad
(ah, bh
, -ph
))));
127 _CLC_DEF float __clc_removePi2S
(float *hi
, float
*lo
, float x
)
130 const float fpiby2_1
= (float) 0xC90FDA / 0x1.0p
+23f
;
131 const float fpiby2_1_h
= (float) 0xC90 / 0x1.0p
+11f
;
132 const float fpiby2_1_t
= (float) 0xFDA / 0x1.0p
+23f
;
134 const float fpiby2_2
= (float) 0xA22168 / 0x1.0p
+47f
;
135 const float fpiby2_2_h
= (float) 0xA22 / 0x1.0p
+35f
;
136 const float fpiby2_2_t
= (float) 0x168 / 0x1.0p
+47f
;
138 const float fpiby2_3
= (float) 0xC234C4 / 0x1.0p
+71f
;
139 const float fpiby2_3_h
= (float) 0xC23 / 0x1.0p
+59f
;
140 const float fpiby2_3_t
= (float) 0x4C4 / 0x1.0p
+71f
;
142 const float twobypi
= 0x1.45f306p-1f
;
144 float fnpi2
= trunc
(mad(x, twobypi
, 0.5f
));
146 // subtract n
* pi
/2 from x
148 __clc_fullMulS
(&rhead
, &rtail
, fnpi2
, fpiby2_1
, fpiby2_1_h
, fpiby2_1_t
);
150 float rem
= v
+ (((x - v
) - rhead
) - rtail
);
152 float rhead2
, rtail2
;
153 __clc_fullMulS
(&rhead2
, &rtail2
, fnpi2
, fpiby2_2
, fpiby2_2_h
, fpiby2_2_t
);
155 rem
= v
+ (((rem - v
) - rhead2
) - rtail2
);
157 float rhead3
, rtail3
;
158 __clc_fullMulS
(&rhead3
, &rtail3
, fnpi2
, fpiby2_3
, fpiby2_3_h
, fpiby2_3_t
);
161 *hi
= v
+ ((rem - v
) - rhead3
);
166 _CLC_DEF int __clc_argReductionSmallS
(float *r
, float
*rr
, float x
)
168 float fnpi2
= __clc_removePi2S
(r, rr
, x
);
169 return
(int)fnpi2
& 0x3;
172 #define FULL_MUL
(A, B
, HI
, LO
) \
176 #define FULL_MAD
(A, B
, C
, HI
, LO
) \
177 LO
= ((A) * (B) + (C)); \
181 _CLC_DEF int __clc_argReductionLargeS
(float *r
, float
*rr
, float x
)
183 int xe
= (int)(as_uint(x) >> 23) -
127;
184 uint xm
= 0x00800000U |
(as_uint(x) & 0x7fffffU
);
186 // 224 bits of
2/PI
: . A2F9836E
4E441529 FC2757D1 F534DDC0 DB629599
3C439041 FE5163AB
187 const uint b6
= 0xA2F9836EU
;
188 const uint b5
= 0x4E441529U
;
189 const uint b4
= 0xFC2757D1U
;
190 const uint b3
= 0xF534DDC0U
;
191 const uint b2
= 0xDB629599U
;
192 const uint b1
= 0x3C439041U
;
193 const uint b0
= 0xFE5163ABU
;
195 uint p0
, p1
, p2
, p3
, p4
, p5
, p6
, p7
, c0
, c1
;
197 FULL_MUL
(xm, b0
, c0
, p0
);
198 FULL_MAD
(xm, b1
, c0
, c1
, p1
);
199 FULL_MAD
(xm, b2
, c1
, c0
, p2
);
200 FULL_MAD
(xm, b3
, c0
, c1
, p3
);
201 FULL_MAD
(xm, b4
, c1
, c0
, p4
);
202 FULL_MAD
(xm, b5
, c0
, c1
, p5
);
203 FULL_MAD
(xm, b6
, c1
, p7
, p6
);
205 uint fbits
= 224 + 23 - xe
;
207 // shift amount to get
2 lsb of integer part at top
2 bits
208 // min
: 25 (xe=18) max
: 134 (xe=127)
209 uint shift
= 256U -
2 - fbits
;
211 // Shift by up to
134/32 = 4 words
246 // bitalign cannot handle a shift of
32
249 uint t7
= bitalign
(p7, p6
, shift
);
250 uint t6
= bitalign
(p6, p5
, shift
);
251 uint t5
= bitalign
(p5, p4
, shift
);
256 // Get
2 lsb of int part and msb of fraction
259 // Scoot up
2 more bits so only fraction remains
260 p7
= bitalign
(p7, p6
, 30);
261 p6
= bitalign
(p6, p5
, 30);
262 p5
= bitalign
(p5, p4
, 30);
264 // Subtract
1 if msb of fraction is
1, i.e. fraction
>= 0.5
265 uint flip
= i
& 1 ?
0xffffffffU
: 0U;
266 uint sign
= i
& 1 ?
0x80000000U
: 0U;
271 // Find exponent and shift away leading zeroes and hidden bit
274 p7
= bitalign
(p7, p6
, shift
);
275 p6
= bitalign
(p6, p5
, shift
);
277 // Most significant part of fraction
278 float q1
= as_float
(sign |
((127 - xe
) << 23) |
(p7 >> 9));
280 // Shift out bits we captured on q1
281 p7
= bitalign
(p7, p6
, 32-
23);
283 // Get
24 more bits of fraction in another float
, there are not long strings of zeroes here
284 int xxe
= clz
(p7) + 1;
285 p7
= bitalign
(p7, p6
, 32-xxe
);
286 float q0
= as_float
(sign |
((127 -
(xe + 23 + xxe
)) << 23) |
(p7 >> 9));
288 // At this point
, the fraction q1
+ q0 is correct to at least
48 bits
289 // Now we need to multiply the fraction by pi
/2
290 // This loses us about
4 bits
291 // pi
/2 = C90 FDA A22
168 C23
4C4
293 const float pio2h
= (float)0xc90fda / 0x1.0p
+23f
;
294 const float pio2hh
= (float)0xc90 / 0x1.0p
+11f
;
295 const float pio2ht
= (float)0xfda / 0x1.0p
+23f
;
296 const float pio2t
= (float)0xa22168 / 0x1.0p
+47f
;
300 if
(HAVE_HW_FMA32()) {
302 rt
= fma
(q0, pio2h
, fma
(q1, pio2t
, fma
(q1, pio2h
, -rh
)));
304 float q1h
= as_float
(as_uint(q1) & 0xfffff000);
305 float q1t
= q1 - q1h
;
307 rt
= mad
(q1t, pio2ht
, mad
(q1t, pio2hh
, mad
(q1h, pio2ht
, mad
(q1h, pio2hh
, -rh
))));
308 rt
= mad
(q0, pio2h
, mad
(q1, pio2t
, rt
));
316 return
((i >> 1) + (i & 1)) & 0x3;
319 _CLC_DEF int __clc_argReductionS
(float *r
, float
*rr
, float x
)
322 return __clc_argReductionSmallS
(r, rr
, x
);
324 return __clc_argReductionLargeS
(r, rr
, x
);
329 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
331 // Reduction for medium sized arguments
332 _CLC_DEF void __clc_remainder_piby2_medium
(double x
, double
*r
, double
*rr
, int
*regn
) {
333 // How many pi
/2 is x a multiple of?
334 const double two_by_pi
= 0x1.45f306dc9c883p-1
;
335 double dnpi2
= trunc
(fma(x, two_by_pi
, 0.5));
337 const double piby2_h
= -
7074237752028440.0 / 0x1.0p
+52;
338 const double piby2_m
= -
2483878800010755.0 / 0x1.0p
+105;
339 const double piby2_t
= -
3956492004828932.0 / 0x1.0p
+158;
341 // Compute product of npi2 with
159 bits of
2/pi
342 double p_hh
= piby2_h
* dnpi2
;
343 double p_ht
= fma
(piby2_h, dnpi2
, -p_hh
);
344 double p_mh
= piby2_m
* dnpi2
;
345 double p_mt
= fma
(piby2_m, dnpi2
, -p_mh
);
346 double p_th
= piby2_t
* dnpi2
;
347 double p_tt
= fma
(piby2_t, dnpi2
, -p_th
);
349 // Reduce to
159 bits
351 double pm
= p_ht
+ p_mh
;
352 double t
= p_mh -
(pm - p_ht
);
353 double pt
= p_th
+ t
+ p_mt
+ p_tt
;
354 t
= ph
+ pm
; pm = pm - (t - ph); ph = t;
355 t
= pm
+ pt
; pt = pt - (t - pm); pm = t;
360 double qt
= pm -
(qh - t
) + pt
;
364 *regn
= (int)(long)dnpi2
& 0x3;
367 // Given positive argument x
, reduce it to the range
[-pi
/4,pi
/4] using
368 // extra precision
, and return the result in r
, rr.
369 // Return value
"regn" tells how many lots of pi
/2 were subtracted
370 // from x to put it in the range
[-pi
/4,pi
/4], mod
4.
372 _CLC_DEF void __clc_remainder_piby2_large
(double x
, double
*r
, double
*rr
, int
*regn
) {
374 long ux
= as_long
(x);
375 int e
= (int)(ux >> 52) -
1023;
376 int i
= __clc_max
(23, (e >> 3) + 17);
381 // The following extracts
192 consecutive bits of
2/pi aligned on an arbitrary byte boundary
382 uint4 q0
= USE_TABLE
(pibits_tbl, j16
);
383 uint4 q1
= USE_TABLE
(pibits_tbl, (j16 + 16));
384 uint4 q2
= USE_TABLE
(pibits_tbl, (j16 + 32));
386 int k
= (j >> 2) & 0x3;
387 int4 c
= (int4)k
== (int4)(0, 1, 2, 3);
389 uint u0
, u1
, u2
, u3
, u4
, u5
, u6
;
391 u0
= c.s1 ? q0.s1
: q0.s0
;
392 u0
= c.s2 ? q0.s2
: u0
;
393 u0
= c.s3 ? q0.s3
: u0
;
395 u1
= c.s1 ? q0.s2
: q0.s1
;
396 u1
= c.s2 ? q0.s3
: u1
;
397 u1
= c.s3 ? q1.s0
: u1
;
399 u2
= c.s1 ? q0.s3
: q0.s2
;
400 u2
= c.s2 ? q1.s0
: u2
;
401 u2
= c.s3 ? q1.s1
: u2
;
403 u3
= c.s1 ? q1.s0
: q0.s3
;
404 u3
= c.s2 ? q1.s1
: u3
;
405 u3
= c.s3 ? q1.s2
: u3
;
407 u4
= c.s1 ? q1.s1
: q1.s0
;
408 u4
= c.s2 ? q1.s2
: u4
;
409 u4
= c.s3 ? q1.s3
: u4
;
411 u5
= c.s1 ? q1.s2
: q1.s1
;
412 u5
= c.s2 ? q1.s3
: u5
;
413 u5
= c.s3 ? q2.s0
: u5
;
415 u6
= c.s1 ? q1.s3
: q1.s2
;
416 u6
= c.s2 ? q2.s0
: u6
;
417 u6
= c.s3 ? q2.s1
: u6
;
419 uint v0
= bytealign
(u1, u0
, j
);
420 uint v1
= bytealign
(u2, u1
, j
);
421 uint v2
= bytealign
(u3, u2
, j
);
422 uint v3
= bytealign
(u4, u3
, j
);
423 uint v4
= bytealign
(u5, u4
, j
);
424 uint v5
= bytealign
(u6, u5
, j
);
426 // Place those
192 bits in
4 48-bit doubles along with correct exponent
427 // If i
> 1018 we would get subnormals so we scale p up and x down to get the same product
429 x
*= i
> 1018 ?
0x1.0p-136
: 1.0;
430 i -
= i
> 1018 ?
136 : 0;
432 uint ua
= (uint)(1023 + 52 - i
) << 20;
433 double a
= as_double
((uint2)(0, ua
));
434 double p0
= as_double
((uint2)(v0, ua |
(v1 & 0xffffU
))) - a
;
436 a
= as_double
((uint2)(0, ua
));
437 double p1
= as_double
((uint2)((v2 << 16) |
(v1 >> 16), ua |
(v2 >> 16))) - a
;
439 a
= as_double
((uint2)(0, ua
));
440 double p2
= as_double
((uint2)(v3, ua |
(v4 & 0xffffU
))) - a
;
442 a
= as_double
((uint2)(0, ua
));
443 double p3
= as_double
((uint2)((v5 << 16) |
(v4 >> 16), ua |
(v5 >> 16))) - a
;
447 double f0l
= fma
(p0, x
, -f0h
);
449 double f1l
= fma
(p1, x
, -f1h
);
451 double f2l
= fma
(p2, x
, -f2h
);
453 double f3l
= fma
(p3, x
, -f3h
);
455 // Accumulate product into
4 doubles
458 double f3
= f3h
+ f2h
;
459 t
= f2h -
(f3 - f3h
);
464 t
= f1h -
(f2 - s
) + t
;
469 t
= f0h -
(f1 - s
) + t
;
474 // Strip off unwanted large integer bits
475 f3
= 0x1.0p
+10 * fract
(f3 * 0x1.0p-10
, &fract_temp
);
476 f3
+= f3
+ f2
< 0.0 ?
0x1.0p
+10 : 0.0;
478 // Compute least significant integer bits
480 double di
= t - fract
(t, &fract_temp
);
483 // Shift out remaining integer part
485 s
= f3
+ f2
; t = f2 - (s - f3); f3 = s; f2 = t;
486 s
= f2
+ f1
; t = f1 - (s - f2); f2 = s; f1 = t;
489 // Subtract
1 if fraction is
>= 0.5, and update regn
495 s
= f3
+ f2
; t = f2 -(s - f3); f3 = s; f2 = t + f1;
497 // Multiply precise fraction by pi
/2 to get radians
498 const double p2h
= 7074237752028440.0 / 0x1.0p
+52;
499 const double p2t
= 4967757600021510.0 / 0x1.0p
+106;
501 double rhi
= f3
* p2h
;
502 double rlo
= fma
(f2, p2h
, fma
(f3, p2t
, fma
(f3, p2h
, -rhi
)));
505 *rr
= rlo -
(*r - rhi
);
510 _CLC_DEF double2 __clc_sincos_piby4
(double x
, double xx
) {
511 // Taylor series for sin
(x) is x - x^
3/3! + x^
5/5! - x^
7/7! ...
512 // = x
* (1 - x^
2/3! + x^
4/5! - x^
6/7! ...
514 // where w
= x
*x and f
(w) = (1 - w
/3! + w^
2/5! - w^
3/7! ...
515 // We use a minimax approximation of
(f(w) -
1) / w
516 // because this produces an expansion in even powers of x.
517 // If xx
(the tail of x
) is non-zero
, we add a correction
518 // term g
(x,xx
) = (1-x*x
/2)*xx to the result
, where g
(x,xx
)
519 // is an approximation to cos
(x)*sin
(xx) valid because
520 // xx is tiny relative to x.
522 // Taylor series for cos
(x) is
1 - x^
2/2! + x^
4/4! - x^
6/6! ...
524 // where w
= x
*x and f
(w) = (1 - w
/2! + w^
2/4! - w^
3/6! ...
525 // We use a minimax approximation of
(f(w) -
1 + w
/2) / (w*w
)
526 // because this produces an expansion in even powers of x.
527 // If xx
(the tail of x
) is non-zero
, we subtract a correction
528 // term g
(x,xx
) = x
*xx to the result
, where g
(x,xx
)
529 // is an approximation to sin
(x)*sin
(xx) valid because
530 // xx is tiny relative to x.
532 const double sc1
= -
0.166666666666666646259241729;
533 const double sc2
= 0.833333333333095043065222816e-2;
534 const double sc3
= -
0.19841269836761125688538679e-3;
535 const double sc4
= 0.275573161037288022676895908448e-5;
536 const double sc5
= -
0.25051132068021699772257377197e-7;
537 const double sc6
= 0.159181443044859136852668200e-9;
539 const double cc1
= 0.41666666666666665390037e-1;
540 const double cc2
= -
0.13888888888887398280412e-2;
541 const double cc3
= 0.248015872987670414957399e-4;
542 const double cc4
= -
0.275573172723441909470836e-6;
543 const double cc5
= 0.208761463822329611076335e-8;
544 const double cc6
= -
0.113826398067944859590880e-10;
551 double sp
= fma
(fma(fma(fma(sc6, x2
, sc5
), x2
, sc4
), x2
, sc3
), x2
, sc2
);
553 double cp
= t
+ fma
(fma(fma(fma(fma(fma(cc6, x2
, cc5
), x2
, cc4
), x2
, cc3
), x2
, cc2
), x2
, cc1
),
554 x2
*x2
, fma
(x, xx
, (1.0 - t
) - r
));
557 ret.lo
= x - fma
(-x3, sc1
, fma
(fma(-x3, sp
, 0.5*xx
), x2
, -xx
));