2 * Copyright
(c) 2014,2015 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include
<clc
/clcmacro.h
>
28 _CLC_OVERLOAD _CLC_DEF float acospi
(float x
) {
29 // Computes arccos
(x).
30 // The argument is first reduced by noting that arccos
(x)
31 // is invalid for abs
(x) > 1. For denormal and small
32 // arguments arccos
(x) = pi
/2 to machine accuracy.
33 // Remaining argument ranges are handled as follows.
34 // For abs
(x) <= 0.5 use
35 // arccos
(x) = pi
/2 - arcsin
(x)
36 // = pi
/2 -
(x + x^
3*R
(x^
2))
37 // where R
(x^
2) is a rational minimax approximation to
38 // (arcsin(x) - x
)/x^
3.
39 // For abs
(x) > 0.5 exploit the identity
:
40 // arccos
(x) = pi -
2*arcsin
(sqrt(1-x)/2)
41 // together with the above rational approximation
, and
42 // reconstruct the terms carefully.
45 // Some constants and split constants.
46 const float pi
= 3.1415926535897933e+00f
;
47 const float piby2_head
= 1.5707963267948965580e+00f
; /* 0x3ff921fb54442d18 */
48 const float piby2_tail
= 6.12323399573676603587e-17f
; /* 0x3c91a62633145c07 */
51 uint aux
= ux
& ~SIGNBIT_SP32
;
53 int xexp
= (int)(aux >> EXPSHIFTBITS_SP32
) - EXPBIAS_SP32
;
55 float y
= as_float
(aux);
57 // transform if |x|
>= 0.5
58 int transform
= xexp
>= -
1;
61 float yt
= 0.5f
* (1.0f - y
);
62 float r
= transform ? yt
: y2
;
64 // Use a rational approximation for
[0.0, 0.5]
65 float a
= mad
(r, mad
(r, mad
(r, -
0.00396137437848476485201154797087F
, -
0.0133819288943925804214011424456F
),
66 -
0.0565298683201845211985026327361F
),
67 0.184161606965100694821398249421F
);
68 float b
= mad
(r, -
0.836411276854206731913362287293F
, 1.10496961524520294485512696706F
);
69 float u
= r
* MATH_DIVIDE
(a, b
);
71 float s
= MATH_SQRT
(r);
73 float s1
= as_float
(as_uint(s) & 0xffff0000);
74 float c
= MATH_DIVIDE
(r - s1
* s1
, s
+ s1
);
75 // float rettn
= 1.0f - MATH_DIVIDE
(2.0f
* (s + (y * u - piby2_tail
)), pi
);
76 float rettn
= 1.0f - MATH_DIVIDE
(2.0f
* (s + mad
(y, u
, -piby2_tail
)), pi
);
77 // float rettp
= MATH_DIVIDE
(2.0F
* s1
+ (2.0F
* c
+ 2.0F
* y
* u
), pi
);
78 float rettp
= MATH_DIVIDE
(2.0f
*(s1 + mad
(y, u
, c
)), pi
);
79 float rett
= xneg ? rettn
: rettp
;
80 // float ret
= MATH_DIVIDE
(piby2_head -
(x -
(piby2_tail - x
* u
)), pi
);
81 float ret
= MATH_DIVIDE
(piby2_head -
(x - mad
(x, -u
, piby2_tail
)), pi
);
83 ret
= transform ? rett
: ret
;
84 ret
= aux
> 0x3f800000U ? as_float
(QNANBITPATT_SP32) : ret
;
85 ret
= ux
== 0x3f800000U ?
0.0f
: ret
;
86 ret
= ux
== 0xbf800000U ?
1.0f
: ret
;
87 ret
= xexp
< -
26 ?
0.5f
: ret
;
91 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, float
, acospi
, float
)
94 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
96 _CLC_OVERLOAD _CLC_DEF double acospi
(double x
) {
97 // Computes arccos
(x).
98 // The argument is first reduced by noting that arccos
(x)
99 // is invalid for abs
(x) > 1. For denormal and small
100 // arguments arccos
(x) = pi
/2 to machine accuracy.
101 // Remaining argument ranges are handled as follows.
102 // For abs
(x) <= 0.5 use
103 // arccos
(x) = pi
/2 - arcsin
(x)
104 // = pi
/2 -
(x + x^
3*R
(x^
2))
105 // where R
(x^
2) is a rational minimax approximation to
106 // (arcsin(x) - x
)/x^
3.
107 // For abs
(x) > 0.5 exploit the identity
:
108 // arccos
(x) = pi -
2*arcsin
(sqrt(1-x)/2)
109 // together with the above rational approximation
, and
110 // reconstruct the terms carefully.
112 const double pi
= 0x1.921fb54442d18p
+1;
113 const double piby2_tail
= 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
116 int xneg
= as_int2
(x).hi
< 0;
117 int xexp
= (as_int2(y).hi
>> 20) - EXPBIAS_DP64
;
120 int transform
= xexp
>= -
1;
122 // Transform y into the range
[0,0.5)
123 double r1
= 0.5 * (1.0 - y
);
126 r
= transform ? r1
: r
;
127 y
= transform ? s
: y
;
129 // Use a rational approximation for
[0.0, 0.5]
134 fma
(r, 0.0000482901920344786991880522822991,
135 0.00109242697235074662306043804220),
136 -
0.0549989809235685841612020091328),
137 0.275558175256937652532686256258),
138 -
0.445017216867635649900123110649),
139 0.227485835556935010735943483075);
144 fma
(r, 0.105869422087204370341222318533,
145 -
0.943639137032492685763471240072),
146 2.76568859157270989520376345954),
147 -
3.28431505720958658909889444194),
148 1.36491501334161032038194214209);
150 double u
= r
* MATH_DIVIDE
(un, ud
);
152 // Reconstruct acos carefully in transformed region
153 double res1
= fma
(-2.0
, MATH_DIVIDE
(s + fma
(y, u
, -piby2_tail
), pi
), 1.0);
154 double s1
= as_double
(as_ulong(s) & 0xffffffff00000000UL
);
155 double c
= MATH_DIVIDE
(fma(-s1, s1
, r
), s
+ s1
);
156 double res2
= MATH_DIVIDE
(fma(2.0
, s1
, fma
(2.0
, c
, 2.0 * y
* u
)), pi
);
157 res1
= xneg ? res1
: res2
;
158 res2
= 0.5 - fma
(x, u
, x
) / pi
;
159 res1
= transform ? res1
: res2
;
161 const double qnan
= as_double
(QNANBITPATT_DP64);
162 res2
= x
== 1.0 ?
0.0 : qnan
;
163 res2
= x
== -
1.0 ?
1.0 : res2
;
164 res1
= xexp
>= 0 ? res2
: res1
;
165 res1
= xexp
< -
56 ?
0.5 : res1
;
170 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, double
, acospi
, double
)
174 _CLC_DEFINE_UNARY_BUILTIN_FP16
(acospi)