2 * Copyright
(c) 2014 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 #include
<clc
/clcmacro.h
>
27 _CLC_OVERLOAD _CLC_DEF float acos
(float x
) {
28 // Computes arccos
(x).
29 // The argument is first reduced by noting that arccos
(x)
30 // is invalid for abs
(x) > 1. For denormal and small
31 // arguments arccos
(x) = pi
/2 to machine accuracy.
32 // Remaining argument ranges are handled as follows.
33 // For abs
(x) <= 0.5 use
34 // arccos
(x) = pi
/2 - arcsin
(x)
35 // = pi
/2 -
(x + x^
3*R
(x^
2))
36 // where R
(x^
2) is a rational minimax approximation to
37 // (arcsin(x) - x
)/x^
3.
38 // For abs
(x) > 0.5 exploit the identity
:
39 // arccos
(x) = pi -
2*arcsin
(sqrt(1-x)/2)
40 // together with the above rational approximation
, and
41 // reconstruct the terms carefully.
44 // Some constants and split constants.
45 const float piby2
= 1.5707963705e+00F
;
46 const float pi
= 3.1415926535897933e+00F
;
47 const float piby2_head
= 1.5707963267948965580e+00F
;
48 const float piby2_tail
= 6.12323399573676603587e-17F
;
51 uint aux
= ux
& ~SIGNBIT_SP32
;
53 int xexp
= (int)(aux >> EXPSHIFTBITS_SP32
) - EXPBIAS_SP32
;
54 float y
= as_float
(aux);
56 // transform if |x|
>= 0.5
57 int transform
= xexp
>= -
1;
60 float yt
= 0.5f
* (1.0f - y
);
61 float r
= transform ? yt
: y2
;
63 // Use a rational approximation for
[0.0, 0.5]
66 mad
(r, -
0.00396137437848476485201154797087F
, -
0.0133819288943925804214011424456F
),
67 -
0.0565298683201845211985026327361F
),
68 0.184161606965100694821398249421F
);
70 float b
= mad
(r, -
0.836411276854206731913362287293F
, 1.10496961524520294485512696706F
);
71 float u
= r
* MATH_DIVIDE
(a, b
);
73 float s
= MATH_SQRT
(r);
75 float s1
= as_float
(as_uint(s) & 0xffff0000);
76 float c
= MATH_DIVIDE
(mad(s1, -s1
, r
), s
+ s1
);
77 float rettn
= mad
(s + mad
(y, u
, -piby2_tail
), -
2.0f
, pi
);
78 float rettp
= 2.0F
* (s1 + mad
(y, u
, c
));
79 float rett
= xneg ? rettn
: rettp
;
80 float ret
= piby2_head -
(x - mad
(x, -u
, piby2_tail
));
82 ret
= transform ? rett
: ret
;
83 ret
= aux
> 0x3f800000U ? as_float
(QNANBITPATT_SP32) : ret
;
84 ret
= ux
== 0x3f800000U ?
0.0f
: ret
;
85 ret
= ux
== 0xbf800000U ? pi
: ret
;
86 ret
= xexp
< -
26 ? piby2
: ret
;
90 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, float
, acos
, float
);
94 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
96 _CLC_OVERLOAD _CLC_DEF double acos
(double x
) {
97 // Computes arccos
(x).
98 // The argument is first reduced by noting that arccos
(x)
99 // is invalid for abs
(x) > 1. For denormal and small
100 // arguments arccos
(x) = pi
/2 to machine accuracy.
101 // Remaining argument ranges are handled as follows.
102 // For abs
(x) <= 0.5 use
103 // arccos
(x) = pi
/2 - arcsin
(x)
104 // = pi
/2 -
(x + x^
3*R
(x^
2))
105 // where R
(x^
2) is a rational minimax approximation to
106 // (arcsin(x) - x
)/x^
3.
107 // For abs
(x) > 0.5 exploit the identity
:
108 // arccos
(x) = pi -
2*arcsin
(sqrt(1-x)/2)
109 // together with the above rational approximation
, and
110 // reconstruct the terms carefully.
112 const double pi
= 3.1415926535897933e+00; /* 0x400921fb54442d18 */
113 const double piby2
= 1.5707963267948965580e+00; /* 0x3ff921fb54442d18 */
114 const double piby2_head
= 1.5707963267948965580e+00; /* 0x3ff921fb54442d18 */
115 const double piby2_tail
= 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
118 int xneg
= as_int2
(x).hi
< 0;
119 int xexp
= (as_int2(y).hi
>> 20) - EXPBIAS_DP64
;
122 int transform
= xexp
>= -
1;
124 double rt
= 0.5 * (1.0 - y
);
126 double r
= transform ? rt
: y2
;
128 // Use a rational approximation for
[0.0, 0.5]
133 fma
(r, 0.0000482901920344786991880522822991,
134 0.00109242697235074662306043804220),
135 -
0.0549989809235685841612020091328),
136 0.275558175256937652532686256258),
137 -
0.445017216867635649900123110649),
138 0.227485835556935010735943483075);
143 fma
(r, 0.105869422087204370341222318533,
144 -
0.943639137032492685763471240072),
145 2.76568859157270989520376345954),
146 -
3.28431505720958658909889444194),
147 1.36491501334161032038194214209);
149 double u
= r
* MATH_DIVIDE
(un, ud
);
151 // Reconstruct acos carefully in transformed region
153 double ztn
= fma
(-2.0
, (s + fma
(s, u
, -piby2_tail
)), pi
);
155 double s1
= as_double
(as_ulong(s) & 0xffffffff00000000UL
);
156 double c
= MATH_DIVIDE
(fma(-s1, s1
, r
), s
+ s1
);
157 double ztp
= 2.0 * (s1 + fma
(s, u
, c
));
158 double zt
= xneg ? ztn
: ztp
;
159 double z
= piby2_head -
(x - fma
(-x, u
, piby2_tail
));
161 z
= transform ? zt
: z
;
163 z
= xexp
< -
56 ? piby2
: z
;
164 z
= isnan
(x) ? as_double
((as_ulong(x) | QNANBITPATT_DP64
)) : z
;
165 z
= x
== 1.0 ?
0.0 : z
;
166 z
= x
== -
1.0 ? pi
: z
;
171 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, double
, acos
, double
);
173 #endif
// cl_khr_fp64
177 #pragma OPENCL EXTENSION cl_khr_fp16
: enable
179 _CLC_DEFINE_UNARY_BUILTIN_FP16
(acos)