2 * Copyright
(c) 2014,2015 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include
<clc
/clcmacro.h
>
29 _CLC_OVERLOAD _CLC_DEF float cbrt
(float x
) {
32 uint axi
= xi
& EXSIGNBIT_SP32
;
33 uint xsign
= axi ^ xi
;
36 int m
= (xi >> EXPSHIFTBITS_SP32
) - EXPBIAS_SP32
;
39 uint xisub
= as_uint
(as_float(xi |
0x3f800000) -
1.0f
);
40 int msub
= (xisub >> EXPSHIFTBITS_SP32
) -
253;
47 float mf
= as_float
((m3 + EXPBIAS_SP32
) << EXPSHIFTBITS_SP32
);
49 uint indx
= (xi & 0x007f0000) + ((xi & 0x00008000) << 1);
50 float f
= as_float
((xi & MANTBITS_SP32
) |
0x3f000000) - as_float
(indx |
0x3f000000);
53 float r
= f
* USE_TABLE
(log_inv_tbl, indx
);
54 float poly
= mad
(mad(r, 0x1.f9add4p-5f
, -
0x1.c71c72p-4f
), r
*r
, r
* 0x1.555556p-2f
);
56 // This could also be done with a
5-element table
57 float remH
= 0x1.428000p-1f
;
58 float remT
= 0x1.45f31ap-14f
;
60 remH
= rem
== -
1 ?
0x1.964000p-1f
: remH
;
61 remT
= rem
== -
1 ?
0x1.fea53ep-13f
: remT
;
63 remH
= rem
== 0 ?
0x1.000000p
+0f
: remH
;
64 remT
= rem
== 0 ?
0x0.000000p
+0f
: remT
;
66 remH
= rem
== 1 ?
0x1.428000p
+0f
: remH
;
67 remT
= rem
== 1 ?
0x1.45f31ap-13f
: remT
;
69 remH
= rem
== 2 ?
0x1.964000p
+0f
: remH
;
70 remT
= rem
== 2 ?
0x1.fea53ep-12f
: remT
;
72 float2 tv
= USE_TABLE
(cbrt_tbl, indx
);
76 float bH
= cbrtH
* remH
;
77 float bT
= mad
(cbrtH, remT
, mad
(cbrtT, remH
, cbrtT
*remT
));
79 float z
= mad
(poly, bH
, mad
(poly, bT
, bT
)) + bH
;
81 z
= as_float
(as_uint(z) | xsign
);
82 c
= axi
>= EXPBITS_SP32 | axi
== 0;
88 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, float
, cbrt
, float
);
91 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
93 _CLC_OVERLOAD _CLC_DEF double cbrt
(double x
) {
95 int return_x
= isinf
(x) | isnan
(x) | x
== 0.0;
96 ulong ux
= as_ulong
(fabs(x));
97 int m
= (as_int2(ux).hi
>> 20) -
1023;
100 ulong uxs
= as_ulong
(as_double(0x3ff0000000000000UL | ux
) -
1.0);
101 int ms
= m
+ (as_int2(uxs).hi
>> 20) -
1022;
108 int rem
= m -
3*mby3
;
110 double mf
= as_double
((ulong)(mby3 + 1023) << 52);
112 ux
&= 0x000fffffffffffffUL
;
113 double Y
= as_double
(0x3fe0000000000000UL | ux
);
116 int index
= as_int2
(ux).hi
>> 11;
117 index
= (0x100 |
(index >> 1)) + (index & 1);
118 double F
= (double)index
* 0x1.0p-9
;
121 double r
= f
* USE_TABLE
(cbrt_inv_tbl, index-256
);
123 double z
= r
* fma
(r,
127 fma
(r, -
0x1.8090d6221a247p-6
, 0x1.ee7113506ac13p-6
),
128 -
0x1.511e8d2b3183bp-5
),
129 0x1.f9add3c0ca458p-5
),
130 -
0x1.c71c71c71c71cp-4
),
131 0x1.5555555555555p-2
);
133 double2 tv
= USE_TABLE
(cbrt_rem_tbl, rem
+2);
134 double Rem_h
= tv.s0
;
135 double Rem_t
= tv.s1
;
137 tv
= USE_TABLE
(cbrt_dbl_tbl, index-256
);
141 double b_h
= F_h
* Rem_h
;
142 double b_t
= fma
(Rem_t, F_h
, fma
(F_t, Rem_h
, F_t
*Rem_t
));
144 double ans
= fma
(z, b_h
, fma
(z, b_t
, b_t
)) + b_h
;
145 ans
= copysign
(ans*mf
, x
);
146 return return_x ? x
: ans
;
149 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, double
, cbrt
, double
)
155 #pragma OPENCL EXTENSION cl_khr_fp16
: enable
157 _CLC_DEFINE_UNARY_BUILTIN_FP16
(cbrt)