2 * Copyright
(c) 2014 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include
<clc
/clcmacro.h
>
28 _CLC_OVERLOAD _CLC_DEF float tanh
(float x
)
30 // The definition of tanh
(x) is sinh
(x)/cosh
(x), which is also equivalent
31 // to the following three formulae
:
32 // 1.
(exp(x) - exp
(-x))/(exp(x) + exp
(-x))
33 // 2.
(1 -
(2/(exp(2*x
) + 1 )))
34 // 3.
(exp(2*x
) -
1)/(exp(2*x
) + 1)
35 // but computationally
, some formulae are better on some ranges.
37 const float large_threshold
= 0x1.0a2b24p
+3f
;
40 uint aux
= ux
& EXSIGNBIT_SP32
;
43 float y
= as_float
(aux);
47 mad
(y2, 0.4891631088530669873e-4F
, -
0.14628356048797849e-2F
),
48 -
0.28192806108402678e0F
);
49 float b1
= mad
(y2, 0.3427017942262751343e0F
, 0.845784192581041099e0F
);
52 mad
(y2, 0.3827534993599483396e-4F
, -
0.12325644183611929e-2F
),
53 -
0.24069858695196524e0F
);
54 float b2
= mad
(y2, 0.292529068698052819e0F
, 0.72209738473684982e0F
);
57 float a
= c ? a1
: a2
;
58 float b
= c ? b1
: b2
;
59 float zlo
= mad
(MATH_DIVIDE(a, b
), y
*y2
, y
);
61 float p
= exp
(2.0f
* y
) + 1.0f
;
62 float zhi
= 1.0F - MATH_DIVIDE
(2.0F
, p
);
64 float z
= y
<= 1.0f ? zlo
: zhi
;
65 z
= as_float
(xs | as_uint
(z));
68 float sone
= as_float
(0x3f800000U | xs
);
69 z
= y
> large_threshold ? sone
: z
;
70 z
= aux
< 0x39000000 | aux
> 0x7f800000 ? x
: z
;
75 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, float
, tanh
, float
);
79 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
81 _CLC_OVERLOAD _CLC_DEF double tanh
(double x
)
83 // The definition of tanh
(x) is sinh
(x)/cosh
(x), which is also equivalent
84 // to the following three formulae
:
85 // 1.
(exp(x) - exp
(-x))/(exp(x) + exp
(-x))
86 // 2.
(1 -
(2/(exp(2*x
) + 1 )))
87 // 3.
(exp(2*x
) -
1)/(exp(2*x
) + 1)
88 // but computationally
, some formulae are better on some ranges.
90 // The point at which e^-x is insignificant compared to e^x
= ln
(2^
27)
91 const double large_threshold
= 0x1.2b708872320e2p
+4;
93 ulong ux
= as_ulong
(x);
94 ulong ax
= ux
& ~SIGNBIT_DP64
;
96 double y
= as_double
(ax);
102 fma
(y2, -
0.142077926378834722618091e-7, -
0.200047621071909498730453e-3),
103 -
0.176016349003044679402273e-1),
104 -
0.274030424656179760118928e0
);
108 fma
(y2, 0.2091140262529164482568557e-3, 0.201562166026937652780575e-1),
109 0.381641414288328849317962e0
),
110 0.822091273968539282568011e0
);
115 fma
(y2, -
0.115475878996143396378318e-7, -
0.165597043903549960486816e-3),
116 -
0.146173047288731678404066e-1),
117 -
0.227793870659088295252442e0
);
121 fma
(y2, 0.173076050126225961768710e-3, 0.167358775461896562588695e-1),
122 0.317204558977294374244770e0
),
123 0.683381611977295894959554e0
);
126 double zn
= c ? znl
: znm
;
127 double zd
= c ? zdl
: zdm
;
128 double z
= y
+ y
*y2
* MATH_DIVIDE
(zn, zd
);
131 double p
= exp
(2.0
* y
) + 1.0;
132 double zg
= 1.0 -
2.0 / p
;
134 z
= y
> 1.0 ? zg
: z
;
137 z
= y
< 0x1.0p-28 | ax
> PINFBITPATT_DP64 ? x
: z
;
139 z
= y
> large_threshold ?
1.0 : z
;
141 return as_double
(sx | as_ulong
(z));
144 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, double
, tanh
, double
);
146 #endif
// cl_khr_fp64
148 _CLC_DEFINE_UNARY_BUILTIN_FP16
(tanh)