2 #include
<clc
/clcmacro.h
>
7 /* Refer to the exp routine for the underlying algorithm
*/
9 _CLC_OVERLOAD _CLC_DEF float expm1
(float x
) {
10 const float X_MAX
= 0x1.62e42ep
+6f
; // 128*log2 : 88.722839111673
11 const float X_MIN
= -
0x1.9d1da0p
+6f
; // -149*log2 : -103.27892990343184
13 const float R_64_BY_LOG2
= 0x1.715476p
+6f
; // 64/log2 : 92.332482616893657
14 const float R_LOG2_BY_64_LD
= 0x1.620000p-7f
; // log2/64 lead: 0.0108032227
15 const float R_LOG2_BY_64_TL
= 0x1.c85fdep-16f
; // log2/64 tail: 0.0000272020388
18 int n
= (int)(x * R_64_BY_LOG2
);
24 float r
= mad
(fn, -R_LOG2_BY_64_TL
, mad
(fn, -R_LOG2_BY_64_LD
, x
));
26 // Truncated Taylor series
27 float z2
= mad
(r*r
, mad
(r, mad
(r, 0x1.555556p-5f
, 0x1.555556p-3f
), 0.5f
), r
);
29 float m2
= as_float
((m + EXPBIAS_SP32
) << EXPSHIFTBITS_SP32
);
30 float2 tv
= USE_TABLE
(exp_tbl_ep, j
);
32 float two_to_jby64_h
= tv.s0
* m2
;
33 float two_to_jby64_t
= tv.s1
* m2
;
34 float two_to_jby64
= two_to_jby64_h
+ two_to_jby64_t
;
36 z2
= mad
(z2, two_to_jby64
, two_to_jby64_t
) + (two_to_jby64_h -
1.0f
);
37 //Make subnormals work
38 z2
= x
== 0.f ? x
: z2
;
39 z2
= x
< X_MIN | m
< -
24 ? -
1.0f
: z2
;
40 z2
= x
> X_MAX ? as_float
(PINFBITPATT_SP32) : z2
;
41 z2
= isnan
(x) ? x
: z2
;
46 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, float
, expm1
, float
)
50 #include
"exp_helper.h"
52 #pragma OPENCL EXTENSION cl_khr_fp64
: enable
54 _CLC_OVERLOAD _CLC_DEF double expm1
(double x
) {
55 const double max_expm1_arg
= 709.8;
56 const double min_expm1_arg
= -
37.42994775023704;
57 const double log_OnePlus_OneByFour
= 0.22314355131420976; //0x3FCC8FF7C79A9A22 = log(1+1/4)
58 const double log_OneMinus_OneByFour
= -
0.28768207245178096; //0xBFD269621134DB93 = log(1-1/4)
59 const double sixtyfour_by_lnof2
= 92.33248261689366; //0x40571547652b82fe
60 const double lnof2_by_64_head
= 0.010830424696223417; //0x3f862e42fefa0000
61 const double lnof2_by_64_tail
= 2.5728046223276688e-14; //0x3d1cf79abc9e3b39
63 // First
, assume log
(1-1/4) < x
< log
(1+1/4) i.e -
0.28768 < x
< 0.22314
64 double u
= as_double
(as_ulong(x) & 0xffffffffff000000UL
);
66 double y
= u
* u
* 0.5;
67 double z
= v
* (x + u
) * 0.5;
76 fma
(x,2.4360682937111612e-8, 2.7582184028154370e-7),
77 2.7558212415361945e-6),
78 2.4801576918453420e-5),
79 1.9841269447671544e-4),
80 1.3888888890687830e-3),
81 8.3333333334012270e-3),
82 4.1666666666665560e-2),
83 1.6666666666666632e-1);
86 double z1g
= (u + y
) + (q + (v + z
));
87 double z1
= x
+ (y + (q + z
));
88 z1
= y
>= 0x1.0p-7 ? z1g
: z1
;
90 // Now assume outside interval around
0
91 int n
= (int)(x * sixtyfour_by_lnof2
);
95 double2 tv
= USE_TABLE
(two_to_jby64_ep_tbl, j
);
101 double r
= fma
(dn, lnof2_by_64_tail
, fma
(dn, lnof2_by_64_head
, x
));
106 fma
(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
107 4.16666666662260795726e-02),
108 1.66666666665260878863e-01),
109 5.00000000000000008883e-01);
112 double twopm
= as_double
((long)(m + EXPBIAS_DP64
) << EXPSHIFTBITS_DP64
);
113 double twopmm
= as_double
((long)(EXPBIAS_DP64 - m
) << EXPSHIFTBITS_DP64
);
115 // Computations for m
> 52, including where result is close to Inf
116 ulong uval
= as_ulong
(0x1.0p
+1023 * (f1 + (f * q
+ (f2))));
117 int e
= (int)(uval >> EXPSHIFTBITS_DP64
) + 1;
119 double zme1024
= as_double
(((long)e
<< EXPSHIFTBITS_DP64
) |
(uval & MANTBITS_DP64
));
120 zme1024
= e
== 2047 ? as_double
(PINFBITPATT_DP64) : zme1024
;
122 double zmg52
= twopm
* (f1 + fma
(f, q
, f2 - twopmm
));
123 zmg52
= m
== 1024 ? zme1024
: zmg52
;
126 double zml53
= twopm
* ((f1 - twopmm
) + fma
(f1, q
, f2
*(1.0
+ q
)));
129 double zmln7
= fma
(twopm, f1
+ fma
(f, q
, f2
), -
1.0);
131 z
= m
< 53 ? zml53
: zmg52
;
132 z
= m
< -
7 ? zmln7
: z
;
133 z
= x
> log_OneMinus_OneByFour
& x
< log_OnePlus_OneByFour ? z1
: z
;
134 z
= x
> max_expm1_arg ? as_double
(PINFBITPATT_DP64) : z
;
135 z
= x
< min_expm1_arg ? -
1.0 : z
;
140 _CLC_UNARY_VECTORIZE
(_CLC_OVERLOAD _CLC_DEF
, double
, expm1
, double
)
144 _CLC_DEFINE_UNARY_BUILTIN_FP16
(expm1)