libclc/generic/lib/math/expm1.cl

   1 #include <clc/clc.h>
   2 #include <clc/clcmacro.h>
   3
   4 #include "math.h"
   5 #include "tables.h"
   6
   7 /* Refer to the exp routine for the underlying algorithm */
   8
   9 _CLC_OVERLOAD _CLC_DEF float expm1(float x) {
  10     const float X_MAX = 0x1.62e42ep+6f; // 128*log2 : 88.722839111673
  11     const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184
  12
  13     const float R_64_BY_LOG2 = 0x1.715476p+6f;     // 64/log2 : 92.332482616893657
  14     const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
  15     const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
  16
  17     uint xi = as_uint(x);
  18     int n = (int)(x * R_64_BY_LOG2);
  19     float fn = (float)n;
  20
  21     int j = n & 0x3f;
  22     int m = n >> 6;
  23
  24     float r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x));
  25
  26     // Truncated Taylor series
  27     float z2 = mad(r*r, mad(r, mad(r, 0x1.555556p-5f,  0x1.555556p-3f), 0.5f), r);
  28
  29     float m2 = as_float((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
  30     float2 tv = USE_TABLE(exp_tbl_ep, j);
  31
  32     float two_to_jby64_h = tv.s0 * m2;
  33     float two_to_jby64_t = tv.s1 * m2;
  34     float two_to_jby64 = two_to_jby64_h + two_to_jby64_t;
  35
  36     z2 = mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f);
  37         //Make subnormals work
  38     z2 = x == 0.f ? x : z2;
  39     z2 = x < X_MIN | m < -24 ? -1.0f : z2;
  40     z2 = x > X_MAX ? as_float(PINFBITPATT_SP32) : z2;
  41     z2 = isnan(x) ? x : z2;
  42
  43     return z2;
  44 }
  45
  46 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, expm1, float)
  47
  48 #ifdef cl_khr_fp64
  49
  50 #include "exp_helper.h"
  51
  52 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
  53
  54 _CLC_OVERLOAD _CLC_DEF double expm1(double x) {
  55     const double max_expm1_arg = 709.8;
  56     const double min_expm1_arg = -37.42994775023704;
  57     const double log_OnePlus_OneByFour = 0.22314355131420976;   //0x3FCC8FF7C79A9A22 = log(1+1/4)
  58     const double log_OneMinus_OneByFour = -0.28768207245178096; //0xBFD269621134DB93 = log(1-1/4)
  59     const double sixtyfour_by_lnof2 = 92.33248261689366;        //0x40571547652b82fe
  60     const double lnof2_by_64_head = 0.010830424696223417;       //0x3f862e42fefa0000
  61     const double lnof2_by_64_tail = 2.5728046223276688e-14;     //0x3d1cf79abc9e3b39
  62
  63     // First, assume log(1-1/4) < x < log(1+1/4) i.e  -0.28768 < x < 0.22314
  64     double u = as_double(as_ulong(x) & 0xffffffffff000000UL);
  65     double v = x - u;
  66     double y = u * u * 0.5;
  67     double z = v * (x + u) * 0.5;
  68
  69     double q = fma(x,
  70                    fma(x,
  71                        fma(x,
  72                            fma(x,
  73                                fma(x,
  74                                    fma(x,
  75                                        fma(x,
  76                                            fma(x,2.4360682937111612e-8, 2.7582184028154370e-7),
  77                                            2.7558212415361945e-6),
  78                                        2.4801576918453420e-5),
  79                                    1.9841269447671544e-4),
  80                                1.3888888890687830e-3),
  81                            8.3333333334012270e-3),
  82                        4.1666666666665560e-2),
  83                    1.6666666666666632e-1);
  84     q *= x * x * x;
  85
  86     double z1g = (u + y) + (q + (v + z));
  87     double z1 = x + (y + (q + z));
  88     z1 = y >= 0x1.0p-7 ? z1g : z1;
  89
  90     // Now assume outside interval around 0
  91     int n = (int)(x * sixtyfour_by_lnof2);
  92     int j = n & 0x3f;
  93     int m = n >> 6;
  94
  95     double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
  96     double f1 = tv.s0;
  97     double f2 = tv.s1;
  98     double f = f1 + f2;
  99
 100     double dn = -n;
 101     double r = fma(dn, lnof2_by_64_tail, fma(dn, lnof2_by_64_head, x));
 102
 103     q = fma(r,
 104             fma(r,
 105                 fma(r,
 106                     fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
 107                     4.16666666662260795726e-02),
 108                 1.66666666665260878863e-01),
 109              5.00000000000000008883e-01);
 110     q = fma(r*r, q, r);
 111
 112     double twopm = as_double((long)(m + EXPBIAS_DP64) << EXPSHIFTBITS_DP64);
 113     double twopmm = as_double((long)(EXPBIAS_DP64 - m) << EXPSHIFTBITS_DP64);
 114
 115     // Computations for m > 52, including where result is close to Inf
 116     ulong uval = as_ulong(0x1.0p+1023 * (f1 + (f * q + (f2))));
 117     int e = (int)(uval >> EXPSHIFTBITS_DP64) + 1;
 118
 119     double zme1024 = as_double(((long)e << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64));
 120     zme1024 = e == 2047 ? as_double(PINFBITPATT_DP64) : zme1024;
 121
 122     double zmg52 = twopm * (f1 + fma(f, q, f2 - twopmm));
 123     zmg52 = m == 1024 ? zme1024 : zmg52;
 124
 125     // For m < 53
 126     double zml53 = twopm * ((f1 - twopmm) + fma(f1, q, f2*(1.0 + q)));
 127
 128     // For m < -7
 129     double zmln7 = fma(twopm,  f1 + fma(f, q, f2), -1.0);
 130
 131     z = m < 53 ? zml53 : zmg52;
 132     z = m < -7 ? zmln7 : z;
 133     z = x > log_OneMinus_OneByFour & x < log_OnePlus_OneByFour ? z1 : z;
 134     z = x > max_expm1_arg ? as_double(PINFBITPATT_DP64) : z;
 135     z = x < min_expm1_arg ? -1.0 : z;
 136
 137     return z;
 138 }
 139
 140 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, expm1, double)
 141
 142 #endif
 143
 144 _CLC_DEFINE_UNARY_BUILTIN_FP16(expm1)