Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / libc / AOR_v20.02 / math / v_powf.c
blobc4d8db0e38fa17bce17a0634b75d0f1fb386f51a
1 /*
2 * Single-precision vector powf function.
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
9 #include "mathlib.h"
10 #include "v_math.h"
11 #if V_SUPPORTED
13 #define Min v_u32 (0x00800000)
14 #define Max v_u32 (0x7f800000)
15 #define SBITS 5
16 #define Tlog v__powf_log2_data.tab
17 #define Texp v__exp2f_data.tab
18 #define A v__powf_log2_data.poly
19 #define C v__exp2f_data.poly
20 #define LOGDEG 4
22 #if LOGDEG == 5
23 /* 1.01 ulp */
24 #define OFF v_u32 (0x3f330000)
25 #define TBITS 4
26 #elif LOGDEG == 4
27 /* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
28 #define OFF v_u32 (0x3f35d000)
29 #define TBITS 5
30 #endif
32 #define V_EXP2F_TABLE_BITS SBITS
33 #define V_EXP2F_POLY_ORDER 3
34 struct v_exp2f_data
36 uint64_t tab[1 << V_EXP2F_TABLE_BITS];
37 double poly[V_EXP2F_POLY_ORDER];
40 #define V_POWF_LOG2_TABLE_BITS TBITS
41 #define V_POWF_LOG2_POLY_ORDER LOGDEG
42 #define SCALE ((double) (1 << SBITS))
43 struct v_powf_log2_data
45 struct
47 double invc, logc;
48 } tab[1 << V_POWF_LOG2_TABLE_BITS];
49 double poly[V_POWF_LOG2_POLY_ORDER];
52 static const struct v_powf_log2_data v__powf_log2_data = {
53 #if LOGDEG == 5
54 .tab = {
55 { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
56 { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
57 { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
58 { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
59 { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
60 { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
61 { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
62 { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
63 { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
64 { 0x1p+0, 0x0p+0 * SCALE },
65 { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
66 { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
67 { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
68 { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
69 { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
70 { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
72 /* rel err: 1.46 * 2^-32 */
73 .poly = {
74 0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
75 0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
76 0x1.71547652ab82bp0 * SCALE,
78 #elif LOGDEG == 4
79 .tab = {
80 {0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
81 {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
82 {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
83 {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
84 {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
85 {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
86 {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
87 {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
88 {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
89 {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
90 {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
91 {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
92 {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
93 {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
94 {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
95 {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
96 {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
97 {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
98 {0x1p+0, 0x0p+0 * SCALE},
99 {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
100 {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
101 {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
102 {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
103 {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
104 {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
105 {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
106 {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
107 {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
108 {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
109 {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
110 {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
111 {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
113 /* rel err: 1.5 * 2^-30 */
114 .poly = {
115 -0x1.6ff5daa3b3d7cp-2 * SCALE,
116 0x1.ec81d03c01aebp-2 * SCALE,
117 -0x1.71547bb43f101p-1 * SCALE,
118 0x1.7154764a815cbp0 * SCALE,
120 #endif
123 static const struct v_exp2f_data v__exp2f_data = {
124 .tab = {
125 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
126 0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
127 0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
128 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
129 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
130 0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
131 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
132 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
134 /* rel err: 1.69 * 2^-34 */
135 .poly = {
136 0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
140 VPCS_ATTR
141 __attribute__ ((noinline)) static v_f32_t
142 specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
144 return v_call2_f32 (powf, x, y, ret, cmp);
147 VPCS_ATTR
148 v_f32_t
149 V_NAME(powf) (v_f32_t x, v_f32_t y)
151 v_u32_t u, tmp, cmp, i, top, iz;
152 v_s32_t k;
153 v_f32_t ret;
155 u = v_as_u32_f32 (x);
156 cmp = v_cond_u32 (u - Min >= Max - Min);
157 tmp = u - OFF;
158 i = (tmp >> (23 - TBITS)) % (1 << TBITS);
159 top = tmp & 0xff800000;
160 iz = u - top;
161 k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
163 for (int lane = 0; lane < v_lanes32 (); lane++)
165 uint32_t si, siz;
166 int32_t sk;
167 float sy;
169 /* Use double precision for each lane. */
170 double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
171 uint64_t ki, t;
173 si = v_get_u32 (i, lane);
174 siz = v_get_u32 (iz, lane);
175 sk = v_get_s32 (k, lane);
176 sy = v_get_f32 (y, lane);
178 invc = Tlog[si].invc;
179 logc = Tlog[si].logc;
180 z = (double) as_f32_u32 (siz);
182 /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
183 r = __builtin_fma (z, invc, -1.0);
184 y0 = logc + (double) sk;
186 /* Polynomial to approximate log1p(r)/ln2. */
187 #if LOGDEG == 5
188 logx = A[0];
189 logx = r * logx + A[1];
190 logx = r * logx + A[2];
191 logx = r * logx + A[3];
192 logx = r * logx + A[4];
193 logx = r * logx + y0;
194 #elif LOGDEG == 4
195 logx = A[0];
196 logx = r * logx + A[1];
197 logx = r * logx + A[2];
198 logx = r * logx + A[3];
199 logx = r * logx + y0;
200 #endif
201 ylogx = sy * logx;
202 v_set_u32 (&cmp, lane,
203 (as_u64_f64 (ylogx) >> 47 & 0xffff)
204 >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
206 : v_get_u32 (cmp, lane));
208 /* N*x = k + r with r in [-1/2, 1/2] */
209 #if TOINT_INTRINSICS
210 kd = roundtoint (ylogx); /* k */
211 ki = converttoint (ylogx);
212 #else
213 # define SHIFT 0x1.8p52
214 kd = eval_as_double (ylogx + SHIFT);
215 ki = asuint64 (kd);
216 kd -= SHIFT;
217 #endif
218 r = ylogx - kd;
220 /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
221 t = Texp[ki % (1 << SBITS)];
222 t += ki << (52 - SBITS);
223 s = as_f64_u64 (t);
224 p = C[0];
225 p = __builtin_fma (p, r, C[1]);
226 p = __builtin_fma (p, r, C[2]);
227 p = __builtin_fma (p, s * r, s);
229 v_set_f32 (&ret, lane, p);
231 if (unlikely (v_any_u32 (cmp)))
232 return specialcase (x, y, ret, cmp);
233 return ret;
235 VPCS_ALIAS
236 #endif