8354 sync regcomp(3C) with upstream (fix make catalog)
[unleashed/tickless.git] / usr / src / lib / libmvec / common / __vrsqrt.c
blob82a3acaa82d998d3567531f8f2d49fe5017b26e5
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
30 #include <sys/isa_defs.h>
31 #include "libm_inlines.h"
33 #ifdef _LITTLE_ENDIAN
34 #define HI(x) *(1+(int*)x)
35 #define LO(x) *(unsigned*)x
36 #else
37 #define HI(x) *(int*)x
38 #define LO(x) *(1+(unsigned*)x)
39 #endif
41 #ifdef __RESTRICT
42 #define restrict _Restrict
43 #else
44 #define restrict
45 #endif
47 /* double rsqrt(double x)
49 * Method :
50 * 1. Special cases:
51 * for x = NaN => QNaN;
52 * for x = +Inf => 0;
53 * for x is negative, -Inf => QNaN + invalid;
54 * for x = +0 => +Inf + divide-by-zero;
55 * for x = -0 => -Inf + divide-by-zero.
56 * 2. Computes reciprocal square root from:
57 * x = m * 2**n
58 * Where:
59 * m = [0.5, 2),
60 * n = ((exponent + 1) & ~1).
61 * Then:
62 * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
63 * 2. Computes 1/sqrt(m) from:
64 * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
65 * Where:
66 * m = m0 + dm,
67 * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63];
68 * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
69 * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128.
70 * Then:
71 * 1/sqrt(m0) is looked up in a table,
72 * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
73 * 1/sqrt(1 + (1/m0)*dm) is computed using approximation:
74 * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
75 * * z + a2) * z + a1) * z + a0
76 * where z = [-1/128, 1/128].
78 * Accuracy:
79 * The maximum relative error for the approximating
80 * polynomial is 2**(-56.26).
81 * Maximum error observed: less than 0.563 ulp after 1.500.000.000
82 * results.
85 extern double sqrt (double);
86 extern const double __vlibm_TBL_rsqrt[];
88 static void
89 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
91 #pragma no_inline(__vrsqrt_n)
93 #define RETURN(ret) \
94 { \
95 *py = (ret); \
96 py += stridey; \
97 if (n_n == 0) \
98 { \
99 spx = px; spy = py; \
100 hx = HI(px); \
101 continue; \
103 n--; \
104 break; \
107 static const double
108 DONE = 1.0,
109 K1 = -5.00000000000005209867e-01,
110 K2 = 3.75000000000004884257e-01,
111 K3 = -3.12499999317136886551e-01,
112 K4 = 2.73437499359815081532e-01,
113 K5 = -2.46116125605037803130e-01,
114 K6 = 2.25606914648617522896e-01;
116 void
117 __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
119 double *spx, *spy;
120 int ax, lx, hx, n_n;
121 double res;
123 while (n > 1)
125 n_n = 0;
126 spx = px;
127 spy = py;
128 hx = HI(px);
129 for (; n > 1 ; n--)
131 px += stridex;
132 if (hx >= 0x7ff00000) /* X = NaN or Inf */
134 res = *(px - stridex);
135 RETURN (DONE / res)
138 py += stridey;
140 if (hx < 0x00100000) /* X = denormal, zero or negative */
142 py -= stridey;
143 ax = hx & 0x7fffffff;
144 lx = LO((px - stridex));
145 res = *(px - stridex);
147 if ((ax | lx) == 0) /* |X| = zero */
149 RETURN (DONE / res)
151 else if (hx >= 0) /* X = denormal */
153 double res_c0, dsqrt_exp0;
154 int ind0, sqrt_exp0;
155 double xx0, dexp_hi0, dexp_lo0;
156 int hx0, resh0, res_ch0;
158 res = *(long long*)&res;
160 hx0 = HI(&res);
161 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
162 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
164 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
165 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
166 HI(&res) = resh0;
167 HI(&res_c0) = res_ch0;
168 LO(&res_c0) = 0;
170 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
171 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
172 xx0 = dexp_hi0 * dexp_hi0;
173 xx0 = (res - res_c0) * xx0;
174 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
176 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
178 HI(&dsqrt_exp0) = sqrt_exp0;
179 LO(&dsqrt_exp0) = 0;
180 res *= dsqrt_exp0;
182 RETURN (res)
184 else /* X = negative */
186 RETURN (sqrt(res))
189 n_n++;
190 hx = HI(px);
192 if (n_n > 0)
193 __vrsqrt_n(n_n, spx, stridex, spy, stridey);
195 if (n > 0)
197 hx = HI(px);
199 if (hx >= 0x7ff00000) /* X = NaN or Inf */
201 res = *px;
202 *py = DONE / res;
204 else if (hx < 0x00100000) /* X = denormal, zero or negative */
206 ax = hx & 0x7fffffff;
207 lx = LO(px);
208 res = *px;
210 if ((ax | lx) == 0) /* |X| = zero */
212 *py = DONE / res;
214 else if (hx >= 0) /* X = denormal */
216 double res_c0, dsqrt_exp0;
217 int ind0, sqrt_exp0;
218 double xx0, dexp_hi0, dexp_lo0;
219 int hx0, resh0, res_ch0;
221 res = *(long long*)&res;
223 hx0 = HI(&res);
224 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
225 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
227 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
228 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
229 HI(&res) = resh0;
230 HI(&res_c0) = res_ch0;
231 LO(&res_c0) = 0;
233 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
234 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
235 xx0 = dexp_hi0 * dexp_hi0;
236 xx0 = (res - res_c0) * xx0;
237 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
239 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
241 HI(&dsqrt_exp0) = sqrt_exp0;
242 LO(&dsqrt_exp0) = 0;
243 res *= dsqrt_exp0;
245 *py = res;
247 else /* X = negative */
249 *py = sqrt(res);
252 else
254 double res_c0, dsqrt_exp0;
255 int ind0, sqrt_exp0;
256 double xx0, dexp_hi0, dexp_lo0;
257 int resh0, res_ch0;
259 sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
260 ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
262 resh0 = (hx & 0x001fffff) | 0x3fe00000;
263 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
264 HI(&res) = resh0;
265 LO(&res) = LO(px);
266 HI(&res_c0) = res_ch0;
267 LO(&res_c0) = 0;
269 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
270 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
271 xx0 = dexp_hi0 * dexp_hi0;
272 xx0 = (res - res_c0) * xx0;
273 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
275 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
277 HI(&dsqrt_exp0) = sqrt_exp0;
278 LO(&dsqrt_exp0) = 0;
279 res *= dsqrt_exp0;
281 *py = res;
286 static void
287 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
289 double res0, res_c0, dsqrt_exp0;
290 double res1, res_c1, dsqrt_exp1;
291 double res2, res_c2, dsqrt_exp2;
292 int ind0, sqrt_exp0;
293 int ind1, sqrt_exp1;
294 int ind2, sqrt_exp2;
295 double xx0, dexp_hi0, dexp_lo0;
296 double xx1, dexp_hi1, dexp_lo1;
297 double xx2, dexp_hi2, dexp_lo2;
298 int hx0, resh0, res_ch0;
299 int hx1, resh1, res_ch1;
300 int hx2, resh2, res_ch2;
302 LO(&dsqrt_exp0) = 0;
303 LO(&dsqrt_exp1) = 0;
304 LO(&dsqrt_exp2) = 0;
305 LO(&res_c0) = 0;
306 LO(&res_c1) = 0;
307 LO(&res_c2) = 0;
309 for(; n > 2 ; n -= 3)
311 hx0 = HI(px);
312 LO(&res0) = LO(px);
313 px += stridex;
315 hx1 = HI(px);
316 LO(&res1) = LO(px);
317 px += stridex;
319 hx2 = HI(px);
320 LO(&res2) = LO(px);
321 px += stridex;
323 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
324 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
325 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
326 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
327 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
328 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
330 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
331 resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
332 resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
333 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
334 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
335 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
336 HI(&res0) = resh0;
337 HI(&res1) = resh1;
338 HI(&res2) = resh2;
339 HI(&res_c0) = res_ch0;
340 HI(&res_c1) = res_ch1;
341 HI(&res_c2) = res_ch2;
343 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
344 dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
345 dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
346 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
347 dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
348 dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
349 xx0 = dexp_hi0 * dexp_hi0;
350 xx1 = dexp_hi1 * dexp_hi1;
351 xx2 = dexp_hi2 * dexp_hi2;
352 xx0 = (res0 - res_c0) * xx0;
353 xx1 = (res1 - res_c1) * xx1;
354 xx2 = (res2 - res_c2) * xx2;
355 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
356 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
357 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
359 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
360 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
361 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
363 HI(&dsqrt_exp0) = sqrt_exp0;
364 HI(&dsqrt_exp1) = sqrt_exp1;
365 HI(&dsqrt_exp2) = sqrt_exp2;
366 res0 *= dsqrt_exp0;
367 res1 *= dsqrt_exp1;
368 res2 *= dsqrt_exp2;
370 *py = res0;
371 py += stridey;
373 *py = res1;
374 py += stridey;
376 *py = res2;
377 py += stridey;
380 for(; n > 0 ; n--)
382 hx0 = HI(px);
384 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
385 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
387 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
388 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
389 HI(&res0) = resh0;
390 LO(&res0) = LO(px);
391 HI(&res_c0) = res_ch0;
392 LO(&res_c0) = 0;
394 px += stridex;
396 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
397 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
398 xx0 = dexp_hi0 * dexp_hi0;
399 xx0 = (res0 - res_c0) * xx0;
400 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
402 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
404 HI(&dsqrt_exp0) = sqrt_exp0;
405 LO(&dsqrt_exp0) = 0;
406 res0 *= dsqrt_exp0;
408 *py = res0;
409 py += stridey;