8354 sync regcomp(3C) with upstream (fix make catalog)
[unleashed/tickless.git] / usr / src / lib / libmvec / common / __vrhypot.c
blob4fd1e51d39554bd165b204f4595e3bfe3aac5b3e
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
30 #include <sys/isa_defs.h>
31 #include "libm_inlines.h"
33 #ifdef _LITTLE_ENDIAN
34 #define HI(x) *(1+(int*)x)
35 #define LO(x) *(unsigned*)x
36 #else
37 #define HI(x) *(int*)x
38 #define LO(x) *(1+(unsigned*)x)
39 #endif
41 #ifdef __RESTRICT
42 #define restrict _Restrict
43 #else
44 #define restrict
45 #endif
47 /* double rhypot(double x, double y)
49 * Method :
50 * 1. Special cases:
51 * x or y = Inf => 0
52 * x or y = NaN => QNaN
53 * x and y = 0 => Inf + divide-by-zero
54 * 2. Computes rhypot(x,y):
55 * rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm))
56 * Where:
57 * m = 1/max(|x|,|y|)
58 * xnm = x * m
59 * ynm = y * m
61 * Compute 1/(xnm * xnm + ynm * ynm) by simulating
62 * muti-precision arithmetic.
64 * Accuracy:
65 * Maximum error observed: less than 0.869 ulp after 1.000.000.000
66 * results.
69 extern double sqrt(double);
70 extern double fabs(double);
72 static const int __vlibm_TBL_rhypot[] = {
73 /* i = [0,127]
74 * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */
75 0x7fe00000, 0x7fdfc07f, 0x7fdf81f8, 0x7fdf4465,
76 0x7fdf07c1, 0x7fdecc07, 0x7fde9131, 0x7fde573a,
77 0x7fde1e1e, 0x7fdde5d6, 0x7fddae60, 0x7fdd77b6,
78 0x7fdd41d4, 0x7fdd0cb5, 0x7fdcd856, 0x7fdca4b3,
79 0x7fdc71c7, 0x7fdc3f8f, 0x7fdc0e07, 0x7fdbdd2b,
80 0x7fdbacf9, 0x7fdb7d6c, 0x7fdb4e81, 0x7fdb2036,
81 0x7fdaf286, 0x7fdac570, 0x7fda98ef, 0x7fda6d01,
82 0x7fda41a4, 0x7fda16d3, 0x7fd9ec8e, 0x7fd9c2d1,
83 0x7fd99999, 0x7fd970e4, 0x7fd948b0, 0x7fd920fb,
84 0x7fd8f9c1, 0x7fd8d301, 0x7fd8acb9, 0x7fd886e5,
85 0x7fd86186, 0x7fd83c97, 0x7fd81818, 0x7fd7f405,
86 0x7fd7d05f, 0x7fd7ad22, 0x7fd78a4c, 0x7fd767dc,
87 0x7fd745d1, 0x7fd72428, 0x7fd702e0, 0x7fd6e1f7,
88 0x7fd6c16c, 0x7fd6a13c, 0x7fd68168, 0x7fd661ec,
89 0x7fd642c8, 0x7fd623fa, 0x7fd60581, 0x7fd5e75b,
90 0x7fd5c988, 0x7fd5ac05, 0x7fd58ed2, 0x7fd571ed,
91 0x7fd55555, 0x7fd53909, 0x7fd51d07, 0x7fd50150,
92 0x7fd4e5e0, 0x7fd4cab8, 0x7fd4afd6, 0x7fd49539,
93 0x7fd47ae1, 0x7fd460cb, 0x7fd446f8, 0x7fd42d66,
94 0x7fd41414, 0x7fd3fb01, 0x7fd3e22c, 0x7fd3c995,
95 0x7fd3b13b, 0x7fd3991c, 0x7fd38138, 0x7fd3698d,
96 0x7fd3521c, 0x7fd33ae4, 0x7fd323e3, 0x7fd30d19,
97 0x7fd2f684, 0x7fd2e025, 0x7fd2c9fb, 0x7fd2b404,
98 0x7fd29e41, 0x7fd288b0, 0x7fd27350, 0x7fd25e22,
99 0x7fd24924, 0x7fd23456, 0x7fd21fb7, 0x7fd20b47,
100 0x7fd1f704, 0x7fd1e2ef, 0x7fd1cf06, 0x7fd1bb4a,
101 0x7fd1a7b9, 0x7fd19453, 0x7fd18118, 0x7fd16e06,
102 0x7fd15b1e, 0x7fd1485f, 0x7fd135c8, 0x7fd12358,
103 0x7fd11111, 0x7fd0fef0, 0x7fd0ecf5, 0x7fd0db20,
104 0x7fd0c971, 0x7fd0b7e6, 0x7fd0a681, 0x7fd0953f,
105 0x7fd08421, 0x7fd07326, 0x7fd0624d, 0x7fd05197,
106 0x7fd04104, 0x7fd03091, 0x7fd02040, 0x7fd01010,
109 static const unsigned long long LCONST[] = {
110 0x3ff0000000000000ULL, /* DONE = 1.0 */
111 0x4000000000000000ULL, /* DTWO = 2.0 */
112 0x4230000000000000ULL, /* D2ON36 = 2**36 */
113 0x7fd0000000000000ULL, /* D2ON1022 = 2**1022 */
114 0x3cb0000000000000ULL, /* D2ONM52 = 2**-52 */
117 #define RET_SC(I) \
118 px += stridex; \
119 py += stridey; \
120 pz += stridez; \
121 if (--n <= 0) \
122 break; \
123 goto start##I;
125 #define RETURN(I, ret) \
127 pz[0] = (ret); \
128 RET_SC(I) \
131 #define PREP(I) \
132 hx##I = HI(px); \
133 hy##I = HI(py); \
134 hx##I &= 0x7fffffff; \
135 hy##I &= 0x7fffffff; \
136 pz##I = pz; \
137 if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000) /* |X| or |Y| = Inf,NaN */ \
139 lx = LO(px); \
140 ly = LO(py); \
141 x = *px; \
142 y = *py; \
143 if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0; /* |X| = Inf */ \
144 else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0; /* |Y| = Inf */ \
145 else res0 = fabs(x) + fabs(y); \
147 RETURN (I, res0) \
149 x##I = *px; \
150 y##I = *py; \
151 diff0 = hy##I - hx##I; \
152 j0 = diff0 >> 31; \
153 if (hx##I < 0x00100000 && hy##I < 0x00100000) /* |X| and |Y| = subnormal or zero */ \
155 lx = LO(px); \
156 ly = LO(py); \
157 x = x##I; \
158 y = y##I; \
160 if ((hx##I | hy##I | lx | ly) == 0) /* |X| and |Y| = 0 */ \
161 RETURN (I, DONE / 0.0) \
163 x = fabs(x); \
164 y = fabs(y); \
166 x = *(long long*)&x; \
167 y = *(long long*)&y; \
169 x *= D2ONM52; \
170 y *= D2ONM52; \
172 x_hi0 = (x + D2ON36) - D2ON36; \
173 y_hi0 = (y + D2ON36) - D2ON36; \
174 x_lo0 = x - x_hi0; \
175 y_lo0 = y - y_hi0; \
176 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); \
177 res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0); \
179 dres0 = res0_hi + res0_lo; \
181 iarr0 = HI(&dres0); \
182 iexp0 = iarr0 & 0xfff00000; \
184 iarr0 = (iarr0 >> 11) & 0x1fc; \
185 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; \
186 itbl0 -= iexp0; \
187 HI(&dd0) = itbl0; \
188 LO(&dd0) = 0; \
190 dd0 = dd0 * (DTWO - dd0 * dres0); \
191 dd0 = dd0 * (DTWO - dd0 * dres0); \
192 dres0 = dd0 * (DTWO - dd0 * dres0); \
194 HI(&res0) = HI(&dres0) & 0xffffff00; \
195 LO(&res0) = 0; \
196 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; \
197 res0 = sqrt (res0); \
199 res0 = D2ON1022 * res0; \
200 RETURN (I, res0) \
202 j0 = hy##I - (diff0 & j0); \
203 j0 &= 0x7ff00000; \
204 HI(&scl##I) = 0x7ff00000 - j0;
206 void
207 __vrhypot(int n, double * restrict px, int stridex, double * restrict py,
208 int stridey, double * restrict pz, int stridez)
210 int i = 0;
211 double x, y;
212 double x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
213 double x0, y0, res0, dd0;
214 double res0_hi,res0_lo, dres0;
215 double x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0;
216 double x1 = 0.0L, y1 = 0.0L, res1, dd1;
217 double res1_hi,res1_lo, dres1;
218 double x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0;
219 double x2, y2, res2, dd2;
220 double res2_hi,res2_lo, dres2;
222 int hx0, hy0, j0, diff0;
223 int iarr0, iexp0, itbl0;
224 int hx1, hy1;
225 int iarr1, iexp1, itbl1;
226 int hx2, hy2;
227 int iarr2, iexp2, itbl2;
229 int lx, ly;
231 double DONE = ((double*)LCONST)[0];
232 double DTWO = ((double*)LCONST)[1];
233 double D2ON36 = ((double*)LCONST)[2];
234 double D2ON1022 = ((double*)LCONST)[3];
235 double D2ONM52 = ((double*)LCONST)[4];
237 double *pz0, *pz1 = 0, *pz2;
241 start0:
242 PREP(0)
243 px += stridex;
244 py += stridey;
245 pz += stridez;
246 i = 1;
247 if (--n <= 0)
248 break;
250 start1:
251 PREP(1)
252 px += stridex;
253 py += stridey;
254 pz += stridez;
255 i = 2;
256 if (--n <= 0)
257 break;
259 start2:
260 PREP(2)
262 x0 *= scl0;
263 y0 *= scl0;
264 x1 *= scl1;
265 y1 *= scl1;
266 x2 *= scl2;
267 y2 *= scl2;
269 x_hi0 = (x0 + D2ON36) - D2ON36;
270 y_hi0 = (y0 + D2ON36) - D2ON36;
271 x_hi1 = (x1 + D2ON36) - D2ON36;
272 y_hi1 = (y1 + D2ON36) - D2ON36;
273 x_hi2 = (x2 + D2ON36) - D2ON36;
274 y_hi2 = (y2 + D2ON36) - D2ON36;
275 x_lo0 = x0 - x_hi0;
276 y_lo0 = y0 - y_hi0;
277 x_lo1 = x1 - x_hi1;
278 y_lo1 = y1 - y_hi1;
279 x_lo2 = x2 - x_hi2;
280 y_lo2 = y2 - y_hi2;
281 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
282 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
283 res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2);
284 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
285 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
286 res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2);
288 dres0 = res0_hi + res0_lo;
289 dres1 = res1_hi + res1_lo;
290 dres2 = res2_hi + res2_lo;
292 iarr0 = HI(&dres0);
293 iarr1 = HI(&dres1);
294 iarr2 = HI(&dres2);
295 iexp0 = iarr0 & 0xfff00000;
296 iexp1 = iarr1 & 0xfff00000;
297 iexp2 = iarr2 & 0xfff00000;
299 iarr0 = (iarr0 >> 11) & 0x1fc;
300 iarr1 = (iarr1 >> 11) & 0x1fc;
301 iarr2 = (iarr2 >> 11) & 0x1fc;
302 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
303 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
304 itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0];
305 itbl0 -= iexp0;
306 itbl1 -= iexp1;
307 itbl2 -= iexp2;
308 HI(&dd0) = itbl0;
309 HI(&dd1) = itbl1;
310 HI(&dd2) = itbl2;
311 LO(&dd0) = 0;
312 LO(&dd1) = 0;
313 LO(&dd2) = 0;
315 dd0 = dd0 * (DTWO - dd0 * dres0);
316 dd1 = dd1 * (DTWO - dd1 * dres1);
317 dd2 = dd2 * (DTWO - dd2 * dres2);
318 dd0 = dd0 * (DTWO - dd0 * dres0);
319 dd1 = dd1 * (DTWO - dd1 * dres1);
320 dd2 = dd2 * (DTWO - dd2 * dres2);
321 dres0 = dd0 * (DTWO - dd0 * dres0);
322 dres1 = dd1 * (DTWO - dd1 * dres1);
323 dres2 = dd2 * (DTWO - dd2 * dres2);
325 HI(&res0) = HI(&dres0) & 0xffffff00;
326 HI(&res1) = HI(&dres1) & 0xffffff00;
327 HI(&res2) = HI(&dres2) & 0xffffff00;
328 LO(&res0) = 0;
329 LO(&res1) = 0;
330 LO(&res2) = 0;
331 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
332 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
333 res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2;
334 res0 = sqrt (res0);
335 res1 = sqrt (res1);
336 res2 = sqrt (res2);
338 res0 = scl0 * res0;
339 res1 = scl1 * res1;
340 res2 = scl2 * res2;
342 *pz0 = res0;
343 *pz1 = res1;
344 *pz2 = res2;
346 px += stridex;
347 py += stridey;
348 pz += stridez;
349 i = 0;
351 } while (--n > 0);
353 if (i > 0)
355 x0 *= scl0;
356 y0 *= scl0;
358 x_hi0 = (x0 + D2ON36) - D2ON36;
359 y_hi0 = (y0 + D2ON36) - D2ON36;
360 x_lo0 = x0 - x_hi0;
361 y_lo0 = y0 - y_hi0;
362 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
363 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
365 dres0 = res0_hi + res0_lo;
367 iarr0 = HI(&dres0);
368 iexp0 = iarr0 & 0xfff00000;
370 iarr0 = (iarr0 >> 11) & 0x1fc;
371 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
372 itbl0 -= iexp0;
373 HI(&dd0) = itbl0;
374 LO(&dd0) = 0;
376 dd0 = dd0 * (DTWO - dd0 * dres0);
377 dd0 = dd0 * (DTWO - dd0 * dres0);
378 dres0 = dd0 * (DTWO - dd0 * dres0);
380 HI(&res0) = HI(&dres0) & 0xffffff00;
381 LO(&res0) = 0;
382 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
383 res0 = sqrt (res0);
385 res0 = scl0 * res0;
387 *pz0 = res0;
389 if (i > 1)
391 x1 *= scl1;
392 y1 *= scl1;
394 x_hi1 = (x1 + D2ON36) - D2ON36;
395 y_hi1 = (y1 + D2ON36) - D2ON36;
396 x_lo1 = x1 - x_hi1;
397 y_lo1 = y1 - y_hi1;
398 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
399 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
401 dres1 = res1_hi + res1_lo;
403 iarr1 = HI(&dres1);
404 iexp1 = iarr1 & 0xfff00000;
406 iarr1 = (iarr1 >> 11) & 0x1fc;
407 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
408 itbl1 -= iexp1;
409 HI(&dd1) = itbl1;
410 LO(&dd1) = 0;
412 dd1 = dd1 * (DTWO - dd1 * dres1);
413 dd1 = dd1 * (DTWO - dd1 * dres1);
414 dres1 = dd1 * (DTWO - dd1 * dres1);
416 HI(&res1) = HI(&dres1) & 0xffffff00;
417 LO(&res1) = 0;
418 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
419 res1 = sqrt (res1);
421 res1 = scl1 * res1;
423 *pz1 = res1;