4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
30 #include <sys/isa_defs.h>
31 #include "libm_inlines.h"
34 #define HI(x) *(1+(int*)x)
35 #define LO(x) *(unsigned*)x
37 #define HI(x) *(int*)x
38 #define LO(x) *(1+(unsigned*)x)
42 #define restrict _Restrict
47 /* double rsqrt(double x)
51 * for x = NaN => QNaN;
53 * for x is negative, -Inf => QNaN + invalid;
54 * for x = +0 => +Inf + divide-by-zero;
55 * for x = -0 => -Inf + divide-by-zero.
56 * 2. Computes reciprocal square root from:
60 * n = ((exponent + 1) & ~1).
62 * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
63 * 2. Computes 1/sqrt(m) from:
64 * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
67 * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63];
68 * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
69 * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128.
71 * 1/sqrt(m0) is looked up in a table,
72 * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
73 * 1/sqrt(1 + (1/m0)*dm) is computed using approximation:
74 * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
75 * * z + a2) * z + a1) * z + a0
76 * where z = [-1/128, 1/128].
79 * The maximum relative error for the approximating
80 * polynomial is 2**(-56.26).
81 * Maximum error observed: less than 0.563 ulp after 1.500.000.000
85 extern double sqrt (double);
86 extern const double __vlibm_TBL_rsqrt
[];
89 __vrsqrt_n(int n
, double * restrict px
, int stridex
, double * restrict py
, int stridey
);
91 #pragma no_inline(__vrsqrt_n)
109 K1
= -5.00000000000005209867e-01,
110 K2
= 3.75000000000004884257e-01,
111 K3
= -3.12499999317136886551e-01,
112 K4
= 2.73437499359815081532e-01,
113 K5
= -2.46116125605037803130e-01,
114 K6
= 2.25606914648617522896e-01;
117 __vrsqrt(int n
, double * restrict px
, int stridex
, double * restrict py
, int stridey
)
132 if (hx
>= 0x7ff00000) /* X = NaN or Inf */
134 res
= *(px
- stridex
);
140 if (hx
< 0x00100000) /* X = denormal, zero or negative */
143 ax
= hx
& 0x7fffffff;
144 lx
= LO((px
- stridex
));
145 res
= *(px
- stridex
);
147 if ((ax
| lx
) == 0) /* |X| = zero */
151 else if (hx
>= 0) /* X = denormal */
153 double res_c0
, dsqrt_exp0
;
155 double xx0
, dexp_hi0
, dexp_lo0
;
156 int hx0
, resh0
, res_ch0
;
158 res
= *(long long*)&res
;
161 sqrt_exp0
= (0x817 - (hx0
>> 21)) << 20;
162 ind0
= (((hx0
>> 10) & 0x7f8) + 8) & -16;
164 resh0
= (hx0
& 0x001fffff) | 0x3fe00000;
165 res_ch0
= (resh0
+ 0x00002000) & 0x7fffc000;
167 HI(&res_c0
) = res_ch0
;
170 dexp_hi0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[0];
171 dexp_lo0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[1];
172 xx0
= dexp_hi0
* dexp_hi0
;
173 xx0
= (res
- res_c0
) * xx0
;
174 res
= (((((K6
* xx0
+ K5
) * xx0
+ K4
) * xx0
+ K3
) * xx0
+ K2
) * xx0
+ K1
) * xx0
;
176 res
= dexp_hi0
* res
+ dexp_lo0
+ dexp_hi0
;
178 HI(&dsqrt_exp0
) = sqrt_exp0
;
184 else /* X = negative */
193 __vrsqrt_n(n_n
, spx
, stridex
, spy
, stridey
);
199 if (hx
>= 0x7ff00000) /* X = NaN or Inf */
204 else if (hx
< 0x00100000) /* X = denormal, zero or negative */
206 ax
= hx
& 0x7fffffff;
210 if ((ax
| lx
) == 0) /* |X| = zero */
214 else if (hx
>= 0) /* X = denormal */
216 double res_c0
, dsqrt_exp0
;
218 double xx0
, dexp_hi0
, dexp_lo0
;
219 int hx0
, resh0
, res_ch0
;
221 res
= *(long long*)&res
;
224 sqrt_exp0
= (0x817 - (hx0
>> 21)) << 20;
225 ind0
= (((hx0
>> 10) & 0x7f8) + 8) & -16;
227 resh0
= (hx0
& 0x001fffff) | 0x3fe00000;
228 res_ch0
= (resh0
+ 0x00002000) & 0x7fffc000;
230 HI(&res_c0
) = res_ch0
;
233 dexp_hi0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[0];
234 dexp_lo0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[1];
235 xx0
= dexp_hi0
* dexp_hi0
;
236 xx0
= (res
- res_c0
) * xx0
;
237 res
= (((((K6
* xx0
+ K5
) * xx0
+ K4
) * xx0
+ K3
) * xx0
+ K2
) * xx0
+ K1
) * xx0
;
239 res
= dexp_hi0
* res
+ dexp_lo0
+ dexp_hi0
;
241 HI(&dsqrt_exp0
) = sqrt_exp0
;
247 else /* X = negative */
254 double res_c0
, dsqrt_exp0
;
256 double xx0
, dexp_hi0
, dexp_lo0
;
259 sqrt_exp0
= (0x5fe - (hx
>> 21)) << 20;
260 ind0
= (((hx
>> 10) & 0x7f8) + 8) & -16;
262 resh0
= (hx
& 0x001fffff) | 0x3fe00000;
263 res_ch0
= (resh0
+ 0x00002000) & 0x7fffc000;
266 HI(&res_c0
) = res_ch0
;
269 dexp_hi0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[0];
270 dexp_lo0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[1];
271 xx0
= dexp_hi0
* dexp_hi0
;
272 xx0
= (res
- res_c0
) * xx0
;
273 res
= (((((K6
* xx0
+ K5
) * xx0
+ K4
) * xx0
+ K3
) * xx0
+ K2
) * xx0
+ K1
) * xx0
;
275 res
= dexp_hi0
* res
+ dexp_lo0
+ dexp_hi0
;
277 HI(&dsqrt_exp0
) = sqrt_exp0
;
287 __vrsqrt_n(int n
, double * restrict px
, int stridex
, double * restrict py
, int stridey
)
289 double res0
, res_c0
, dsqrt_exp0
;
290 double res1
, res_c1
, dsqrt_exp1
;
291 double res2
, res_c2
, dsqrt_exp2
;
295 double xx0
, dexp_hi0
, dexp_lo0
;
296 double xx1
, dexp_hi1
, dexp_lo1
;
297 double xx2
, dexp_hi2
, dexp_lo2
;
298 int hx0
, resh0
, res_ch0
;
299 int hx1
, resh1
, res_ch1
;
300 int hx2
, resh2
, res_ch2
;
309 for(; n
> 2 ; n
-= 3)
323 sqrt_exp0
= (0x5fe - (hx0
>> 21)) << 20;
324 sqrt_exp1
= (0x5fe - (hx1
>> 21)) << 20;
325 sqrt_exp2
= (0x5fe - (hx2
>> 21)) << 20;
326 ind0
= (((hx0
>> 10) & 0x7f8) + 8) & -16;
327 ind1
= (((hx1
>> 10) & 0x7f8) + 8) & -16;
328 ind2
= (((hx2
>> 10) & 0x7f8) + 8) & -16;
330 resh0
= (hx0
& 0x001fffff) | 0x3fe00000;
331 resh1
= (hx1
& 0x001fffff) | 0x3fe00000;
332 resh2
= (hx2
& 0x001fffff) | 0x3fe00000;
333 res_ch0
= (resh0
+ 0x00002000) & 0x7fffc000;
334 res_ch1
= (resh1
+ 0x00002000) & 0x7fffc000;
335 res_ch2
= (resh2
+ 0x00002000) & 0x7fffc000;
339 HI(&res_c0
) = res_ch0
;
340 HI(&res_c1
) = res_ch1
;
341 HI(&res_c2
) = res_ch2
;
343 dexp_hi0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[0];
344 dexp_hi1
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind1
))[0];
345 dexp_hi2
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind2
))[0];
346 dexp_lo0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[1];
347 dexp_lo1
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind1
))[1];
348 dexp_lo2
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind2
))[1];
349 xx0
= dexp_hi0
* dexp_hi0
;
350 xx1
= dexp_hi1
* dexp_hi1
;
351 xx2
= dexp_hi2
* dexp_hi2
;
352 xx0
= (res0
- res_c0
) * xx0
;
353 xx1
= (res1
- res_c1
) * xx1
;
354 xx2
= (res2
- res_c2
) * xx2
;
355 res0
= (((((K6
* xx0
+ K5
) * xx0
+ K4
) * xx0
+ K3
) * xx0
+ K2
) * xx0
+ K1
) * xx0
;
356 res1
= (((((K6
* xx1
+ K5
) * xx1
+ K4
) * xx1
+ K3
) * xx1
+ K2
) * xx1
+ K1
) * xx1
;
357 res2
= (((((K6
* xx2
+ K5
) * xx2
+ K4
) * xx2
+ K3
) * xx2
+ K2
) * xx2
+ K1
) * xx2
;
359 res0
= dexp_hi0
* res0
+ dexp_lo0
+ dexp_hi0
;
360 res1
= dexp_hi1
* res1
+ dexp_lo1
+ dexp_hi1
;
361 res2
= dexp_hi2
* res2
+ dexp_lo2
+ dexp_hi2
;
363 HI(&dsqrt_exp0
) = sqrt_exp0
;
364 HI(&dsqrt_exp1
) = sqrt_exp1
;
365 HI(&dsqrt_exp2
) = sqrt_exp2
;
384 sqrt_exp0
= (0x5fe - (hx0
>> 21)) << 20;
385 ind0
= (((hx0
>> 10) & 0x7f8) + 8) & -16;
387 resh0
= (hx0
& 0x001fffff) | 0x3fe00000;
388 res_ch0
= (resh0
+ 0x00002000) & 0x7fffc000;
391 HI(&res_c0
) = res_ch0
;
396 dexp_hi0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[0];
397 dexp_lo0
= ((double*)((char*)__vlibm_TBL_rsqrt
+ ind0
))[1];
398 xx0
= dexp_hi0
* dexp_hi0
;
399 xx0
= (res0
- res_c0
) * xx0
;
400 res0
= (((((K6
* xx0
+ K5
) * xx0
+ K4
) * xx0
+ K3
) * xx0
+ K2
) * xx0
+ K1
) * xx0
;
402 res0
= dexp_hi0
* res0
+ dexp_lo0
+ dexp_hi0
;
404 HI(&dsqrt_exp0
) = sqrt_exp0
;