8354 sync regcomp(3C) with upstream (fix make catalog)
[unleashed/tickless.git] / usr / src / lib / libmvec / common / __vrem_pio2m.c
blob7a36e944ab8309bc194950b78bb1aabe2cc6a9e6
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
31 * Given X, __vlibm_rem_pio2m finds Y and an integer n such that
32 * Y = X - n*pi/2 and |Y| < pi/2.
34 * On entry, X is represented by x, an array of nx 24-bit integers
35 * stored in double precision format, and e:
37 * X = sum (x[i] * 2^(e - 24*i))
39 * nx must be 1, 2, or 3, and e must be >= -24. For example, a
40 * suitable representation for the double precision number z can
41 * be computed as follows:
43 * e = ilogb(z)-23
44 * z = scalbn(z,-e)
45 * for i = 0,1,2
46 * x[i] = floor(z)
47 * z = (z-x[i])*2**24
49 * On exit, Y is approximated by y[0] if prec is 0 and by the un-
50 * evaluated sum y[0] + y[1] if prec != 0. The approximation is
51 * accurate to 53 bits in the former case and to at least 72 bits
52 * in the latter.
54 * __vlibm_rem_pio2m returns n mod 8.
56 * Notes:
58 * As n is the integer nearest X * 2/pi, we approximate the latter
59 * product to a precision that is determined dynamically so as to
60 * ensure that the final value Y is approximated accurately enough.
61 * We don't bother to compute terms in the product that are multiples
62 * of 8, so the cost of this multiplication is independent of the
63 * magnitude of X. The variable ip determines the offset into the
64 * array ipio2 of the first term we need to use. The variable eq0
65 * is the corresponding exponent of the first partial product.
67 * The partial products are scaled, summed, and split into an array
68 * of non-overlapping 24-bit terms (not necessarily having the same
69 * signs). Each partial product overlaps three elements of the
70 * resulting array:
72 * q[i] xxxxxxxxxxxxxx
73 * q[i+1] xxxxxxxxxxxxxx
74 * q[i+2] xxxxxxxxxxxxxx
75 * ... ...
78 * r[i] xxxxxx
79 * r[i+1] xxxxxx
80 * r[i+2] xxxxxx
81 * ... ...
83 * In order that the last element of the r array have some correct
84 * bits, we compute an extra term in the q array, but we don't bother
85 * to split this last term into 24-bit chunks; thus, the final term
86 * of the r array could have more than 24 bits, but this doesn't
87 * matter.
89 * After we subtract the nearest integer to the product, we multiply
90 * the remaining part of r by pi/2 to obtain Y. Before we compute
91 * this last product, however, we make sure that the remaining part
92 * of r has at least five nonzero terms, computing more if need be.
93 * This ensures that even if the first nonzero term is only a single
94 * bit and the last term is wrong in several trailing bits, we still
95 * have enough accuracy to obtain 72 bits of Y.
97 * IMPORTANT: This code assumes that the rounding mode is round-to-
98 * nearest in several key places. First, after we compute X * 2/pi,
99 * we round to the nearest integer by adding and subtracting a power
100 * of two. This step must be done in round-to-nearest mode to ensure
101 * that the remainder is less than 1/2 in absolute value. (Because
102 * we only take two adjacent terms of r into account when we perform
103 * this rounding, in very rare cases the remainder could be just
104 * barely greater than 1/2, but this shouldn't matter in practice.)
106 * Second, we also split the partial products of X * 2/pi into 24-bit
107 * pieces by adding and subtracting a power of two. In this step,
108 * round-to-nearest mode is important in order to guarantee that
109 * the index of the first nonzero term in the remainder gives an
110 * accurate indication of the number of significant terms. For
111 * example, suppose eq0 = -1, so that r[1] is a multiple of 1/2 and
112 * |r[2]| < 1/2. After we subtract the nearest integer, r[1] could
113 * be -1/2, and r[2] could be very nearly 1/2, so that r[1] != 0,
114 * yet the remainder is much smaller than the least significant bit
115 * corresponding to r[1]. As long as we use round-to-nearest mode,
116 * this can't happen; instead, the absolute value of each r[j] will
117 * be less than 1/2 the least significant bit corresponding to r[j-1],
118 * so that the entire remainder must be at least half as large as
119 * the first nonzero term (or perhaps just barely smaller than this).
122 #include <sys/isa_defs.h>
124 #ifdef _LITTLE_ENDIAN
125 #define HIWORD 1
126 #define LOWORD 0
127 #else
128 #define HIWORD 0
129 #define LOWORD 1
130 #endif
132 /* 396 hex digits of 2/pi, with two leading zeroes to make life easier */
133 static const double ipio2[] = {
134 0, 0,
135 0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
136 0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
137 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
138 0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
139 0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
140 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
141 0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
142 0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
143 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
144 0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
145 0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
148 /* pi/2 in 24-bit pieces */
149 static const double pio2[] = {
150 1.57079625129699707031e+00,
151 7.54978941586159635335e-08,
152 5.39030252995776476554e-15,
153 3.28200341580791294123e-22,
154 1.27065575308067607349e-29,
157 /* miscellaneous constants */
158 static const double
159 zero = 0.0,
160 two24 = 16777216.0,
161 round1 = 6755399441055744.0, /* 3 * 2^51 */
162 round24 = 113336795588871485128704.0, /* 3 * 2^75 */
163 twon24 = 5.960464477539062500E-8;
166 __vlibm_rem_pio2m(double *x, double *y, int e, int nx, int prec)
168 union {
169 double d;
170 int i[2];
171 } s;
172 double z, t, p, q[20], r[21], *pr;
173 int nq, ip, n, i, j, k, eq0, eqnqm1;
175 /* determine ip and eq0; note that -48 <= eq0 <= 2 */
176 ip = (e - 3) / 24;
177 if (ip < 0)
178 ip = 0;
179 eq0 = e - 24 * (ip + 1);
181 /* compute q[0,...,5] = x * ipio2 and initialize nq and eqnqm1 */
182 if (nx == 3) {
183 q[0] = x[0] * ipio2[ip+2] + x[1] * ipio2[ip+1] + x[2] * ipio2[ip];
184 q[1] = x[0] * ipio2[ip+3] + x[1] * ipio2[ip+2] + x[2] * ipio2[ip+1];
185 q[2] = x[0] * ipio2[ip+4] + x[1] * ipio2[ip+3] + x[2] * ipio2[ip+2];
186 q[3] = x[0] * ipio2[ip+5] + x[1] * ipio2[ip+4] + x[2] * ipio2[ip+3];
187 q[4] = x[0] * ipio2[ip+6] + x[1] * ipio2[ip+5] + x[2] * ipio2[ip+4];
188 q[5] = x[0] * ipio2[ip+7] + x[1] * ipio2[ip+6] + x[2] * ipio2[ip+5];
189 } else if (nx == 2) {
190 q[0] = x[0] * ipio2[ip+2] + x[1] * ipio2[ip+1];
191 q[1] = x[0] * ipio2[ip+3] + x[1] * ipio2[ip+2];
192 q[2] = x[0] * ipio2[ip+4] + x[1] * ipio2[ip+3];
193 q[3] = x[0] * ipio2[ip+5] + x[1] * ipio2[ip+4];
194 q[4] = x[0] * ipio2[ip+6] + x[1] * ipio2[ip+5];
195 q[5] = x[0] * ipio2[ip+7] + x[1] * ipio2[ip+6];
196 } else {
197 q[0] = x[0] * ipio2[ip+2];
198 q[1] = x[0] * ipio2[ip+3];
199 q[2] = x[0] * ipio2[ip+4];
200 q[3] = x[0] * ipio2[ip+5];
201 q[4] = x[0] * ipio2[ip+6];
202 q[5] = x[0] * ipio2[ip+7];
204 nq = 5;
205 eqnqm1 = eq0 - 96;
207 recompute:
208 /* propagate carries and incorporate powers of two */
209 s.i[HIWORD] = (0x3ff + eqnqm1) << 20;
210 s.i[LOWORD] = 0;
211 p = s.d;
212 z = q[nq] * twon24;
213 for (j = nq-1; j >= 1; j--) {
214 z += q[j];
215 t = (z + round24) - round24; /* must be rounded to nearest */
216 r[j+1] = (z - t) * p;
217 z = t * twon24;
218 p *= two24;
220 z += q[0];
221 t = (z + round24) - round24; /* must be rounded to nearest */
222 r[1] = (z - t) * p;
223 r[0] = t * p;
225 /* form n = [r] mod 8 and leave the fractional part of r */
226 if (eq0 > 0) {
227 /* binary point lies within r[2] */
228 z = r[2] + r[3];
229 t = (z + round1) - round1; /* must be rounded to nearest */
230 r[2] -= t;
231 n = (int)(r[1] + t);
232 r[0] = r[1] = zero;
233 } else if (eq0 > -24) {
234 /* binary point lies within or just to the right of r[1] */
235 z = r[1] + r[2];
236 t = (z + round1) - round1; /* must be rounded to nearest */
237 r[1] -= t;
238 z = r[0] + t;
239 /* cut off high part of z so conversion to int doesn't
240 overflow */
241 t = (z + round24) - round24;
242 n = (int)(z - t);
243 r[0] = zero;
244 } else {
245 /* binary point lies within or just to the right of r[0] */
246 z = r[0] + r[1];
247 t = (z + round1) - round1; /* must be rounded to nearest */
248 r[0] -= t;
249 n = (int)t;
252 /* count the number of leading zeroes in r */
253 for (j = 0; j <= nq; j++) {
254 if (r[j] != zero)
255 break;
258 /* if fewer than 5 terms remain, add more */
259 if (nq - j < 4) {
260 k = 4 - (nq - j);
262 * compute q[nq+1] to q[nq+k]
264 * For some reason, writing out the nx loop explicitly
265 * for each of the three possible values (as above) seems
266 * to run a little slower, so we'll leave this code as is.
268 for (i = nq + 1; i <= nq + k; i++) {
269 t = x[0] * ipio2[ip+2+i];
270 for (j = 1; j < nx; j++)
271 t += x[j] * ipio2[ip+2+i-j];
272 q[i] = t;
273 eqnqm1 -= 24;
275 nq += k;
276 goto recompute;
279 /* set pr and nq so that pr[0,...,nq] is the part of r remaining */
280 pr = &r[j];
281 nq = nq - j;
283 /* compute pio2 * pr[0,...,nq]; note that nq >= 4 here */
284 q[0] = pio2[0] * pr[0];
285 q[1] = pio2[0] * pr[1] + pio2[1] * pr[0];
286 q[2] = pio2[0] * pr[2] + pio2[1] * pr[1] + pio2[2] * pr[0];
287 q[3] = pio2[0] * pr[3] + pio2[1] * pr[2] + pio2[2] * pr[1]
288 + pio2[3] * pr[0];
289 for (i = 4; i <= nq; i++) {
290 q[i] = pio2[0] * pr[i] + pio2[1] * pr[i-1] + pio2[2] * pr[i-2]
291 + pio2[3] * pr[i-3] + pio2[4] * pr[i-4];
294 /* sum q in increasing order to obtain the first term of y */
295 t = q[nq];
296 for (i = nq - 1; i >= 0; i--)
297 t += q[i];
298 y[0] = t;
299 if (prec) {
300 /* subtract and sum again in decreasing order
301 to obtain the second term */
302 t = q[0] - t;
303 for (i = 1; i <= nq; i++)
304 t += q[i];
305 y[1] = t;
308 return (n & 7);