Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / lib / libmvec / common / __vatan2f.c
blobbe6c7b2824a349688c29c4bf96b7337e0abc2793
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
30 #ifdef __RESTRICT
31 #define restrict _Restrict
32 #else
33 #define restrict
34 #endif
36 extern const double __vlibm_TBL_atan1[];
38 static const double
39 pio4 = 7.8539816339744827900e-01,
40 pio2 = 1.5707963267948965580e+00,
41 pi = 3.1415926535897931160e+00;
43 static const float
44 zero = 0.0f,
45 one = 1.0f,
46 q1 = -3.3333333333296428046e-01f,
47 q2 = 1.9999999186853752618e-01f,
48 twop24 = 16777216.0f;
50 void
51 __vatan2f(int n, float * restrict y, int stridey, float * restrict x,
52 int stridex, float * restrict z, int stridez)
54 float x0, x1, x2, y0, y1, y2, *pz0 = 0, *pz1, *pz2;
55 double ah0, ah1, ah2;
56 double t0, t1, t2;
57 double sx0, sx1, sx2;
58 double sign0, sign1, sign2;
59 int i, k0 = 0, k1, k2, hx, sx, sy;
60 int hy0, hy1, hy2;
61 float base0 = 0.0, base1, base2;
62 double num0, num1, num2;
63 double den0, den1, den2;
64 double dx0, dx1, dx2;
65 double dy0, dy1, dy2;
66 double db0, db1, db2;
70 loop0:
71 hy0 = *(int*)y;
72 hx = *(int*)x;
73 sign0 = one;
74 sy = hy0 & 0x80000000;
75 hy0 &= ~0x80000000;
77 sx = hx & 0x80000000;
78 hx &= ~0x80000000;
80 if (hy0 > hx)
82 x0 = *y;
83 y0 = *x;
84 i = hx;
85 hx = hy0;
86 hy0 = i;
87 if (sy)
89 x0 = -x0;
90 sign0 = -sign0;
92 if (sx)
94 y0 = -y0;
95 ah0 = pio2;
97 else
99 ah0 = -pio2;
100 sign0 = -sign0;
103 else
105 y0 = *y;
106 x0 = *x;
107 if (sy)
109 y0 = -y0;
110 sign0 = -sign0;
112 if (sx)
114 x0 = -x0;
115 ah0 = -pi;
116 sign0 = -sign0;
118 else
119 ah0 = zero;
122 if (hx >= 0x7f800000 || hx - hy0 >= 0x0c800000)
124 if (hx >= 0x7f800000)
126 if (hx ^ 0x7f800000) /* nan */
127 ah0 = x0 + y0;
128 else if (hy0 >= 0x7f800000)
129 ah0 += pio4;
131 else if ((int) ah0 == 0)
132 ah0 = y0 / x0;
133 *z = (sign0 == one) ? ah0 : -ah0;
134 /* sign0*ah0 would change nan behavior relative to previous release */
135 x += stridex;
136 y += stridey;
137 z += stridez;
138 i = 0;
139 if (--n <= 0)
140 break;
141 goto loop0;
143 if (hy0 < 0x00800000) {
144 if (hy0 == 0)
146 *z = sign0 * (float) ah0;
147 x += stridex;
148 y += stridey;
149 z += stridez;
150 i = 0;
151 if (--n <= 0)
152 break;
153 goto loop0;
155 y0 *= twop24; /* scale subnormal y */
156 x0 *= twop24; /* scale possibly subnormal x */
157 hy0 = *(int*)&y0;
158 hx = *(int*)&x0;
160 pz0 = z;
162 k0 = (hy0 - hx + 0x3f800000) & 0xfff80000;
163 if (k0 >= 0x3C800000) /* if |x| >= (1/64)... */
165 *(int*)&base0 = k0;
166 k0 = (k0 - 0x3C800000) >> 18; /* (index >> 19) << 1) */
167 k0 += 4;
168 /* skip over 0,0,pi/2,pi/2 */
170 else /* |x| < 1/64 */
172 k0 = 0;
173 base0 = zero;
176 x += stridex;
177 y += stridey;
178 z += stridez;
179 i = 1;
180 if (--n <= 0)
181 break;
184 loop1:
185 hy1 = *(int*)y;
186 hx = *(int*)x;
187 sign1 = one;
188 sy = hy1 & 0x80000000;
189 hy1 &= ~0x80000000;
191 sx = hx & 0x80000000;
192 hx &= ~0x80000000;
194 if (hy1 > hx)
196 x1 = *y;
197 y1 = *x;
198 i = hx;
199 hx = hy1;
200 hy1 = i;
201 if (sy)
203 x1 = -x1;
204 sign1 = -sign1;
206 if (sx)
208 y1 = -y1;
209 ah1 = pio2;
211 else
213 ah1 = -pio2;
214 sign1 = -sign1;
217 else
219 y1 = *y;
220 x1 = *x;
221 if (sy)
223 y1 = -y1;
224 sign1 = -sign1;
226 if (sx)
228 x1 = -x1;
229 ah1 = -pi;
230 sign1 = -sign1;
232 else
233 ah1 = zero;
236 if (hx >= 0x7f800000 || hx - hy1 >= 0x0c800000)
238 if (hx >= 0x7f800000)
240 if (hx ^ 0x7f800000) /* nan */
241 ah1 = x1 + y1;
242 else if (hy1 >= 0x7f800000)
243 ah1 += pio4;
245 else if ((int) ah1 == 0)
246 ah1 = y1 / x1;
247 *z = (sign1 == one)? ah1 : -ah1;
248 x += stridex;
249 y += stridey;
250 z += stridez;
251 i = 1;
252 if (--n <= 0)
253 break;
254 goto loop1;
256 if (hy1 < 0x00800000) {
257 if (hy1 == 0)
259 *z = sign1 * (float) ah1;
260 x += stridex;
261 y += stridey;
262 z += stridez;
263 i = 1;
264 if (--n <= 0)
265 break;
266 goto loop1;
268 y1 *= twop24; /* scale subnormal y */
269 x1 *= twop24; /* scale possibly subnormal x */
270 hy1 = *(int*)&y1;
271 hx = *(int*)&x1;
273 pz1 = z;
275 k1 = (hy1 - hx + 0x3f800000) & 0xfff80000;
276 if (k1 >= 0x3C800000) /* if |x| >= (1/64)... */
278 *(int*)&base1 = k1;
279 k1 = (k1 - 0x3C800000) >> 18; /* (index >> 19) << 1) */
280 k1 += 4;
281 /* skip over 0,0,pi/2,pi/2 */
283 else /* |x| < 1/64 */
285 k1 = 0;
286 base1 = zero;
289 x += stridex;
290 y += stridey;
291 z += stridez;
292 i = 2;
293 if (--n <= 0)
294 break;
296 loop2:
297 hy2 = *(int*)y;
298 hx = *(int*)x;
299 sign2 = one;
300 sy = hy2 & 0x80000000;
301 hy2 &= ~0x80000000;
303 sx = hx & 0x80000000;
304 hx &= ~0x80000000;
306 if (hy2 > hx)
308 x2 = *y;
309 y2 = *x;
310 i = hx;
311 hx = hy2;
312 hy2 = i;
313 if (sy)
315 x2 = -x2;
316 sign2 = -sign2;
318 if (sx)
320 y2 = -y2;
321 ah2 = pio2;
323 else
325 ah2 = -pio2;
326 sign2 = -sign2;
329 else
331 y2 = *y;
332 x2 = *x;
333 if (sy)
335 y2 = -y2;
336 sign2 = -sign2;
338 if (sx)
340 x2 = -x2;
341 ah2 = -pi;
342 sign2 = -sign2;
344 else
345 ah2 = zero;
348 if (hx >= 0x7f800000 || hx - hy2 >= 0x0c800000)
350 if (hx >= 0x7f800000)
352 if (hx ^ 0x7f800000) /* nan */
353 ah2 = x2 + y2;
354 else if (hy2 >= 0x7f800000)
355 ah2 += pio4;
357 else if ((int) ah2 == 0)
358 ah2 = y2 / x2;
359 *z = (sign2 == one)? ah2 : -ah2;
360 x += stridex;
361 y += stridey;
362 z += stridez;
363 i = 2;
364 if (--n <= 0)
365 break;
366 goto loop2;
368 if (hy2 < 0x00800000) {
369 if (hy2 == 0)
371 *z = sign2 * (float) ah2;
372 x += stridex;
373 y += stridey;
374 z += stridez;
375 i = 2;
376 if (--n <= 0)
377 break;
378 goto loop2;
380 y2 *= twop24; /* scale subnormal y */
381 x2 *= twop24; /* scale possibly subnormal x */
382 hy2 = *(int*)&y2;
383 hx = *(int*)&x2;
386 pz2 = z;
388 k2 = (hy2 - hx + 0x3f800000) & 0xfff80000;
389 if (k2 >= 0x3C800000) /* if |x| >= (1/64)... */
391 *(int*)&base2 = k2;
392 k2 = (k2 - 0x3C800000) >> 18; /* (index >> 19) << 1) */
393 k2 += 4;
394 /* skip over 0,0,pi/2,pi/2 */
396 else /* |x| < 1/64 */
398 k2 = 0;
399 base2 = zero;
402 goto endloop;
404 endloop:
406 ah2 += __vlibm_TBL_atan1[k2];
407 ah1 += __vlibm_TBL_atan1[k1];
408 ah0 += __vlibm_TBL_atan1[k0];
410 db2 = base2;
411 db1 = base1;
412 db0 = base0;
413 dy2 = y2;
414 dy1 = y1;
415 dy0 = y0;
416 dx2 = x2;
417 dx1 = x1;
418 dx0 = x0;
420 num2 = dy2 - dx2 * db2;
421 den2 = dx2 + dy2 * db2;
423 num1 = dy1 - dx1 * db1;
424 den1 = dx1 + dy1 * db1;
426 num0 = dy0 - dx0 * db0;
427 den0 = dx0 + dy0 * db0;
429 t2 = num2 / den2;
430 t1 = num1 / den1;
431 t0 = num0 / den0;
433 sx2 = t2 * t2;
434 sx1 = t1 * t1;
435 sx0 = t0 * t0;
437 t2 += t2 * sx2 * (q1 + sx2 * q2);
438 t1 += t1 * sx1 * (q1 + sx1 * q2);
439 t0 += t0 * sx0 * (q1 + sx0 * q2);
441 t2 += ah2;
442 t1 += ah1;
443 t0 += ah0;
445 *pz2 = sign2 * t2;
446 *pz1 = sign1 * t1;
447 *pz0 = sign0 * t0;
449 x += stridex;
450 y += stridey;
451 z += stridez;
452 i = 0;
453 } while (--n > 0);
455 if (i > 1)
457 ah1 += __vlibm_TBL_atan1[k1];
458 t1 = (y1 - x1 * (double)base1) /
459 (x1 + y1 * (double)base1);
460 sx1 = t1 * t1;
461 t1 += t1 * sx1 * (q1 + sx1 * q2);
462 t1 += ah1;
463 *pz1 = sign1 * t1;
466 if (i > 0)
468 ah0 += __vlibm_TBL_atan1[k0];
469 t0 = (y0 - x0 * (double)base0) /
470 (x0 + y0 * (double)base0);
471 sx0 = t0 * t0;
472 t0 += t0 * sx0 * (q1 + sx0 * q2);
473 t0 += ah0;
474 *pz0 = sign0 * t0;