4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
37 .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01
38 .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01
39 .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff
40 .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000
41 .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000
42 .word 0x7fe00000, 0x00000000 ! DA0 = 0x7fe0000000000000
43 .word 0x47efffff, 0xe0000000 ! DFMAX = 3.402823e+38
44 .word 0x7f7fffff, 0x80808080 ! FMAX = 3.402823e+38 , SCALE = 0x80808080
45 .word 0x20000000, 0x00000000 ! DA1 = 0x2000000000000000
61 #define _0x7fffffff %o1
62 #define _0x7f3504f3 %o2
68 #define tmp_px STACK_BIAS-0x30
69 #define tmp_py STACK_BIAS-0x28
70 #define tmp_counter STACK_BIAS-0x20
71 #define tmp0 STACK_BIAS-0x18
72 #define tmp1 STACK_BIAS-0x10
73 #define tmp2 STACK_BIAS-0x0c
74 #define tmp3 STACK_BIAS-0x08
75 #define tmp4 STACK_BIAS-0x04
77 ! sizeof temp storage - must be a multiple of 16 for V9
80 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
81 ! !!!!! algorithm !!!!!
93 ! if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 )
95 ! if ( hx >= 0x7f800000 || hy >= 0x7f800000 )
97 ! if ( hx == 0x7f800000 || hy == 0x7f800000 )
98 ! *(int*)pz = 0x7f800000;
103 ! hyp = sqrt(x * (double)x + y * (double)y);
104 ! if ( hyp <= DMAX ) ftmp0 = (float)hyp;
105 ! else ftmp0 = FMAX * FMAX;
111 ! if ( (hx | hy) == 0 )
117 ! dx0 = x0 * (double)x0;
118 ! dy0 = y0 * (double)y0;
121 ! iexp0 = ((int*)&db0)[0];
123 ! h0 = vis_fand(db0,DC0);
124 ! h0 = vis_for(h0,DC1);
125 ! h_hi0 = vis_fand(h0,DC2);
127 ! db0 = vis_fand(db0,DA0);
128 ! db0 = vis_fmul8x16(SCALE, db0);
129 ! db0 = vis_fpadd32(db0,DA1);
132 ! di0 = iexp0 & 0x1ff0;
133 ! si0 = (char*)sqrt_arr + di0;
135 ! dtmp0 = ((double*)((char*)div_arr + di0))[0];
139 ! dtmp0 = ((double*)si0)[1];
144 ! res0 = dtmp0 * res0;
146 ! ftmp0 = (float)res0;
149 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
152 save %sp,-SA(MINFRAME)-tmps,%sp
154 PIC_SET(l7,.CONST_TBL,o3)
155 PIC_SET(l7,__vlibm_TBL_sqrtf,l1)
158 ldx [%fp+STACK_BIAS+176],stridez
160 ld [%fp+STACK_BIAS+92],stridez
162 st %i0,[%fp+tmp_counter]
169 sethi %hi(0x7ffffc00),%o1
172 sethi %hi(0x7f350400),%o2
175 add %o1,1023,_0x7fffffff
176 add %o2,0xf3,_0x7f3504f3
187 sll stridez,2,stridez
199 ld [%fp+tmp_counter],counter
202 st %g0,[%fp+tmp_counter]
206 lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px;
208 lda [%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py;
210 lda [%i1]0x82,%f17 ! (3_0) x0 = *px;
211 and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff;
213 cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3
214 bge,pn %icc,.spec ! (3_0) if ( hx >= 0x7f3504f3 )
215 and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff;
217 cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3
218 bge,pn %icc,.spec ! (3_0) if ( hy >= 0x7f3504f3 )
224 add %i1,stridex,%i1 ! px += stridex
225 fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0;
226 lda [%i2]0x82,%f17 ! (3_0) y0 = *py;
228 lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px;
230 lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py;
232 and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff;
234 fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0;
235 cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3
236 bge,pn %icc,.update0 ! (4_0) if ( hx >= 0x7f3504f3 )
237 and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
241 lda [%i1]0x82,%f17 ! (4_0) x0 = *px;
243 faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0;
245 fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0;
246 cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3
247 lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py;
249 add %o7,stridey,%i5 ! py += stridey
250 lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px;
252 bge,pn %icc,.update1 ! (4_1) if ( hy >= 0x7f3504f3 )
253 st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0];
255 and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff;
257 fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0;
258 lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px;
260 add %i1,stridex,%i1 ! px += stridex
262 lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py;
263 cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3
264 bge,pn %icc,.update2 ! (0_0) if ( hx >= 0x7f3504f3 )
265 add %i5,stridey,%o4 ! py += stridey
267 faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0;
269 fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0;
270 and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff;
271 lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py;
273 cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3
274 bge,pn %icc,.update3 ! (0_0) if ( hy >= 0x7f3504f3 )
275 st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0];
280 lda [%i1+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px;
282 fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0);
284 and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff;
286 fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0;
287 cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3
288 lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py;
290 add %i1,stridex,%i1 ! px += stridex
292 lda [%i1]0x82,%f17 ! (1_0) x0 = *px;
293 bge,pn %icc,.update4 ! (1_0) if ( hx >= 0x7f3504f3 )
294 add %o4,stridey,%i5 ! py += stridey
296 and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff;
297 for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1);
299 cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3
300 ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0];
301 faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0;
303 fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0;
304 add %i1,stridex,%i1 ! px += stridex
305 lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py;
307 srax %o0,8,%o0 ! (3_1) iexp0 >>= 8;
308 bge,pn %icc,.update5 ! (1_0) if ( hy >= 0x7f3504f3 )
309 fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2);
314 lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px;
316 and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0;
317 st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0];
318 fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0);
320 ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
321 fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0;
323 fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0;
324 add %i5,stridey,%i2 ! py += stridey
325 lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py;
327 and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff;
329 lda [%i1]0x82,%f17 ! (2_0) x0 = *px;
330 cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3
332 fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0;
333 and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff;
334 for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1);
336 bge,pn %icc,.update6 ! (2_0) if ( hx >= 0x7f3504f3 )
337 ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0];
339 faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0;
341 fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0;
342 cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3
343 lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py;
345 add %i1,stridex,%i1 ! px += stridex
346 bge,pn %icc,.update7 ! (2_0) if ( hy >= 0x7f3504f3 )
347 fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2);
353 fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0;
354 srax %o3,8,%o3 ! (4_1) iexp0 >>= 8;
355 lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px;
357 and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0;
358 st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0];
359 fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0);
361 ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
362 add %i2,stridey,%o7 ! py += stridey
363 fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0;
365 fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0;
366 lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py;
367 and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff;
369 faddd %f56,K1,%f54 ! (3_1) res0 += K1;
370 cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3
372 lda [%i1]0x82,%f17 ! (3_0) x0 = *px;
373 add %i1,stridex,%i1 ! px += stridex
374 bge,pn %icc,.update8 ! (3_0) if ( hx >= 0x7f3504f3 )
376 fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0;
378 and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff;
379 for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1);
381 cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3
382 ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0];
383 faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0;
385 fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0;
386 bge,pn %icc,.update9 ! (3_0) if ( hy >= 0x7f3504f3 )
387 lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py;
393 fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0;
394 lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px;
395 fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2);
397 fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0;
398 srax %g1,8,%o5 ! (0_0) iexp0 >>= 8;
399 lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py;
400 fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0);
402 and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0;
403 st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0];
404 fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0);
406 ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
407 add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0;
408 and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff;
409 fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0;
411 fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0;
412 cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3
413 bge,pn %icc,.update10 ! (4_0) if ( hx >= 0x7f3504f3 )
414 faddd %f40,DC1,%f40 ! (3_1) res0 += DC1;
416 fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
417 and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
418 ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1];
419 faddd %f54,K1,%f54 ! (4_1) res0 += K1;
421 lda [%i1]0x82,%f17 ! (4_0) x0 = *px;
423 fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0;
425 for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1);
427 ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0];
428 fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0;
429 faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0;
435 sub counter,5,counter
439 fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0;
440 cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3
441 lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py;
442 fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1);
444 fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0;
445 add %o7,stridey,%i5 ! py += stridey
446 st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0];
447 fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2);
449 fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0;
450 srax %g1,8,%g5 ! (1_1) iexp0 >>= 8;
451 bge,pn %icc,.update11 ! (4_1) if ( hy >= 0x7f3504f3 )
452 fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0);
459 fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0;
460 and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0;
461 lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px;
462 fand %f30,DC0,%f60 ! (2_1) h0 = vis_fand(db0,DC0);
464 ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
465 add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0;
466 add %i1,stridex,%i0 ! px += stridex
467 fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0;
469 fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0;
471 lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px;
472 faddd %f58,DC1,%f36 ! (4_2) res0 += DC1;
474 faddd %f56,K1,%f58 ! (0_1) res0 += K1;
475 and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff;
476 ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1];
477 fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0);
479 lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py;
480 cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3
481 bge,pn %icc,.update12 ! (0_0) if ( hx >= 0x7f3504f3 )
482 fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0;
484 fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0;
485 add %l7,stridez,%o7 ! pz += stridez
486 st %f14,[%l7] ! (3_2) *pz = ftmp0;
487 for %f60,DC1,%f46 ! (2_1) h0 = vis_for(h0,DC1);
489 fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0;
490 add %i5,stridey,%o4 ! py += stridey
491 ld [%fp+tmp4],%g1 ! (2_1) iexp0 = ((int*)&db0)[0];
492 faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0;
494 fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0;
495 and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff;
496 lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py;
497 fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1);
499 fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0;
500 cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3
501 st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0];
502 fand %f46,DC2,%f58 ! (2_1) h_hi0 = vis_fand(h0,DC2);
504 fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0;
505 srax %g1,8,%g1 ! (2_1) iexp0 >>= 8;
506 bge,pn %icc,.update13 ! (0_0) if ( hy >= 0x7f3504f3 )
507 fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0);
514 fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0;
515 and %g1,_0x1ff0,%g1 ! (2_1) di0 = iexp0 & 0x1ff0;
516 lda [%i0+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px;
517 fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0);
519 ldd [TBL+%g1],%f22 ! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
520 add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0;
521 add %i0,stridex,%i1 ! px += stridex
522 fsubd %f46,%f58,%f58 ! (2_1) xx0 = h0 - h_hi0;
524 fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0;
525 add %o7,stridez,%i0 ! pz += stridez
526 lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py;
527 faddd %f38,DC1,%f36 ! (0_1) res0 += DC1;
529 faddd %f56,K1,%f38 ! (1_1) res0 += K1;
530 and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff;
531 ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1];
532 fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0);
534 lda [%i1]0x82,%f17 ! (1_0) x0 = *px;
535 cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3
536 bge,pn %icc,.update14 ! (1_0) if ( hx >= 0x7f3504f3 )
537 fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0;
539 fmuld %f58,%f22,%f58 ! (2_1) xx0 *= dmp0;
540 and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff;
541 add %o4,stridey,%i5 ! py += stridey
542 for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1);
544 fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0;
545 cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3
546 ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0];
547 faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0;
549 fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0;
550 add %i1,stridex,%i1 ! px += stridex
551 lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py;
552 fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1);
554 fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0;
555 st %f14,[%o7] ! (4_2) *pz = ftmp0;
556 bge,pn %icc,.update15 ! (1_0) if ( hy >= 0x7f3504f3 )
557 fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2);
563 fmuld K2,%f58,%f54 ! (2_1) res0 = K2 * xx0;
564 srax %o0,8,%o0 ! (3_1) iexp0 >>= 8;
565 st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0];
566 fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0);
568 fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0;
569 and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0;
570 lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px;
571 fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0);
573 ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
574 add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0;
575 add %i0,stridez,%i3 ! pz += stridez
576 fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0;
578 fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0;
579 add %i5,stridey,%i2 ! py += stridey
580 lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py;
581 faddd %f44,DC1,%f44 ! (1_1) res0 += DC1;
583 fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0);
584 and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff;
585 ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1];
586 faddd %f54,K1,%f54 ! (2_1) res0 += K1;
588 lda [%i1]0x82,%f17 ! (2_0) x0 = *px;
589 cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3
590 add %i3,stridez,%o4 ! pz += stridez
591 fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0;
593 fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0;
594 and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff;
595 st %f14,[%i0] ! (0_1) *pz = ftmp0;
596 for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1);
598 fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0;
599 bge,pn %icc,.update16 ! (2_0) if ( hx >= 0x7f3504f3 )
600 ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0];
601 faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0;
603 fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0;
604 cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3
605 lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py;
606 fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1);
608 fmuld %f54,%f58,%f54 ! (2_1) res0 *= xx0;
609 add %i1,stridex,%l7 ! px += stridex
610 bge,pn %icc,.update17 ! (2_0) if ( hy >= 0x7f3504f3 )
611 fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2);
618 fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0;
619 srax %o3,8,%o3 ! (4_1) iexp0 >>= 8;
620 st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0];
621 fand %f30,DA0,%f40 ! (2_1) db0 = vis_fand(db0,DA0);
623 fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0;
624 and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0;
625 lda [%l7]0x82,%l3 ! (3_0) hx0 = *(int*)px;
626 fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0);
628 ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
629 add %g1,TBL,%g1 ! (2_1) si0 = (char*)sqrt_arr + di0;
630 add %i2,stridey,%o7 ! py += stridey
631 fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0;
633 fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0;
634 lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py;
635 add %l7,stridex,%i1 ! px += stridex
636 faddd %f54,DC1,%f36 ! (2_1) res0 += DC1;
638 faddd %f56,K1,%f54 ! (3_1) res0 += K1;
639 and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff;
640 ldd [%g1+8],%f56 ! (2_1) dtmp0 = ((double*)si0)[1];
641 fmul8x16 SCALE,%f40,%f40 ! (2_1) db0 = vis_fmul8x16(SCALE, db0);
643 lda [%l7]0x82,%f17 ! (3_0) x0 = *px;
644 cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3
645 bge,pn %icc,.update18 ! (3_0) if ( hx >= 0x7f3504f3 )
646 fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0;
648 fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0;
649 and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff;
650 st %f14,[%i3] ! (1_1) *pz = ftmp0;
651 for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1);
653 fmuld %f56,%f36,%f36 ! (2_1) res0 = dtmp0 * res0;
654 cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3
655 ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0];
656 faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0;
658 fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0;
659 bge,pn %icc,.update19 ! (3_0) if ( hy >= 0x7f3504f3 )
660 lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py;
661 fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1);
664 fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0;
666 st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0];
667 fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2);
669 fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0;
670 srax %g1,8,%o5 ! (0_0) iexp0 >>= 8;
671 lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px;
672 fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0);
674 fmuld %f36,%f62,%f62 ! (2_1) res0 *= db0;
675 and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0;
676 bz,pn %icc,.update19a
677 fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0);
679 ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
680 add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0;
681 and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff;
682 fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0;
684 fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0;
685 cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3
686 lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py;
687 faddd %f40,DC1,%f40 ! (3_1) res0 += DC1;
689 fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
690 bge,pn %icc,.update20 ! (4_0) if ( hx >= 0x7f3504f3 )
691 ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1];
692 faddd %f54,K1,%f54 ! (4_1) res0 += K1;
694 lda [%i1]0x82,%f17 ! (4_0) x0 = *px;
696 subcc counter,5,counter ! counter -= 5
697 add %o4,stridez,%l7 ! pz += stridez
698 fdtos %f62,%f14 ! (2_1) ftmp0 = (float)res0;
700 fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0;
701 and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
702 st %f14,[%o4] ! (2_1) *pz = ftmp0;
703 for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1);
705 ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0];
706 fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0;
707 bpos,pt %icc,.main_loop
708 faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0;
710 add counter,5,counter
713 subcc counter,1,counter
717 fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1);
719 fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0;
720 fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2);
722 fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0;
723 srax %g1,8,%g5 ! (1_1) iexp0 >>= 8;
724 fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0);
726 fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0;
727 and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0;
729 ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
730 add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0;
731 fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0;
733 faddd %f58,DC1,%f36 ! (4_2) res0 += DC1;
735 faddd %f56,K1,%f58 ! (0_1) res0 += K1;
736 ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1];
737 fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0);
739 fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0;
741 fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0;
742 add %l7,stridez,%o7 ! pz += stridez
743 st %f14,[%l7] ! (3_2) *pz = ftmp0;
745 subcc counter,1,counter
749 fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0;
751 fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1);
753 fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0;
755 fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0;
756 fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0);
758 fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0;
760 add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0;
762 faddd %f38,DC1,%f36 ! (0_1) res0 += DC1;
764 faddd %f56,K1,%f38 ! (1_1) res0 += K1;
765 ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1];
766 fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0);
768 add %o7,stridez,%i0 ! pz += stridez
769 fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0;
771 fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0;
773 fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1);
775 fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0;
776 add %i0,stridez,%i3 ! pz += stridez
777 st %f14,[%o7] ! (4_2) *pz = ftmp0;
779 subcc counter,1,counter
783 fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0);
785 fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0;
787 add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0;
789 faddd %f44,DC1,%f44 ! (1_1) res0 += DC1;
791 fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0);
792 ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1];
794 add %i3,stridez,%o4 ! pz += stridez
795 fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0;
797 st %f14,[%i0] ! (0_1) *pz = ftmp0;
799 subcc counter,1,counter
803 fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0;
805 fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1);
807 fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0;
809 fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0;
811 st %f14,[%i3] ! (1_1) *pz = ftmp0;
818 st %g0,[%l7] ! *pz = 0;
819 add %l7,stridez,%l7 ! pz += stridez
821 add %i2,stridey,%i2 ! py += stridey
823 sub counter,1,counter ! counter--
827 sethi %hi(0x7f800000),%i0
828 cmp %l3,%i0 ! hx ? 0x7f800000
829 bge,pt %icc,2f ! if ( hx >= 0x7f800000 )
832 cmp %l4,%i0 ! hy ? 0x7f800000
833 bge,pt %icc,2f ! if ( hy >= 0x7f800000 )
836 fsmuld %f17,%f17,%f44 ! x * (double)x
837 fsmuld %f8,%f8,%f24 ! y * (double)y
838 faddd %f44,%f24,%f24 ! x * (double)x + y * (double)y
839 fsqrtd %f24,%f24 ! hyp = sqrt(x * (double)x + y * (double)y);
840 fcmped %f24,DFMAX ! hyp ? DMAX
841 fbug,a 1f ! if ( hyp > DMAX )
842 fmuls FMAX,FMAX,%f20 ! ftmp0 = FMAX * FMAX;
844 fdtos %f24,%f20 ! ftmp0 = (float)hyp;
846 st %f20,[%l7] ! *pz = ftmp0;
847 add %l7,stridez,%l7 ! pz += stridez
848 add %i1,stridex,%i1 ! px += stridex
850 add %i2,stridey,%i2 ! py += stridey
852 sub counter,1,counter ! counter--
854 fcmps %f17,%f8 ! exceptions
855 cmp %l3,%i0 ! hx ? 0x7f800000
856 be,a %icc,1f ! if ( hx == 0x7f800000 )
857 st %i0,[%l7] ! *(int*)pz = 0x7f800000;
859 cmp %l4,%i0 ! hy ? 0x7f800000
860 be,a %icc,1f ! if ( hy == 0x7f800000
861 st %i0,[%l7] ! *(int*)pz = 0x7f800000;
863 fmuls %f17,%f8,%f8 ! x * y
864 st %f8,[%l7] ! *pz = x * y;
867 add %l7,stridez,%l7 ! pz += stridez
868 add %i1,stridex,%i1 ! px += stridex
870 add %i2,stridey,%i2 ! py += stridey
872 sub counter,1,counter ! counter--
885 sub counter,1,counter
886 st counter,[%fp+tmp_counter]
900 sub counter,1,counter
901 st counter,[%fp+tmp_counter]
915 sub counter,2,counter
916 st counter,[%fp+tmp_counter]
930 sub counter,2,counter
931 st counter,[%fp+tmp_counter]
945 sub counter,3,counter
946 st counter,[%fp+tmp_counter]
961 sub counter,3,counter
962 st counter,[%fp+tmp_counter]
976 sub counter,4,counter
977 st counter,[%fp+tmp_counter]
992 sub counter,4,counter
993 st counter,[%fp+tmp_counter]
1005 stx %o5,[%fp+tmp_px]
1006 stx %o7,[%fp+tmp_py]
1008 sub counter,5,counter
1009 st counter,[%fp+tmp_counter]
1021 stx %o5,[%fp+tmp_px]
1022 stx %o7,[%fp+tmp_py]
1024 sub counter,5,counter
1025 st counter,[%fp+tmp_counter]
1032 fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
1033 and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
1034 ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1];
1035 faddd %f54,K1,%f54 ! (4_1) res0 += K1;
1041 stx %i1,[%fp+tmp_px]
1043 stx %i5,[%fp+tmp_py]
1045 sub counter,6,counter
1046 st counter,[%fp+tmp_counter]
1057 stx %i1,[%fp+tmp_px]
1058 stx %i5,[%fp+tmp_py]
1060 sub counter,1,counter
1061 st counter,[%fp+tmp_counter]
1072 stx %i0,[%fp+tmp_px]
1074 stx %o4,[%fp+tmp_py]
1076 sub counter,2,counter
1077 st counter,[%fp+tmp_counter]
1088 stx %i0,[%fp+tmp_px]
1089 stx %o4,[%fp+tmp_py]
1091 sub counter,2,counter
1092 st counter,[%fp+tmp_counter]
1103 stx %i1,[%fp+tmp_px]
1105 stx %i5,[%fp+tmp_py]
1107 sub counter,3,counter
1108 st counter,[%fp+tmp_counter]
1120 stx %i2,[%fp+tmp_px]
1121 stx %i5,[%fp+tmp_py]
1123 sub counter,3,counter
1124 st counter,[%fp+tmp_counter]
1131 faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0;
1136 stx %i1,[%fp+tmp_px]
1137 stx %i2,[%fp+tmp_py]
1139 sub counter,4,counter
1140 st counter,[%fp+tmp_counter]
1151 stx %i1,[%fp+tmp_px]
1152 stx %i2,[%fp+tmp_py]
1154 sub counter,4,counter
1155 st counter,[%fp+tmp_counter]
1166 stx %l7,[%fp+tmp_px]
1167 stx %o7,[%fp+tmp_py]
1169 sub counter,5,counter
1170 st counter,[%fp+tmp_counter]
1177 fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1);
1182 stx %l7,[%fp+tmp_px]
1183 stx %o7,[%fp+tmp_py]
1185 sub counter,5,counter
1186 st counter,[%fp+tmp_counter]
1197 stx %l7,[%fp+tmp_px]
1198 stx %o7,[%fp+tmp_py]
1200 sub counter,5,counter
1201 st counter,[%fp+tmp_counter]
1208 faddd %f54,K1,%f54 ! (4_1) res0 += K1;
1213 stx %i1,[%fp+tmp_px]
1215 stx %g1,[%fp+tmp_py]
1217 sub counter,6,counter
1218 st counter,[%fp+tmp_counter]