4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
36 .word 0xbfd522ae, 0x0738a000
37 .word 0xbd2ebe70, 0x8164c759
38 .word 0xbfd3c252, 0x77333000
39 .word 0xbd183b54, 0xb606bd5c
40 .word 0xbfd26962, 0x1134e000
41 .word 0x3d31b61f, 0x10522625
42 .word 0xbfd1178e, 0x8227e000
43 .word 0xbd31ef78, 0xce2d07f2
44 .word 0xbfcf991c, 0x6cb3c000
45 .word 0x3d390d04, 0xcd7cc834
46 .word 0xbfcd1037, 0xf2656000
47 .word 0x3d084a7e, 0x75b6f6e4
48 .word 0xbfca93ed, 0x3c8ae000
49 .word 0x3d287243, 0x50562169
50 .word 0xbfc823c1, 0x6551a000
51 .word 0xbd1e0ddb, 0x9a631e83
52 .word 0xbfc5bf40, 0x6b544000
53 .word 0x3d127023, 0xeb68981c
54 .word 0xbfc365fc, 0xb015a000
55 .word 0x3d3fd3a0, 0xafb9691b
56 .word 0xbfc1178e, 0x8227e000
57 .word 0xbd21ef78, 0xce2d07f2
58 .word 0xbfbda727, 0x63844000
59 .word 0xbd1a8940, 0x1fa71733
60 .word 0xbfb9335e, 0x5d594000
61 .word 0xbd23115c, 0x3abd47da
62 .word 0xbfb4d311, 0x5d208000
63 .word 0x3cf53a25, 0x82f4e1ef
64 .word 0xbfb08598, 0xb59e4000
65 .word 0x3d17e5dd, 0x7009902c
66 .word 0xbfa894aa, 0x149f8000
67 .word 0xbd39a19a, 0x8be97661
68 .word 0xbfa0415d, 0x89e78000
69 .word 0x3d3dddc7, 0xf461c516
70 .word 0xbf902056, 0x58930000
71 .word 0xbd3611d2, 0x7c8e8417
72 .word 0x00000000, 0x00000000
73 .word 0x00000000, 0x00000000
74 .word 0x3f9f829b, 0x0e780000
75 .word 0x3d298026, 0x7c7e09e4
76 .word 0x3faf0a30, 0xc0110000
77 .word 0x3d48a998, 0x5f325c5c
78 .word 0x3fb6f0d2, 0x8ae58000
79 .word 0xbd34b464, 0x1b664613
80 .word 0x3fbe2707, 0x6e2b0000
81 .word 0xbd2a342c, 0x2af0003c
82 .word 0x3fc29552, 0xf8200000
83 .word 0xbd35b967, 0xf4471dfc
84 .word 0x3fc5ff30, 0x70a78000
85 .word 0x3d43d3c8, 0x73e20a07
86 .word 0x3fc9525a, 0x9cf44000
87 .word 0x3d46b476, 0x41307539
88 .word 0x3fcc8ff7, 0xc79a8000
89 .word 0x3d4a21ac, 0x25d81ef3
90 .word 0x3fcfb918, 0x6d5e4000
91 .word 0xbd0d572a, 0xab993c87
92 .word 0x3fd1675c, 0xababa000
93 .word 0x3d38380e, 0x731f55c4
94 .word 0x3fd2e8e2, 0xbae12000
95 .word 0xbd267b1e, 0x99b72bd8
96 .word 0x3fd4618b, 0xc21c6000
97 .word 0xbd13d82f, 0x484c84cc
98 .word 0x3fd5d1bd, 0xbf580000
99 .word 0x3d4394a1, 0x1b1c1ee4
101 .word 0x40000000,0x00000000
102 .word 0x3fe55555,0x555571da
103 .word 0x3fd99999,0x8702be3a
104 .word 0x3fd24af7,0x3f4569b1
105 .word 0x3ea62e42,0xfee00000 ! scaled by 2**-20
106 .word 0x3caa39ef,0x35793c76 ! scaled by 2**-20
107 .word 0xffff8000,0x00000000
120 #define ox43200000 0x238
121 #define oxfff00000 0x23c
122 #define oxc0194000 0x240
125 ! local storage indices
127 #define jnk STACK_BIAS-0x8
128 #define tmp2 STACK_BIAS-0x10
129 #define tmp1 STACK_BIAS-0x18
130 #define tmp0 STACK_BIAS-0x20
131 ! sizeof temp storage - must be a multiple of 16 for V9
163 ! f2 v0,(two-v0)-u0,z0
168 ! f12 v1,(two-v1)-u1,z1
173 ! f22 v2,(two-v2)-u2,q2
198 save %sp,-SA(MINFRAME)-tmps,%sp
202 wr %g0,0x82,%asi ! set %asi for non-faulting loads
203 sethi %hi(0x94000),%l4
204 sethi %hi(0x000fffff),%l6
205 or %l6,%lo(0x000fffff),%l6
206 sethi %hi(0x7ff00000),%l7
214 ld [%g1+ox43200000],%f29
215 ld [%g1+oxfff00000],%f28
216 ld [%g1+oxc0194000],%f31
218 sll %i2,3,%i2 ! scale strides
220 add %fp,jnk,%o0 ! precondition loop
241 ld [%i1],%f0 ! u.l[0] = *x
243 ld [%i1+4],%f1 ! u.l[1] = *(1+x)
250 fpadd32s %f0,%f31,%f4 ! n = (ix + 0xc0194000) & 0xfff00000
251 fmuld %f6,%f2,%f8 ! (previous iteration)
254 bge,pn %icc,.range0 ! ix <= 0x000fffff or >= 0x7ff00000
258 add %i1,%i2,%i1 ! x += stridex
259 add %i3,%i4,%i3 ! y += stridey
260 fpsub32s %f0,%f4,%f0 ! u.l[0] -= n
263 lda [%i1]%asi,%l1 ! preload next argument
264 add %l0,%l4,%l0 ! j = ix + 0x94000
265 fpadd32s %f0,%f30,%f2 ! v.l[0] = u.l[0] + 0x4000
268 srl %l0,11,%l0 ! j = (j >> 11) & 0x1f0
269 fand %f2,%f50,%f2 ! v.l &= 0xffff8000...
273 fitod %f4,%f32 ! (double) n
276 fsubd %f0,%f2,%f4 ! f = u.d - v.d
278 faddd %f0,%f2,%f6 ! s = f / (u.d + v.d)
280 fsubd %f40,%f2,%f2 ! two - v.d
281 fmuld %f32,%f60,%f34 ! h = n * ln2hi + TBL[j]
283 faddd %f8,%f18,%f8 ! y = c + (t + q)
284 fmuld %f32,%f62,%f32 ! t = n * ln2lo + TBL[j+1]
288 faddd %f54,%f24,%f56 ! c = h + f
289 fmuld %f26,%f26,%f22 ! z = s * s
298 fsubd %f56,%f54,%f54 ! t += f - (c - h)
299 fmuld %f22,%f58,%f20 ! q = ...
301 fsubd %f2,%f0,%f2 ! (two - v.d) - u.d
321 ble,pn %icc,.endloop0
329 fpadd32s %f10,%f31,%f14 ! n = (ix + 0xc0194000) & 0xfff00000
330 fmuld %f16,%f12,%f8 ! (previous iteration)
333 bge,pn %icc,.range1 ! ix <= 0x000fffff or >= 0x7ff00000
337 add %i1,%i2,%i1 ! x += stridex
338 add %i3,%i4,%i3 ! y += stridey
339 fpsub32s %f10,%f14,%f10 ! u.l[0] -= n
342 lda [%i1]%asi,%l2 ! preload next argument
343 add %l1,%l4,%l1 ! j = ix + 0x94000
344 fpadd32s %f10,%f30,%f12 ! v.l[0] = u.l[0] + 0x4000
347 srl %l1,11,%l1 ! j = (j >> 11) & 0x1f0
348 fand %f12,%f50,%f12 ! v.l &= 0xffff8000...
352 fitod %f14,%f42 ! (double) n
355 fsubd %f10,%f12,%f14 ! f = u.d - v.d
357 faddd %f10,%f12,%f16 ! s = f / (u.d + v.d)
359 fsubd %f40,%f12,%f12 ! two - v.d
360 fmuld %f42,%f60,%f44 ! h = n * ln2hi + TBL[j]
362 faddd %f8,%f18,%f8 ! y = c + (t + q)
363 fmuld %f42,%f62,%f42 ! t = n * ln2lo + TBL[j+1]
367 faddd %f34,%f4,%f36 ! c = h + f
368 fmuld %f6,%f6,%f2 ! z = s * s
377 fsubd %f36,%f34,%f34 ! t += f - (c - h)
378 fmuld %f2,%f58,%f0 ! q = ...
380 fsubd %f12,%f10,%f12 ! (two - v.d) - u.d
400 ble,pn %icc,.endloop1
408 fpadd32s %f20,%f31,%f24 ! n = (ix + 0xc0194000) & 0xfff00000
409 fmuld %f26,%f22,%f8 ! (previous iteration)
412 bge,pn %icc,.range2 ! ix <= 0x000fffff or >= 0x7ff00000
416 add %i1,%i2,%i1 ! x += stridex
417 add %i3,%i4,%i3 ! y += stridey
418 fpsub32s %f20,%f24,%f20 ! u.l[0] -= n
421 lda [%i1]%asi,%l0 ! preload next argument
422 add %l2,%l4,%l2 ! j = ix + 0x94000
423 fpadd32s %f20,%f30,%f22 ! v.l[0] = u.l[0] + 0x4000
426 srl %l2,11,%l2 ! j = (j >> 11) & 0x1f0
427 fand %f22,%f50,%f22 ! v.l &= 0xffff8000...
431 fitod %f24,%f52 ! (double) n
434 fsubd %f20,%f22,%f24 ! f = u.d - v.d
436 faddd %f20,%f22,%f26 ! s = f / (u.d + v.d)
438 fsubd %f40,%f22,%f22 ! two - v.d
439 fmuld %f52,%f60,%f54 ! h = n * ln2hi + TBL[j]
441 faddd %f8,%f18,%f8 ! y = c + (t + q)
442 fmuld %f52,%f62,%f52 ! t = n * ln2lo + TBL[j+1]
446 faddd %f44,%f14,%f46 ! c = h + f
447 fmuld %f16,%f16,%f12 ! z = s * s
456 fsubd %f46,%f44,%f44 ! t += f - (c - h)
457 fmuld %f12,%f58,%f10 ! q = ...
459 fsubd %f22,%f20,%f22 ! (two - v.d) - u.d
484 ! Once we get to the last element, we loop three more times to finish
485 ! the computations in progress. This means we will load past the end
486 ! of the argument vector, but since we use non-faulting loads and never
487 ! use the data, the only potential problem is cache miss. (Note that
488 ! when the argument is 2, the only exception that occurs in the compu-
489 ! tation is an inexact result in the final addition, and we break out
490 ! of the "extra" iterations before then.)
492 sethi %hi(0x40000000),%l0 ! "next argument" = two
502 sethi %hi(0x40000000),%l1 ! "next argument" = two
512 sethi %hi(0x40000000),%l2 ! "next argument" = two
524 bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000
527 fxtod %f0,%f0 ! scale by 2**1074 w/o trapping
529 add %i1,%i2,%i1 ! x += stridex
531 be,pn %icc,1f ! if x == 0
533 add %i3,%i4,%i3 ! y += stridey
534 fpadd32s %f0,%f31,%f4 ! n = (ix + 0xc0194000) & 0xfff00000
536 fpsub32s %f0,%f4,%f0 ! u.l[0] -= n
540 fpsub32s %f4,%f29,%f4 ! n -= 0x43200000
542 fdivs %f29,%f1,%f4 ! raise div-by-zero
545 st %f28,[%i3] ! store -inf
547 sll %l0,1,%l0 ! lop off sign bit
548 add %i1,%i2,%i1 ! x += stridex
550 be,pn %icc,1b ! if x == -0
552 add %i3,%i4,%i3 ! y += stridey
553 fabsd %f0,%f4 ! *y = (x + |x|) * inf
561 ble,pn %icc,.endloop2
564 ld [%i1],%l0 ! get next argument
574 bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000
577 fxtod %f10,%f10 ! scale by 2**1074 w/o trapping
579 add %i1,%i2,%i1 ! x += stridex
581 be,pn %icc,1f ! if x == 0
583 add %i3,%i4,%i3 ! y += stridey
584 fpadd32s %f10,%f31,%f14 ! n = (ix + 0xc0194000) & 0xfff00000
586 fpsub32s %f10,%f14,%f10 ! u.l[0] -= n
590 fpsub32s %f14,%f29,%f14 ! n -= 0x43200000
592 fdivs %f29,%f11,%f14 ! raise div-by-zero
595 st %f28,[%i3] ! store -inf
597 sll %l1,1,%l1 ! lop off sign bit
598 add %i1,%i2,%i1 ! x += stridex
600 be,pn %icc,1b ! if x == -0
602 add %i3,%i4,%i3 ! y += stridey
603 fabsd %f10,%f14 ! *y = (x + |x|) * inf
611 ble,pn %icc,.endloop0
614 ld [%i1],%l1 ! get next argument
624 bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000
627 fxtod %f20,%f20 ! scale by 2**1074 w/o trapping
629 add %i1,%i2,%i1 ! x += stridex
631 be,pn %icc,1f ! if x == 0
633 add %i3,%i4,%i3 ! y += stridey
634 fpadd32s %f20,%f31,%f24 ! n = (ix + 0xc0194000) & 0xfff00000
636 fpsub32s %f20,%f24,%f20 ! u.l[0] -= n
640 fpsub32s %f24,%f29,%f24 ! n -= 0x43200000
642 fdivs %f29,%f21,%f24 ! raise div-by-zero
645 st %f28,[%i3] ! store -inf
647 sll %l2,1,%l2 ! lop off sign bit
648 add %i1,%i2,%i1 ! x += stridex
650 be,pn %icc,1b ! if x == -0
652 add %i3,%i4,%i3 ! y += stridey
653 fabsd %f20,%f24 ! *y = (x + |x|) * inf
661 ble,pn %icc,.endloop1
664 ld [%i1],%l2 ! get next argument