4 | The entry point stan computes the tangent of
6 | stand does the same except for denormalized input.
8 | Input: Double-extended number X in location pointed to
9 | by address register a0.
11 | Output: The value tan(X) returned in floating-point register Fp0.
13 | Accuracy and Monotonicity: The returned result is within 3 ulp in
14 | 64 significant bit, i.e. within 0.5001 ulp to 53 bits if the
15 | result is subsequently rounded to double precision. The
16 | result is provably monotonic in double precision.
18 | Speed: The program sTAN takes approximately 170 cycles for
19 | input argument X such that |X| < 15Pi, which is the usual
24 | 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.
26 | 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let
27 | k = N mod 2, so in particular, k = 0 or 1.
29 | 3. If k is odd, go to 5.
31 | 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a
32 | rational function U/V where
33 | U = r + r*s*(P1 + s*(P2 + s*P3)), and
34 | V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r.
37 | 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a
38 | rational function U/V where
39 | U = r + r*s*(P1 + s*(P2 + s*P3)), and
40 | V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,
41 | -Cot(r) = -V/U. Exit.
43 | 6. If |X| > 1, go to 8.
45 | 7. (|X|<2**(-40)) Tan(X) = X. Exit.
47 | 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2.
50 | Copyright (C) Motorola, Inc. 1990
53 | THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
54 | The copyright notice above does not evidence any
55 | actual or intended publication of such source code.
57 |STAN idnt 2,1 | Motorola 040 Floating Point Software Package
63 BOUNDS1: .long 0x3FD78000,0x4004BC7E
64 TWOBYPI: .long 0x3FE45F30,0x6DC9C883
66 TANQ4: .long 0x3EA0B759,0xF50F8688
67 TANP3: .long 0xBEF2BAA5,0xA8924F04
69 TANQ3: .long 0xBF346F59,0xB39BA65F,0x00000000,0x00000000
71 TANP2: .long 0x3FF60000,0xE073D3FC,0x199C4A00,0x00000000
73 TANQ2: .long 0x3FF90000,0xD23CD684,0x15D95FA1,0x00000000
75 TANP1: .long 0xBFFC0000,0x8895A6C5,0xFB423BCA,0x00000000
77 TANQ1: .long 0xBFFD0000,0xEEF57E0D,0xA84BC8CE,0x00000000
79 INVTWOPI: .long 0x3FFC0000,0xA2F9836E,0x4E44152A,0x00000000
81 TWOPI1: .long 0x40010000,0xC90FDAA2,0x00000000,0x00000000
82 TWOPI2: .long 0x3FDF0000,0x85A308D4,0x00000000,0x00000000
84 |--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING
85 |--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT
89 .long 0xC0040000,0xC90FDAA2,0x2168C235,0x21800000
90 .long 0xC0040000,0xC2C75BCD,0x105D7C23,0xA0D00000
91 .long 0xC0040000,0xBC7EDCF7,0xFF523611,0xA1E80000
92 .long 0xC0040000,0xB6365E22,0xEE46F000,0x21480000
93 .long 0xC0040000,0xAFEDDF4D,0xDD3BA9EE,0xA1200000
94 .long 0xC0040000,0xA9A56078,0xCC3063DD,0x21FC0000
95 .long 0xC0040000,0xA35CE1A3,0xBB251DCB,0x21100000
96 .long 0xC0040000,0x9D1462CE,0xAA19D7B9,0xA1580000
97 .long 0xC0040000,0x96CBE3F9,0x990E91A8,0x21E00000
98 .long 0xC0040000,0x90836524,0x88034B96,0x20B00000
99 .long 0xC0040000,0x8A3AE64F,0x76F80584,0xA1880000
100 .long 0xC0040000,0x83F2677A,0x65ECBF73,0x21C40000
101 .long 0xC0030000,0xFB53D14A,0xA9C2F2C2,0x20000000
102 .long 0xC0030000,0xEEC2D3A0,0x87AC669F,0x21380000
103 .long 0xC0030000,0xE231D5F6,0x6595DA7B,0xA1300000
104 .long 0xC0030000,0xD5A0D84C,0x437F4E58,0x9FC00000
105 .long 0xC0030000,0xC90FDAA2,0x2168C235,0x21000000
106 .long 0xC0030000,0xBC7EDCF7,0xFF523611,0xA1680000
107 .long 0xC0030000,0xAFEDDF4D,0xDD3BA9EE,0xA0A00000
108 .long 0xC0030000,0xA35CE1A3,0xBB251DCB,0x20900000
109 .long 0xC0030000,0x96CBE3F9,0x990E91A8,0x21600000
110 .long 0xC0030000,0x8A3AE64F,0x76F80584,0xA1080000
111 .long 0xC0020000,0xFB53D14A,0xA9C2F2C2,0x1F800000
112 .long 0xC0020000,0xE231D5F6,0x6595DA7B,0xA0B00000
113 .long 0xC0020000,0xC90FDAA2,0x2168C235,0x20800000
114 .long 0xC0020000,0xAFEDDF4D,0xDD3BA9EE,0xA0200000
115 .long 0xC0020000,0x96CBE3F9,0x990E91A8,0x20E00000
116 .long 0xC0010000,0xFB53D14A,0xA9C2F2C2,0x1F000000
117 .long 0xC0010000,0xC90FDAA2,0x2168C235,0x20000000
118 .long 0xC0010000,0x96CBE3F9,0x990E91A8,0x20600000
119 .long 0xC0000000,0xC90FDAA2,0x2168C235,0x1F800000
120 .long 0xBFFF0000,0xC90FDAA2,0x2168C235,0x1F000000
121 .long 0x00000000,0x00000000,0x00000000,0x00000000
122 .long 0x3FFF0000,0xC90FDAA2,0x2168C235,0x9F000000
123 .long 0x40000000,0xC90FDAA2,0x2168C235,0x9F800000
124 .long 0x40010000,0x96CBE3F9,0x990E91A8,0xA0600000
125 .long 0x40010000,0xC90FDAA2,0x2168C235,0xA0000000
126 .long 0x40010000,0xFB53D14A,0xA9C2F2C2,0x9F000000
127 .long 0x40020000,0x96CBE3F9,0x990E91A8,0xA0E00000
128 .long 0x40020000,0xAFEDDF4D,0xDD3BA9EE,0x20200000
129 .long 0x40020000,0xC90FDAA2,0x2168C235,0xA0800000
130 .long 0x40020000,0xE231D5F6,0x6595DA7B,0x20B00000
131 .long 0x40020000,0xFB53D14A,0xA9C2F2C2,0x9F800000
132 .long 0x40030000,0x8A3AE64F,0x76F80584,0x21080000
133 .long 0x40030000,0x96CBE3F9,0x990E91A8,0xA1600000
134 .long 0x40030000,0xA35CE1A3,0xBB251DCB,0xA0900000
135 .long 0x40030000,0xAFEDDF4D,0xDD3BA9EE,0x20A00000
136 .long 0x40030000,0xBC7EDCF7,0xFF523611,0x21680000
137 .long 0x40030000,0xC90FDAA2,0x2168C235,0xA1000000
138 .long 0x40030000,0xD5A0D84C,0x437F4E58,0x1FC00000
139 .long 0x40030000,0xE231D5F6,0x6595DA7B,0x21300000
140 .long 0x40030000,0xEEC2D3A0,0x87AC669F,0xA1380000
141 .long 0x40030000,0xFB53D14A,0xA9C2F2C2,0xA0000000
142 .long 0x40040000,0x83F2677A,0x65ECBF73,0xA1C40000
143 .long 0x40040000,0x8A3AE64F,0x76F80584,0x21880000
144 .long 0x40040000,0x90836524,0x88034B96,0xA0B00000
145 .long 0x40040000,0x96CBE3F9,0x990E91A8,0xA1E00000
146 .long 0x40040000,0x9D1462CE,0xAA19D7B9,0x21580000
147 .long 0x40040000,0xA35CE1A3,0xBB251DCB,0xA1100000
148 .long 0x40040000,0xA9A56078,0xCC3063DD,0xA1FC0000
149 .long 0x40040000,0xAFEDDF4D,0xDD3BA9EE,0x21200000
150 .long 0x40040000,0xB6365E22,0xEE46F000,0xA1480000
151 .long 0x40040000,0xBC7EDCF7,0xFF523611,0x21E80000
152 .long 0x40040000,0xC2C75BCD,0x105D7C23,0x20D00000
153 .long 0x40040000,0xC90FDAA2,0x2168C235,0xA1800000
166 |--TAN(X) = X FOR DENORMALIZED X
172 fmovex (%a0),%fp0 | ...LOAD INPUT
176 andil #0x7FFFFFFF,%d0
178 cmpil #0x3FD78000,%d0 | ...|X| >= 2**(-40)?
182 cmpil #0x4004BC7E,%d0 | ...|X| < 15 PI?
188 |--THIS IS THE USUAL CASE, |X| <= 15 PI.
189 |--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
191 fmuld TWOBYPI,%fp1 | ...X*2/PI
193 |--HIDE THE NEXT TWO INSTRUCTIONS
194 leal PITBL+0x200,%a1 | ...TABLE OF N*PI/2, N = -32,...,32
197 fmovel %fp1,%d0 | ...CONVERT TO INTEGER
200 addal %d0,%a1 | ...ADDRESS N*PIBY2 IN Y1, Y2
202 fsubx (%a1)+,%fp0 | ...X-Y1
205 fsubs (%a1),%fp0 | ...FP0 IS R = (X-Y1)-Y2
208 andil #0x80000000,%d0 | ...D0 WAS ODD IFF D0 < 0
216 fmulx %fp1,%fp1 | ...S = R*R
221 fmulx %fp1,%fp3 | ...SQ4
222 fmulx %fp1,%fp2 | ...SP3
224 faddd TANQ3,%fp3 | ...Q3+SQ4
225 faddx TANP2,%fp2 | ...P2+SP3
227 fmulx %fp1,%fp3 | ...S(Q3+SQ4)
228 fmulx %fp1,%fp2 | ...S(P2+SP3)
230 faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4)
231 faddx TANP1,%fp2 | ...P1+S(P2+SP3)
233 fmulx %fp1,%fp3 | ...S(Q2+S(Q3+SQ4))
234 fmulx %fp1,%fp2 | ...S(P1+S(P2+SP3))
236 faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4))
237 fmulx %fp0,%fp2 | ...RS(P1+S(P2+SP3))
239 fmulx %fp3,%fp1 | ...S(Q1+S(Q2+S(Q3+SQ4)))
242 faddx %fp2,%fp0 | ...R+RS(P1+S(P2+SP3))
245 fadds #0x3F800000,%fp1 | ...1+S(Q1+...)
247 fmovel %d1,%fpcr |restore users exceptions
248 fdivx %fp1,%fp0 |last inst - possible exception set
254 fmulx %fp0,%fp0 | ...S = R*R
259 fmulx %fp0,%fp3 | ...SQ4
260 fmulx %fp0,%fp2 | ...SP3
262 faddd TANQ3,%fp3 | ...Q3+SQ4
263 faddx TANP2,%fp2 | ...P2+SP3
265 fmulx %fp0,%fp3 | ...S(Q3+SQ4)
266 fmulx %fp0,%fp2 | ...S(P2+SP3)
268 faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4)
269 faddx TANP1,%fp2 | ...P1+S(P2+SP3)
271 fmulx %fp0,%fp3 | ...S(Q2+S(Q3+SQ4))
272 fmulx %fp0,%fp2 | ...S(P1+S(P2+SP3))
274 faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4))
275 fmulx %fp1,%fp2 | ...RS(P1+S(P2+SP3))
277 fmulx %fp3,%fp0 | ...S(Q1+S(Q2+S(Q3+SQ4)))
280 faddx %fp2,%fp1 | ...R+RS(P1+S(P2+SP3))
281 fadds #0x3F800000,%fp0 | ...1+S(Q1+...)
285 eoril #0x80000000,(%sp)
287 fmovel %d1,%fpcr |restore users exceptions
288 fdivx (%sp)+,%fp0 |last inst - possible exception set
293 |--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION.
294 |--IF |X| < 2**(-40), RETURN X OR 1.
295 cmpil #0x3FFF8000,%d0
301 fmovel %d1,%fpcr |restore users exceptions
302 fmovex (%sp)+,%fp0 |last inst - possible exception set
308 |--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
309 |--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
310 |--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
312 fmovemx %fp2-%fp5,-(%a7) | ...save FP2 through FP5
314 fmoves #0x00000000,%fp1
316 |--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
317 |--there is a danger of unwanted overflow in first LOOP iteration. In this
318 |--case, reduce argument by one remainder step to make subsequent reduction
320 cmpil #0x7ffeffff,%d0 |is argument dangerously large?
322 movel #0x7ffe0000,FP_SCR2(%a6) |yes
323 | ;create 2**16383*PI/2
324 movel #0xc90fdaa2,FP_SCR2+4(%a6)
326 ftstx %fp0 |test sign of argument
327 movel #0x7fdc0000,FP_SCR3(%a6) |create low half of 2**16383*
329 movel #0x85a308d3,FP_SCR3+4(%a6)
332 orw #0x8000,FP_SCR2(%a6) |positive arg
333 orw #0x8000,FP_SCR3(%a6)
335 faddx FP_SCR2(%a6),%fp0 |high part of reduction is exact
336 fmovex %fp0,%fp1 |save high result in fp1
337 faddx FP_SCR3(%a6),%fp0 |low part of reduction
338 fsubx %fp0,%fp1 |determine low component of result
339 faddx FP_SCR3(%a6),%fp1 |fp0/fp1 are reduced argument.
341 |--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
342 |--integer quotient will be stored in N
343 |--Intermediate remainder is 66-bit long; (R,r) in (FP0,FP1)
346 fmovex %fp0,INARG(%a6) | ...+-2**K * F, 1 <= F < 2
348 movel %d0,%a1 | ...save a copy of D0
349 andil #0x00007FFF,%d0
350 subil #0x00003FFF,%d0 | ...D0 IS K
354 subil #27,%d0 | ...D0 IS L := K-27
355 movel #0,ENDFLAG(%a6)
358 clrl %d0 | ...D0 IS L := 0
359 movel #1,ENDFLAG(%a6)
362 |--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN
363 |--THAT INT( X * (2/PI) / 2**(L) ) < 2**29.
365 |--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
366 |--2**L * (PIby2_1), 2**L * (PIby2_2)
368 movel #0x00003FFE,%d2 | ...BIASED EXPO OF 2/PI
369 subl %d0,%d2 | ...BIASED EXPO OF 2**(-L)*(2/PI)
371 movel #0xA2F9836E,FP_SCR1+4(%a6)
372 movel #0x4E44152A,FP_SCR1+8(%a6)
373 movew %d2,FP_SCR1(%a6) | ...FP_SCR1 is 2**(-L)*(2/PI)
376 fmulx FP_SCR1(%a6),%fp2
377 |--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
378 |--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.L FP <--> N
379 |--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
380 |--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE
381 |--US THE DESIRED VALUE IN FLOATING POINT.
383 |--HIDE SIX CYCLES OF INSTRUCTION
386 andil #0x80000000,%d2
387 oril #0x5F000000,%d2 | ...D2 IS SIGN(INARG)*2**63 IN SGL
388 movel %d2,TWOTO63(%a6)
391 addil #0x00003FFF,%d2 | ...BIASED EXPO OF 2**L * (PI/2)
394 fadds TWOTO63(%a6),%fp2 | ...THE FRACTIONAL PART OF FP1 IS ROUNDED
396 |--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1 and 2**(L)*Piby2_2
397 movew %d2,FP_SCR2(%a6)
399 movel #0xC90FDAA2,FP_SCR2+4(%a6)
400 clrl FP_SCR2+8(%a6) | ...FP_SCR2 is 2**(L) * Piby2_1
403 fsubs TWOTO63(%a6),%fp2 | ...FP2 is N
405 addil #0x00003FDD,%d0
406 movew %d0,FP_SCR3(%a6)
408 movel #0x85A308D3,FP_SCR3+4(%a6)
409 clrl FP_SCR3+8(%a6) | ...FP_SCR3 is 2**(L) * Piby2_2
411 movel ENDFLAG(%a6),%d0
413 |--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
414 |--P2 = 2**(L) * Piby2_2
416 fmulx FP_SCR2(%a6),%fp4 | ...W = N*P1
418 fmulx FP_SCR3(%a6),%fp5 | ...w = N*P2
420 |--we want P+p = W+w but |p| <= half ulp of P
421 |--Then, we need to compute A := R-P and a := r-p
422 faddx %fp5,%fp3 | ...FP3 is P
423 fsubx %fp3,%fp4 | ...W-P
425 fsubx %fp3,%fp0 | ...FP0 is A := R - P
426 faddx %fp5,%fp4 | ...FP4 is p = (W-P)+w
428 fmovex %fp0,%fp3 | ...FP3 A
429 fsubx %fp4,%fp1 | ...FP1 is a := r - p
431 |--Now we need to normalize (A,a) to "new (R,r)" where R+r = A+a but
432 |--|r| <= half ulp of R.
433 faddx %fp1,%fp0 | ...FP0 is R := A+a
434 |--No need to calculate r if this is the last loop
438 |--Need to calculate r
439 fsubx %fp0,%fp3 | ...A-R
440 faddx %fp3,%fp1 | ...FP1 is r := (A-R)+a
446 fmovemx (%a7)+,%fp2-%fp5