Automatic merge of rsync://rsync.kernel.org/pub/scm/linux/kernel/git/gregkh/driver...
[linux-2.6/verdex.git] / arch / m68k / fpsp040 / stan.S
blobb5c2a196e617fec38e16e33eb10411a5ab9f2499
2 |       stan.sa 3.3 7/29/91
4 |       The entry point stan computes the tangent of
5 |       an input argument;
6 |       stand does the same except for denormalized input.
8 |       Input: Double-extended number X in location pointed to
9 |               by address register a0.
11 |       Output: The value tan(X) returned in floating-point register Fp0.
13 |       Accuracy and Monotonicity: The returned result is within 3 ulp in
14 |               64 significant bit, i.e. within 0.5001 ulp to 53 bits if the
15 |               result is subsequently rounded to double precision. The
16 |               result is provably monotonic in double precision.
18 |       Speed: The program sTAN takes approximately 170 cycles for
19 |               input argument X such that |X| < 15Pi, which is the usual
20 |               situation.
22 |       Algorithm:
24 |       1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.
26 |       2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let
27 |               k = N mod 2, so in particular, k = 0 or 1.
29 |       3. If k is odd, go to 5.
31 |       4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a
32 |               rational function U/V where
33 |               U = r + r*s*(P1 + s*(P2 + s*P3)), and
34 |               V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))),  s = r*r.
35 |               Exit.
37 |       4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a
38 |               rational function U/V where
39 |               U = r + r*s*(P1 + s*(P2 + s*P3)), and
40 |               V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,
41 |               -Cot(r) = -V/U. Exit.
43 |       6. If |X| > 1, go to 8.
45 |       7. (|X|<2**(-40)) Tan(X) = X. Exit.
47 |       8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2.
50 |               Copyright (C) Motorola, Inc. 1990
51 |                       All Rights Reserved
53 |       THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
54 |       The copyright notice above does not evidence any
55 |       actual or intended publication of such source code.
57 |STAN   idnt    2,1 | Motorola 040 Floating Point Software Package
59         |section        8
61 #include "fpsp.h"
63 BOUNDS1:        .long 0x3FD78000,0x4004BC7E
64 TWOBYPI:        .long 0x3FE45F30,0x6DC9C883
66 TANQ4:  .long 0x3EA0B759,0xF50F8688
67 TANP3:  .long 0xBEF2BAA5,0xA8924F04
69 TANQ3:  .long 0xBF346F59,0xB39BA65F,0x00000000,0x00000000
71 TANP2:  .long 0x3FF60000,0xE073D3FC,0x199C4A00,0x00000000
73 TANQ2:  .long 0x3FF90000,0xD23CD684,0x15D95FA1,0x00000000
75 TANP1:  .long 0xBFFC0000,0x8895A6C5,0xFB423BCA,0x00000000
77 TANQ1:  .long 0xBFFD0000,0xEEF57E0D,0xA84BC8CE,0x00000000
79 INVTWOPI: .long 0x3FFC0000,0xA2F9836E,0x4E44152A,0x00000000
81 TWOPI1: .long 0x40010000,0xC90FDAA2,0x00000000,0x00000000
82 TWOPI2: .long 0x3FDF0000,0x85A308D4,0x00000000,0x00000000
84 |--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING
85 |--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT
86 |--MOST 69 BITS LONG.
87         .global PITBL
88 PITBL:
89   .long  0xC0040000,0xC90FDAA2,0x2168C235,0x21800000
90   .long  0xC0040000,0xC2C75BCD,0x105D7C23,0xA0D00000
91   .long  0xC0040000,0xBC7EDCF7,0xFF523611,0xA1E80000
92   .long  0xC0040000,0xB6365E22,0xEE46F000,0x21480000
93   .long  0xC0040000,0xAFEDDF4D,0xDD3BA9EE,0xA1200000
94   .long  0xC0040000,0xA9A56078,0xCC3063DD,0x21FC0000
95   .long  0xC0040000,0xA35CE1A3,0xBB251DCB,0x21100000
96   .long  0xC0040000,0x9D1462CE,0xAA19D7B9,0xA1580000
97   .long  0xC0040000,0x96CBE3F9,0x990E91A8,0x21E00000
98   .long  0xC0040000,0x90836524,0x88034B96,0x20B00000
99   .long  0xC0040000,0x8A3AE64F,0x76F80584,0xA1880000
100   .long  0xC0040000,0x83F2677A,0x65ECBF73,0x21C40000
101   .long  0xC0030000,0xFB53D14A,0xA9C2F2C2,0x20000000
102   .long  0xC0030000,0xEEC2D3A0,0x87AC669F,0x21380000
103   .long  0xC0030000,0xE231D5F6,0x6595DA7B,0xA1300000
104   .long  0xC0030000,0xD5A0D84C,0x437F4E58,0x9FC00000
105   .long  0xC0030000,0xC90FDAA2,0x2168C235,0x21000000
106   .long  0xC0030000,0xBC7EDCF7,0xFF523611,0xA1680000
107   .long  0xC0030000,0xAFEDDF4D,0xDD3BA9EE,0xA0A00000
108   .long  0xC0030000,0xA35CE1A3,0xBB251DCB,0x20900000
109   .long  0xC0030000,0x96CBE3F9,0x990E91A8,0x21600000
110   .long  0xC0030000,0x8A3AE64F,0x76F80584,0xA1080000
111   .long  0xC0020000,0xFB53D14A,0xA9C2F2C2,0x1F800000
112   .long  0xC0020000,0xE231D5F6,0x6595DA7B,0xA0B00000
113   .long  0xC0020000,0xC90FDAA2,0x2168C235,0x20800000
114   .long  0xC0020000,0xAFEDDF4D,0xDD3BA9EE,0xA0200000
115   .long  0xC0020000,0x96CBE3F9,0x990E91A8,0x20E00000
116   .long  0xC0010000,0xFB53D14A,0xA9C2F2C2,0x1F000000
117   .long  0xC0010000,0xC90FDAA2,0x2168C235,0x20000000
118   .long  0xC0010000,0x96CBE3F9,0x990E91A8,0x20600000
119   .long  0xC0000000,0xC90FDAA2,0x2168C235,0x1F800000
120   .long  0xBFFF0000,0xC90FDAA2,0x2168C235,0x1F000000
121   .long  0x00000000,0x00000000,0x00000000,0x00000000
122   .long  0x3FFF0000,0xC90FDAA2,0x2168C235,0x9F000000
123   .long  0x40000000,0xC90FDAA2,0x2168C235,0x9F800000
124   .long  0x40010000,0x96CBE3F9,0x990E91A8,0xA0600000
125   .long  0x40010000,0xC90FDAA2,0x2168C235,0xA0000000
126   .long  0x40010000,0xFB53D14A,0xA9C2F2C2,0x9F000000
127   .long  0x40020000,0x96CBE3F9,0x990E91A8,0xA0E00000
128   .long  0x40020000,0xAFEDDF4D,0xDD3BA9EE,0x20200000
129   .long  0x40020000,0xC90FDAA2,0x2168C235,0xA0800000
130   .long  0x40020000,0xE231D5F6,0x6595DA7B,0x20B00000
131   .long  0x40020000,0xFB53D14A,0xA9C2F2C2,0x9F800000
132   .long  0x40030000,0x8A3AE64F,0x76F80584,0x21080000
133   .long  0x40030000,0x96CBE3F9,0x990E91A8,0xA1600000
134   .long  0x40030000,0xA35CE1A3,0xBB251DCB,0xA0900000
135   .long  0x40030000,0xAFEDDF4D,0xDD3BA9EE,0x20A00000
136   .long  0x40030000,0xBC7EDCF7,0xFF523611,0x21680000
137   .long  0x40030000,0xC90FDAA2,0x2168C235,0xA1000000
138   .long  0x40030000,0xD5A0D84C,0x437F4E58,0x1FC00000
139   .long  0x40030000,0xE231D5F6,0x6595DA7B,0x21300000
140   .long  0x40030000,0xEEC2D3A0,0x87AC669F,0xA1380000
141   .long  0x40030000,0xFB53D14A,0xA9C2F2C2,0xA0000000
142   .long  0x40040000,0x83F2677A,0x65ECBF73,0xA1C40000
143   .long  0x40040000,0x8A3AE64F,0x76F80584,0x21880000
144   .long  0x40040000,0x90836524,0x88034B96,0xA0B00000
145   .long  0x40040000,0x96CBE3F9,0x990E91A8,0xA1E00000
146   .long  0x40040000,0x9D1462CE,0xAA19D7B9,0x21580000
147   .long  0x40040000,0xA35CE1A3,0xBB251DCB,0xA1100000
148   .long  0x40040000,0xA9A56078,0xCC3063DD,0xA1FC0000
149   .long  0x40040000,0xAFEDDF4D,0xDD3BA9EE,0x21200000
150   .long  0x40040000,0xB6365E22,0xEE46F000,0xA1480000
151   .long  0x40040000,0xBC7EDCF7,0xFF523611,0x21E80000
152   .long  0x40040000,0xC2C75BCD,0x105D7C23,0x20D00000
153   .long  0x40040000,0xC90FDAA2,0x2168C235,0xA1800000
155         .set    INARG,FP_SCR4
157         .set    TWOTO63,L_SCR1
158         .set    ENDFLAG,L_SCR2
159         .set    N,L_SCR3
161         | xref  t_frcinx
162         |xref   t_extdnrm
164         .global stand
165 stand:
166 |--TAN(X) = X FOR DENORMALIZED X
168         bra             t_extdnrm
170         .global stan
171 stan:
172         fmovex          (%a0),%fp0      | ...LOAD INPUT
174         movel           (%a0),%d0
175         movew           4(%a0),%d0
176         andil           #0x7FFFFFFF,%d0
178         cmpil           #0x3FD78000,%d0         | ...|X| >= 2**(-40)?
179         bges            TANOK1
180         bra             TANSM
181 TANOK1:
182         cmpil           #0x4004BC7E,%d0         | ...|X| < 15 PI?
183         blts            TANMAIN
184         bra             REDUCEX
187 TANMAIN:
188 |--THIS IS THE USUAL CASE, |X| <= 15 PI.
189 |--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
190         fmovex          %fp0,%fp1
191         fmuld           TWOBYPI,%fp1    | ...X*2/PI
193 |--HIDE THE NEXT TWO INSTRUCTIONS
194         leal            PITBL+0x200,%a1 | ...TABLE OF N*PI/2, N = -32,...,32
196 |--FP1 IS NOW READY
197         fmovel          %fp1,%d0                | ...CONVERT TO INTEGER
199         asll            #4,%d0
200         addal           %d0,%a1         | ...ADDRESS N*PIBY2 IN Y1, Y2
202         fsubx           (%a1)+,%fp0     | ...X-Y1
203 |--HIDE THE NEXT ONE
205         fsubs           (%a1),%fp0      | ...FP0 IS R = (X-Y1)-Y2
207         rorl            #5,%d0
208         andil           #0x80000000,%d0 | ...D0 WAS ODD IFF D0 < 0
210 TANCONT:
212         cmpil           #0,%d0
213         blt             NODD
215         fmovex          %fp0,%fp1
216         fmulx           %fp1,%fp1               | ...S = R*R
218         fmoved          TANQ4,%fp3
219         fmoved          TANP3,%fp2
221         fmulx           %fp1,%fp3               | ...SQ4
222         fmulx           %fp1,%fp2               | ...SP3
224         faddd           TANQ3,%fp3      | ...Q3+SQ4
225         faddx           TANP2,%fp2      | ...P2+SP3
227         fmulx           %fp1,%fp3               | ...S(Q3+SQ4)
228         fmulx           %fp1,%fp2               | ...S(P2+SP3)
230         faddx           TANQ2,%fp3      | ...Q2+S(Q3+SQ4)
231         faddx           TANP1,%fp2      | ...P1+S(P2+SP3)
233         fmulx           %fp1,%fp3               | ...S(Q2+S(Q3+SQ4))
234         fmulx           %fp1,%fp2               | ...S(P1+S(P2+SP3))
236         faddx           TANQ1,%fp3      | ...Q1+S(Q2+S(Q3+SQ4))
237         fmulx           %fp0,%fp2               | ...RS(P1+S(P2+SP3))
239         fmulx           %fp3,%fp1               | ...S(Q1+S(Q2+S(Q3+SQ4)))
242         faddx           %fp2,%fp0               | ...R+RS(P1+S(P2+SP3))
245         fadds           #0x3F800000,%fp1        | ...1+S(Q1+...)
247         fmovel          %d1,%fpcr               |restore users exceptions
248         fdivx           %fp1,%fp0               |last inst - possible exception set
250         bra             t_frcinx
252 NODD:
253         fmovex          %fp0,%fp1
254         fmulx           %fp0,%fp0               | ...S = R*R
256         fmoved          TANQ4,%fp3
257         fmoved          TANP3,%fp2
259         fmulx           %fp0,%fp3               | ...SQ4
260         fmulx           %fp0,%fp2               | ...SP3
262         faddd           TANQ3,%fp3      | ...Q3+SQ4
263         faddx           TANP2,%fp2      | ...P2+SP3
265         fmulx           %fp0,%fp3               | ...S(Q3+SQ4)
266         fmulx           %fp0,%fp2               | ...S(P2+SP3)
268         faddx           TANQ2,%fp3      | ...Q2+S(Q3+SQ4)
269         faddx           TANP1,%fp2      | ...P1+S(P2+SP3)
271         fmulx           %fp0,%fp3               | ...S(Q2+S(Q3+SQ4))
272         fmulx           %fp0,%fp2               | ...S(P1+S(P2+SP3))
274         faddx           TANQ1,%fp3      | ...Q1+S(Q2+S(Q3+SQ4))
275         fmulx           %fp1,%fp2               | ...RS(P1+S(P2+SP3))
277         fmulx           %fp3,%fp0               | ...S(Q1+S(Q2+S(Q3+SQ4)))
280         faddx           %fp2,%fp1               | ...R+RS(P1+S(P2+SP3))
281         fadds           #0x3F800000,%fp0        | ...1+S(Q1+...)
284         fmovex          %fp1,-(%sp)
285         eoril           #0x80000000,(%sp)
287         fmovel          %d1,%fpcr               |restore users exceptions
288         fdivx           (%sp)+,%fp0     |last inst - possible exception set
290         bra             t_frcinx
292 TANBORS:
293 |--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION.
294 |--IF |X| < 2**(-40), RETURN X OR 1.
295         cmpil           #0x3FFF8000,%d0
296         bgts            REDUCEX
298 TANSM:
300         fmovex          %fp0,-(%sp)
301         fmovel          %d1,%fpcr                |restore users exceptions
302         fmovex          (%sp)+,%fp0     |last inst - possible exception set
304         bra             t_frcinx
307 REDUCEX:
308 |--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
309 |--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
310 |--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
312         fmovemx %fp2-%fp5,-(%a7)        | ...save FP2 through FP5
313         movel           %d2,-(%a7)
314         fmoves         #0x00000000,%fp1
316 |--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
317 |--there is a danger of unwanted overflow in first LOOP iteration.  In this
318 |--case, reduce argument by one remainder step to make subsequent reduction
319 |--safe.
320         cmpil   #0x7ffeffff,%d0         |is argument dangerously large?
321         bnes    LOOP
322         movel   #0x7ffe0000,FP_SCR2(%a6)        |yes
323 |                                       ;create 2**16383*PI/2
324         movel   #0xc90fdaa2,FP_SCR2+4(%a6)
325         clrl    FP_SCR2+8(%a6)
326         ftstx   %fp0                    |test sign of argument
327         movel   #0x7fdc0000,FP_SCR3(%a6)        |create low half of 2**16383*
328 |                                       ;PI/2 at FP_SCR3
329         movel   #0x85a308d3,FP_SCR3+4(%a6)
330         clrl   FP_SCR3+8(%a6)
331         fblt    red_neg
332         orw     #0x8000,FP_SCR2(%a6)    |positive arg
333         orw     #0x8000,FP_SCR3(%a6)
334 red_neg:
335         faddx  FP_SCR2(%a6),%fp0                |high part of reduction is exact
336         fmovex  %fp0,%fp1               |save high result in fp1
337         faddx  FP_SCR3(%a6),%fp0                |low part of reduction
338         fsubx  %fp0,%fp1                        |determine low component of result
339         faddx  FP_SCR3(%a6),%fp1                |fp0/fp1 are reduced argument.
341 |--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
342 |--integer quotient will be stored in N
343 |--Intermediate remainder is 66-bit long; (R,r) in (FP0,FP1)
345 LOOP:
346         fmovex          %fp0,INARG(%a6) | ...+-2**K * F, 1 <= F < 2
347         movew           INARG(%a6),%d0
348         movel          %d0,%a1          | ...save a copy of D0
349         andil           #0x00007FFF,%d0
350         subil           #0x00003FFF,%d0 | ...D0 IS K
351         cmpil           #28,%d0
352         bles            LASTLOOP
353 CONTLOOP:
354         subil           #27,%d0  | ...D0 IS L := K-27
355         movel           #0,ENDFLAG(%a6)
356         bras            WORK
357 LASTLOOP:
358         clrl            %d0             | ...D0 IS L := 0
359         movel           #1,ENDFLAG(%a6)
361 WORK:
362 |--FIND THE REMAINDER OF (R,r) W.R.T.   2**L * (PI/2). L IS SO CHOSEN
363 |--THAT INT( X * (2/PI) / 2**(L) ) < 2**29.
365 |--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
366 |--2**L * (PIby2_1), 2**L * (PIby2_2)
368         movel           #0x00003FFE,%d2 | ...BIASED EXPO OF 2/PI
369         subl            %d0,%d2         | ...BIASED EXPO OF 2**(-L)*(2/PI)
371         movel           #0xA2F9836E,FP_SCR1+4(%a6)
372         movel           #0x4E44152A,FP_SCR1+8(%a6)
373         movew           %d2,FP_SCR1(%a6)        | ...FP_SCR1 is 2**(-L)*(2/PI)
375         fmovex          %fp0,%fp2
376         fmulx           FP_SCR1(%a6),%fp2
377 |--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
378 |--FLOATING POINT FORMAT, THE TWO FMOVE'S       FMOVE.L FP <--> N
379 |--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
380 |--(SIGN(INARG)*2**63   +       FP2) - SIGN(INARG)*2**63 WILL GIVE
381 |--US THE DESIRED VALUE IN FLOATING POINT.
383 |--HIDE SIX CYCLES OF INSTRUCTION
384         movel           %a1,%d2
385         swap            %d2
386         andil           #0x80000000,%d2
387         oril            #0x5F000000,%d2 | ...D2 IS SIGN(INARG)*2**63 IN SGL
388         movel           %d2,TWOTO63(%a6)
390         movel           %d0,%d2
391         addil           #0x00003FFF,%d2 | ...BIASED EXPO OF 2**L * (PI/2)
393 |--FP2 IS READY
394         fadds           TWOTO63(%a6),%fp2       | ...THE FRACTIONAL PART OF FP1 IS ROUNDED
396 |--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1  and  2**(L)*Piby2_2
397         movew           %d2,FP_SCR2(%a6)
398         clrw           FP_SCR2+2(%a6)
399         movel           #0xC90FDAA2,FP_SCR2+4(%a6)
400         clrl            FP_SCR2+8(%a6)          | ...FP_SCR2 is  2**(L) * Piby2_1
402 |--FP2 IS READY
403         fsubs           TWOTO63(%a6),%fp2               | ...FP2 is N
405         addil           #0x00003FDD,%d0
406         movew           %d0,FP_SCR3(%a6)
407         clrw           FP_SCR3+2(%a6)
408         movel           #0x85A308D3,FP_SCR3+4(%a6)
409         clrl            FP_SCR3+8(%a6)          | ...FP_SCR3 is 2**(L) * Piby2_2
411         movel           ENDFLAG(%a6),%d0
413 |--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
414 |--P2 = 2**(L) * Piby2_2
415         fmovex          %fp2,%fp4
416         fmulx           FP_SCR2(%a6),%fp4               | ...W = N*P1
417         fmovex          %fp2,%fp5
418         fmulx           FP_SCR3(%a6),%fp5               | ...w = N*P2
419         fmovex          %fp4,%fp3
420 |--we want P+p = W+w  but  |p| <= half ulp of P
421 |--Then, we need to compute  A := R-P   and  a := r-p
422         faddx           %fp5,%fp3                       | ...FP3 is P
423         fsubx           %fp3,%fp4                       | ...W-P
425         fsubx           %fp3,%fp0                       | ...FP0 is A := R - P
426         faddx           %fp5,%fp4                       | ...FP4 is p = (W-P)+w
428         fmovex          %fp0,%fp3                       | ...FP3 A
429         fsubx           %fp4,%fp1                       | ...FP1 is a := r - p
431 |--Now we need to normalize (A,a) to  "new (R,r)" where R+r = A+a but
432 |--|r| <= half ulp of R.
433         faddx           %fp1,%fp0                       | ...FP0 is R := A+a
434 |--No need to calculate r if this is the last loop
435         cmpil           #0,%d0
436         bgt             RESTORE
438 |--Need to calculate r
439         fsubx           %fp0,%fp3                       | ...A-R
440         faddx           %fp3,%fp1                       | ...FP1 is r := (A-R)+a
441         bra             LOOP
443 RESTORE:
444         fmovel          %fp2,N(%a6)
445         movel           (%a7)+,%d2
446         fmovemx (%a7)+,%fp2-%fp5
449         movel           N(%a6),%d0
450         rorl            #1,%d0
453         bra             TANCONT
455         |end