Automatic merge of rsync://rsync.kernel.org/pub/scm/linux/kernel/git/gregkh/driver...
[linux-2.6/verdex.git] / arch / m68k / fpsp040 / setox.S
blob0aa75f9bf7d1872f0b47ffa2b64efe585f26cb98
2 |       setox.sa 3.1 12/10/90
4 |       The entry point setox computes the exponential of a value.
5 |       setoxd does the same except the input value is a denormalized
6 |       number. setoxm1 computes exp(X)-1, and setoxm1d computes
7 |       exp(X)-1 for denormalized X.
9 |       INPUT
10 |       -----
11 |       Double-extended value in memory location pointed to by address
12 |       register a0.
14 |       OUTPUT
15 |       ------
16 |       exp(X) or exp(X)-1 returned in floating-point register fp0.
18 |       ACCURACY and MONOTONICITY
19 |       -------------------------
20 |       The returned result is within 0.85 ulps in 64 significant bit, i.e.
21 |       within 0.5001 ulp to 53 bits if the result is subsequently rounded
22 |       to double precision. The result is provably monotonic in double
23 |       precision.
25 |       SPEED
26 |       -----
27 |       Two timings are measured, both in the copy-back mode. The
28 |       first one is measured when the function is invoked the first time
29 |       (so the instructions and data are not in cache), and the
30 |       second one is measured when the function is reinvoked at the same
31 |       input argument.
33 |       The program setox takes approximately 210/190 cycles for input
34 |       argument X whose magnitude is less than 16380 log2, which
35 |       is the usual situation. For the less common arguments,
36 |       depending on their values, the program may run faster or slower --
37 |       but no worse than 10% slower even in the extreme cases.
39 |       The program setoxm1 takes approximately ???/??? cycles for input
40 |       argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes
41 |       approximately ???/??? cycles. For the less common arguments,
42 |       depending on their values, the program may run faster or slower --
43 |       but no worse than 10% slower even in the extreme cases.
45 |       ALGORITHM and IMPLEMENTATION NOTES
46 |       ----------------------------------
48 |       setoxd
49 |       ------
50 |       Step 1. Set ans := 1.0
52 |       Step 2. Return  ans := ans + sign(X)*2^(-126). Exit.
53 |       Notes:  This will always generate one exception -- inexact.
56 |       setox
57 |       -----
59 |       Step 1. Filter out extreme cases of input argument.
60 |               1.1     If |X| >= 2^(-65), go to Step 1.3.
61 |               1.2     Go to Step 7.
62 |               1.3     If |X| < 16380 log(2), go to Step 2.
63 |               1.4     Go to Step 8.
64 |       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
65 |                To avoid the use of floating-point comparisons, a
66 |                compact representation of |X| is used. This format is a
67 |                32-bit integer, the upper (more significant) 16 bits are
68 |                the sign and biased exponent field of |X|; the lower 16
69 |                bits are the 16 most significant fraction (including the
70 |                explicit bit) bits of |X|. Consequently, the comparisons
71 |                in Steps 1.1 and 1.3 can be performed by integer comparison.
72 |                Note also that the constant 16380 log(2) used in Step 1.3
73 |                is also in the compact form. Thus taking the branch
74 |                to Step 2 guarantees |X| < 16380 log(2). There is no harm
75 |                to have a small number of cases where |X| is less than,
76 |                but close to, 16380 log(2) and the branch to Step 9 is
77 |                taken.
79 |       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
80 |               2.1     Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
81 |               2.2     N := round-to-nearest-integer( X * 64/log2 ).
82 |               2.3     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
83 |               2.4     Calculate       M = (N - J)/64; so N = 64M + J.
84 |               2.5     Calculate the address of the stored value of 2^(J/64).
85 |               2.6     Create the value Scale = 2^M.
86 |       Notes:  The calculation in 2.2 is really performed by
88 |                       Z := X * constant
89 |                       N := round-to-nearest-integer(Z)
91 |                where
93 |                       constant := single-precision( 64/log 2 ).
95 |                Using a single-precision constant avoids memory access.
96 |                Another effect of using a single-precision "constant" is
97 |                that the calculated value Z is
99 |                       Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).
101 |                This error has to be considered later in Steps 3 and 4.
103 |       Step 3. Calculate X - N*log2/64.
104 |               3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
105 |               3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
106 |       Notes:  a) The way L1 and L2 are chosen ensures L1+L2 approximate
107 |                the value      -log2/64        to 88 bits of accuracy.
108 |                b) N*L1 is exact because N is no longer than 22 bits and
109 |                L1 is no longer than 24 bits.
110 |                c) The calculation X+N*L1 is also exact due to cancellation.
111 |                Thus, R is practically X+N(L1+L2) to full 64 bits.
112 |                d) It is important to estimate how large can |R| be after
113 |                Step 3.2.
115 |                       N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)
116 |                       X*64/log2 (1+eps)       =       N + f,  |f| <= 0.5
117 |                       X*64/log2 - N   =       f - eps*X 64/log2
118 |                       X - N*log2/64   =       f*log2/64 - eps*X
121 |                Now |X| <= 16446 log2, thus
123 |                       |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64
124 |                                       <= 0.57 log2/64.
125 |                This bound will be used in Step 4.
127 |       Step 4. Approximate exp(R)-1 by a polynomial
128 |                       p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
129 |       Notes:  a) In order to reduce memory access, the coefficients are
130 |                made as "short" as possible: A1 (which is 1/2), A4 and A5
131 |                are single precision; A2 and A3 are double precision.
132 |                b) Even with the restrictions above,
133 |                       |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.
134 |                Note that 0.0062 is slightly bigger than 0.57 log2/64.
135 |                c) To fully utilize the pipeline, p is separated into
136 |                two independent pieces of roughly equal complexities
137 |                       p = [ R + R*S*(A2 + S*A4) ]     +
138 |                               [ S*(A1 + S*(A3 + S*A5)) ]
139 |                where S = R*R.
141 |       Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by
142 |                               ans := T + ( T*p + t)
143 |                where T and t are the stored values for 2^(J/64).
144 |       Notes:  2^(J/64) is stored as T and t where T+t approximates
145 |                2^(J/64) to roughly 85 bits; T is in extended precision
146 |                and t is in single precision. Note also that T is rounded
147 |                to 62 bits so that the last two bits of T are zero. The
148 |                reason for such a special form is that T-1, T-2, and T-8
149 |                will all be exact --- a property that will give much
150 |                more accurate computation of the function EXPM1.
152 |       Step 6. Reconstruction of exp(X)
153 |                       exp(X) = 2^M * 2^(J/64) * exp(R).
154 |               6.1     If AdjFlag = 0, go to 6.3
155 |               6.2     ans := ans * AdjScale
156 |               6.3     Restore the user FPCR
157 |               6.4     Return ans := ans * Scale. Exit.
158 |       Notes:  If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
159 |                |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will
160 |                neither overflow nor underflow. If AdjFlag = 1, that
161 |                means that
162 |                       X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380.
163 |                Hence, exp(X) may overflow or underflow or neither.
164 |                When that is the case, AdjScale = 2^(M1) where M1 is
165 |                approximately M. Thus 6.2 will never cause over/underflow.
166 |                Possible exception in 6.4 is overflow or underflow.
167 |                The inexact exception is not generated in 6.4. Although
168 |                one can argue that the inexact flag should always be
169 |                raised, to simulate that exception cost to much than the
170 |                flag is worth in practical uses.
172 |       Step 7. Return 1 + X.
173 |               7.1     ans := X
174 |               7.2     Restore user FPCR.
175 |               7.3     Return ans := 1 + ans. Exit
176 |       Notes:  For non-zero X, the inexact exception will always be
177 |                raised by 7.3. That is the only exception raised by 7.3.
178 |                Note also that we use the FMOVEM instruction to move X
179 |                in Step 7.1 to avoid unnecessary trapping. (Although
180 |                the FMOVEM may not seem relevant since X is normalized,
181 |                the precaution will be useful in the library version of
182 |                this code where the separate entry for denormalized inputs
183 |                will be done away with.)
185 |       Step 8. Handle exp(X) where |X| >= 16380log2.
186 |               8.1     If |X| > 16480 log2, go to Step 9.
187 |               (mimic 2.2 - 2.6)
188 |               8.2     N := round-to-integer( X * 64/log2 )
189 |               8.3     Calculate J = N mod 64, J = 0,1,...,63
190 |               8.4     K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
191 |               8.5     Calculate the address of the stored value 2^(J/64).
192 |               8.6     Create the values Scale = 2^M, AdjScale = 2^M1.
193 |               8.7     Go to Step 3.
194 |       Notes:  Refer to notes for 2.2 - 2.6.
196 |       Step 9. Handle exp(X), |X| > 16480 log2.
197 |               9.1     If X < 0, go to 9.3
198 |               9.2     ans := Huge, go to 9.4
199 |               9.3     ans := Tiny.
200 |               9.4     Restore user FPCR.
201 |               9.5     Return ans := ans * ans. Exit.
202 |       Notes:  Exp(X) will surely overflow or underflow, depending on
203 |                X's sign. "Huge" and "Tiny" are respectively large/tiny
204 |                extended-precision numbers whose square over/underflow
205 |                with an inexact result. Thus, 9.5 always raises the
206 |                inexact together with either overflow or underflow.
209 |       setoxm1d
210 |       --------
212 |       Step 1. Set ans := 0
214 |       Step 2. Return  ans := X + ans. Exit.
215 |       Notes:  This will return X with the appropriate rounding
216 |                precision prescribed by the user FPCR.
218 |       setoxm1
219 |       -------
221 |       Step 1. Check |X|
222 |               1.1     If |X| >= 1/4, go to Step 1.3.
223 |               1.2     Go to Step 7.
224 |               1.3     If |X| < 70 log(2), go to Step 2.
225 |               1.4     Go to Step 10.
226 |       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
227 |                However, it is conceivable |X| can be small very often
228 |                because EXPM1 is intended to evaluate exp(X)-1 accurately
229 |                when |X| is small. For further details on the comparisons,
230 |                see the notes on Step 1 of setox.
232 |       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
233 |               2.1     N := round-to-nearest-integer( X * 64/log2 ).
234 |               2.2     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
235 |               2.3     Calculate       M = (N - J)/64; so N = 64M + J.
236 |               2.4     Calculate the address of the stored value of 2^(J/64).
237 |               2.5     Create the values Sc = 2^M and OnebySc := -2^(-M).
238 |       Notes:  See the notes on Step 2 of setox.
240 |       Step 3. Calculate X - N*log2/64.
241 |               3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
242 |               3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
243 |       Notes:  Applying the analysis of Step 3 of setox in this case
244 |                shows that |R| <= 0.0055 (note that |X| <= 70 log2 in
245 |                this case).
247 |       Step 4. Approximate exp(R)-1 by a polynomial
248 |                       p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6)))))
249 |       Notes:  a) In order to reduce memory access, the coefficients are
250 |                made as "short" as possible: A1 (which is 1/2), A5 and A6
251 |                are single precision; A2, A3 and A4 are double precision.
252 |                b) Even with the restriction above,
253 |                       |p - (exp(R)-1)| <      |R| * 2^(-72.7)
254 |                for all |R| <= 0.0055.
255 |                c) To fully utilize the pipeline, p is separated into
256 |                two independent pieces of roughly equal complexity
257 |                       p = [ R*S*(A2 + S*(A4 + S*A6)) ]        +
258 |                               [ R + S*(A1 + S*(A3 + S*A5)) ]
259 |                where S = R*R.
261 |       Step 5. Compute 2^(J/64)*p by
262 |                               p := T*p
263 |                where T and t are the stored values for 2^(J/64).
264 |       Notes:  2^(J/64) is stored as T and t where T+t approximates
265 |                2^(J/64) to roughly 85 bits; T is in extended precision
266 |                and t is in single precision. Note also that T is rounded
267 |                to 62 bits so that the last two bits of T are zero. The
268 |                reason for such a special form is that T-1, T-2, and T-8
269 |                will all be exact --- a property that will be exploited
270 |                in Step 6 below. The total relative error in p is no
271 |                bigger than 2^(-67.7) compared to the final result.
273 |       Step 6. Reconstruction of exp(X)-1
274 |                       exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
275 |               6.1     If M <= 63, go to Step 6.3.
276 |               6.2     ans := T + (p + (t + OnebySc)). Go to 6.6
277 |               6.3     If M >= -3, go to 6.5.
278 |               6.4     ans := (T + (p + t)) + OnebySc. Go to 6.6
279 |               6.5     ans := (T + OnebySc) + (p + t).
280 |               6.6     Restore user FPCR.
281 |               6.7     Return ans := Sc * ans. Exit.
282 |       Notes:  The various arrangements of the expressions give accurate
283 |                evaluations.
285 |       Step 7. exp(X)-1 for |X| < 1/4.
286 |               7.1     If |X| >= 2^(-65), go to Step 9.
287 |               7.2     Go to Step 8.
289 |       Step 8. Calculate exp(X)-1, |X| < 2^(-65).
290 |               8.1     If |X| < 2^(-16312), goto 8.3
291 |               8.2     Restore FPCR; return ans := X - 2^(-16382). Exit.
292 |               8.3     X := X * 2^(140).
293 |               8.4     Restore FPCR; ans := ans - 2^(-16382).
294 |                Return ans := ans*2^(140). Exit
295 |       Notes:  The idea is to return "X - tiny" under the user
296 |                precision and rounding modes. To avoid unnecessary
297 |                inefficiency, we stay away from denormalized numbers the
298 |                best we can. For |X| >= 2^(-16312), the straightforward
299 |                8.2 generates the inexact exception as the case warrants.
301 |       Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial
302 |                       p = X + X*X*(B1 + X*(B2 + ... + X*B12))
303 |       Notes:  a) In order to reduce memory access, the coefficients are
304 |                made as "short" as possible: B1 (which is 1/2), B9 to B12
305 |                are single precision; B3 to B8 are double precision; and
306 |                B2 is double extended.
307 |                b) Even with the restriction above,
308 |                       |p - (exp(X)-1)| < |X| 2^(-70.6)
309 |                for all |X| <= 0.251.
310 |                Note that 0.251 is slightly bigger than 1/4.
311 |                c) To fully preserve accuracy, the polynomial is computed
312 |                as     X + ( S*B1 +    Q ) where S = X*X and
313 |                       Q       =       X*S*(B2 + X*(B3 + ... + X*B12))
314 |                d) To fully utilize the pipeline, Q is separated into
315 |                two independent pieces of roughly equal complexity
316 |                       Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +
317 |                               [ S*S*(B3 + S*(B5 + ... + S*B11)) ]
319 |       Step 10.        Calculate exp(X)-1 for |X| >= 70 log 2.
320 |               10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
321 |                purposes. Therefore, go to Step 1 of setox.
322 |               10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
323 |                ans := -1
324 |                Restore user FPCR
325 |                Return ans := ans + 2^(-126). Exit.
326 |       Notes:  10.2 will always create an inexact and return -1 + tiny
327 |                in the user rounding precision and mode.
331 |               Copyright (C) Motorola, Inc. 1990
332 |                       All Rights Reserved
334 |       THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
335 |       The copyright notice above does not evidence any
336 |       actual or intended publication of such source code.
338 |setox  idnt    2,1 | Motorola 040 Floating Point Software Package
340         |section        8
342 #include "fpsp.h"
344 L2:     .long   0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
346 EXPA3:  .long   0x3FA55555,0x55554431
347 EXPA2:  .long   0x3FC55555,0x55554018
349 HUGE:   .long   0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
350 TINY:   .long   0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
352 EM1A4:  .long   0x3F811111,0x11174385
353 EM1A3:  .long   0x3FA55555,0x55554F5A
355 EM1A2:  .long   0x3FC55555,0x55555555,0x00000000,0x00000000
357 EM1B8:  .long   0x3EC71DE3,0xA5774682
358 EM1B7:  .long   0x3EFA01A0,0x19D7CB68
360 EM1B6:  .long   0x3F2A01A0,0x1A019DF3
361 EM1B5:  .long   0x3F56C16C,0x16C170E2
363 EM1B4:  .long   0x3F811111,0x11111111
364 EM1B3:  .long   0x3FA55555,0x55555555
366 EM1B2:  .long   0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
367         .long   0x00000000
369 TWO140: .long   0x48B00000,0x00000000
370 TWON140:        .long   0x37300000,0x00000000
372 EXPTBL:
373         .long   0x3FFF0000,0x80000000,0x00000000,0x00000000
374         .long   0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
375         .long   0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
376         .long   0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
377         .long   0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
378         .long   0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
379         .long   0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
380         .long   0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
381         .long   0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
382         .long   0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
383         .long   0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
384         .long   0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
385         .long   0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
386         .long   0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
387         .long   0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
388         .long   0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
389         .long   0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
390         .long   0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
391         .long   0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
392         .long   0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
393         .long   0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
394         .long   0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
395         .long   0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
396         .long   0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
397         .long   0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
398         .long   0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
399         .long   0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
400         .long   0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
401         .long   0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
402         .long   0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
403         .long   0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
404         .long   0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
405         .long   0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
406         .long   0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
407         .long   0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
408         .long   0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
409         .long   0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
410         .long   0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
411         .long   0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
412         .long   0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
413         .long   0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
414         .long   0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
415         .long   0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
416         .long   0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
417         .long   0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
418         .long   0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
419         .long   0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
420         .long   0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
421         .long   0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
422         .long   0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
423         .long   0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
424         .long   0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
425         .long   0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
426         .long   0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
427         .long   0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
428         .long   0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
429         .long   0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
430         .long   0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
431         .long   0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
432         .long   0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
433         .long   0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
434         .long   0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
435         .long   0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
436         .long   0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
438         .set    ADJFLAG,L_SCR2
439         .set    SCALE,FP_SCR1
440         .set    ADJSCALE,FP_SCR2
441         .set    SC,FP_SCR3
442         .set    ONEBYSC,FP_SCR4
444         | xref  t_frcinx
445         |xref   t_extdnrm
446         |xref   t_unfl
447         |xref   t_ovfl
449         .global setoxd
450 setoxd:
451 |--entry point for EXP(X), X is denormalized
452         movel           (%a0),%d0
453         andil           #0x80000000,%d0
454         oril            #0x00800000,%d0         | ...sign(X)*2^(-126)
455         movel           %d0,-(%sp)
456         fmoves          #0x3F800000,%fp0
457         fmovel          %d1,%fpcr
458         fadds           (%sp)+,%fp0
459         bra             t_frcinx
461         .global setox
462 setox:
463 |--entry point for EXP(X), here X is finite, non-zero, and not NaN's
465 |--Step 1.
466         movel           (%a0),%d0        | ...load part of input X
467         andil           #0x7FFF0000,%d0 | ...biased expo. of X
468         cmpil           #0x3FBE0000,%d0 | ...2^(-65)
469         bges            EXPC1           | ...normal case
470         bra             EXPSM
472 EXPC1:
473 |--The case |X| >= 2^(-65)
474         movew           4(%a0),%d0      | ...expo. and partial sig. of |X|
475         cmpil           #0x400CB167,%d0 | ...16380 log2 trunc. 16 bits
476         blts            EXPMAIN  | ...normal case
477         bra             EXPBIG
479 EXPMAIN:
480 |--Step 2.
481 |--This is the normal branch:   2^(-65) <= |X| < 16380 log2.
482         fmovex          (%a0),%fp0      | ...load input from (a0)
484         fmovex          %fp0,%fp1
485         fmuls           #0x42B8AA3B,%fp0        | ...64/log2 * X
486         fmovemx %fp2-%fp2/%fp3,-(%a7)           | ...save fp2
487         movel           #0,ADJFLAG(%a6)
488         fmovel          %fp0,%d0                | ...N = int( X * 64/log2 )
489         lea             EXPTBL,%a1
490         fmovel          %d0,%fp0                | ...convert to floating-format
492         movel           %d0,L_SCR1(%a6) | ...save N temporarily
493         andil           #0x3F,%d0               | ...D0 is J = N mod 64
494         lsll            #4,%d0
495         addal           %d0,%a1         | ...address of 2^(J/64)
496         movel           L_SCR1(%a6),%d0
497         asrl            #6,%d0          | ...D0 is M
498         addiw           #0x3FFF,%d0     | ...biased expo. of 2^(M)
499         movew           L2,L_SCR1(%a6)  | ...prefetch L2, no need in CB
501 EXPCONT1:
502 |--Step 3.
503 |--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
504 |--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
505         fmovex          %fp0,%fp2
506         fmuls           #0xBC317218,%fp0        | ...N * L1, L1 = lead(-log2/64)
507         fmulx           L2,%fp2         | ...N * L2, L1+L2 = -log2/64
508         faddx           %fp1,%fp0               | ...X + N*L1
509         faddx           %fp2,%fp0               | ...fp0 is R, reduced arg.
510 |       MOVE.W          #$3FA5,EXPA3    ...load EXPA3 in cache
512 |--Step 4.
513 |--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
514 |-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
515 |--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
516 |--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
518         fmovex          %fp0,%fp1
519         fmulx           %fp1,%fp1               | ...fp1 IS S = R*R
521         fmoves          #0x3AB60B70,%fp2        | ...fp2 IS A5
522 |       MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
524         fmulx           %fp1,%fp2               | ...fp2 IS S*A5
525         fmovex          %fp1,%fp3
526         fmuls           #0x3C088895,%fp3        | ...fp3 IS S*A4
528         faddd           EXPA3,%fp2      | ...fp2 IS A3+S*A5
529         faddd           EXPA2,%fp3      | ...fp3 IS A2+S*A4
531         fmulx           %fp1,%fp2               | ...fp2 IS S*(A3+S*A5)
532         movew           %d0,SCALE(%a6)  | ...SCALE is 2^(M) in extended
533         clrw            SCALE+2(%a6)
534         movel           #0x80000000,SCALE+4(%a6)
535         clrl            SCALE+8(%a6)
537         fmulx           %fp1,%fp3               | ...fp3 IS S*(A2+S*A4)
539         fadds           #0x3F000000,%fp2        | ...fp2 IS A1+S*(A3+S*A5)
540         fmulx           %fp0,%fp3               | ...fp3 IS R*S*(A2+S*A4)
542         fmulx           %fp1,%fp2               | ...fp2 IS S*(A1+S*(A3+S*A5))
543         faddx           %fp3,%fp0               | ...fp0 IS R+R*S*(A2+S*A4),
544 |                                       ...fp3 released
546         fmovex          (%a1)+,%fp1     | ...fp1 is lead. pt. of 2^(J/64)
547         faddx           %fp2,%fp0               | ...fp0 is EXP(R) - 1
548 |                                       ...fp2 released
550 |--Step 5
551 |--final reconstruction process
552 |--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
554         fmulx           %fp1,%fp0               | ...2^(J/64)*(Exp(R)-1)
555         fmovemx (%a7)+,%fp2-%fp2/%fp3   | ...fp2 restored
556         fadds           (%a1),%fp0      | ...accurate 2^(J/64)
558         faddx           %fp1,%fp0               | ...2^(J/64) + 2^(J/64)*...
559         movel           ADJFLAG(%a6),%d0
561 |--Step 6
562         tstl            %d0
563         beqs            NORMAL
564 ADJUST:
565         fmulx           ADJSCALE(%a6),%fp0
566 NORMAL:
567         fmovel          %d1,%FPCR               | ...restore user FPCR
568         fmulx           SCALE(%a6),%fp0 | ...multiply 2^(M)
569         bra             t_frcinx
571 EXPSM:
572 |--Step 7
573         fmovemx (%a0),%fp0-%fp0 | ...in case X is denormalized
574         fmovel          %d1,%FPCR
575         fadds           #0x3F800000,%fp0        | ...1+X in user mode
576         bra             t_frcinx
578 EXPBIG:
579 |--Step 8
580         cmpil           #0x400CB27C,%d0 | ...16480 log2
581         bgts            EXP2BIG
582 |--Steps 8.2 -- 8.6
583         fmovex          (%a0),%fp0      | ...load input from (a0)
585         fmovex          %fp0,%fp1
586         fmuls           #0x42B8AA3B,%fp0        | ...64/log2 * X
587         fmovemx  %fp2-%fp2/%fp3,-(%a7)          | ...save fp2
588         movel           #1,ADJFLAG(%a6)
589         fmovel          %fp0,%d0                | ...N = int( X * 64/log2 )
590         lea             EXPTBL,%a1
591         fmovel          %d0,%fp0                | ...convert to floating-format
592         movel           %d0,L_SCR1(%a6)                 | ...save N temporarily
593         andil           #0x3F,%d0                | ...D0 is J = N mod 64
594         lsll            #4,%d0
595         addal           %d0,%a1                 | ...address of 2^(J/64)
596         movel           L_SCR1(%a6),%d0
597         asrl            #6,%d0                  | ...D0 is K
598         movel           %d0,L_SCR1(%a6)                 | ...save K temporarily
599         asrl            #1,%d0                  | ...D0 is M1
600         subl            %d0,L_SCR1(%a6)                 | ...a1 is M
601         addiw           #0x3FFF,%d0             | ...biased expo. of 2^(M1)
602         movew           %d0,ADJSCALE(%a6)               | ...ADJSCALE := 2^(M1)
603         clrw            ADJSCALE+2(%a6)
604         movel           #0x80000000,ADJSCALE+4(%a6)
605         clrl            ADJSCALE+8(%a6)
606         movel           L_SCR1(%a6),%d0                 | ...D0 is M
607         addiw           #0x3FFF,%d0             | ...biased expo. of 2^(M)
608         bra             EXPCONT1                | ...go back to Step 3
610 EXP2BIG:
611 |--Step 9
612         fmovel          %d1,%FPCR
613         movel           (%a0),%d0
614         bclrb           #sign_bit,(%a0)         | ...setox always returns positive
615         cmpil           #0,%d0
616         blt             t_unfl
617         bra             t_ovfl
619         .global setoxm1d
620 setoxm1d:
621 |--entry point for EXPM1(X), here X is denormalized
622 |--Step 0.
623         bra             t_extdnrm
626         .global setoxm1
627 setoxm1:
628 |--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
630 |--Step 1.
631 |--Step 1.1
632         movel           (%a0),%d0        | ...load part of input X
633         andil           #0x7FFF0000,%d0 | ...biased expo. of X
634         cmpil           #0x3FFD0000,%d0 | ...1/4
635         bges            EM1CON1  | ...|X| >= 1/4
636         bra             EM1SM
638 EM1CON1:
639 |--Step 1.3
640 |--The case |X| >= 1/4
641         movew           4(%a0),%d0      | ...expo. and partial sig. of |X|
642         cmpil           #0x4004C215,%d0 | ...70log2 rounded up to 16 bits
643         bles            EM1MAIN  | ...1/4 <= |X| <= 70log2
644         bra             EM1BIG
646 EM1MAIN:
647 |--Step 2.
648 |--This is the case:    1/4 <= |X| <= 70 log2.
649         fmovex          (%a0),%fp0      | ...load input from (a0)
651         fmovex          %fp0,%fp1
652         fmuls           #0x42B8AA3B,%fp0        | ...64/log2 * X
653         fmovemx %fp2-%fp2/%fp3,-(%a7)           | ...save fp2
654 |       MOVE.W          #$3F81,EM1A4            ...prefetch in CB mode
655         fmovel          %fp0,%d0                | ...N = int( X * 64/log2 )
656         lea             EXPTBL,%a1
657         fmovel          %d0,%fp0                | ...convert to floating-format
659         movel           %d0,L_SCR1(%a6)                 | ...save N temporarily
660         andil           #0x3F,%d0                | ...D0 is J = N mod 64
661         lsll            #4,%d0
662         addal           %d0,%a1                 | ...address of 2^(J/64)
663         movel           L_SCR1(%a6),%d0
664         asrl            #6,%d0                  | ...D0 is M
665         movel           %d0,L_SCR1(%a6)                 | ...save a copy of M
666 |       MOVE.W          #$3FDC,L2               ...prefetch L2 in CB mode
668 |--Step 3.
669 |--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
670 |--a0 points to 2^(J/64), D0 and a1 both contain M
671         fmovex          %fp0,%fp2
672         fmuls           #0xBC317218,%fp0        | ...N * L1, L1 = lead(-log2/64)
673         fmulx           L2,%fp2         | ...N * L2, L1+L2 = -log2/64
674         faddx           %fp1,%fp0        | ...X + N*L1
675         faddx           %fp2,%fp0        | ...fp0 is R, reduced arg.
676 |       MOVE.W          #$3FC5,EM1A2            ...load EM1A2 in cache
677         addiw           #0x3FFF,%d0             | ...D0 is biased expo. of 2^M
679 |--Step 4.
680 |--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
681 |-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
682 |--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
683 |--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
685         fmovex          %fp0,%fp1
686         fmulx           %fp1,%fp1               | ...fp1 IS S = R*R
688         fmoves          #0x3950097B,%fp2        | ...fp2 IS a6
689 |       MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
691         fmulx           %fp1,%fp2               | ...fp2 IS S*A6
692         fmovex          %fp1,%fp3
693         fmuls           #0x3AB60B6A,%fp3        | ...fp3 IS S*A5
695         faddd           EM1A4,%fp2      | ...fp2 IS A4+S*A6
696         faddd           EM1A3,%fp3      | ...fp3 IS A3+S*A5
697         movew           %d0,SC(%a6)             | ...SC is 2^(M) in extended
698         clrw            SC+2(%a6)
699         movel           #0x80000000,SC+4(%a6)
700         clrl            SC+8(%a6)
702         fmulx           %fp1,%fp2               | ...fp2 IS S*(A4+S*A6)
703         movel           L_SCR1(%a6),%d0         | ...D0 is      M
704         negw            %d0             | ...D0 is -M
705         fmulx           %fp1,%fp3               | ...fp3 IS S*(A3+S*A5)
706         addiw           #0x3FFF,%d0     | ...biased expo. of 2^(-M)
707         faddd           EM1A2,%fp2      | ...fp2 IS A2+S*(A4+S*A6)
708         fadds           #0x3F000000,%fp3        | ...fp3 IS A1+S*(A3+S*A5)
710         fmulx           %fp1,%fp2               | ...fp2 IS S*(A2+S*(A4+S*A6))
711         oriw            #0x8000,%d0     | ...signed/expo. of -2^(-M)
712         movew           %d0,ONEBYSC(%a6)        | ...OnebySc is -2^(-M)
713         clrw            ONEBYSC+2(%a6)
714         movel           #0x80000000,ONEBYSC+4(%a6)
715         clrl            ONEBYSC+8(%a6)
716         fmulx           %fp3,%fp1               | ...fp1 IS S*(A1+S*(A3+S*A5))
717 |                                       ...fp3 released
719         fmulx           %fp0,%fp2               | ...fp2 IS R*S*(A2+S*(A4+S*A6))
720         faddx           %fp1,%fp0               | ...fp0 IS R+S*(A1+S*(A3+S*A5))
721 |                                       ...fp1 released
723         faddx           %fp2,%fp0               | ...fp0 IS EXP(R)-1
724 |                                       ...fp2 released
725         fmovemx (%a7)+,%fp2-%fp2/%fp3   | ...fp2 restored
727 |--Step 5
728 |--Compute 2^(J/64)*p
730         fmulx           (%a1),%fp0      | ...2^(J/64)*(Exp(R)-1)
732 |--Step 6
733 |--Step 6.1
734         movel           L_SCR1(%a6),%d0         | ...retrieve M
735         cmpil           #63,%d0
736         bles            MLE63
737 |--Step 6.2     M >= 64
738         fmoves          12(%a1),%fp1    | ...fp1 is t
739         faddx           ONEBYSC(%a6),%fp1       | ...fp1 is t+OnebySc
740         faddx           %fp1,%fp0               | ...p+(t+OnebySc), fp1 released
741         faddx           (%a1),%fp0      | ...T+(p+(t+OnebySc))
742         bras            EM1SCALE
743 MLE63:
744 |--Step 6.3     M <= 63
745         cmpil           #-3,%d0
746         bges            MGEN3
747 MLTN3:
748 |--Step 6.4     M <= -4
749         fadds           12(%a1),%fp0    | ...p+t
750         faddx           (%a1),%fp0      | ...T+(p+t)
751         faddx           ONEBYSC(%a6),%fp0       | ...OnebySc + (T+(p+t))
752         bras            EM1SCALE
753 MGEN3:
754 |--Step 6.5     -3 <= M <= 63
755         fmovex          (%a1)+,%fp1     | ...fp1 is T
756         fadds           (%a1),%fp0      | ...fp0 is p+t
757         faddx           ONEBYSC(%a6),%fp1       | ...fp1 is T+OnebySc
758         faddx           %fp1,%fp0               | ...(T+OnebySc)+(p+t)
760 EM1SCALE:
761 |--Step 6.6
762         fmovel          %d1,%FPCR
763         fmulx           SC(%a6),%fp0
765         bra             t_frcinx
767 EM1SM:
768 |--Step 7       |X| < 1/4.
769         cmpil           #0x3FBE0000,%d0 | ...2^(-65)
770         bges            EM1POLY
772 EM1TINY:
773 |--Step 8       |X| < 2^(-65)
774         cmpil           #0x00330000,%d0 | ...2^(-16312)
775         blts            EM12TINY
776 |--Step 8.2
777         movel           #0x80010000,SC(%a6)     | ...SC is -2^(-16382)
778         movel           #0x80000000,SC+4(%a6)
779         clrl            SC+8(%a6)
780         fmovex          (%a0),%fp0
781         fmovel          %d1,%FPCR
782         faddx           SC(%a6),%fp0
784         bra             t_frcinx
786 EM12TINY:
787 |--Step 8.3
788         fmovex          (%a0),%fp0
789         fmuld           TWO140,%fp0
790         movel           #0x80010000,SC(%a6)
791         movel           #0x80000000,SC+4(%a6)
792         clrl            SC+8(%a6)
793         faddx           SC(%a6),%fp0
794         fmovel          %d1,%FPCR
795         fmuld           TWON140,%fp0
797         bra             t_frcinx
799 EM1POLY:
800 |--Step 9       exp(X)-1 by a simple polynomial
801         fmovex          (%a0),%fp0      | ...fp0 is X
802         fmulx           %fp0,%fp0               | ...fp0 is S := X*X
803         fmovemx %fp2-%fp2/%fp3,-(%a7)   | ...save fp2
804         fmoves          #0x2F30CAA8,%fp1        | ...fp1 is B12
805         fmulx           %fp0,%fp1               | ...fp1 is S*B12
806         fmoves          #0x310F8290,%fp2        | ...fp2 is B11
807         fadds           #0x32D73220,%fp1        | ...fp1 is B10+S*B12
809         fmulx           %fp0,%fp2               | ...fp2 is S*B11
810         fmulx           %fp0,%fp1               | ...fp1 is S*(B10 + ...
812         fadds           #0x3493F281,%fp2        | ...fp2 is B9+S*...
813         faddd           EM1B8,%fp1      | ...fp1 is B8+S*...
815         fmulx           %fp0,%fp2               | ...fp2 is S*(B9+...
816         fmulx           %fp0,%fp1               | ...fp1 is S*(B8+...
818         faddd           EM1B7,%fp2      | ...fp2 is B7+S*...
819         faddd           EM1B6,%fp1      | ...fp1 is B6+S*...
821         fmulx           %fp0,%fp2               | ...fp2 is S*(B7+...
822         fmulx           %fp0,%fp1               | ...fp1 is S*(B6+...
824         faddd           EM1B5,%fp2      | ...fp2 is B5+S*...
825         faddd           EM1B4,%fp1      | ...fp1 is B4+S*...
827         fmulx           %fp0,%fp2               | ...fp2 is S*(B5+...
828         fmulx           %fp0,%fp1               | ...fp1 is S*(B4+...
830         faddd           EM1B3,%fp2      | ...fp2 is B3+S*...
831         faddx           EM1B2,%fp1      | ...fp1 is B2+S*...
833         fmulx           %fp0,%fp2               | ...fp2 is S*(B3+...
834         fmulx           %fp0,%fp1               | ...fp1 is S*(B2+...
836         fmulx           %fp0,%fp2               | ...fp2 is S*S*(B3+...)
837         fmulx           (%a0),%fp1      | ...fp1 is X*S*(B2...
839         fmuls           #0x3F000000,%fp0        | ...fp0 is S*B1
840         faddx           %fp2,%fp1               | ...fp1 is Q
841 |                                       ...fp2 released
843         fmovemx (%a7)+,%fp2-%fp2/%fp3   | ...fp2 restored
845         faddx           %fp1,%fp0               | ...fp0 is S*B1+Q
846 |                                       ...fp1 released
848         fmovel          %d1,%FPCR
849         faddx           (%a0),%fp0
851         bra             t_frcinx
853 EM1BIG:
854 |--Step 10      |X| > 70 log2
855         movel           (%a0),%d0
856         cmpil           #0,%d0
857         bgt             EXPC1
858 |--Step 10.2
859         fmoves          #0xBF800000,%fp0        | ...fp0 is -1
860         fmovel          %d1,%FPCR
861         fadds           #0x00800000,%fp0        | ...-1 + 2^(-126)
863         bra             t_frcinx
865         |end