2 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
4 * SPDX-License-Identifier: BSD-3-Clause
8 #include "pico/asm_helper.S"
10 pico_default_asm_setup
12 .macro float_section name
14 .section RAM_SECTION_NAME(\name), "ax"
16 .section SECTION_NAME(\name), "ax"
20 .macro float_wrapper_section func
21 float_section WRAPPER_FUNC_NAME(\func)
24 @ load a 32-bit constant n into register rx
27 movt \rx,#((\n)>>16)&0xffff
30 float_section frrcore_v
32 // 1/2π to plenty of accuracy
33 .long 0 @ this allows values of e down to -32
36 .long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410
40 @ r1 exponent e>=-32, typically offset by +9
43 @ r6 range reduced result in revolutions Q32
48 asrs r3,r1,#5 @ k=e/32, k<=5 for e offsets up to 9+32
49 add r2,r2,r3,lsl#2 @ p
50 and r3,r1,#31 @ s=e%32
55 @ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23
56 ldr r5,[r2,#12] @ a0=p[3]
57 umull r5,r6,r5,r4 @ r0=a0*u1 hi, discard lo
59 ldr r5,[r2,#8] @ a1=p[2]
60 mla r6,r5,r4,r6 @ a1*u1 lo, discard hi
61 umlal r5,r6,r5,r3 @ a1*u0 hi, discard lo
63 ldr r5,[r2,#4] @ a2=p[1]
64 mla r6,r5,r3,r6 @ r0+=a2*u0
67 float_wrapper_section expf
70 @ soft float version, via 2^x
82 movlong r3,0x5c551d95 @ 1/log(2) Q30
83 smull r0,r1,r0,r3 @ Q54
87 bfi r2,r1,#10,#17 @ ε Q32
89 vcvt.f32.u32 s0,s0,#32
108 @ adc r0,#0 @ rounding not needed
111 3: @ risk of overflow, Inf,NaN
112 movlong r2,0x42B17218
114 blo 10b @ in range after all
121 movlong r0,0x7f800000 @ return +Inf
124 1: @ x<0, r1=0xffffffXX where XX is biased exponent
126 bge 5f @ risk of underflow, -Inf, -NaN?
134 adc r0,r0,#0 @ rounding
144 movlong r2,0xC2AEAC4F
157 .float 0.693147181 @ log2
158 .float 0.240226507 @ log²2/2
159 .float 0.055504109 @ log³2/6
161 exptab3: @ pow(2,[0..31]/32)
195 float_wrapper_section logf
198 cmp r0,#0x7f800000 @ catch Inf, NaN, -ve
200 asrs r1,r0,#23 @ get exponent; C from here is preserved...
203 bfi r0,r2,#23,#9 @ fix implied 1
204 it cc @ 50% ... to here...
205 lslcc r0,#1 @ this plus sbc below means we work relative to nearby power of 2
208 @ 0x00c00000 ≤ r0 < 0x017fffff
209 adr r3,logtab3-24*8+4
210 add r3,r3,r0,lsr#16 @ look up r0>>19 rounded, preserving flags
215 vmov s0,s1,r3,r0 @ s0=-log u, s1=ε
217 vcvt.f32.s32 s1,s1,#32
218 vmul.f32 s2,s1,s1 @ power series in ε
219 sbc r1,r1,#0x7e @ ... and here
222 vmul.f32 s4,s2,s2 @ to ε⁴
223 @ movlong r2,0x58b90bfc @ log 2 Q31, more accurate than we deserve
228 smmulr r1,r1,r2 @ Q22
232 vcvt.f32.s32 s7,s7,#22
235 vadd.f32 s0,s0,s1 @ log ε - log u
236 vadd.f32 s0,s0,s7 @ e log 2 + log ε - log u
244 cmp r0,#0x80800000 @ -0?
246 cmp r0,#0xff800000 @ -NaN/-Inf?
250 movlong r0,0xffc00000
254 movlong r0,0xff800000
260 .float 0.333333333333333
265 @ u=64/[48:2:96]; u Q8, -log u F32
266 .word 0x0155,0xbe92cb01 @ 00003e9b..00004145
267 .word 0x0148,0xbe7dc8c3 @ 00003ec8..00004158
268 .word 0x013b,0xbe545f68 @ 00003ec1..00004137
269 .word 0x012f,0xbe2c99c7 @ 00003ebb..00004119
270 .word 0x0125,0xbe0a3c2c @ 00003ef3..0000413d
271 .word 0x011a,0xbdc61a2f @ 00003eca..000040fe
272 .word 0x0111,0xbd83acc2 @ 00003eeb..0000410d
273 .word 0x0108,0xbcfc14d8 @ 00003ee8..000040f8
274 .word 0x0100,0x00000000 @ 00003f00..00004100
275 .word 0x00f8,0x3d020aec @ 00003ef8..000040e8
276 .word 0x00f1,0x3d77518e @ 00003f13..000040f5
277 .word 0x00ea,0x3db80698 @ 00003f12..000040e6
278 .word 0x00e4,0x3ded393b @ 00003f3c..00004104
279 .word 0x00dd,0x3e168b08 @ 00003f05..000040bf
280 .word 0x00d8,0x3e2dfa03 @ 00003f48..000040f8
281 .word 0x00d2,0x3e4ad2d7 @ 00003f2a..000040ce
282 .word 0x00cd,0x3e637fde @ 00003f43..000040dd
283 .word 0x00c8,0x3e7cc8e3 @ 00003f48..000040d8
284 .word 0x00c3,0x3e8b5ae6 @ 00003f39..000040bf
285 .word 0x00bf,0x3e95f784 @ 00003f6b..000040e9
286 .word 0x00ba,0x3ea38c6e @ 00003f36..000040aa
287 .word 0x00b6,0x3eaeadef @ 00003f46..000040b2
288 .word 0x00b2,0x3eba0ec4 @ 00003f46..000040aa
289 .word 0x00ae,0x3ec5b1cd @ 00003f36..00004092
290 .word 0x00ab,0x3ece995f @ 00003f75..000040cb
292 float_wrapper_section fsin_fcos
296 bne 1f @ NaN? return it
297 orrs r0,r0,#0x80000000 @ Inf: make a NaN
299 orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN
302 @ heavy-duty range reduction
303 @ here x≥256, -e in r1
307 bfi r0,r3,#23,#9 @ insert implied 1 in mantissa, clear sign
309 mov r7,#0x7e @ this will be the exponent of the reduced angle - 1
312 @ here r6 is revolutions Q32
313 lsrs r3,r6,#30 @ quadrant count
314 adcs r3,r3,#0 @ rounded
316 subs r6,r6,r3,lsl#30 @ reduced angle/2π Q32 -.125≤x<+.125
317 @ comment out from here...
320 rsbcs r2,r2,#0 @ absolute value
321 cmp r2,#1<<28 @ big enough for accuracy?
323 @ ... to here for slightly better accuracy
325 adds r1,r1,#2 @ try again with increased exponent
327 eors r2,r6,r6,asr#32 @ absolute value
329 cmp r2,#1<<28 @ big enough yet?
336 ldr r4,=0xC90FDAA2 @ 2π Q29
337 umull r2,r4,r2,r4 @ r4 has reduced angle Q34+Q29-Q32=Q31
338 @ add r4,r4,r2,lsr#31
339 clz r2,r4 @ normalise
343 adc r0,r4,r2,lsl#23 @ with rounding
344 lsrs r1,r0,#23 @ re-extract exponent as there may have been a carry into it
345 rsbs r1,r1,#0x7f @ prepare exponent for re-entry
347 add r3,r0,r6,lsl#31 @ apply sign of reduced angle
349 b 5f @ re-enter with no risk of looping
353 @ light-duty range reduction
357 @ r12: quadrant count
358 @ required result is sin(r0+r12*π/2)
362 bics r2,r0,r12,lsl#31 @ negative argument,doing sin -> +2 quadrants
365 bic r0,r0,#0x80000000 @ make positive: original sign is now captured in quadrant count in r12
367 @ this may not actually be faster than doing it in integer registers
371 @ vmul.f32 s4,s4,s0 @ this accurate calculation of the quadrant count does not seem necessary
373 vmul.f32 s4,s5,s0 @ this is BALGE
375 vrintn.f32.f32 s4,s4 @ round to quadrant count: x<256 so count≤163
376 ble 40b @ then do heavy-duty range reduction
379 vmov r3,s0 @ reduced angle
381 ubfx r2,r3,#23,#8 @ get exponent
383 blo 40b @ very small result? use heavy-duty reduction to get a more accurate answer
384 rsbs r1,r2,#0x7f @ ready for re-entry
385 vmov r2,s3 @ integer quadrant count
387 @ prepare to re-enter with no risk of looping
391 @ 2/π=0.A2F9836E4E441529FC...
392 .word 0x3f22f983 @ 2/π
393 @ π/2=1.921FB54442D1846989...
394 .word 0xb695777a,0x3fc91000 @ these two add up to π/2 with error ~1.6e-13
400 cmp r1,#0xff @ Inf/NaN?
402 bl cosf_entry @ this will exit via 1f or 2f...
405 @ here C is still set from lsrs r12,r12,#1
408 eor r12,r12,r1,lsr#31
409 @ this is fsc_costail:
410 @ here calculate cos φ+ε = cosθ
411 vmul.f32 s5,s7,s1 @ sinφ sinε
412 vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε)
413 vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
420 eor r12,r12,r1,lsr#31
421 @ this is fsc_sintail:
422 @ here calculate sin φ+ε = sinθ
423 vmul.f32 s4,s2,s7 @ sinφ(1-cosε)
424 vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε
425 eor r1,r12,r3,lsr#31 @ flip sign if (reduced) argument was negative
426 vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε
429 str r0,[r2] @ save cos result
432 @ sincos of Inf or NaN
436 bne 1f @ NaN? return it
437 orrs r0,r0,#0x80000000 @ Inf: make a NaN
439 orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN
440 str r0,[r2] @ both sin and cos results
445 @ r12b1..0: quadrant count
452 movs r12,#1 @ cos -> +1 quadrant
454 ubfx r1,r0,#23,#8 @ get exponent
455 cbz r1,20f @ 0/denormal?
458 bls 10b @ argument ≥1? needs reduction; also Inf/NaN handling
459 bic r3,r0,r12,lsl#31 @ this would mess up NaNs so do it here
461 @ here we have a quadrant count in r12 and a signed offset r0 from r12*π/2
462 bic r0,r3,#0x80000000 @ this would mess up NaNs so do it here
464 ubfx r0,r0,#18,#5 @ extract top of mantissa
465 adds r0,r0,#32 @ insert implied 1
466 lsrs r1,r0,r1 @ to fixed point Q5
468 adcs r1,r1,#0 @ rounding
470 add r2,r2,r1,lsl#2 @ 12 bytes per entry
473 vldmia r2,{s5-s7} @ φ, cosφ, sinφ
474 vsub.f32 s1,s0,s5 @ ε
475 vmul.f32 s2,s1,s1 @ ε²
476 lsrs r12,r12,#1 @ computing cosine?
477 vmul.f32 s3,s2,s1 @ ε³
480 vmul.f32 s2,s2,s8 @ ε²/2! ~ 1-cosε
481 vmul.f32 s3,s3,s9 @ ε³/3!
482 vsub.f32 s1,s1,s3 @ ε-ε³/3! ~ sinε
489 @ r12: quadrant count
491 @ here calculate sin φ+ε = sinθ
492 vmul.f32 s4,s2,s7 @ sinφ(1-cosε)
493 vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε
494 eor r1,r12,r3,lsr#31 @ flip sign if (reduced) argument was negative
495 vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε
501 and r0,r0,#0x80000000 @ make signed zero
506 vmul.f32 s3,s3,s9 @ ε³/3!
507 vsub.f32 s1,s1,s3 @ ε-ε³/3! ~ sinε
508 vmul.f32 s2,s2,s8 @ ε²/2! ~ 1-cosε
510 @ here calculate cos φ+ε = cosθ
511 vmul.f32 s5,s7,s1 @ sinφ sinε
512 vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε)
513 vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
520 .word 0x3EFFFEC1 @ ~ 1/2! with PMC
521 .word 0x3e2aaa25 @ ~ 1/3! with PMC
525 .word 0x00000000,0x3f800000,0x00000000
526 .word 0x3cfcc961,0x3f7fe0cd,0x3cfcbf1c @ φ=0.03085774 : cos φ=3feffc199ff28ef4 33.3b; sin φ=3f9f97e38006c678 39.2b
527 .word 0x3d810576,0x3f7f7dfe,0x3d80ef9e @ φ=0.06299870 : cos φ=3fefefbfc00d6b6d 33.3b; sin φ=3fb01df3c000dfd5 40.2b
528 .word 0x3dbf0c09,0x3f7ee30f,0x3dbec522 @ φ=0.09328467 : cos φ=3fefdc61dff4f58e 33.5b; sin φ=3fb7d8a43ffdf9ac 39.0b
529 .word 0x3dff24b6,0x3f7e0414,0x3dfe7be2 @ φ=0.12458174 : cos φ=3fefc0827fdaf90f 31.8b; sin φ=3fbfcf7c3ff9dd0c 37.4b
530 .word 0x3e1f0713,0x3f7ceb48,0x3e1e63a0 @ φ=0.15530042 : cos φ=3fef9d68ffe680a0 32.3b; sin φ=3fc3cc73fffa6d09 36.5b
531 .word 0x3e40306d,0x3f7b811d,0x3e3f1015 @ φ=0.18768473 : cos φ=3fef70239fe32301 32.1b; sin φ=3fc7e2029ffdbc2c 37.8b
532 .word 0x3e60ada2,0x3f79dccf,0x3e5ee13e @ φ=0.21941236 : cos φ=3fef3b99e023f5aa 31.8b; sin φ=3fcbdc27bffe216d 38.1b
533 .word 0x3e800d7b,0x3f7808fa,0x3e7d7196 @ φ=0.25010285 : cos φ=3fef011f401572a6 32.6b; sin φ=3fcfae32c00328bb 37.3b
534 .word 0x3e8f986e,0x3f75ff65,0x3e8db868 @ φ=0.28045982 : cos φ=3feebfeca0aaaf99 29.6b; sin φ=3fd1b70cfffc1468 36.0b
535 .word 0x3e9fe1f4,0x3f739e93,0x3e9d4bfd @ φ=0.31227076 : cos φ=3fee73d25fbf733b 31.0b; sin φ=3fd3a97fa0002ced 40.5b
536 .word 0x3eb054c6,0x3f70f7ae,0x3eacddb3 @ φ=0.34439677 : cos φ=3fee1ef5bfcf70cb 31.4b; sin φ=3fd59bb65fff5c30 38.6b
537 .word 0x3ebf89c5,0x3f6e4b60,0x3ebb1a0a @ φ=0.37409797 : cos φ=3fedc96bffdebb8a 31.9b; sin φ=3fd763414003344b 36.3b
538 .word 0x3ecfc426,0x3f6b35ca,0x3eca1c63 @ φ=0.40579337 : cos φ=3fed66b93fe27dc6 32.1b; sin φ=3fd9438c5ffe5d45 37.3b
539 .word 0x3ee054f2,0x3f67d166,0x3ed93907 @ φ=0.43814808 : cos φ=3fecfa2cbffc16e9 35.0b; sin φ=3fdb2720dffef5b6 37.9b
540 .word 0x3eeff0dd,0x3f64664b,0x3ee74116 @ φ=0.46863452 : cos φ=3fec8cc95f714272 29.8b; sin φ=3fdce822c00479ad 35.8b
541 .word 0x3f002b31,0x3f609488,0x3ef5c30f @ φ=0.50065905 : cos φ=3fec1290ffc99208 31.2b; sin φ=3fdeb861dfff3932 38.4b
542 .word 0x3f07e407,0x3f5cc5a2,0x3f01992b @ φ=0.53082317 : cos φ=3feb98b44034cd46 31.3b; sin φ=3fe033255ffff628 41.7b
543 .word 0x3f101fc5,0x3f587d8f,0x3f08a165 @ φ=0.56298476 : cos φ=3feb0fb1e0ceda6f 29.3b; sin φ=3fe1142c9ffd5ae4 35.6b
544 .word 0x3f17f68a,0x3f5434b5,0x3f0f31ca @ φ=0.59360564 : cos φ=3fea8696a038a06f 31.2b; sin φ=3fe1e639400269fb 35.7b
545 .word 0x3f1fffe2,0x3f4f9b59,0x3f15c8d7 @ φ=0.62499821 : cos φ=3fe9f36b1f428363 29.4b; sin φ=3fe2b91ae001d55d 36.1b
546 .word 0x3f280646,0x3f4acf6b,0x3f1c37c4 @ φ=0.65634573 : cos φ=3fe959ed61449f08 28.7b; sin φ=3fe386f87ffd9617 35.7b
547 .word 0x3f303041,0x3f45b9e0,0x3f229ae4 @ φ=0.68823630 : cos φ=3fe8b73c0047ae7a 30.8b; sin φ=3fe4535c7ffdf1ac 36.0b
548 .word 0x3f381da7,0x3f4098ca,0x3f28a620 @ φ=0.71920246 : cos φ=3fe81319402ae6e1 31.6b; sin φ=3fe514c3ffff423c 37.4b
549 .word 0x3f3fc72f,0x3f3b76ac,0x3f2e564a @ φ=0.74913305 : cos φ=3fe76ed5809d419f 29.7b; sin φ=3fe5cac93fffaf1d 38.7b
550 .word 0x3f4813db,0x3f35b6cc,0x3f34526b @ φ=0.78155297 : cos φ=3fe6b6d9800e8b52 33.1b; sin φ=3fe68a4d5ffe89fc 36.5b
551 .word 0x3f4fc779,0x3f30352f,0x3f39b4d0 @ φ=0.81163746 : cos φ=3fe606a5dfdc2b5b 31.8b; sin φ=3fe73699fffd7fc8 35.7b
552 .word 0x3f57dd52,0x3f2a4170,0x3f3f2d91 @ φ=0.84322083 : cos φ=3fe5482e011ba752 28.9b; sin φ=3fe7e5b21ffcb223 35.3b
553 .word 0x3f5fce26,0x3f243e9f,0x3f445dc3 @ φ=0.87423933 : cos φ=3fe487d3e0b9864b 29.5b; sin φ=3fe88bb85ffde6d5 35.9b
554 .word 0x3f6825f1,0x3f1dc250,0x3f499d1c @ φ=0.90682894 : cos φ=3fe3b849ffea9b8f 32.6b; sin φ=3fe933a38002730d 35.7b
555 .word 0x3f703be1,0x3f175041,0x3f4e7ebf @ φ=0.93841368 : cos φ=3fe2ea0820791b4e 30.1b; sin φ=3fe9cfd7e0053e65 34.6b
556 .word 0x3f781078,0x3f10ed71,0x3f5306af @ φ=0.96900129 : cos φ=3fe21dae1fdea23e 31.9b; sin φ=3fea60d5e001b90b 36.2b
557 .word 0x3f7ff4d4,0x3f0a5aa7,0x3f57649b @ φ=0.99982953 : cos φ=3fe14b54deeaa407 28.9b; sin φ=3feaec9360012825 36.8b
559 float_wrapper_section tanf
564 cmp r1,#0xff @ Inf/NaN?
566 bl cosf_entry @ this will exit via sintail or costail...
568 @ here C is still set from lsrs r12,r12,#1
570 @ we exited via sintail
571 @ this is fsc_costail:
572 @ here calculate cos φ+ε = cosθ
573 vmul.f32 s5,s7,s1 @ sinφ sinε
574 vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε)
576 vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
580 eorpl r0,r0,#0x80000000
584 @ we exited via costail
585 @ this is fsc_sintail:
586 @ here calculate sin φ+ε = sinθ
587 vmul.f32 s4,s2,s7 @ sinφ(1-cosε)
588 vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε
590 vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε
594 eormi r0,r0,#0x80000000
600 bne 1f @ NaN? return it
601 orrs r0,r0,#0x80000000 @ Inf: make a NaN
603 orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN
607 float_wrapper_section atan2f
611 orrs r0,r1,#0x00400000
616 cmp r3,#0x7f800000 @ y an infinity; x an infinity too?
617 bne 55f @ no: carry on
618 @ here x and y are both infinities
623 orrs r0,r0,#0x00400000
628 cmp r3,#0x7f800000 @ y an infinity; x an infinity too?
629 bne 65f @ no: carry on
631 @ here x and y are both infinities
632 subs r0,r0,#1 @ make both finite (and equal) with same sign and retry
639 bhs 72f @ y 0 or denormal?
640 @ here both x and y are zeros
646 vdiv.f32 s2,s0,s1 @ restart the division
647 b 73f @ and go back and check for NaNs
652 bhs 82f @ y 0 or denormal?
654 @ here both x and y are zeros
655 orr r1,r1,0x3f800000 @ retry with x replaced by ~1 with appropriate sign
662 vdiv.f32 s2,s1,s0 @ restart the division
663 b 83f @ and go back and check for NaNs
667 bic r2,r0,#0x80000000
668 bic r3,r1,#0x80000000
670 cmp r2,r3 @ |y| vs. |x|
672 @ here |x|≥|y| so we need |y|/|x|; octant/xs/ys: 0++,3-+,4--,7+-
673 vdiv.f32 s2,s0,s1 @ get this division started; result ≤1
675 blo 70b @ x 0 or denormal?
677 blo 71b @ y 0 or denormal?
682 bhs 51b @ y Inf or NaN?
686 ldrmi r12,pi @ if x<0, need two extra quadrants
688 @ inner negation is the sign of x
692 @ here |x|<|y| so we need |x|/|y|; octant/xs/ys: 1++,2-+,5--,6+-
693 vdiv.f32 s2,s1,s0 @ result <1
695 blo 80b @ x 0 or denormal?
697 blo 81b @ y 0 or denormal?
702 bhs 61b @ y Inf or NaN?
704 ldr r12,piover2 @ always one extra quadrant in this path
705 eors r1,r1,#0x80000000 @ inner negation is the complement of the sign of x
714 @ s2=s0/s1 or s1/s0, 0≤s2≤1
715 @ r12=quadrant count * π/2
716 @ where the final result is
717 @ ± (r12 ± atn s2) where the inner negation is given by r1b31 and the outer negation by r0b31
721 vcvt.u32.f32 s3,s3,#6
724 adcs r3,r3,#0 @ rounding; set Z if in φ==0 case
726 vldr s5,[r2,#4] @ t=tanφ
727 vmul.f32 s0,s5,s2 @ ty
728 vsub.f32 s1,s2,s5 @ y-t
730 vadd.f32 s0,s5,s0 @ 1+ty
731 beq 9f @ did we look up zeroth table entry?
733 @ now (s0,s1) = (x,y)
734 vdiv.f32 s0,s1,s0 @ ε
736 @ result is now ±(r12±(r2+atn(s0))
737 cmp r1,#0 @ inner negation
741 cmp r0,#0 @ outer negation
758 lsrs r2,r2,#8 @ rounding bit to carry
759 adc r2,r2,r3,lsl#23 @ with rounding
761 vmul.f32 s2,s0,s0 @ ε²
763 vmul.f32 s2,s2,s0 @ ε³
765 vmul.f32 s2,s2,s3 @ ε³/3
767 vsub.f32 s0,s0,s2 @ ~atn(ε)
774 9: @ we looked up the zeroth table entry; we could generate slightly more accurate results here
775 @ now (s0,s1) = (x,y)
776 vdiv.f32 s0,s1,s0 @ ε
777 @ result is now ±(r12±(0+atn(s0))
778 mov r2,r12 @ Q29; in fact r12 is only ±π/2 or ±π so can probably simplify this
779 cmp r0,#0 @ outer negation
796 lsrs r2,r2,#8 @ rounding bit to carry
797 adc r2,r2,r3,lsl#23 @ with rounding
799 vmul.f32 s2,s0,s0 @ ε²
801 vmul.f32 s2,s2,s0 @ ε³
803 vmul.f32 s2,s2,s3 @ ε³/3
805 vsub.f32 s0,s0,s2 @ ~atn(ε)
810 tst r0,#0x7f800000 @ about to return a denormal?
813 and r0,r0,#0x80000000 @ make it zero
816 piover2: .word 0x3243f6a9 @ Q29
817 pi: .word 0x6487ed51 @ Q29
818 onethird: .float 0.33333333
822 .word 0x00000000,0x00000000
823 .word 0x00ffee23,0x3d0001bb @ φ=0.03124148 : tan φ=3fa000375fffff9d 50.4b
824 .word 0x01fe88dc,0x3d7f992a @ φ=0.06232112 : tan φ=3faff3253fffea1f 44.5b
825 .word 0x02fe0a70,0x3dc01203 @ φ=0.09351084 : tan φ=3fb8024060002522 42.8b
826 .word 0x03fad228,0x3e000368 @ φ=0.12436779 : tan φ=3fc0006cfffffc90 45.2b
827 .word 0x04f5ab70,0x3e1ffdea @ φ=0.15498897 : tan φ=3fc3ffbd400014d5 42.6b
828 .word 0x05ed56f8,0x3e3fdddc @ φ=0.18522213 : tan φ=3fc7fbbb80000beb 43.4b
829 .word 0x06e4cfa0,0x3e601425 @ φ=0.21543103 : tan φ=3fcc02849fffe817 42.4b
830 .word 0x07d8d3e0,0x3e80215d @ φ=0.24521822 : tan φ=3fd0042b9ffff89f 43.1b
831 .word 0x08c60460,0x3e9000a5 @ φ=0.27417201 : tan φ=3fd20014a000182b 41.4b
832 .word 0x09b26770,0x3ea01492 @ φ=0.30302784 : tan φ=3fd402923ffff932 43.2b
833 .word 0x0a996d50,0x3eb01377 @ φ=0.33122888 : tan φ=3fd6026ee0001062 42.0b
834 .word 0x0b7a6d10,0x3ebff4a0 @ φ=0.35869458 : tan φ=3fd7fe93ffff8c38 39.1b
835 .word 0x0c593ce0,0x3ed0019f @ φ=0.38589329 : tan φ=3fda0033e0001354 41.7b
836 .word 0x0d33ebd0,0x3ee01bbc @ φ=0.41258803 : tan φ=3fdc0377800162a1 37.5b
837 .word 0x0e087ab0,0x3ef01fbd @ φ=0.43853506 : tan φ=3fde03f79fffddf2 40.9b
838 .word 0x0ed56180,0x3effef98 @ φ=0.46354747 : tan φ=3fdffdf30000767d 39.1b
839 .word 0x0fa1de80,0x3f080ebf @ φ=0.48850942 : tan φ=3fe101d7dfffb9fc 38.9b
840 .word 0x10639d00,0x3f0fec31 @ φ=0.51215982 : tan φ=3fe1fd862000aad5 37.6b
841 .word 0x112690e0,0x3f180cfd @ φ=0.53595775 : tan φ=3fe3019fa00069ea 38.3b
842 .word 0x11e014c0,0x3f200065 @ φ=0.55860364 : tan φ=3fe4000ca00022e5 39.9b
843 .word 0x129651e0,0x3f2808be @ φ=0.58084959 : tan φ=3fe50117c00015a7 40.6b
844 .word 0x1346d400,0x3f300a7d @ φ=0.60239601 : tan φ=3fe6014f9fffa020 38.4b
845 .word 0x13efc7c0,0x3f37ee2f @ φ=0.62302005 : tan φ=3fe6fdc5dfff98d7 38.3b
846 .word 0x14988960,0x3f400c32 @ φ=0.64362019 : tan φ=3fe801863fffff81 46.0b
847 .word 0x1537a8c0,0x3f47ef42 @ φ=0.66304433 : tan φ=3fe8fde8400062a4 38.4b
848 .word 0x15d4cc60,0x3f4ff630 @ φ=0.68222636 : tan φ=3fe9fec5ffff76e2 37.9b
849 .word 0x166ef280,0x3f581534 @ φ=0.70104337 : tan φ=3feb02a680004e91 38.7b
850 .word 0x16ff75c0,0x3f5fef1e @ φ=0.71868408 : tan φ=3febfde3c0001404 40.7b
851 .word 0x179116a0,0x3f68184d @ φ=0.73646098 : tan φ=3fed03099ffed6e5 36.8b
852 .word 0x181b5aa0,0x3f701722 @ φ=0.75333911 : tan φ=3fee02e43fffd351 39.5b
853 .word 0x18a10560,0x3f781071 @ φ=0.76965588 : tan φ=3fef020e20005c05 38.5b
854 .word 0x19214060,0x3f7ff451 @ φ=0.78530902 : tan φ=3feffe8a1fffe11b 40.1b