lib/main/pico-sdk/rp2_common/pico_float/float_sci_m33_vfp.S

   1 /*
   2  * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
   3  *
   4  * SPDX-License-Identifier: BSD-3-Clause
   5  */
   6
   7 #if !PICO_RP2040
   8 #include "pico/asm_helper.S"
   9
  10 pico_default_asm_setup
  11
  12 .macro float_section name
  13 #if PICO_FLOAT_IN_RAM
  14 .section RAM_SECTION_NAME(\name), "ax"
  15 #else
  16 .section SECTION_NAME(\name), "ax"
  17 #endif
  18 .endm
  19
  20 .macro float_wrapper_section func
  21 float_section WRAPPER_FUNC_NAME(\func)
  22 .endm
  23
  24 @ load a 32-bit constant n into register rx
  25 .macro movlong rx,n
  26  movw \rx,#(\n)&0xffff
  27  movt \rx,#((\n)>>16)&0xffff
  28 .endm
  29
  30 float_section frrcore_v
  31
  32 // 1/2π to plenty of accuracy
  33 .long 0                      @ this allows values of e down to -32
  34 rtwopi:
  35 .long 0,0
  36 .long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410
  37
  38 @ input:
  39 @ r0 mantissa m Q23
  40 @ r1 exponent e>=-32, typically offset by +9
  41 @ output:
  42 @ r0..r1 preserved
  43 @ r6 range reduced result in revolutions Q32
  44 @ r2,r3,r4,r5 trashed
  45 .thumb_func
  46 frr_core:
  47  adr r2,rtwopi
  48  asrs r3,r1,#5               @ k=e/32, k<=5 for e offsets up to 9+32
  49  add r2,r2,r3,lsl#2          @ p
  50  and r3,r1,#31               @ s=e%32
  51  mov r4,#1
  52  lsls r4,r4,r3               @ 1<<s
  53  umull r3,r4,r4,r0
  54 @ r2    p
  55 @ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23
  56  ldr r5,[r2,#12]             @ a0=p[3]
  57  umull r5,r6,r5,r4           @ r0=a0*u1 hi, discard lo
  58 @ r6  r0
  59  ldr r5,[r2,#8]              @ a1=p[2]
  60  mla r6,r5,r4,r6             @ a1*u1 lo, discard hi
  61  umlal r5,r6,r5,r3           @ a1*u0 hi, discard lo
  62 @ r6  r0
  63  ldr r5,[r2,#4]              @ a2=p[1]
  64  mla r6,r5,r3,r6             @ r0+=a2*u0
  65  bx r14
  66
  67 float_wrapper_section expf
  68
  69 wrapper_func expf
  70 @ soft float version, via 2^x
  71  asrs r1,r0,#23
  72  bmi 1f
  73  cmp r1,#0x85
  74  bge 3f
  75 10:
  76  movs r2,#1
  77  bfi r0,r2,#23,#9
  78  subs r1,#0x7e
  79  bmi 2f
  80  lsl r0,r1                   @ x Q24
  81 11:
  82  movlong r3,0x5c551d95       @ 1/log(2) Q30
  83  smull r0,r1,r0,r3           @ Q54
  84  adr r2,k_exp2
  85  vldmia r2,{s8-s10}
  86  lsrs r2,r0,#22
  87  bfi r2,r1,#10,#17           @ ε Q32
  88  vmov s0,r2
  89  vcvt.f32.u32 s0,s0,#32
  90  vmul.f32 s1,s0,s0
  91  ubfx r2,r1,#17,#5
  92  vmul.f32 s4,s0,s8
  93  adr r3,exptab3
  94  vmul.f32 s2,s0,s1
  95  ldr r2,[r3,r2,lsl#2]
  96  vmla.f32 s4,s1,s9
  97  asrs r1,#22
  98  vmla.f32 s4,s2,s10
  99  add r2,r2,r1,lsl#23
 100  vmov s0,r2
 101  vmla.f32 s0,s0,s4
 102  vmov r0,s0
 103  bx r14
 104
 105 2:                           @ x≤0.5
 106  rsbs r1,#0
 107  lsrs r0,r1
 108 @ adc r0,#0                  @ rounding not needed
 109  b 11b
 110
 111 3:                           @ risk of overflow, Inf,NaN
 112  movlong r2,0x42B17218
 113  cmp r0,r2
 114  blo 10b                     @ in range after all
 115  cmp r0,#0x7f800000
 116  bls 4f                      @ not NaN?
 117  orrs r0,#0x00400000
 118  bx r14
 119
 120 4:
 121  movlong r0,0x7f800000       @ return +Inf
 122  bx r14
 123
 124 1:                           @ x<0, r1=0xffffffXX where XX is biased exponent
 125  cmn r1,#0x7b
 126  bge 5f                      @ risk of underflow, -Inf, -NaN?
 127 13:
 128  movs r2,#1
 129  bfi r0,r2,#23,#9
 130  adds r1,#0x82
 131  bpl 6f
 132  rsbs r1,#0
 133  lsrs r0,r1
 134  adc r0,r0,#0                @ rounding
 135 12:
 136  rsbs r0,#0
 137  b 11b
 138
 139 6:
 140  lsls r0,r1
 141  b 12b
 142
 143 5:
 144  movlong r2,0xC2AEAC4F
 145  cmp r0,r2
 146  bls 13b
 147  cmp r0,#0xff800000
 148  bls 14f
 149  orrs r0,#0x00400000
 150  bx r14
 151 14:
 152  mov r0,#0
 153  bx r14
 154
 155 .align 3
 156 k_exp2:
 157 .float 0.693147181           @ log2
 158 .float 0.240226507           @ log²2/2
 159 .float 0.055504109           @ log³2/6
 160
 161 exptab3:                     @ pow(2,[0..31]/32)
 162 .word 0x3f800000
 163 .word 0x3f82cd87
 164 .word 0x3f85aac3
 165 .word 0x3f88980f
 166 .word 0x3f8b95c2
 167 .word 0x3f8ea43a
 168 .word 0x3f91c3d3
 169 .word 0x3f94f4f0
 170 .word 0x3f9837f0
 171 .word 0x3f9b8d3a
 172 .word 0x3f9ef532
 173 .word 0x3fa27043
 174 .word 0x3fa5fed7
 175 .word 0x3fa9a15b
 176 .word 0x3fad583f
 177 .word 0x3fb123f6
 178 .word 0x3fb504f3
 179 .word 0x3fb8fbaf
 180 .word 0x3fbd08a4
 181 .word 0x3fc12c4d
 182 .word 0x3fc5672a
 183 .word 0x3fc9b9be
 184 .word 0x3fce248c
 185 .word 0x3fd2a81e
 186 .word 0x3fd744fd
 187 .word 0x3fdbfbb8
 188 .word 0x3fe0ccdf
 189 .word 0x3fe5b907
 190 .word 0x3feac0c7
 191 .word 0x3fefe4ba
 192 .word 0x3ff5257d
 193 .word 0x3ffa83b3
 194
 195 float_wrapper_section logf
 196
 197 wrapper_func logf
 198  cmp r0,#0x7f800000          @ catch Inf, NaN, -ve
 199  bhs 1f
 200  asrs r1,r0,#23              @ get exponent; C from here is preserved...
 201  beq 2f                      @ ±0?
 202  mov r2,#1
 203  bfi r0,r2,#23,#9            @ fix implied 1
 204  it cc                       @ 50% ... to here...
 205  lslcc r0,#1                 @ this plus sbc below means we work relative to nearby power of 2
 206  adr r3,#k_log3
 207  vldmia r3,{s8-s10}
 208 @ 0x00c00000 ≤ r0 < 0x017fffff
 209  adr r3,logtab3-24*8+4
 210  add r3,r3,r0,lsr#16         @ look up r0>>19 rounded, preserving flags
 211  bic r3,#7
 212
 213  ldrd r2,r3,[r3]
 214  mul r0,r0,r2                @ ε
 215  vmov s0,s1,r3,r0            @ s0=-log u, s1=ε
 216
 217  vcvt.f32.s32 s1,s1,#32
 218  vmul.f32 s2,s1,s1           @ power series in ε
 219  sbc r1,r1,#0x7e             @ ... and here
 220  vmul.f32 s3,s1,s2
 221  lsls r1,#23                 @ e Q23
 222  vmul.f32 s4,s2,s2           @ to ε⁴
 223 @ movlong r2,0x58b90bfc      @ log 2 Q31, more accurate than we deserve
 224  movw r2,0x0bfc
 225  vmul.f32 s2,s2,s8
 226  movt r2,0x58b9
 227  vmul.f32 s3,s3,s9
 228  smmulr r1,r1,r2             @ Q22
 229  vmul.f32 s4,s4,s10
 230  vmov s7,r1
 231  vsub.f32 s3,s3,s4
 232  vcvt.f32.s32 s7,s7,#22
 233  vsub.f32 s2,s2,s3
 234  vsub.f32 s1,s1,s2
 235  vadd.f32 s0,s0,s1           @ log ε - log u
 236  vadd.f32 s0,s0,s7           @ e log 2 + log ε - log u
 237  vmov r0,s0
 238  bx r14
 239
 240 1:
 241  bgt 3f                      @ +NaN?
 242  beq 10f                     @ +Inf?
 243 2:
 244  cmp r0,#0x80800000          @ -0?
 245  blo 11f
 246  cmp r0,#0xff800000          @ -NaN/-Inf?
 247 3:
 248  orr r0,#0x00400000
 249  bhi 10f
 250  movlong r0,0xffc00000
 251 10:
 252  bx r14
 253 11:
 254  movlong r0,0xff800000
 255  bx r14
 256
 257 .align 3
 258 k_log3:
 259 .float 0.5
 260 .float 0.333333333333333
 261 .float 0.25
 262 .float 0                     @ alignment
 263
 264 logtab3:
 265 @ u=64/[48:2:96]; u Q8, -log u F32
 266 .word 0x0155,0xbe92cb01      @ 00003e9b..00004145
 267 .word 0x0148,0xbe7dc8c3      @ 00003ec8..00004158
 268 .word 0x013b,0xbe545f68      @ 00003ec1..00004137
 269 .word 0x012f,0xbe2c99c7      @ 00003ebb..00004119
 270 .word 0x0125,0xbe0a3c2c      @ 00003ef3..0000413d
 271 .word 0x011a,0xbdc61a2f      @ 00003eca..000040fe
 272 .word 0x0111,0xbd83acc2      @ 00003eeb..0000410d
 273 .word 0x0108,0xbcfc14d8      @ 00003ee8..000040f8
 274 .word 0x0100,0x00000000      @ 00003f00..00004100
 275 .word 0x00f8,0x3d020aec      @ 00003ef8..000040e8
 276 .word 0x00f1,0x3d77518e      @ 00003f13..000040f5
 277 .word 0x00ea,0x3db80698      @ 00003f12..000040e6
 278 .word 0x00e4,0x3ded393b      @ 00003f3c..00004104
 279 .word 0x00dd,0x3e168b08      @ 00003f05..000040bf
 280 .word 0x00d8,0x3e2dfa03      @ 00003f48..000040f8
 281 .word 0x00d2,0x3e4ad2d7      @ 00003f2a..000040ce
 282 .word 0x00cd,0x3e637fde      @ 00003f43..000040dd
 283 .word 0x00c8,0x3e7cc8e3      @ 00003f48..000040d8
 284 .word 0x00c3,0x3e8b5ae6      @ 00003f39..000040bf
 285 .word 0x00bf,0x3e95f784      @ 00003f6b..000040e9
 286 .word 0x00ba,0x3ea38c6e      @ 00003f36..000040aa
 287 .word 0x00b6,0x3eaeadef      @ 00003f46..000040b2
 288 .word 0x00b2,0x3eba0ec4      @ 00003f46..000040aa
 289 .word 0x00ae,0x3ec5b1cd      @ 00003f36..00004092
 290 .word 0x00ab,0x3ece995f      @ 00003f75..000040cb
 291
 292 float_wrapper_section fsin_fcos
 293
 294 30:
 295  lsls r1,r0,#9
 296  bne 1f                      @ NaN? return it
 297  orrs r0,r0,#0x80000000      @ Inf: make a NaN
 298 1:
 299  orrs r0,r0,#0x00400000      @ set top mantissa bit of NaN
 300  bx r14
 301
 302 @ heavy-duty range reduction
 303 @ here x≥256, -e in r1
 304 40:
 305  push {r4-r7,r14}
 306  movs r3,#1
 307  bfi r0,r3,#23,#9            @ insert implied 1 in mantissa, clear sign
 308  rsb r1,#9                   @ e+9
 309  mov r7,#0x7e                @ this will be the exponent of the reduced angle - 1
 310 42:
 311  bl frr_core
 312 @ here r6 is revolutions Q32
 313  lsrs r3,r6,#30              @ quadrant count
 314  adcs r3,r3,#0               @ rounded
 315  add r12,r12,r3
 316  subs r6,r6,r3,lsl#30        @ reduced angle/2π Q32 -.125≤x<+.125
 317 @ comment out from here...
 318  lsls r2,r6,#2               @ Q34
 319  it cs
 320  rsbcs r2,r2,#0              @ absolute value
 321  cmp r2,#1<<28               @ big enough for accuracy?
 322  bhs 41f
 323 @ ... to here for slightly better accuracy
 324 43:
 325  adds r1,r1,#2               @ try again with increased exponent
 326  bl frr_core
 327  eors r2,r6,r6,asr#32        @ absolute value
 328  adc r2,r2,#0
 329  cmp r2,#1<<28               @ big enough yet?
 330  bhs 44f
 331  subs r7,r7,#2
 332  bpl 43b                     @ safety net
 333 44:
 334
 335 41:
 336  ldr r4,=0xC90FDAA2          @ 2π Q29
 337  umull r2,r4,r2,r4           @ r4 has reduced angle Q34+Q29-Q32=Q31
 338 @ add r4,r4,r2,lsr#31
 339  clz r2,r4                   @ normalise
 340  lsls r4,r4,r2
 341  lsrs r4,r4,#8
 342  sub r2,r7,r2
 343  adc r0,r4,r2,lsl#23         @ with rounding
 344  lsrs r1,r0,#23              @ re-extract exponent as there may have been a carry into it
 345  rsbs r1,r1,#0x7f            @ prepare exponent for re-entry
 346  lsrs r6,r6,#31
 347  add r3,r0,r6,lsl#31         @ apply sign of reduced angle
 348  pop {r4-r7,r14}
 349  b 5f                        @ re-enter with no risk of looping
 350
 351 .ltorg
 352
 353 @ light-duty range reduction
 354 @ here argument ≥1
 355 @ r0: argument
 356 @ r1: -e
 357 @ r12: quadrant count
 358 @ required result is sin(r0+r12*π/2)
 359 10:
 360  cmn r1,#0x80
 361  beq 30b                     @ Inf/NaN
 362  bics r2,r0,r12,lsl#31       @ negative argument,doing sin -> +2 quadrants
 363  it mi
 364  addmi r12,r12,#2
 365  bic r0,r0,#0x80000000       @ make positive: original sign is now captured in quadrant count in r12
 366
 367 @ this may not actually be faster than doing it in integer registers
 368  vmov s0,r0
 369  adr r2,k_sc4
 370  vldmia r2!,{s5-s7}
 371 @ vmul.f32 s4,s4,s0          @ this accurate calculation of the quadrant count does not seem necessary
 372 @ vfma.f32 s4,s5,s0
 373  vmul.f32 s4,s5,s0           @ this is BALGE
 374  cmn r1,#8                   @ ≥256?
 375  vrintn.f32.f32 s4,s4        @ round to quadrant count: x<256 so count≤163
 376  ble 40b                     @ then do heavy-duty range reduction
 377  vfms.f32 s0,s4,s7
 378  vfms.f32 s0,s4,s6
 379  vmov r3,s0                  @ reduced angle
 380  vcvt.s32.f32 s3,s4
 381  ubfx r2,r3,#23,#8           @ get exponent
 382  cmp r2,#0x78
 383  blo 40b                     @ very small result? use heavy-duty reduction to get a more accurate answer
 384  rsbs r1,r2,#0x7f            @ ready for re-entry
 385  vmov r2,s3                  @ integer quadrant count
 386  add r12,r12,r2
 387 @ prepare to re-enter with no risk of looping
 388  b 5f
 389
 390 k_sc4:
 391 @ 2/π=0.A2F9836E4E441529FC...
 392 .word 0x3f22f983             @ 2/π
 393 @ π/2=1.921FB54442D1846989...
 394 .word 0xb695777a,0x3fc91000  @ these two add up to π/2 with error ~1.6e-13
 395
 396 wrapper_func sincosf
 397
 398  push {r0-r2,r14}
 399  ubfx r1,r0,#23,#8
 400  cmp r1,#0xff                @ Inf/NaN?
 401  beq 2f
 402  bl cosf_entry               @ this will exit via 1f or 2f...
 403  pop {r1-r2,r14}
 404  str r0,[r14]
 405 @ here C is still set from lsrs r12,r12,#1
 406  bcs 1f
 407  mvns r1,r1
 408  eor r12,r12,r1,lsr#31
 409 @ this is fsc_costail:
 410 @ here calculate cos φ+ε = cosθ
 411  vmul.f32 s5,s7,s1           @ sinφ sinε
 412  vfma.f32 s5,s2,s6           @ sinφ sinε + cosφ(1-cosε)
 413  vsub.f32 s5,s6,s5           @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
 414  vmov.f32 r0,s5
 415  eor r0,r0,r12,lsl#31
 416  str r0,[r2]
 417  pop {r15}
 418
 419 1:
 420  eor r12,r12,r1,lsr#31
 421 @ this is fsc_sintail:
 422 @ here calculate sin φ+ε = sinθ
 423  vmul.f32 s4,s2,s7           @ sinφ(1-cosε)
 424  vfms.f32 s4,s6,s1           @ sinφ(1-cosε) - cosφ sinε
 425  eor r1,r12,r3,lsr#31        @ flip sign if (reduced) argument was negative
 426  vsub.f32 s4,s7,s4           @ cosφ sinε + sinφ cosε
 427  vmov.f32 r0,s4
 428  eor r0,r0,r1,lsl#31
 429  str r0,[r2]                 @ save cos result
 430  pop {r15}
 431
 432 @ sincos of Inf or NaN
 433 2:
 434  lsls r1,r0,#9
 435  pop {r1-r3,r14}
 436  bne 1f                      @ NaN? return it
 437  orrs r0,r0,#0x80000000      @ Inf: make a NaN
 438 1:
 439  orrs r0,r0,#0x00400000      @ set top mantissa bit of NaN
 440  str r0,[r2]                 @ both sin and cos results
 441  str r0,[r3]
 442  bx r14
 443
 444 wrapper_func sinf
 445 @ r12b1..0: quadrant count
 446  movs r12,#0
 447  b 1f
 448
 449 wrapper_func cosf
 450 .thumb_func
 451 cosf_entry:
 452  movs r12,#1                 @ cos -> +1 quadrant
 453 1:
 454  ubfx r1,r0,#23,#8           @ get exponent
 455  cbz r1,20f                  @ 0/denormal?
 456 22:
 457  rsbs r1,r1,#0x7f
 458  bls 10b                     @ argument ≥1? needs reduction; also Inf/NaN handling
 459  bic r3,r0,r12,lsl#31        @ this would mess up NaNs so do it here
 460 5:
 461 @ here we have a quadrant count in r12 and a signed offset r0 from r12*π/2
 462  bic r0,r3,#0x80000000       @ this would mess up NaNs so do it here
 463  vmov s0,r0
 464  ubfx r0,r0,#18,#5           @ extract top of mantissa
 465  adds r0,r0,#32              @ insert implied 1
 466  lsrs r1,r0,r1               @ to fixed point Q5
 467  ldr r2,=k_sc3
 468  adcs r1,r1,#0               @ rounding
 469  vldmia r2!,{s8-s9}
 470  add r2,r2,r1,lsl#2          @ 12 bytes per entry
 471  add r2,r2,r1,lsl#3
 472
 473  vldmia r2,{s5-s7}           @ φ, cosφ, sinφ
 474  vsub.f32 s1,s0,s5           @ ε
 475  vmul.f32 s2,s1,s1           @ ε²
 476  lsrs r12,r12,#1             @ computing cosine?
 477  vmul.f32 s3,s2,s1           @ ε³
 478  bcs 2f
 479
 480  vmul.f32 s2,s2,s8           @ ε²/2! ~ 1-cosε
 481  vmul.f32 s3,s3,s9           @ ε³/3!
 482  vsub.f32 s1,s1,s3           @ ε-ε³/3! ~ sinε
 483
 484 @ here:
 485 @ s1: sinε
 486 @ s2: 1-cosε
 487 @ s6: cosφ
 488 @ s7: sinφ
 489 @ r12: quadrant count
 490 fsc_sintail:
 491 @ here calculate sin φ+ε = sinθ
 492  vmul.f32 s4,s2,s7           @ sinφ(1-cosε)
 493  vfms.f32 s4,s6,s1           @ sinφ(1-cosε) - cosφ sinε
 494  eor r1,r12,r3,lsr#31        @ flip sign if (reduced) argument was negative
 495  vsub.f32 s4,s7,s4           @ cosφ sinε + sinφ cosε
 496  vmov.f32 r0,s4
 497  eor r0,r0,r1,lsl#31
 498  bx r14
 499
 500 20:
 501  and r0,r0,#0x80000000       @ make signed zero
 502  b 22b
 503
 504 .align 2
 505 2:
 506  vmul.f32 s3,s3,s9           @ ε³/3!
 507  vsub.f32 s1,s1,s3           @ ε-ε³/3! ~ sinε
 508  vmul.f32 s2,s2,s8           @ ε²/2! ~ 1-cosε
 509 fsc_costail:
 510 @ here calculate cos φ+ε = cosθ
 511  vmul.f32 s5,s7,s1           @ sinφ sinε
 512  vfma.f32 s5,s2,s6           @ sinφ sinε + cosφ(1-cosε)
 513  vsub.f32 s5,s6,s5           @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
 514  vmov.f32 r0,s5
 515  eor r0,r0,r12,lsl#31
 516  bx r14
 517
 518 .align 3
 519 k_sc3:
 520 .word 0x3EFFFEC1             @ ~ 1/2! with PMC
 521 .word 0x3e2aaa25             @ ~ 1/3! with PMC
 522
 523 trigtab2:
 524 //      φ          cos φ      sin φ
 525 .word 0x00000000,0x3f800000,0x00000000
 526 .word 0x3cfcc961,0x3f7fe0cd,0x3cfcbf1c           @ φ=0.03085774 : cos φ=3feffc199ff28ef4 33.3b; sin φ=3f9f97e38006c678 39.2b
 527 .word 0x3d810576,0x3f7f7dfe,0x3d80ef9e           @ φ=0.06299870 : cos φ=3fefefbfc00d6b6d 33.3b; sin φ=3fb01df3c000dfd5 40.2b
 528 .word 0x3dbf0c09,0x3f7ee30f,0x3dbec522           @ φ=0.09328467 : cos φ=3fefdc61dff4f58e 33.5b; sin φ=3fb7d8a43ffdf9ac 39.0b
 529 .word 0x3dff24b6,0x3f7e0414,0x3dfe7be2           @ φ=0.12458174 : cos φ=3fefc0827fdaf90f 31.8b; sin φ=3fbfcf7c3ff9dd0c 37.4b
 530 .word 0x3e1f0713,0x3f7ceb48,0x3e1e63a0           @ φ=0.15530042 : cos φ=3fef9d68ffe680a0 32.3b; sin φ=3fc3cc73fffa6d09 36.5b
 531 .word 0x3e40306d,0x3f7b811d,0x3e3f1015           @ φ=0.18768473 : cos φ=3fef70239fe32301 32.1b; sin φ=3fc7e2029ffdbc2c 37.8b
 532 .word 0x3e60ada2,0x3f79dccf,0x3e5ee13e           @ φ=0.21941236 : cos φ=3fef3b99e023f5aa 31.8b; sin φ=3fcbdc27bffe216d 38.1b
 533 .word 0x3e800d7b,0x3f7808fa,0x3e7d7196           @ φ=0.25010285 : cos φ=3fef011f401572a6 32.6b; sin φ=3fcfae32c00328bb 37.3b
 534 .word 0x3e8f986e,0x3f75ff65,0x3e8db868           @ φ=0.28045982 : cos φ=3feebfeca0aaaf99 29.6b; sin φ=3fd1b70cfffc1468 36.0b
 535 .word 0x3e9fe1f4,0x3f739e93,0x3e9d4bfd           @ φ=0.31227076 : cos φ=3fee73d25fbf733b 31.0b; sin φ=3fd3a97fa0002ced 40.5b
 536 .word 0x3eb054c6,0x3f70f7ae,0x3eacddb3           @ φ=0.34439677 : cos φ=3fee1ef5bfcf70cb 31.4b; sin φ=3fd59bb65fff5c30 38.6b
 537 .word 0x3ebf89c5,0x3f6e4b60,0x3ebb1a0a           @ φ=0.37409797 : cos φ=3fedc96bffdebb8a 31.9b; sin φ=3fd763414003344b 36.3b
 538 .word 0x3ecfc426,0x3f6b35ca,0x3eca1c63           @ φ=0.40579337 : cos φ=3fed66b93fe27dc6 32.1b; sin φ=3fd9438c5ffe5d45 37.3b
 539 .word 0x3ee054f2,0x3f67d166,0x3ed93907           @ φ=0.43814808 : cos φ=3fecfa2cbffc16e9 35.0b; sin φ=3fdb2720dffef5b6 37.9b
 540 .word 0x3eeff0dd,0x3f64664b,0x3ee74116           @ φ=0.46863452 : cos φ=3fec8cc95f714272 29.8b; sin φ=3fdce822c00479ad 35.8b
 541 .word 0x3f002b31,0x3f609488,0x3ef5c30f           @ φ=0.50065905 : cos φ=3fec1290ffc99208 31.2b; sin φ=3fdeb861dfff3932 38.4b
 542 .word 0x3f07e407,0x3f5cc5a2,0x3f01992b           @ φ=0.53082317 : cos φ=3feb98b44034cd46 31.3b; sin φ=3fe033255ffff628 41.7b
 543 .word 0x3f101fc5,0x3f587d8f,0x3f08a165           @ φ=0.56298476 : cos φ=3feb0fb1e0ceda6f 29.3b; sin φ=3fe1142c9ffd5ae4 35.6b
 544 .word 0x3f17f68a,0x3f5434b5,0x3f0f31ca           @ φ=0.59360564 : cos φ=3fea8696a038a06f 31.2b; sin φ=3fe1e639400269fb 35.7b
 545 .word 0x3f1fffe2,0x3f4f9b59,0x3f15c8d7           @ φ=0.62499821 : cos φ=3fe9f36b1f428363 29.4b; sin φ=3fe2b91ae001d55d 36.1b
 546 .word 0x3f280646,0x3f4acf6b,0x3f1c37c4           @ φ=0.65634573 : cos φ=3fe959ed61449f08 28.7b; sin φ=3fe386f87ffd9617 35.7b
 547 .word 0x3f303041,0x3f45b9e0,0x3f229ae4           @ φ=0.68823630 : cos φ=3fe8b73c0047ae7a 30.8b; sin φ=3fe4535c7ffdf1ac 36.0b
 548 .word 0x3f381da7,0x3f4098ca,0x3f28a620           @ φ=0.71920246 : cos φ=3fe81319402ae6e1 31.6b; sin φ=3fe514c3ffff423c 37.4b
 549 .word 0x3f3fc72f,0x3f3b76ac,0x3f2e564a           @ φ=0.74913305 : cos φ=3fe76ed5809d419f 29.7b; sin φ=3fe5cac93fffaf1d 38.7b
 550 .word 0x3f4813db,0x3f35b6cc,0x3f34526b           @ φ=0.78155297 : cos φ=3fe6b6d9800e8b52 33.1b; sin φ=3fe68a4d5ffe89fc 36.5b
 551 .word 0x3f4fc779,0x3f30352f,0x3f39b4d0           @ φ=0.81163746 : cos φ=3fe606a5dfdc2b5b 31.8b; sin φ=3fe73699fffd7fc8 35.7b
 552 .word 0x3f57dd52,0x3f2a4170,0x3f3f2d91           @ φ=0.84322083 : cos φ=3fe5482e011ba752 28.9b; sin φ=3fe7e5b21ffcb223 35.3b
 553 .word 0x3f5fce26,0x3f243e9f,0x3f445dc3           @ φ=0.87423933 : cos φ=3fe487d3e0b9864b 29.5b; sin φ=3fe88bb85ffde6d5 35.9b
 554 .word 0x3f6825f1,0x3f1dc250,0x3f499d1c           @ φ=0.90682894 : cos φ=3fe3b849ffea9b8f 32.6b; sin φ=3fe933a38002730d 35.7b
 555 .word 0x3f703be1,0x3f175041,0x3f4e7ebf           @ φ=0.93841368 : cos φ=3fe2ea0820791b4e 30.1b; sin φ=3fe9cfd7e0053e65 34.6b
 556 .word 0x3f781078,0x3f10ed71,0x3f5306af           @ φ=0.96900129 : cos φ=3fe21dae1fdea23e 31.9b; sin φ=3fea60d5e001b90b 36.2b
 557 .word 0x3f7ff4d4,0x3f0a5aa7,0x3f57649b           @ φ=0.99982953 : cos φ=3fe14b54deeaa407 28.9b; sin φ=3feaec9360012825 36.8b
 558
 559 float_wrapper_section tanf
 560
 561 wrapper_func tanf
 562  push {r0,r14}
 563  ubfx r1,r0,#23,#8
 564  cmp r1,#0xff                @ Inf/NaN?
 565  beq 2f
 566  bl cosf_entry               @ this will exit via sintail or costail...
 567  ldr r1,[sp,#0]
 568 @ here C is still set from lsrs r12,r12,#1
 569  bcs 1f
 570 @ we exited via sintail
 571 @ this is fsc_costail:
 572 @ here calculate cos φ+ε = cosθ
 573  vmul.f32 s5,s7,s1           @ sinφ sinε
 574  vfma.f32 s5,s2,s6           @ sinφ sinε + cosφ(1-cosε)
 575  eors r1,r1,r3
 576  vsub.f32 s5,s6,s5           @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
 577  vdiv.f32 s0,s5,s4
 578  vmov.f32 r0,s0
 579  it pl
 580  eorpl r0,r0,#0x80000000
 581  pop {r1,r15}
 582
 583 1:
 584 @ we exited via costail
 585 @ this is fsc_sintail:
 586 @ here calculate sin φ+ε = sinθ
 587  vmul.f32 s4,s2,s7           @ sinφ(1-cosε)
 588  vfms.f32 s4,s6,s1           @ sinφ(1-cosε) - cosφ sinε
 589  eors r1,r1,r3
 590  vsub.f32 s4,s7,s4           @ cosφ sinε + sinφ cosε
 591  vdiv.f32 s0,s4,s5
 592  vmov.f32 r0,s0
 593  it mi
 594  eormi r0,r0,#0x80000000
 595  pop {r1,r15}
 596
 597 @ tan of Inf or NaN
 598 2:
 599  lsls r1,r0,#9
 600  bne 1f                      @ NaN? return it
 601  orrs r0,r0,#0x80000000      @ Inf: make a NaN
 602 1:
 603  orrs r0,r0,#0x00400000      @ set top mantissa bit of NaN
 604  pop {r3,r15}
 605
 606
 607 float_wrapper_section atan2f
 608
 609 50:
 610 60:
 611  orrs r0,r1,#0x00400000
 612  bx r14
 613
 614 51:
 615  bne 52f                     @ NaN?
 616  cmp r3,#0x7f800000          @ y an infinity; x an infinity too?
 617  bne 55f                     @ no: carry on
 618 @ here x and y are both infinities
 619  b 66f
 620
 621 52:
 622 62:
 623  orrs r0,r0,#0x00400000
 624  bx r14
 625
 626 61:
 627  bne 62b                     @ NaN?
 628  cmp r3,#0x7f800000          @ y an infinity; x an infinity too?
 629  bne 65f                     @ no: carry on
 630 66:
 631 @ here x and y are both infinities
 632  subs r0,r0,#1               @ make both finite (and equal) with same sign and retry
 633  subs r1,r1,#1
 634  b 86f
 635
 636 70:
 637  and r3,#0x80000000
 638  cmp r2,#0x00800000
 639  bhs 72f                     @ y 0 or denormal?
 640 @ here both x and y are zeros
 641  b 85f
 642 71:
 643  and r2,#0x80000000
 644 72:
 645  vmov s0,s1,r2,r3
 646  vdiv.f32 s2,s0,s1           @ restart the division
 647  b 73f                       @ and go back and check for NaNs
 648
 649 80:
 650  and r3,#0x80000000
 651  cmp r2,#0x00800000
 652  bhs 82f                     @ y 0 or denormal?
 653 85:
 654 @ here both x and y are zeros
 655  orr r1,r1,0x3f800000        @ retry with x replaced by ~1 with appropriate sign
 656  b 86f
 657
 658 81:
 659  and r2,#0x80000000
 660 82:
 661  vmov s0,s1,r2,r3
 662  vdiv.f32 s2,s1,s0           @ restart the division
 663  b 83f                       @ and go back and check for NaNs
 664
 665 wrapper_func atan2f
 666 86:
 667  bic r2,r0,#0x80000000
 668  bic r3,r1,#0x80000000
 669  vmov s0,s1,r2,r3
 670  cmp r2,r3                   @ |y| vs. |x|
 671  bhi 1f
 672 @ here |x|≥|y| so we need |y|/|x|; octant/xs/ys: 0++,3-+,4--,7+-
 673  vdiv.f32 s2,s0,s1           @ get this division started; result ≤1
 674  cmp r3,#0x00800000
 675  blo 70b                     @ x 0 or denormal?
 676  cmp r2,#0x00800000
 677  blo 71b                     @ y 0 or denormal?
 678 73:
 679  cmp r3,#0x7f800000
 680  bhi 50b                     @ x NaN?
 681  cmp r2,#0x7f800000
 682  bhs 51b                     @ y Inf or NaN?
 683 55:
 684  cmp r1,#0
 685  ite mi
 686  ldrmi r12,pi                @ if x<0, need two extra quadrants
 687  movpl r12,#0
 688                              @ inner negation is the sign of x
 689  b 2f
 690
 691 1:
 692 @ here |x|<|y| so we need |x|/|y|; octant/xs/ys: 1++,2-+,5--,6+-
 693  vdiv.f32 s2,s1,s0           @ result <1
 694  cmp r3,#0x00800000
 695  blo 80b                     @ x 0 or denormal?
 696  cmp r2,#0x00800000
 697  blo 81b                     @ y 0 or denormal?
 698 83:
 699  cmp r3,#0x7f800000
 700  bhi 60b                     @ x NaN?
 701  cmp r2,#0x7f800000
 702  bhs 61b                     @ y Inf or NaN?
 703 65:
 704  ldr r12,piover2             @ always one extra quadrant in this path
 705  eors r1,r1,#0x80000000      @ inner negation is the complement of the sign of x
 706
 707 2:
 708 @ here
 709 @ r0 y
 710 @ r1 ±x
 711 @ r2 |y|
 712 @ r3 |x|
 713 @ s0,s1 = |x|,|y|
 714 @ s2=s0/s1 or s1/s0, 0≤s2≤1
 715 @ r12=quadrant count * π/2
 716 @ where the final result is
 717 @ ± (r12 ± atn s2) where the inner negation is given by r1b31 and the outer negation by r0b31
 718
 719  adr r2,trigtab3
 720  vmov.f32 s3,s2
 721  vcvt.u32.f32 s3,s3,#6
 722  vmov.f32 r3,s3
 723  lsrs r3,r3,#1
 724  adcs r3,r3,#0               @ rounding; set Z if in φ==0 case
 725  add r2,r2,r3,lsl#3
 726  vldr s5,[r2,#4]             @ t=tanφ
 727  vmul.f32 s0,s5,s2           @ ty
 728  vsub.f32 s1,s2,s5           @ y-t
 729  vmov.f32 s5,#1.0
 730  vadd.f32 s0,s5,s0           @ 1+ty
 731  beq 9f                      @ did we look up zeroth table entry?
 732
 733 @ now (s0,s1) = (x,y)
 734  vdiv.f32 s0,s1,s0           @ ε
 735  ldr r2,[r2]                 @ φ Q29
 736 @ result is now ±(r12±(r2+atn(s0))
 737  cmp r1,#0                   @ inner negation
 738  it mi
 739  rsbmi r2,r2,#0
 740  add r2,r12,r2               @ Q29
 741  cmp r0,#0                   @ outer negation
 742  it mi
 743  rsbmi r2,r2,#0
 744  cmp r2,#0
 745  bpl 1f
 746  rsbs r2,r2,#0
 747  clz r3,r2
 748  lsls r2,r2,r3
 749  beq 3f
 750  rsb r3,#0x180
 751  b 2f
 752 1:
 753  clz r3,r2
 754  lsls r2,r2,r3
 755  beq 3f
 756  rsb r3,#0x80
 757 2:
 758  lsrs r2,r2,#8               @ rounding bit to carry
 759  adc r2,r2,r3,lsl#23         @ with rounding
 760 3:
 761  vmul.f32 s2,s0,s0           @ ε²
 762  vldr.f32 s3,onethird
 763  vmul.f32 s2,s2,s0           @ ε³
 764  teq r0,r1
 765  vmul.f32 s2,s2,s3           @ ε³/3
 766  vmov.f32 s4,r2
 767  vsub.f32 s0,s0,s2           @ ~atn(ε)
 768  ite pl
 769  vaddpl.f32 s0,s4,s0
 770  vsubmi.f32 s0,s4,s0
 771  vmov.f32 r0,s0
 772  bx r14
 773
 774 9:                           @ we looked up the zeroth table entry; we could generate slightly more accurate results here
 775 @ now (s0,s1) = (x,y)
 776  vdiv.f32 s0,s1,s0           @ ε
 777 @ result is now ±(r12±(0+atn(s0))
 778  mov r2,r12                  @ Q29; in fact r12 is only ±π/2 or ±π so can probably simplify this
 779  cmp r0,#0                   @ outer negation
 780  it mi
 781  rsbmi r2,r2,#0
 782  cmp r2,#0
 783  bpl 1f
 784  rsbs r2,r2,#0
 785  clz r3,r2
 786  lsls r2,r2,r3
 787  beq 3f
 788  rsb r3,#0x180
 789  b 2f
 790 1:
 791  clz r3,r2
 792  lsls r2,r2,r3
 793  beq 3f
 794  rsb r3,#0x80
 795 2:
 796  lsrs r2,r2,#8               @ rounding bit to carry
 797  adc r2,r2,r3,lsl#23         @ with rounding
 798 3:
 799  vmul.f32 s2,s0,s0           @ ε²
 800  vldr.f32 s3,onethird
 801  vmul.f32 s2,s2,s0           @ ε³
 802  teq r0,r1
 803  vmul.f32 s2,s2,s3           @ ε³/3
 804  vmov.f32 s4,r2
 805  vsub.f32 s0,s0,s2           @ ~atn(ε)
 806  ite pl
 807  vaddpl.f32 s0,s4,s0
 808  vsubmi.f32 s0,s4,s0
 809  vmov.f32 r0,s0
 810  tst r0,#0x7f800000          @ about to return a denormal?
 811  it ne
 812  bxne r14
 813  and r0,r0,#0x80000000       @ make it zero
 814  bx r14
 815
 816 piover2:  .word 0x3243f6a9   @ Q29
 817 pi:       .word 0x6487ed51   @ Q29
 818 onethird: .float 0.33333333
 819
 820 trigtab3:
 821 //      φ Q29      tan φ SP
 822 .word 0x00000000,0x00000000
 823 .word 0x00ffee23,0x3d0001bb  @ φ=0.03124148 : tan φ=3fa000375fffff9d 50.4b
 824 .word 0x01fe88dc,0x3d7f992a  @ φ=0.06232112 : tan φ=3faff3253fffea1f 44.5b
 825 .word 0x02fe0a70,0x3dc01203  @ φ=0.09351084 : tan φ=3fb8024060002522 42.8b
 826 .word 0x03fad228,0x3e000368  @ φ=0.12436779 : tan φ=3fc0006cfffffc90 45.2b
 827 .word 0x04f5ab70,0x3e1ffdea  @ φ=0.15498897 : tan φ=3fc3ffbd400014d5 42.6b
 828 .word 0x05ed56f8,0x3e3fdddc  @ φ=0.18522213 : tan φ=3fc7fbbb80000beb 43.4b
 829 .word 0x06e4cfa0,0x3e601425  @ φ=0.21543103 : tan φ=3fcc02849fffe817 42.4b
 830 .word 0x07d8d3e0,0x3e80215d  @ φ=0.24521822 : tan φ=3fd0042b9ffff89f 43.1b
 831 .word 0x08c60460,0x3e9000a5  @ φ=0.27417201 : tan φ=3fd20014a000182b 41.4b
 832 .word 0x09b26770,0x3ea01492  @ φ=0.30302784 : tan φ=3fd402923ffff932 43.2b
 833 .word 0x0a996d50,0x3eb01377  @ φ=0.33122888 : tan φ=3fd6026ee0001062 42.0b
 834 .word 0x0b7a6d10,0x3ebff4a0  @ φ=0.35869458 : tan φ=3fd7fe93ffff8c38 39.1b
 835 .word 0x0c593ce0,0x3ed0019f  @ φ=0.38589329 : tan φ=3fda0033e0001354 41.7b
 836 .word 0x0d33ebd0,0x3ee01bbc  @ φ=0.41258803 : tan φ=3fdc0377800162a1 37.5b
 837 .word 0x0e087ab0,0x3ef01fbd  @ φ=0.43853506 : tan φ=3fde03f79fffddf2 40.9b
 838 .word 0x0ed56180,0x3effef98  @ φ=0.46354747 : tan φ=3fdffdf30000767d 39.1b
 839 .word 0x0fa1de80,0x3f080ebf  @ φ=0.48850942 : tan φ=3fe101d7dfffb9fc 38.9b
 840 .word 0x10639d00,0x3f0fec31  @ φ=0.51215982 : tan φ=3fe1fd862000aad5 37.6b
 841 .word 0x112690e0,0x3f180cfd  @ φ=0.53595775 : tan φ=3fe3019fa00069ea 38.3b
 842 .word 0x11e014c0,0x3f200065  @ φ=0.55860364 : tan φ=3fe4000ca00022e5 39.9b
 843 .word 0x129651e0,0x3f2808be  @ φ=0.58084959 : tan φ=3fe50117c00015a7 40.6b
 844 .word 0x1346d400,0x3f300a7d  @ φ=0.60239601 : tan φ=3fe6014f9fffa020 38.4b
 845 .word 0x13efc7c0,0x3f37ee2f  @ φ=0.62302005 : tan φ=3fe6fdc5dfff98d7 38.3b
 846 .word 0x14988960,0x3f400c32  @ φ=0.64362019 : tan φ=3fe801863fffff81 46.0b
 847 .word 0x1537a8c0,0x3f47ef42  @ φ=0.66304433 : tan φ=3fe8fde8400062a4 38.4b
 848 .word 0x15d4cc60,0x3f4ff630  @ φ=0.68222636 : tan φ=3fe9fec5ffff76e2 37.9b
 849 .word 0x166ef280,0x3f581534  @ φ=0.70104337 : tan φ=3feb02a680004e91 38.7b
 850 .word 0x16ff75c0,0x3f5fef1e  @ φ=0.71868408 : tan φ=3febfde3c0001404 40.7b
 851 .word 0x179116a0,0x3f68184d  @ φ=0.73646098 : tan φ=3fed03099ffed6e5 36.8b
 852 .word 0x181b5aa0,0x3f701722  @ φ=0.75333911 : tan φ=3fee02e43fffd351 39.5b
 853 .word 0x18a10560,0x3f781071  @ φ=0.76965588 : tan φ=3fef020e20005c05 38.5b
 854 .word 0x19214060,0x3f7ff451  @ φ=0.78530902 : tan φ=3feffe8a1fffe11b 40.1b
 855
 856 #endif