arch/sh/lib/memcpy-sh4.S

   1 /*
   2  * "memcpy" implementation of SuperH
   3  *
   4  * Copyright (C) 1999  Niibe Yutaka
   5  * Copyright (c) 2002  STMicroelectronics Ltd
   6  *   Modified from memcpy.S and micro-optimised for SH4
   7  *   Stuart Menefy (stuart.menefy@st.com)
   8  *
   9  */
  10 #include <linux/linkage.h>
  11
  12 /*
  13  * void *memcpy(void *dst, const void *src, size_t n);
  14  *
  15  * It is assumed that there is no overlap between src and dst.
  16  * If there is an overlap, then the results are undefined.
  17  */
  18
  19         !
  20         !       GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
  21         !
  22
  23         ! Size is 16 or greater, and may have trailing bytes
  24
  25         .balign 32
  26 .Lcase1:
  27         ! Read a long word and write a long word at once
  28         ! At the start of each iteration, r7 contains last long load
  29         add     #-1,r5          !  79 EX
  30         mov     r4,r2           !   5 MT (0 cycles latency)
  31
  32         mov.l   @(r0,r5),r7     !  21 LS (2 cycles latency)
  33         add     #-4,r5          !  50 EX
  34
  35         add     #7,r2           !  79 EX
  36         !
  37 #ifdef CONFIG_CPU_LITTLE_ENDIAN
  38         ! 6 cycles, 4 bytes per iteration
  39 3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
  40         mov     r7, r3          !   5 MT (latency=0)    ! RQPO
  41
  42         cmp/hi  r2,r0           !  57 MT
  43         shll16  r3              ! 103 EX
  44
  45         mov     r1,r6           !   5 MT (latency=0)
  46         shll8   r3              ! 102 EX                ! Oxxx
  47
  48         shlr8   r6              ! 106 EX                ! xNML
  49         mov     r1, r7          !   5 MT (latency=0)
  50
  51         or      r6,r3           !  82 EX                ! ONML
  52         bt/s    3b              ! 109 BR
  53
  54          mov.l  r3,@-r0         !  30 LS
  55 #else
  56 3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! KLMN
  57         mov     r7,r3           !   5 MT (latency=0)    ! OPQR
  58
  59         cmp/hi  r2,r0           !  57 MT
  60         shlr16  r3              ! 107 EX
  61
  62         shlr8   r3              ! 106 EX                ! xxxO
  63         mov     r1,r6           !   5 MT (latency=0)
  64
  65         shll8   r6              ! 102 EX                ! LMNx
  66         mov     r1,r7           !   5 MT (latency=0)
  67
  68         or      r6,r3           !  82 EX                ! LMNO
  69         bt/s    3b              ! 109 BR
  70
  71          mov.l  r3,@-r0         !  30 LS
  72 #endif
  73         ! Finally, copy a byte at once, if necessary
  74
  75         add     #4,r5           !  50 EX
  76         cmp/eq  r4,r0           !  54 MT
  77
  78         add     #-6,r2          !  50 EX
  79         bt      9f              ! 109 BR
  80
  81 8:      cmp/hi  r2,r0           !  57 MT
  82         mov.b   @(r0,r5),r1     !  20 LS (latency=2)
  83
  84         bt/s    8b              ! 109 BR
  85
  86          mov.b  r1,@-r0         !  29 LS
  87
  88 9:      rts
  89          nop
  90
  91
  92         !
  93         !       GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
  94         !
  95
  96         ! Size is 16 or greater, and may have trailing bytes
  97
  98         .balign 32
  99 .Lcase3:
 100         ! Read a long word and write a long word at once
 101         ! At the start of each iteration, r7 contains last long load
 102         add     #-3,r5          ! 79 EX
 103         mov     r4,r2           !  5 MT (0 cycles latency)
 104
 105         mov.l   @(r0,r5),r7     ! 21 LS (2 cycles latency)
 106         add     #-4,r5          ! 50 EX
 107
 108         add     #7,r2           !  79 EX
 109         !
 110 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 111         ! 6 cycles, 4 bytes per iteration
 112 3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
 113         mov     r7, r3          !   5 MT (latency=0)    ! RQPO
 114
 115         cmp/hi  r2,r0           !  57 MT
 116         shll8   r3              ! 102 EX                ! QPOx
 117
 118         mov     r1,r6           !   5 MT (latency=0)
 119         shlr16  r6              ! 107 EX
 120
 121         shlr8   r6              ! 106 EX                ! xxxN
 122         mov     r1, r7          !   5 MT (latency=0)
 123
 124         or      r6,r3           !  82 EX                ! QPON
 125         bt/s    3b              ! 109 BR
 126
 127          mov.l  r3,@-r0         !  30 LS
 128 #else
 129 3:      mov     r7,r3           ! OPQR
 130         shlr8   r3              ! xOPQ
 131         mov.l   @(r0,r5),r7     ! KLMN
 132         mov     r7,r6
 133         shll16  r6
 134         shll8   r6              ! Nxxx
 135         or      r6,r3           ! NOPQ
 136         cmp/hi  r2,r0
 137         bt/s    3b
 138          mov.l  r3,@-r0
 139 #endif
 140
 141         ! Finally, copy a byte at once, if necessary
 142
 143         add     #6,r5           !  50 EX
 144         cmp/eq  r4,r0           !  54 MT
 145
 146         add     #-6,r2          !  50 EX
 147         bt      9f              ! 109 BR
 148
 149 8:      cmp/hi  r2,r0           !  57 MT
 150         mov.b   @(r0,r5),r1     !  20 LS (latency=2)
 151
 152         bt/s    8b              ! 109 BR
 153
 154          mov.b  r1,@-r0         !  29 LS
 155
 156 9:      rts
 157          nop
 158
 159 ENTRY(memcpy)
 160
 161         ! Calculate the invariants which will be used in the remainder
 162         ! of the code:
 163         !
 164         !      r4   -->  [ ...  ] DST             [ ...  ] SRC
 165         !                [ ...  ]                 [ ...  ]
 166         !                  :                        :
 167         !      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
 168         !
 169         !
 170
 171         ! Short circuit the common case of src, dst and len being 32 bit aligned
 172         ! and test for zero length move
 173
 174         mov     r6, r0          !   5 MT (0 cycle latency)
 175         or      r4, r0          !  82 EX
 176
 177         or      r5, r0          !  82 EX
 178         tst     r6, r6          !  86 MT
 179
 180         bt/s    99f             ! 111 BR                (zero len)
 181          tst    #3, r0          !  87 MT
 182
 183         mov     r4, r0          !   5 MT (0 cycle latency)
 184         add     r6, r0          !  49 EX
 185
 186         mov     #16, r1         !   6 EX
 187         bt/s    .Lcase00        ! 111 BR                (aligned)
 188
 189          sub    r4, r5          !  75 EX
 190
 191         ! Arguments are not nicely long word aligned or zero len.
 192         ! Check for small copies, and if so do a simple byte at a time copy.
 193         !
 194         ! Deciding on an exact value of 'small' is not easy, as the point at which
 195         ! using the optimised routines become worthwhile varies (these are the
 196         ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
 197         !       size    byte-at-time    long    word    byte
 198         !       16      42              39-40   46-50   50-55
 199         !       24      58              43-44   54-58   62-67
 200         !       36      82              49-50   66-70   80-85
 201         ! However the penalty for getting it 'wrong' is much higher for long word
 202         ! aligned data (and this is more common), so use a value of 16.
 203
 204         cmp/gt  r6,r1           !  56 MT
 205
 206         add     #-1,r5          !  50 EX
 207         bf/s    6f              ! 108 BR                (not small)
 208
 209          mov    r5, r3          !   5 MT (latency=0)
 210         shlr    r6              ! 104 EX
 211
 212         mov.b   @(r0,r5),r1     !  20 LS (latency=2)
 213         bf/s    4f              ! 111 BR
 214
 215          add    #-1,r3          !  50 EX
 216         tst     r6, r6          !  86 MT
 217
 218         bt/s    98f             ! 110 BR
 219          mov.b  r1,@-r0         !  29 LS
 220
 221         ! 4 cycles, 2 bytes per iteration
 222 3:      mov.b   @(r0,r5),r1     !  20 LS (latency=2)
 223
 224 4:      mov.b   @(r0,r3),r2     !  20 LS (latency=2)
 225         dt      r6              !  67 EX
 226
 227         mov.b   r1,@-r0         !  29 LS
 228         bf/s    3b              ! 111 BR
 229
 230          mov.b  r2,@-r0         !  29 LS
 231 98:
 232         rts
 233          nop
 234
 235 99:     rts
 236          mov    r4, r0
 237
 238         ! Size is not small, so its worthwhile looking for optimisations.
 239         ! First align destination to a long word boundary.
 240         !
 241         ! r5 = normal value -1
 242
 243 6:      tst     #3, r0          !  87 MT
 244         mov     #3, r3          !   6 EX
 245
 246         bt/s    2f              ! 111 BR
 247          and    r0,r3           !  78 EX
 248
 249         ! 3 cycles, 1 byte per iteration
 250 1:      dt      r3              !  67 EX
 251         mov.b   @(r0,r5),r1     !  19 LS (latency=2)
 252
 253         add     #-1, r6         !  79 EX
 254         bf/s    1b              ! 109 BR
 255
 256          mov.b  r1,@-r0         !  28 LS
 257
 258 2:      add     #1, r5          !  79 EX
 259
 260         ! Now select the appropriate bulk transfer code based on relative
 261         ! alignment of src and dst.
 262
 263         mov     r0, r3          !   5 MT (latency=0)
 264
 265         mov     r5, r0          !   5 MT (latency=0)
 266         tst     #1, r0          !  87 MT
 267
 268         bf/s    1f              ! 111 BR
 269          mov    #64, r7         !   6 EX
 270
 271         ! bit 0 clear
 272
 273         cmp/ge  r7, r6          !  55 MT
 274
 275         bt/s    2f              ! 111 BR
 276          tst    #2, r0          !  87 MT
 277
 278         ! small
 279         bt/s    .Lcase0
 280          mov    r3, r0
 281
 282         bra     .Lcase2
 283          nop
 284
 285         ! big
 286 2:      bt/s    .Lcase0b
 287          mov    r3, r0
 288
 289         bra     .Lcase2b
 290          nop
 291
 292         ! bit 0 set
 293 1:      tst     #2, r0          ! 87 MT
 294
 295         bt/s    .Lcase1
 296          mov    r3, r0
 297
 298         bra     .Lcase3
 299          nop
 300
 301
 302         !
 303         !       GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
 304         !
 305
 306         ! src, dst and size are all long word aligned
 307         ! size is non-zero
 308
 309         .balign 32
 310 .Lcase00:
 311         mov     #64, r1         !   6 EX
 312         mov     r5, r3          !   5 MT (latency=0)
 313
 314         cmp/gt  r6, r1          !  56 MT
 315         add     #-4, r5         !  50 EX
 316
 317         bf      .Lcase00b       ! 108 BR                (big loop)
 318         shlr2   r6              ! 105 EX
 319
 320         shlr    r6              ! 104 EX
 321         mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 322
 323         bf/s    4f              ! 111 BR
 324          add    #-8, r3         !  50 EX
 325
 326         tst     r6, r6          !  86 MT
 327         bt/s    5f              ! 110 BR
 328
 329          mov.l  r1,@-r0         !  30 LS
 330
 331         ! 4 cycles, 2 long words per iteration
 332 3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 333
 334 4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
 335         dt      r6              !  67 EX
 336
 337         mov.l   r1, @-r0        !  30 LS
 338         bf/s    3b              ! 109 BR
 339
 340          mov.l  r2, @-r0        !  30 LS
 341
 342 5:      rts
 343          nop
 344
 345
 346         ! Size is 16 or greater and less than 64, but may have trailing bytes
 347
 348         .balign 32
 349 .Lcase0:
 350         add     #-4, r5         !  50 EX
 351         mov     r4, r7          !   5 MT (latency=0)
 352
 353         mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 354         mov     #4, r2          !   6 EX
 355
 356         add     #11, r7         !  50 EX
 357         tst     r2, r6          !  86 MT
 358
 359         mov     r5, r3          !   5 MT (latency=0)
 360         bt/s    4f              ! 111 BR
 361
 362          add    #-4, r3         !  50 EX
 363         mov.l   r1,@-r0         !  30 LS
 364
 365         ! 4 cycles, 2 long words per iteration
 366 3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 367
 368 4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
 369         cmp/hi  r7, r0
 370
 371         mov.l   r1, @-r0        !  30 LS
 372         bt/s    3b              ! 109 BR
 373
 374          mov.l  r2, @-r0        !  30 LS
 375
 376         ! Copy the final 0-3 bytes
 377
 378         add     #3,r5           !  50 EX
 379
 380         cmp/eq  r0, r4          !  54 MT
 381         add     #-10, r7        !  50 EX
 382
 383         bt      9f              ! 110 BR
 384
 385         ! 3 cycles, 1 byte per iteration
 386 1:      mov.b   @(r0,r5),r1     !  19 LS
 387         cmp/hi  r7,r0           !  57 MT
 388
 389         bt/s    1b              ! 111 BR
 390          mov.b  r1,@-r0         !  28 LS
 391
 392 9:      rts
 393          nop
 394
 395         ! Size is at least 64 bytes, so will be going round the big loop at least once.
 396         !
 397         !   r2 = rounded up r4
 398         !   r3 = rounded down r0
 399
 400         .balign 32
 401 .Lcase0b:
 402         add     #-4, r5         !  50 EX
 403
 404 .Lcase00b:
 405         mov     r0, r3          !   5 MT (latency=0)
 406         mov     #(~0x1f), r1    !   6 EX
 407
 408         and     r1, r3          !  78 EX
 409         mov     r4, r2          !   5 MT (latency=0)
 410
 411         cmp/eq  r3, r0          !  54 MT
 412         add     #0x1f, r2       !  50 EX
 413
 414         bt/s    1f              ! 110 BR
 415          and    r1, r2          !  78 EX
 416
 417         ! copy initial words until cache line aligned
 418
 419         mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 420         tst     #4, r0          !  87 MT
 421
 422         mov     r5, r6          !   5 MT (latency=0)
 423         add     #-4, r6         !  50 EX
 424
 425         bt/s    4f              ! 111 BR
 426          add    #8, r3          !  50 EX
 427
 428         tst     #0x18, r0       !  87 MT
 429
 430         bt/s    1f              ! 109 BR
 431          mov.l  r1,@-r0         !  30 LS
 432
 433         ! 4 cycles, 2 long words per iteration
 434 3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 435
 436 4:      mov.l   @(r0, r6), r7   !  21 LS (latency=2)
 437         cmp/eq  r3, r0          !  54 MT
 438
 439         mov.l   r1, @-r0        !  30 LS
 440         bf/s    3b              ! 109 BR
 441
 442          mov.l  r7, @-r0        !  30 LS
 443
 444         ! Copy the cache line aligned blocks
 445         !
 446         ! In use: r0, r2, r4, r5
 447         ! Scratch: r1, r3, r6, r7
 448         !
 449         ! We could do this with the four scratch registers, but if src
 450         ! and dest hit the same cache line, this will thrash, so make
 451         ! use of additional registers.
 452         !
 453         ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
 454         !   r5:  src (was r0+r5)
 455         !   r1:  dest (was r0)
 456         ! this can be reversed at the end, so we don't need to save any extra
 457         ! state.
 458         !
 459 1:      mov.l   r8, @-r15       !  30 LS
 460         add     r0, r5          !  49 EX
 461
 462         mov.l   r9, @-r15       !  30 LS
 463         mov     r0, r1          !   5 MT (latency=0)
 464
 465         mov.l   r10, @-r15      !  30 LS
 466         add     #-0x1c, r5      !  50 EX
 467
 468         mov.l   r11, @-r15      !  30 LS
 469
 470         ! 16 cycles, 32 bytes per iteration
 471 2:      mov.l   @(0x00,r5),r0   ! 18 LS (latency=2)
 472         add     #-0x20, r1      ! 50 EX
 473         mov.l   @(0x04,r5),r3   ! 18 LS (latency=2)
 474         mov.l   @(0x08,r5),r6   ! 18 LS (latency=2)
 475         mov.l   @(0x0c,r5),r7   ! 18 LS (latency=2)
 476         mov.l   @(0x10,r5),r8   ! 18 LS (latency=2)
 477         mov.l   @(0x14,r5),r9   ! 18 LS (latency=2)
 478         mov.l   @(0x18,r5),r10  ! 18 LS (latency=2)
 479         mov.l   @(0x1c,r5),r11  ! 18 LS (latency=2)
 480         movca.l r0,@r1          ! 40 LS (latency=3-7)
 481         mov.l   r3,@(0x04,r1)   ! 33 LS
 482         mov.l   r6,@(0x08,r1)   ! 33 LS
 483         mov.l   r7,@(0x0c,r1)   ! 33 LS
 484
 485         mov.l   r8,@(0x10,r1)   ! 33 LS
 486         add     #-0x20, r5      ! 50 EX
 487
 488         mov.l   r9,@(0x14,r1)   ! 33 LS
 489         cmp/eq  r2,r1           ! 54 MT
 490
 491         mov.l   r10,@(0x18,r1)  !  33 LS
 492         bf/s    2b              ! 109 BR
 493
 494          mov.l  r11,@(0x1c,r1)  !  33 LS
 495
 496         mov     r1, r0          !   5 MT (latency=0)
 497
 498         mov.l   @r15+, r11      !  15 LS
 499         sub     r1, r5          !  75 EX
 500
 501         mov.l   @r15+, r10      !  15 LS
 502         cmp/eq  r4, r0          !  54 MT
 503
 504         bf/s    1f              ! 109 BR
 505          mov.l   @r15+, r9      !  15 LS
 506
 507         rts
 508 1:       mov.l  @r15+, r8       !  15 LS
 509         sub     r4, r1          !  75 EX                (len remaining)
 510
 511         ! number of trailing bytes is non-zero
 512         !
 513         ! invariants restored (r5 already decremented by 4)
 514         ! also r1=num bytes remaining
 515
 516         mov     #4, r2          !   6 EX
 517         mov     r4, r7          !   5 MT (latency=0)
 518
 519         add     #0x1c, r5       !  50 EX                (back to -4)
 520         cmp/hs  r2, r1          !  58 MT
 521
 522         bf/s    5f              ! 108 BR
 523          add     #11, r7        !  50 EX
 524
 525         mov.l   @(r0, r5), r6   !  21 LS (latency=2)
 526         tst     r2, r1          !  86 MT
 527
 528         mov     r5, r3          !   5 MT (latency=0)
 529         bt/s    4f              ! 111 BR
 530
 531          add    #-4, r3         !  50 EX
 532         cmp/hs  r2, r1          !  58 MT
 533
 534         bt/s    5f              ! 111 BR
 535          mov.l  r6,@-r0         !  30 LS
 536
 537         ! 4 cycles, 2 long words per iteration
 538 3:      mov.l   @(r0, r5), r6   !  21 LS (latency=2)
 539
 540 4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
 541         cmp/hi  r7, r0
 542
 543         mov.l   r6, @-r0        !  30 LS
 544         bt/s    3b              ! 109 BR
 545
 546          mov.l  r2, @-r0        !  30 LS
 547
 548         ! Copy the final 0-3 bytes
 549
 550 5:      cmp/eq  r0, r4          !  54 MT
 551         add     #-10, r7        !  50 EX
 552
 553         bt      9f              ! 110 BR
 554         add     #3,r5           !  50 EX
 555
 556         ! 3 cycles, 1 byte per iteration
 557 1:      mov.b   @(r0,r5),r1     !  19 LS
 558         cmp/hi  r7,r0           !  57 MT
 559
 560         bt/s    1b              ! 111 BR
 561          mov.b  r1,@-r0         !  28 LS
 562
 563 9:      rts
 564          nop
 565
 566         !
 567         !       GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
 568         !
 569
 570         .balign 32
 571 .Lcase2:
 572         ! Size is 16 or greater and less then 64, but may have trailing bytes
 573
 574 2:      mov     r5, r6          !   5 MT (latency=0)
 575         add     #-2,r5          !  50 EX
 576
 577         mov     r4,r2           !   5 MT (latency=0)
 578         add     #-4,r6          !  50 EX
 579
 580         add     #7,r2           !  50 EX
 581 3:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
 582
 583         mov.w   @(r0,r6),r3     !  20 LS (latency=2)
 584         cmp/hi  r2,r0           !  57 MT
 585
 586         mov.w   r1,@-r0         !  29 LS
 587         bt/s    3b              ! 111 BR
 588
 589          mov.w  r3,@-r0         !  29 LS
 590
 591         bra     10f
 592          nop
 593
 594
 595         .balign 32
 596 .Lcase2b:
 597         ! Size is at least 64 bytes, so will be going round the big loop at least once.
 598         !
 599         !   r2 = rounded up r4
 600         !   r3 = rounded down r0
 601
 602         mov     r0, r3          !   5 MT (latency=0)
 603         mov     #(~0x1f), r1    !   6 EX
 604
 605         and     r1, r3          !  78 EX
 606         mov     r4, r2          !   5 MT (latency=0)
 607
 608         cmp/eq  r3, r0          !  54 MT
 609         add     #0x1f, r2       !  50 EX
 610
 611         add     #-2, r5         !  50 EX
 612         bt/s    1f              ! 110 BR
 613          and    r1, r2          !  78 EX
 614
 615         ! Copy a short word one at a time until we are cache line aligned
 616         !   Normal values: r0, r2, r3, r4
 617         !   Unused: r1, r6, r7
 618         !   Mod: r5 (=r5-2)
 619         !
 620         add     #2, r3          !  50 EX
 621
 622 2:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
 623         cmp/eq  r3,r0           !  54 MT
 624
 625         bf/s    2b              ! 111 BR
 626
 627          mov.w  r1,@-r0         !  29 LS
 628
 629         ! Copy the cache line aligned blocks
 630         !
 631         ! In use: r0, r2, r4, r5 (=r5-2)
 632         ! Scratch: r1, r3, r6, r7
 633         !
 634         ! We could do this with the four scratch registers, but if src
 635         ! and dest hit the same cache line, this will thrash, so make
 636         ! use of additional registers.
 637         !
 638         ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
 639         !   r5:  src (was r0+r5)
 640         !   r1:  dest (was r0)
 641         ! this can be reversed at the end, so we don't need to save any extra
 642         ! state.
 643         !
 644 1:      mov.l   r8, @-r15       !  30 LS
 645         add     r0, r5          !  49 EX
 646
 647         mov.l   r9, @-r15       !  30 LS
 648         mov     r0, r1          !   5 MT (latency=0)
 649
 650         mov.l   r10, @-r15      !  30 LS
 651         add     #-0x1e, r5      !  50 EX
 652
 653         mov.l   r11, @-r15      !  30 LS
 654
 655         mov.l   r12, @-r15      !  30 LS
 656
 657         ! 17 cycles, 32 bytes per iteration
 658 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 659 2:      mov.w   @r5+, r0        !  14 LS (latency=2)            ..JI
 660         add     #-0x20, r1      !  50 EX
 661
 662         mov.l   @r5+, r3        !  15 LS (latency=2)            NMLK
 663
 664         mov.l   @r5+, r6        !  15 LS (latency=2)            RQPO
 665         shll16  r0              ! 103 EX                        JI..
 666
 667         mov.l   @r5+, r7        !  15 LS (latency=2)
 668         xtrct   r3, r0          !  48 EX                        LKJI
 669
 670         mov.l   @r5+, r8        !  15 LS (latency=2)
 671         xtrct   r6, r3          !  48 EX                        PONM
 672
 673         mov.l   @r5+, r9        !  15 LS (latency=2)
 674         xtrct   r7, r6          !  48 EX
 675
 676         mov.l   @r5+, r10       !  15 LS (latency=2)
 677         xtrct   r8, r7          !  48 EX
 678
 679         mov.l   @r5+, r11       !  15 LS (latency=2)
 680         xtrct   r9, r8          !  48 EX
 681
 682         mov.w   @r5+, r12       !  15 LS (latency=2)
 683         xtrct   r10, r9         !  48 EX
 684
 685         movca.l r0,@r1          !  40 LS (latency=3-7)
 686         xtrct   r11, r10        !  48 EX
 687
 688         mov.l   r3, @(0x04,r1)  !  33 LS
 689         xtrct   r12, r11        !  48 EX
 690
 691         mov.l   r6, @(0x08,r1)  !  33 LS
 692
 693         mov.l   r7, @(0x0c,r1)  !  33 LS
 694
 695         mov.l   r8, @(0x10,r1)  !  33 LS
 696         add     #-0x40, r5      !  50 EX
 697
 698         mov.l   r9, @(0x14,r1)  !  33 LS
 699         cmp/eq  r2,r1           !  54 MT
 700
 701         mov.l   r10, @(0x18,r1) !  33 LS
 702         bf/s    2b              ! 109 BR
 703
 704          mov.l  r11, @(0x1c,r1) !  33 LS
 705 #else
 706 2:      mov.w   @(0x1e,r5), r0  !  17 LS (latency=2)
 707         add     #-2, r5         !  50 EX
 708
 709         mov.l   @(0x1c,r5), r3  !  18 LS (latency=2)
 710         add     #-4, r1         !  50 EX
 711
 712         mov.l   @(0x18,r5), r6  !  18 LS (latency=2)
 713         shll16  r0              ! 103 EX
 714
 715         mov.l   @(0x14,r5), r7  !  18 LS (latency=2)
 716         xtrct   r3, r0          !  48 EX
 717
 718         mov.l   @(0x10,r5), r8  !  18 LS (latency=2)
 719         xtrct   r6, r3          !  48 EX
 720
 721         mov.l   @(0x0c,r5), r9  !  18 LS (latency=2)
 722         xtrct   r7, r6          !  48 EX
 723
 724         mov.l   @(0x08,r5), r10 !  18 LS (latency=2)
 725         xtrct   r8, r7          !  48 EX
 726
 727         mov.l   @(0x04,r5), r11 !  18 LS (latency=2)
 728         xtrct   r9, r8          !  48 EX
 729
 730         mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
 731         xtrct   r10, r9         !  48 EX
 732
 733         movca.l r0,@r1          !  40 LS (latency=3-7)
 734         add     #-0x1c, r1      !  50 EX
 735
 736         mov.l   r3, @(0x18,r1)  !  33 LS
 737         xtrct   r11, r10        !  48 EX
 738
 739         mov.l   r6, @(0x14,r1)  !  33 LS
 740         xtrct   r12, r11        !  48 EX
 741
 742         mov.l   r7, @(0x10,r1)  !  33 LS
 743
 744         mov.l   r8, @(0x0c,r1)  !  33 LS
 745         add     #-0x1e, r5      !  50 EX
 746
 747         mov.l   r9, @(0x08,r1)  !  33 LS
 748         cmp/eq  r2,r1           !  54 MT
 749
 750         mov.l   r10, @(0x04,r1) !  33 LS
 751         bf/s    2b              ! 109 BR
 752
 753          mov.l  r11, @(0x00,r1) !  33 LS
 754 #endif
 755
 756         mov.l   @r15+, r12
 757         mov     r1, r0          !   5 MT (latency=0)
 758
 759         mov.l   @r15+, r11      !  15 LS
 760         sub     r1, r5          !  75 EX
 761
 762         mov.l   @r15+, r10      !  15 LS
 763         cmp/eq  r4, r0          !  54 MT
 764
 765         bf/s    1f              ! 109 BR
 766          mov.l   @r15+, r9      !  15 LS
 767
 768         rts
 769 1:       mov.l  @r15+, r8       !  15 LS
 770
 771         add     #0x1e, r5       !  50 EX
 772
 773         ! Finish off a short word at a time
 774         ! r5 must be invariant - 2
 775 10:     mov     r4,r2           !   5 MT (latency=0)
 776         add     #1,r2           !  50 EX
 777
 778         cmp/hi  r2, r0          !  57 MT
 779         bf/s    1f              ! 109 BR
 780
 781          add    #2, r2          !  50 EX
 782
 783 3:      mov.w   @(r0,r5),r1     !  20 LS
 784         cmp/hi  r2,r0           !  57 MT
 785
 786         bt/s    3b              ! 109 BR
 787
 788          mov.w  r1,@-r0         !  29 LS
 789 1:
 790
 791         !
 792         ! Finally, copy the last byte if necessary
 793         cmp/eq  r4,r0           !  54 MT
 794         bt/s    9b
 795          add    #1,r5
 796         mov.b   @(r0,r5),r1
 797         rts
 798          mov.b  r1,@-r0
 799