common/lib/libc/arch/sparc64/string/memcpy.S

   1 /*      $NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $      */
   2
   3 /*
   4  * Copyright (c) 1996-2002 Eduardo Horvath
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR  ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR  BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  */
  26 #include "strmacros.h"
  27 #if defined(LIBC_SCCS) && !defined(lint)
  28 RCSID("$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $")
  29 #endif  /* LIBC_SCCS and not lint */
  30
  31 /*
  32  * memcpy
  33  * Assumes regions do not overlap;
  34  *
  35  * Must not use %g7 (see copyin/copyout above).
  36  */
  37 ENTRY(memcpy) /* dest, src, size */
  38         /*
  39          * Swap args for bcopy.  Gcc generates calls to memcpy for
  40          * structure assignments.
  41          */
  42         mov     %o0, %o3
  43         mov     %o1, %o0
  44         mov     %o3, %o1
  45 #if !defined(_KERNEL) || defined(_RUMPKERNEL)
  46 ENTRY(bcopy) /* src, dest, size */
  47 #endif
  48 #ifdef DEBUG
  49 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
  50         set     pmapdebug, %o4
  51         ld      [%o4], %o4
  52         btst    0x80, %o4       ! PDB_COPY
  53         bz,pt   %icc, 3f
  54          nop
  55 #endif
  56         save    %sp, -CC64FSZ, %sp
  57         mov     %i0, %o1
  58         set     2f, %o0
  59         mov     %i1, %o2
  60         call    printf
  61          mov    %i2, %o3
  62 !       ta      1; nop
  63         restore
  64         .data
  65 2:      .asciz  "memcpy(%p<-%p,%x)\n"
  66         _ALIGN
  67         .text
  68 3:
  69 #endif
  70
  71         cmp     %o2, BCOPY_SMALL
  72
  73 Lmemcpy_start:
  74         bge,pt  CCCR, 2f        ! if >= this many, go be fancy.
  75          cmp    %o2, 256
  76
  77         mov     %o1, %o5        ! Save memcpy return value
  78         /*
  79          * Not much to copy, just do it a byte at a time.
  80          */
  81         deccc   %o2             ! while (--len >= 0)
  82         bl      1f
  83          .empty
  84 0:
  85         inc     %o0
  86         ldsb    [%o0 - 1], %o4  !       (++dst)[-1] = *src++;
  87         stb     %o4, [%o1]
  88         deccc   %o2
  89         bge     0b
  90          inc    %o1
  91 1:
  92         retl
  93          mov    %o5, %o0
  94         NOTREACHED
  95
  96         /*
  97          * Plenty of data to copy, so try to do it optimally.
  98          */
  99 2:
 100 #ifdef USE_BLOCK_STORE_LOAD
 101         ! If it is big enough, use VIS instructions
 102         bge     Lmemcpy_block
 103          nop
 104 #endif /* USE_BLOCK_STORE_LOAD */
 105 Lmemcpy_fancy:
 106
 107         !!
 108         !! First align the output to a 8-byte entity
 109         !!
 110
 111         save    %sp, -CC64FSZ, %sp
 112
 113         mov     %i0, %l0
 114         mov     %i1, %l1
 115
 116         mov     %i2, %l2
 117         btst    1, %l1
 118
 119         bz,pt   %icc, 4f
 120          btst   2, %l1
 121         ldub    [%l0], %l4                              ! Load 1st byte
 122
 123         deccc   1, %l2
 124         ble,pn  CCCR, Lmemcpy_finish                    ! XXXX
 125          inc    1, %l0
 126
 127         stb     %l4, [%l1]                              ! Store 1st byte
 128         inc     1, %l1                                  ! Update address
 129         btst    2, %l1
 130 4:
 131         bz,pt   %icc, 4f
 132
 133          btst   1, %l0
 134         bz,a    1f
 135          lduh   [%l0], %l4                              ! Load short
 136
 137         ldub    [%l0], %l4                              ! Load bytes
 138
 139         ldub    [%l0+1], %l3
 140         sllx    %l4, 8, %l4
 141         or      %l3, %l4, %l4
 142
 143 1:
 144         deccc   2, %l2
 145         ble,pn  CCCR, Lmemcpy_finish                    ! XXXX
 146          inc    2, %l0
 147         sth     %l4, [%l1]                              ! Store 1st short
 148
 149         inc     2, %l1
 150 4:
 151         btst    4, %l1
 152         bz,pt   CCCR, 4f
 153
 154          btst   3, %l0
 155         bz,a,pt CCCR, 1f
 156          lduw   [%l0], %l4                              ! Load word -1
 157
 158         btst    1, %l0
 159         bz,a,pt %icc, 2f
 160          lduh   [%l0], %l4
 161
 162         ldub    [%l0], %l4
 163
 164         lduh    [%l0+1], %l3
 165         sllx    %l4, 16, %l4
 166         or      %l4, %l3, %l4
 167
 168         ldub    [%l0+3], %l3
 169         sllx    %l4, 8, %l4
 170         ba,pt   %icc, 1f
 171          or     %l4, %l3, %l4
 172
 173 2:
 174         lduh    [%l0+2], %l3
 175         sllx    %l4, 16, %l4
 176         or      %l4, %l3, %l4
 177
 178 1:
 179         deccc   4, %l2
 180         ble,pn  CCCR, Lmemcpy_finish            ! XXXX
 181          inc    4, %l0
 182
 183         st      %l4, [%l1]                              ! Store word
 184         inc     4, %l1
 185 4:
 186         !!
 187         !! We are now 32-bit aligned in the dest.
 188         !!
 189 Lmemcpy_common:
 190
 191         and     %l0, 7, %l4                             ! Shift amount
 192         andn    %l0, 7, %l0                             ! Source addr
 193
 194         brz,pt  %l4, Lmemcpy_noshift8                   ! No shift version...
 195
 196          sllx   %l4, 3, %l4                             ! In bits
 197         mov     8<<3, %l3
 198
 199         ldx     [%l0], %o0                              ! Load word -1
 200         sub     %l3, %l4, %l3                           ! Reverse shift
 201         deccc   12*8, %l2                               ! Have enough room?
 202
 203         sllx    %o0, %l4, %o0
 204         bl,pn   CCCR, 2f
 205          and    %l3, 0x38, %l3
 206 Lmemcpy_unrolled8:
 207
 208         /*
 209          * This is about as close to optimal as you can get, since
 210          * the shifts require EU0 and cannot be paired, and you have
 211          * 3 dependent operations on the data.
 212          */
 213
 214 !       ldx     [%l0+0*8], %o0                          ! Already done
 215 !       sllx    %o0, %l4, %o0                           ! Already done
 216         ldx     [%l0+1*8], %o1
 217         ldx     [%l0+2*8], %o2
 218         ldx     [%l0+3*8], %o3
 219         ldx     [%l0+4*8], %o4
 220         ba,pt   %icc, 1f
 221          ldx    [%l0+5*8], %o5
 222         .align  8
 223 1:
 224         srlx    %o1, %l3, %g1
 225         inc     6*8, %l0
 226
 227         sllx    %o1, %l4, %o1
 228         or      %g1, %o0, %g6
 229         ldx     [%l0+0*8], %o0
 230
 231         stx     %g6, [%l1+0*8]
 232         srlx    %o2, %l3, %g1
 233
 234         sllx    %o2, %l4, %o2
 235         or      %g1, %o1, %g6
 236         ldx     [%l0+1*8], %o1
 237
 238         stx     %g6, [%l1+1*8]
 239         srlx    %o3, %l3, %g1
 240
 241         sllx    %o3, %l4, %o3
 242         or      %g1, %o2, %g6
 243         ldx     [%l0+2*8], %o2
 244
 245         stx     %g6, [%l1+2*8]
 246         srlx    %o4, %l3, %g1
 247
 248         sllx    %o4, %l4, %o4
 249         or      %g1, %o3, %g6
 250         ldx     [%l0+3*8], %o3
 251
 252         stx     %g6, [%l1+3*8]
 253         srlx    %o5, %l3, %g1
 254
 255         sllx    %o5, %l4, %o5
 256         or      %g1, %o4, %g6
 257         ldx     [%l0+4*8], %o4
 258
 259         stx     %g6, [%l1+4*8]
 260         srlx    %o0, %l3, %g1
 261         deccc   6*8, %l2                                ! Have enough room?
 262
 263         sllx    %o0, %l4, %o0                           ! Next loop
 264         or      %g1, %o5, %g6
 265         ldx     [%l0+5*8], %o5
 266
 267         stx     %g6, [%l1+5*8]
 268         bge,pt  CCCR, 1b
 269          inc    6*8, %l1
 270
 271 Lmemcpy_unrolled8_cleanup:
 272         !!
 273         !! Finished 8 byte block, unload the regs.
 274         !!
 275         srlx    %o1, %l3, %g1
 276         inc     5*8, %l0
 277
 278         sllx    %o1, %l4, %o1
 279         or      %g1, %o0, %g6
 280
 281         stx     %g6, [%l1+0*8]
 282         srlx    %o2, %l3, %g1
 283
 284         sllx    %o2, %l4, %o2
 285         or      %g1, %o1, %g6
 286
 287         stx     %g6, [%l1+1*8]
 288         srlx    %o3, %l3, %g1
 289
 290         sllx    %o3, %l4, %o3
 291         or      %g1, %o2, %g6
 292
 293         stx     %g6, [%l1+2*8]
 294         srlx    %o4, %l3, %g1
 295
 296         sllx    %o4, %l4, %o4
 297         or      %g1, %o3, %g6
 298
 299         stx     %g6, [%l1+3*8]
 300         srlx    %o5, %l3, %g1
 301
 302         sllx    %o5, %l4, %o5
 303         or      %g1, %o4, %g6
 304
 305         stx     %g6, [%l1+4*8]
 306         inc     5*8, %l1
 307
 308         mov     %o5, %o0                                ! Save our unused data
 309         dec     5*8, %l2
 310 2:
 311         inccc   12*8, %l2
 312         bz,pn   %icc, Lmemcpy_complete
 313
 314         !! Unrolled 8 times
 315 Lmemcpy_aligned8:
 316 !       ldx     [%l0], %o0                              ! Already done
 317 !       sllx    %o0, %l4, %o0                           ! Shift high word
 318
 319          deccc  8, %l2                                  ! Pre-decrement
 320         bl,pn   CCCR, Lmemcpy_finish
 321 1:
 322         ldx     [%l0+8], %o1                            ! Load word 0
 323         inc     8, %l0
 324
 325         srlx    %o1, %l3, %g6
 326         or      %g6, %o0, %g6                           ! Combine
 327
 328         stx     %g6, [%l1]                              ! Store result
 329          inc    8, %l1
 330
 331         deccc   8, %l2
 332         bge,pn  CCCR, 1b
 333          sllx   %o1, %l4, %o0
 334
 335         btst    7, %l2                                  ! Done?
 336         bz,pt   CCCR, Lmemcpy_complete
 337
 338         !!
 339         !! Loadup the last dregs into %o0 and shift it into place
 340         !!
 341          srlx   %l3, 3, %g6                             ! # bytes in %o0
 342         dec     8, %g6                                  !  - 8
 343         !! n-8 - (by - 8) -> n - by
 344         subcc   %l2, %g6, %g0                           ! # bytes we need
 345         ble,pt  %icc, Lmemcpy_finish
 346          nop
 347         ldx     [%l0+8], %o1                            ! Need another word
 348         srlx    %o1, %l3, %o1
 349         ba,pt   %icc, Lmemcpy_finish
 350          or     %o0, %o1, %o0                           ! All loaded up.
 351
 352 Lmemcpy_noshift8:
 353         deccc   6*8, %l2                                ! Have enough room?
 354         bl,pn   CCCR, 2f
 355          nop
 356         ba,pt   %icc, 1f
 357          nop
 358         .align  32
 359 1:
 360         ldx     [%l0+0*8], %o0
 361         ldx     [%l0+1*8], %o1
 362         ldx     [%l0+2*8], %o2
 363         stx     %o0, [%l1+0*8]
 364         stx     %o1, [%l1+1*8]
 365         stx     %o2, [%l1+2*8]
 366
 367
 368         ldx     [%l0+3*8], %o3
 369         ldx     [%l0+4*8], %o4
 370         ldx     [%l0+5*8], %o5
 371         inc     6*8, %l0
 372         stx     %o3, [%l1+3*8]
 373         deccc   6*8, %l2
 374         stx     %o4, [%l1+4*8]
 375         stx     %o5, [%l1+5*8]
 376         bge,pt  CCCR, 1b
 377          inc    6*8, %l1
 378 2:
 379         inc     6*8, %l2
 380 1:
 381         deccc   8, %l2
 382         bl,pn   %icc, 1f                                ! < 0 --> sub word
 383          nop
 384         ldx     [%l0], %g6
 385         inc     8, %l0
 386         stx     %g6, [%l1]
 387         bg,pt   %icc, 1b                                ! Exactly 0 --> done
 388          inc    8, %l1
 389 1:
 390         btst    7, %l2                                  ! Done?
 391         bz,pt   CCCR, Lmemcpy_complete
 392          clr    %l4
 393         ldx     [%l0], %o0
 394 Lmemcpy_finish:
 395
 396         brz,pn  %l2, 2f                                 ! 100% complete?
 397          cmp    %l2, 8                                  ! Exactly 8 bytes?
 398         bz,a,pn CCCR, 2f
 399          stx    %o0, [%l1]
 400
 401         btst    4, %l2                                  ! Word store?
 402         bz      CCCR, 1f
 403          srlx   %o0, 32, %g6                            ! Shift high word down
 404         stw     %g6, [%l1]
 405         inc     4, %l1
 406         mov     %o0, %g6                                ! Operate on the low bits
 407 1:
 408         btst    2, %l2
 409         mov     %g6, %o0
 410         bz      1f
 411          srlx   %o0, 16, %g6
 412
 413         sth     %g6, [%l1]                              ! Store short
 414         inc     2, %l1
 415         mov     %o0, %g6                                ! Operate on low bytes
 416 1:
 417         mov     %g6, %o0
 418         btst    1, %l2                                  ! Byte aligned?
 419         bz      2f
 420          srlx   %o0, 8, %g6
 421
 422         stb     %g6, [%l1]                              ! Store last byte
 423         inc     1, %l1                                  ! Update address
 424 2:
 425 Lmemcpy_complete:
 426 #if 0
 427         !!
 428         !! verify copy success.
 429         !!
 430
 431         mov     %i0, %o2
 432         mov     %i1, %o4
 433         mov     %i2, %l4
 434 0:
 435         ldub    [%o2], %o1
 436         inc     %o2
 437         ldub    [%o4], %o3
 438         inc     %o4
 439         cmp     %o3, %o1
 440         bnz     1f
 441          dec    %l4
 442         brnz    %l4, 0b
 443          nop
 444         ba      2f
 445          nop
 446
 447 1:
 448         set     0f, %o0
 449         call    printf
 450          sub    %i2, %l4, %o5
 451         set     1f, %o0
 452         mov     %i0, %o2
 453         mov     %i1, %o1
 454         call    printf
 455          mov    %i2, %o3
 456         ta      1
 457         .data
 458 0:      .asciz  "memcpy failed: %x@%p != %x@%p byte %d\n"
 459 1:      .asciz  "memcpy(%p, %p, %lx)\n"
 460         .align 8
 461         .text
 462 2:
 463 #endif
 464         ret
 465          restore %i1, %g0, %o0
 466
 467 #ifdef USE_BLOCK_STORE_LOAD
 468
 469 /*
 470  * Block copy.  Useful for >256 byte copies.
 471  *
 472  * Benchmarking has shown this always seems to be slower than
 473  * the integer version, so this is disabled.  Maybe someone will
 474  * figure out why sometime.
 475  */
 476
 477 Lmemcpy_block:
 478         sethi   %hi(block_disable), %o3
 479         ldx     [ %o3 + %lo(block_disable) ], %o3
 480         brnz,pn %o3, Lmemcpy_fancy
 481         !! Make sure our trap table is installed
 482         set     _C_LABEL(trapbase), %o5
 483         rdpr    %tba, %o3
 484         sub     %o3, %o5, %o3
 485         brnz,pn %o3, Lmemcpy_fancy      ! No, then don't use block load/store
 486          nop
 487 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
 488 /*
 489  * Kernel:
 490  *
 491  * Here we use VIS instructions to do a block clear of a page.
 492  * But before we can do that we need to save and enable the FPU.
 493  * The last owner of the FPU registers is fplwp, and
 494  * fplwp->l_md.md_fpstate is the current fpstate.  If that's not
 495  * null, call savefpstate() with it to store our current fp state.
 496  *
 497  * Next, allocate an aligned fpstate on the stack.  We will properly
 498  * nest calls on a particular stack so this should not be a problem.
 499  *
 500  * Now we grab either curlwp (or if we're on the interrupt stack
 501  * lwp0).  We stash its existing fpstate in a local register and
 502  * put our new fpstate in curlwp->p_md.md_fpstate.  We point
 503  * fplwp at curlwp (or lwp0) and enable the FPU.
 504  *
 505  * If we are ever preempted, our FPU state will be saved in our
 506  * fpstate.  Then, when we're resumed and we take an FPDISABLED
 507  * trap, the trap handler will be able to fish our FPU state out
 508  * of curlwp (or lwp0).
 509  *
 510  * On exiting this routine we undo the damage: restore the original
 511  * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable
 512  * the MMU.
 513  *
 514  *
 515  * Register usage, Kernel only (after save):
 516  *
 517  * %i0          src
 518  * %i1          dest
 519  * %i2          size
 520  *
 521  * %l0          XXXX DEBUG old fpstate
 522  * %l1          fplwp (hi bits only)
 523  * %l2          orig fplwp
 524  * %l3          orig fpstate
 525  * %l5          curlwp
 526  * %l6          old fpstate
 527  *
 528  * Register ussage, Kernel and user:
 529  *
 530  * %g1          src (retval for memcpy)
 531  *
 532  * %o0          src
 533  * %o1          dest
 534  * %o2          end dest
 535  * %o5          last safe fetchable address
 536  */
 537
 538         ENABLE_FPU(0)
 539
 540         mov     %i0, %o0                                ! Src addr.
 541         mov     %i1, %o1                                ! Store our dest ptr here.
 542         mov     %i2, %o2                                ! Len counter
 543 #endif  /* _KERNEL */
 544
 545         !!
 546         !! First align the output to a 64-bit entity
 547         !!
 548
 549         mov     %o1, %g1                                ! memcpy retval
 550         add     %o0, %o2, %o5                           ! End of source block
 551
 552         andn    %o0, 7, %o3                             ! Start of block
 553         dec     %o5
 554         fzero   %f0
 555
 556         andn    %o5, BLOCK_ALIGN, %o5                   ! Last safe addr.
 557         ldd     [%o3], %f2                              ! Load 1st word
 558
 559         dec     8, %o3                                  ! Move %o3 1 word back
 560         btst    1, %o1
 561         bz      4f
 562
 563          mov    -7, %o4                                 ! Lowest src addr possible
 564         alignaddr %o0, %o4, %o4                         ! Base addr for load.
 565
 566         cmp     %o3, %o4
 567         be,pt   CCCR, 1f                                ! Already loaded?
 568          mov    %o4, %o3
 569         fmovd   %f2, %f0                                ! No. Shift
 570         ldd     [%o3+8], %f2                            ! And load
 571 1:
 572
 573         faligndata      %f0, %f2, %f4                   ! Isolate 1st byte
 574
 575         stda    %f4, [%o1] ASI_FL8_P                    ! Store 1st byte
 576         inc     1, %o1                                  ! Update address
 577         inc     1, %o0
 578         dec     1, %o2
 579 4:
 580         btst    2, %o1
 581         bz      4f
 582
 583          mov    -6, %o4                                 ! Calculate src - 6
 584         alignaddr %o0, %o4, %o4                         ! calculate shift mask and dest.
 585
 586         cmp     %o3, %o4                                ! Addresses same?
 587         be,pt   CCCR, 1f
 588          mov    %o4, %o3
 589         fmovd   %f2, %f0                                ! Shuffle data
 590         ldd     [%o3+8], %f2                            ! Load word 0
 591 1:
 592         faligndata %f0, %f2, %f4                        ! Move 1st short low part of f8
 593
 594         stda    %f4, [%o1] ASI_FL16_P                   ! Store 1st short
 595         dec     2, %o2
 596         inc     2, %o1
 597         inc     2, %o0
 598 4:
 599         brz,pn  %o2, Lmemcpy_blockfinish                        ! XXXX
 600
 601          btst   4, %o1
 602         bz      4f
 603
 604         mov     -4, %o4
 605         alignaddr %o0, %o4, %o4                         ! calculate shift mask and dest.
 606
 607         cmp     %o3, %o4                                ! Addresses same?
 608         beq,pt  CCCR, 1f
 609          mov    %o4, %o3
 610         fmovd   %f2, %f0                                ! Shuffle data
 611         ldd     [%o3+8], %f2                            ! Load word 0
 612 1:
 613         faligndata %f0, %f2, %f4                        ! Move 1st short low part of f8
 614
 615         st      %f5, [%o1]                              ! Store word
 616         dec     4, %o2
 617         inc     4, %o1
 618         inc     4, %o0
 619 4:
 620         brz,pn  %o2, Lmemcpy_blockfinish                        ! XXXX
 621         !!
 622         !! We are now 32-bit aligned in the dest.
 623         !!
 624 Lmemcpy_block_common:
 625
 626          mov    -0, %o4
 627         alignaddr %o0, %o4, %o4                         ! base - shift
 628
 629         cmp     %o3, %o4                                ! Addresses same?
 630         beq,pt  CCCR, 1f
 631          mov    %o4, %o3
 632         fmovd   %f2, %f0                                ! Shuffle data
 633         ldd     [%o3+8], %f2                            ! Load word 0
 634 1:
 635         add     %o3, 8, %o0                             ! now use %o0 for src
 636
 637         !!
 638         !! Continue until our dest is block aligned
 639         !!
 640 Lmemcpy_block_aligned8:
 641 1:
 642         brz     %o2, Lmemcpy_blockfinish
 643          btst   BLOCK_ALIGN, %o1                        ! Block aligned?
 644         bz      1f
 645
 646          faligndata %f0, %f2, %f4                       ! Generate result
 647         deccc   8, %o2
 648         ble,pn  %icc, Lmemcpy_blockfinish               ! Should never happen
 649          fmovd  %f4, %f48
 650
 651         std     %f4, [%o1]                              ! Store result
 652         inc     8, %o1
 653
 654         fmovd   %f2, %f0
 655         inc     8, %o0
 656         ba,pt   %xcc, 1b                                ! Not yet.
 657          ldd    [%o0], %f2                              ! Load next part
 658 Lmemcpy_block_aligned64:
 659 1:
 660
 661 /*
 662  * 64-byte aligned -- ready for block operations.
 663  *
 664  * Here we have the destination block aligned, but the
 665  * source pointer may not be.  Sub-word alignment will
 666  * be handled by faligndata instructions.  But the source
 667  * can still be potentially aligned to 8 different words
 668  * in our 64-bit block, so we have 8 different copy routines.
 669  *
 670  * Once we figure out our source alignment, we branch
 671  * to the appropriate copy routine, which sets up the
 672  * alignment for faligndata and loads (sets) the values
 673  * into the source registers and does the copy loop.
 674  *
 675  * When were down to less than 1 block to store, we
 676  * exit the copy loop and execute cleanup code.
 677  *
 678  * Block loads and stores are not properly interlocked.
 679  * Stores save one reg/cycle, so you can start overwriting
 680  * registers the cycle after the store is issued.
 681  *
 682  * Block loads require a block load to a different register
 683  * block or a membar #Sync before accessing the loaded
 684  * data.
 685  *
 686  * Since the faligndata instructions may be offset as far
 687  * as 7 registers into a block (if you are shifting source
 688  * 7 -> dest 0), you need 3 source register blocks for full
 689  * performance: one you are copying, one you are loading,
 690  * and one for interlocking.  Otherwise, we would need to
 691  * sprinkle the code with membar #Sync and lose the advantage
 692  * of running faligndata in parallel with block stores.  This
 693  * means we are fetching a full 128 bytes ahead of the stores.
 694  * We need to make sure the prefetch does not inadvertently
 695  * cross a page boundary and fault on data that we will never
 696  * store.
 697  *
 698  */
 699 #if 1
 700         and     %o0, BLOCK_ALIGN, %o3
 701         srax    %o3, 3, %o3                             ! Isolate the offset
 702
 703         brz     %o3, L100                               ! 0->0
 704          btst   4, %o3
 705         bnz     %xcc, 4f
 706          btst   2, %o3
 707         bnz     %xcc, 2f
 708          btst   1, %o3
 709         ba,pt   %xcc, L101                              ! 0->1
 710          nop    /* XXX spitfire bug */
 711 2:
 712         bz      %xcc, L102                              ! 0->2
 713          nop
 714         ba,pt   %xcc, L103                              ! 0->3
 715          nop    /* XXX spitfire bug */
 716 4:
 717         bnz     %xcc, 2f
 718          btst   1, %o3
 719         bz      %xcc, L104                              ! 0->4
 720          nop
 721         ba,pt   %xcc, L105                              ! 0->5
 722          nop    /* XXX spitfire bug */
 723 2:
 724         bz      %xcc, L106                              ! 0->6
 725          nop
 726         ba,pt   %xcc, L107                              ! 0->7
 727          nop    /* XXX spitfire bug */
 728 #else
 729
 730         !!
 731         !! Isolate the word offset, which just happens to be
 732         !! the slot in our jump table.
 733         !!
 734         !! This is 6 insns, most of which cannot be paired,
 735         !! which is about the same as the above version.
 736         !!
 737         rd      %pc, %o4
 738 1:
 739         and     %o0, 0x31, %o3
 740         add     %o3, (Lmemcpy_block_jmp - 1b), %o3
 741         jmpl    %o4 + %o3, %g0
 742          nop
 743
 744         !!
 745         !! Jump table
 746         !!
 747
 748 Lmemcpy_block_jmp:
 749         ba,a,pt %xcc, L100
 750          nop
 751         ba,a,pt %xcc, L101
 752          nop
 753         ba,a,pt %xcc, L102
 754          nop
 755         ba,a,pt %xcc, L103
 756          nop
 757         ba,a,pt %xcc, L104
 758          nop
 759         ba,a,pt %xcc, L105
 760          nop
 761         ba,a,pt %xcc, L106
 762          nop
 763         ba,a,pt %xcc, L107
 764          nop
 765 #endif
 766
 767         !!
 768         !! Source is block aligned.
 769         !!
 770         !! Just load a block and go.
 771         !!
 772 L100:
 773 #ifdef RETURN_NAME
 774         sethi   %hi(1f), %g1
 775         ba,pt   %icc, 2f
 776          or     %g1, %lo(1f), %g1
 777 1:
 778         .asciz  "L100"
 779         .align  8
 780 2:
 781 #endif
 782         fmovd   %f0 , %f62
 783         ldda    [%o0] ASI_BLK_P, %f0
 784         inc     BLOCK_SIZE, %o0
 785         cmp     %o0, %o5
 786         bleu,a,pn       %icc, 3f
 787          ldda   [%o0] ASI_BLK_P, %f16
 788         ba,pt   %icc, 3f
 789          membar #Sync
 790
 791         .align  32                                      ! ICache align.
 792 3:
 793         faligndata      %f62, %f0, %f32
 794         inc     BLOCK_SIZE, %o0
 795         faligndata      %f0, %f2, %f34
 796         dec     BLOCK_SIZE, %o2
 797         faligndata      %f2, %f4, %f36
 798         cmp     %o0, %o5
 799         faligndata      %f4, %f6, %f38
 800         faligndata      %f6, %f8, %f40
 801         faligndata      %f8, %f10, %f42
 802         faligndata      %f10, %f12, %f44
 803         brlez,pn        %o2, Lmemcpy_blockdone
 804          faligndata     %f12, %f14, %f46
 805
 806         bleu,a,pn       %icc, 2f
 807          ldda   [%o0] ASI_BLK_P, %f48
 808         membar  #Sync
 809 2:
 810         stda    %f32, [%o1] ASI_STORE
 811         faligndata      %f14, %f16, %f32
 812         inc     BLOCK_SIZE, %o0
 813         faligndata      %f16, %f18, %f34
 814         inc     BLOCK_SIZE, %o1
 815         faligndata      %f18, %f20, %f36
 816         dec     BLOCK_SIZE, %o2
 817         faligndata      %f20, %f22, %f38
 818         cmp     %o0, %o5
 819         faligndata      %f22, %f24, %f40
 820         faligndata      %f24, %f26, %f42
 821         faligndata      %f26, %f28, %f44
 822         brlez,pn        %o2, Lmemcpy_blockdone
 823          faligndata     %f28, %f30, %f46
 824
 825         bleu,a,pn       %icc, 2f
 826          ldda   [%o0] ASI_BLK_P, %f0
 827         membar  #Sync
 828 2:
 829         stda    %f32, [%o1] ASI_STORE
 830         faligndata      %f30, %f48, %f32
 831         inc     BLOCK_SIZE, %o0
 832         faligndata      %f48, %f50, %f34
 833         inc     BLOCK_SIZE, %o1
 834         faligndata      %f50, %f52, %f36
 835         dec     BLOCK_SIZE, %o2
 836         faligndata      %f52, %f54, %f38
 837         cmp     %o0, %o5
 838         faligndata      %f54, %f56, %f40
 839         faligndata      %f56, %f58, %f42
 840         faligndata      %f58, %f60, %f44
 841         brlez,pn        %o2, Lmemcpy_blockdone
 842          faligndata     %f60, %f62, %f46
 843         bleu,a,pn       %icc, 2f
 844          ldda   [%o0] ASI_BLK_P, %f16                   ! Increment is at top
 845         membar  #Sync
 846 2:
 847         stda    %f32, [%o1] ASI_STORE
 848         ba      3b
 849          inc    BLOCK_SIZE, %o1
 850
 851         !!
 852         !! Source at BLOCK_ALIGN+8
 853         !!
 854         !! We need to load almost 1 complete block by hand.
 855         !!
 856 L101:
 857 #ifdef RETURN_NAME
 858         sethi   %hi(1f), %g1
 859         ba,pt   %icc, 2f
 860          or     %g1, %lo(1f), %g1
 861 1:
 862         .asciz  "L101"
 863         .align  8
 864 2:
 865 #endif
 866 !       fmovd   %f0, %f0                                ! Hoist fmovd
 867         ldd     [%o0], %f2
 868         inc     8, %o0
 869         ldd     [%o0], %f4
 870         inc     8, %o0
 871         ldd     [%o0], %f6
 872         inc     8, %o0
 873         ldd     [%o0], %f8
 874         inc     8, %o0
 875         ldd     [%o0], %f10
 876         inc     8, %o0
 877         ldd     [%o0], %f12
 878         inc     8, %o0
 879         ldd     [%o0], %f14
 880         inc     8, %o0
 881
 882         cmp     %o0, %o5
 883         bleu,a,pn       %icc, 3f
 884          ldda   [%o0] ASI_BLK_P, %f16
 885         membar #Sync
 886 3:
 887         faligndata      %f0, %f2, %f32
 888         inc     BLOCK_SIZE, %o0
 889         faligndata      %f2, %f4, %f34
 890         cmp     %o0, %o5
 891         faligndata      %f4, %f6, %f36
 892         dec     BLOCK_SIZE, %o2
 893         faligndata      %f6, %f8, %f38
 894         faligndata      %f8, %f10, %f40
 895         faligndata      %f10, %f12, %f42
 896         faligndata      %f12, %f14, %f44
 897         bleu,a,pn       %icc, 2f
 898          ldda   [%o0] ASI_BLK_P, %f48
 899         membar  #Sync
 900 2:
 901         brlez,pn        %o2, Lmemcpy_blockdone
 902          faligndata     %f14, %f16, %f46
 903
 904         stda    %f32, [%o1] ASI_STORE
 905
 906         faligndata      %f16, %f18, %f32
 907         inc     BLOCK_SIZE, %o0
 908         faligndata      %f18, %f20, %f34
 909         inc     BLOCK_SIZE, %o1
 910         faligndata      %f20, %f22, %f36
 911         cmp     %o0, %o5
 912         faligndata      %f22, %f24, %f38
 913         dec     BLOCK_SIZE, %o2
 914         faligndata      %f24, %f26, %f40
 915         faligndata      %f26, %f28, %f42
 916         faligndata      %f28, %f30, %f44
 917         bleu,a,pn       %icc, 2f
 918          ldda   [%o0] ASI_BLK_P, %f0
 919         membar  #Sync
 920 2:
 921         brlez,pn        %o2, Lmemcpy_blockdone
 922          faligndata     %f30, %f48, %f46
 923
 924         stda    %f32, [%o1] ASI_STORE
 925
 926         faligndata      %f48, %f50, %f32
 927         inc     BLOCK_SIZE, %o0
 928         faligndata      %f50, %f52, %f34
 929         inc     BLOCK_SIZE, %o1
 930         faligndata      %f52, %f54, %f36
 931         cmp     %o0, %o5
 932         faligndata      %f54, %f56, %f38
 933         dec     BLOCK_SIZE, %o2
 934         faligndata      %f56, %f58, %f40
 935         faligndata      %f58, %f60, %f42
 936         faligndata      %f60, %f62, %f44
 937         bleu,a,pn       %icc, 2f
 938          ldda   [%o0] ASI_BLK_P, %f16
 939         membar  #Sync
 940 2:
 941         brlez,pn        %o2, Lmemcpy_blockdone
 942          faligndata     %f62, %f0, %f46
 943
 944         stda    %f32, [%o1] ASI_STORE
 945         ba      3b
 946          inc    BLOCK_SIZE, %o1
 947
 948         !!
 949         !! Source at BLOCK_ALIGN+16
 950         !!
 951         !! We need to load 6 doubles by hand.
 952         !!
 953 L102:
 954 #ifdef RETURN_NAME
 955         sethi   %hi(1f), %g1
 956         ba,pt   %icc, 2f
 957          or     %g1, %lo(1f), %g1
 958 1:
 959         .asciz  "L102"
 960         .align  8
 961 2:
 962 #endif
 963         ldd     [%o0], %f4
 964         inc     8, %o0
 965         fmovd   %f0, %f2                                ! Hoist fmovd
 966         ldd     [%o0], %f6
 967         inc     8, %o0
 968
 969         ldd     [%o0], %f8
 970         inc     8, %o0
 971         ldd     [%o0], %f10
 972         inc     8, %o0
 973         ldd     [%o0], %f12
 974         inc     8, %o0
 975         ldd     [%o0], %f14
 976         inc     8, %o0
 977
 978         cmp     %o0, %o5
 979         bleu,a,pn       %icc, 3f
 980          ldda   [%o0] ASI_BLK_P, %f16
 981         membar #Sync
 982 3:
 983         faligndata      %f2, %f4, %f32
 984         inc     BLOCK_SIZE, %o0
 985         faligndata      %f4, %f6, %f34
 986         cmp     %o0, %o5
 987         faligndata      %f6, %f8, %f36
 988         dec     BLOCK_SIZE, %o2
 989         faligndata      %f8, %f10, %f38
 990         faligndata      %f10, %f12, %f40
 991         faligndata      %f12, %f14, %f42
 992         bleu,a,pn       %icc, 2f
 993          ldda   [%o0] ASI_BLK_P, %f48
 994         membar  #Sync
 995 2:
 996         faligndata      %f14, %f16, %f44
 997
 998         brlez,pn        %o2, Lmemcpy_blockdone
 999          faligndata     %f16, %f18, %f46
1000
1001         stda    %f32, [%o1] ASI_STORE
1002
1003         faligndata      %f18, %f20, %f32
1004         inc     BLOCK_SIZE, %o0
1005         faligndata      %f20, %f22, %f34
1006         inc     BLOCK_SIZE, %o1
1007         faligndata      %f22, %f24, %f36
1008         cmp     %o0, %o5
1009         faligndata      %f24, %f26, %f38
1010         dec     BLOCK_SIZE, %o2
1011         faligndata      %f26, %f28, %f40
1012         faligndata      %f28, %f30, %f42
1013         bleu,a,pn       %icc, 2f
1014          ldda   [%o0] ASI_BLK_P, %f0
1015         membar  #Sync
1016 2:
1017         faligndata      %f30, %f48, %f44
1018         brlez,pn        %o2, Lmemcpy_blockdone
1019          faligndata     %f48, %f50, %f46
1020
1021         stda    %f32, [%o1] ASI_STORE
1022
1023         faligndata      %f50, %f52, %f32
1024         inc     BLOCK_SIZE, %o0
1025         faligndata      %f52, %f54, %f34
1026         inc     BLOCK_SIZE, %o1
1027         faligndata      %f54, %f56, %f36
1028         cmp     %o0, %o5
1029         faligndata      %f56, %f58, %f38
1030         dec     BLOCK_SIZE, %o2
1031         faligndata      %f58, %f60, %f40
1032         faligndata      %f60, %f62, %f42
1033         bleu,a,pn       %icc, 2f
1034          ldda   [%o0] ASI_BLK_P, %f16
1035         membar  #Sync
1036 2:
1037         faligndata      %f62, %f0, %f44
1038         brlez,pn        %o2, Lmemcpy_blockdone
1039          faligndata     %f0, %f2, %f46
1040
1041         stda    %f32, [%o1] ASI_STORE
1042         ba      3b
1043          inc    BLOCK_SIZE, %o1
1044
1045         !!
1046         !! Source at BLOCK_ALIGN+24
1047         !!
1048         !! We need to load 5 doubles by hand.
1049         !!
1050 L103:
1051 #ifdef RETURN_NAME
1052         sethi   %hi(1f), %g1
1053         ba,pt   %icc, 2f
1054          or     %g1, %lo(1f), %g1
1055 1:
1056         .asciz  "L103"
1057         .align  8
1058 2:
1059 #endif
1060         fmovd   %f0, %f4
1061         ldd     [%o0], %f6
1062         inc     8, %o0
1063         ldd     [%o0], %f8
1064         inc     8, %o0
1065         ldd     [%o0], %f10
1066         inc     8, %o0
1067         ldd     [%o0], %f12
1068         inc     8, %o0
1069         ldd     [%o0], %f14
1070         inc     8, %o0
1071
1072         cmp     %o0, %o5
1073         bleu,a,pn       %icc, 2f
1074          ldda   [%o0] ASI_BLK_P, %f16
1075         membar #Sync
1076 2:
1077         inc     BLOCK_SIZE, %o0
1078 3:
1079         faligndata      %f4, %f6, %f32
1080         cmp     %o0, %o5
1081         faligndata      %f6, %f8, %f34
1082         dec     BLOCK_SIZE, %o2
1083         faligndata      %f8, %f10, %f36
1084         faligndata      %f10, %f12, %f38
1085         faligndata      %f12, %f14, %f40
1086         bleu,a,pn       %icc, 2f
1087          ldda   [%o0] ASI_BLK_P, %f48
1088         membar  #Sync
1089 2:
1090         faligndata      %f14, %f16, %f42
1091         inc     BLOCK_SIZE, %o0
1092         faligndata      %f16, %f18, %f44
1093         brlez,pn        %o2, Lmemcpy_blockdone
1094          faligndata     %f18, %f20, %f46
1095
1096         stda    %f32, [%o1] ASI_STORE
1097
1098         faligndata      %f20, %f22, %f32
1099         cmp     %o0, %o5
1100         faligndata      %f22, %f24, %f34
1101         dec     BLOCK_SIZE, %o2
1102         faligndata      %f24, %f26, %f36
1103         inc     BLOCK_SIZE, %o1
1104         faligndata      %f26, %f28, %f38
1105         faligndata      %f28, %f30, %f40
1106         ble,a,pn        %icc, 2f
1107          ldda   [%o0] ASI_BLK_P, %f0
1108         membar  #Sync
1109 2:
1110         faligndata      %f30, %f48, %f42
1111         inc     BLOCK_SIZE, %o0
1112         faligndata      %f48, %f50, %f44
1113         brlez,pn        %o2, Lmemcpy_blockdone
1114          faligndata     %f50, %f52, %f46
1115
1116         stda    %f32, [%o1] ASI_STORE
1117
1118         faligndata      %f52, %f54, %f32
1119         cmp     %o0, %o5
1120         faligndata      %f54, %f56, %f34
1121         dec     BLOCK_SIZE, %o2
1122         faligndata      %f56, %f58, %f36
1123         faligndata      %f58, %f60, %f38
1124         inc     BLOCK_SIZE, %o1
1125         faligndata      %f60, %f62, %f40
1126         bleu,a,pn       %icc, 2f
1127          ldda   [%o0] ASI_BLK_P, %f16
1128         membar  #Sync
1129 2:
1130         faligndata      %f62, %f0, %f42
1131         inc     BLOCK_SIZE, %o0
1132         faligndata      %f0, %f2, %f44
1133         brlez,pn        %o2, Lmemcpy_blockdone
1134          faligndata     %f2, %f4, %f46
1135
1136         stda    %f32, [%o1] ASI_STORE
1137         ba      3b
1138          inc    BLOCK_SIZE, %o1
1139
1140         !!
1141         !! Source at BLOCK_ALIGN+32
1142         !!
1143         !! We need to load 4 doubles by hand.
1144         !!
1145 L104:
1146 #ifdef RETURN_NAME
1147         sethi   %hi(1f), %g1
1148         ba,pt   %icc, 2f
1149          or     %g1, %lo(1f), %g1
1150 1:
1151         .asciz  "L104"
1152         .align  8
1153 2:
1154 #endif
1155         fmovd   %f0, %f6
1156         ldd     [%o0], %f8
1157         inc     8, %o0
1158         ldd     [%o0], %f10
1159         inc     8, %o0
1160         ldd     [%o0], %f12
1161         inc     8, %o0
1162         ldd     [%o0], %f14
1163         inc     8, %o0
1164
1165         cmp     %o0, %o5
1166         bleu,a,pn       %icc, 2f
1167          ldda   [%o0] ASI_BLK_P, %f16
1168         membar #Sync
1169 2:
1170         inc     BLOCK_SIZE, %o0
1171 3:
1172         faligndata      %f6, %f8, %f32
1173         cmp     %o0, %o5
1174         faligndata      %f8, %f10, %f34
1175         dec     BLOCK_SIZE, %o2
1176         faligndata      %f10, %f12, %f36
1177         faligndata      %f12, %f14, %f38
1178         bleu,a,pn       %icc, 2f
1179          ldda   [%o0] ASI_BLK_P, %f48
1180         membar  #Sync
1181 2:
1182         faligndata      %f14, %f16, %f40
1183         faligndata      %f16, %f18, %f42
1184         inc     BLOCK_SIZE, %o0
1185         faligndata      %f18, %f20, %f44
1186         brlez,pn        %o2, Lmemcpy_blockdone
1187          faligndata     %f20, %f22, %f46
1188
1189         stda    %f32, [%o1] ASI_STORE
1190
1191         faligndata      %f22, %f24, %f32
1192         cmp     %o0, %o5
1193         faligndata      %f24, %f26, %f34
1194         faligndata      %f26, %f28, %f36
1195         inc     BLOCK_SIZE, %o1
1196         faligndata      %f28, %f30, %f38
1197         bleu,a,pn       %icc, 2f
1198          ldda   [%o0] ASI_BLK_P, %f0
1199         membar  #Sync
1200 2:
1201         faligndata      %f30, %f48, %f40
1202         dec     BLOCK_SIZE, %o2
1203         faligndata      %f48, %f50, %f42
1204         inc     BLOCK_SIZE, %o0
1205         faligndata      %f50, %f52, %f44
1206         brlez,pn        %o2, Lmemcpy_blockdone
1207          faligndata     %f52, %f54, %f46
1208
1209         stda    %f32, [%o1] ASI_STORE
1210
1211         faligndata      %f54, %f56, %f32
1212         cmp     %o0, %o5
1213         faligndata      %f56, %f58, %f34
1214         faligndata      %f58, %f60, %f36
1215         inc     BLOCK_SIZE, %o1
1216         faligndata      %f60, %f62, %f38
1217         bleu,a,pn       %icc, 2f
1218          ldda   [%o0] ASI_BLK_P, %f16
1219         membar  #Sync
1220 2:
1221         faligndata      %f62, %f0, %f40
1222         dec     BLOCK_SIZE, %o2
1223         faligndata      %f0, %f2, %f42
1224         inc     BLOCK_SIZE, %o0
1225         faligndata      %f2, %f4, %f44
1226         brlez,pn        %o2, Lmemcpy_blockdone
1227          faligndata     %f4, %f6, %f46
1228
1229         stda    %f32, [%o1] ASI_STORE
1230         ba      3b
1231          inc    BLOCK_SIZE, %o1
1232
1233         !!
1234         !! Source at BLOCK_ALIGN+40
1235         !!
1236         !! We need to load 3 doubles by hand.
1237         !!
1238 L105:
1239 #ifdef RETURN_NAME
1240         sethi   %hi(1f), %g1
1241         ba,pt   %icc, 2f
1242          or     %g1, %lo(1f), %g1
1243 1:
1244         .asciz  "L105"
1245         .align  8
1246 2:
1247 #endif
1248         fmovd   %f0, %f8
1249         ldd     [%o0], %f10
1250         inc     8, %o0
1251         ldd     [%o0], %f12
1252         inc     8, %o0
1253         ldd     [%o0], %f14
1254         inc     8, %o0
1255
1256         cmp     %o0, %o5
1257         bleu,a,pn       %icc, 2f
1258          ldda   [%o0] ASI_BLK_P, %f16
1259         membar #Sync
1260 2:
1261         inc     BLOCK_SIZE, %o0
1262 3:
1263         faligndata      %f8, %f10, %f32
1264         cmp     %o0, %o5
1265         faligndata      %f10, %f12, %f34
1266         faligndata      %f12, %f14, %f36
1267         bleu,a,pn       %icc, 2f
1268          ldda   [%o0] ASI_BLK_P, %f48
1269         membar  #Sync
1270 2:
1271         faligndata      %f14, %f16, %f38
1272         dec     BLOCK_SIZE, %o2
1273         faligndata      %f16, %f18, %f40
1274         inc     BLOCK_SIZE, %o0
1275         faligndata      %f18, %f20, %f42
1276         faligndata      %f20, %f22, %f44
1277         brlez,pn        %o2, Lmemcpy_blockdone
1278          faligndata     %f22, %f24, %f46
1279
1280         stda    %f32, [%o1] ASI_STORE
1281
1282         faligndata      %f24, %f26, %f32
1283         cmp     %o0, %o5
1284         faligndata      %f26, %f28, %f34
1285         dec     BLOCK_SIZE, %o2
1286         faligndata      %f28, %f30, %f36
1287         bleu,a,pn       %icc, 2f
1288          ldda   [%o0] ASI_BLK_P, %f0
1289         membar  #Sync
1290 2:
1291         faligndata      %f30, %f48, %f38
1292         inc     BLOCK_SIZE, %o1
1293         faligndata      %f48, %f50, %f40
1294         inc     BLOCK_SIZE, %o0
1295         faligndata      %f50, %f52, %f42
1296         faligndata      %f52, %f54, %f44
1297         brlez,pn        %o2, Lmemcpy_blockdone
1298          faligndata     %f54, %f56, %f46
1299
1300         stda    %f32, [%o1] ASI_STORE
1301
1302         faligndata      %f56, %f58, %f32
1303         cmp     %o0, %o5
1304         faligndata      %f58, %f60, %f34
1305         dec     BLOCK_SIZE, %o2
1306         faligndata      %f60, %f62, %f36
1307         bleu,a,pn       %icc, 2f
1308          ldda   [%o0] ASI_BLK_P, %f16
1309         membar  #Sync
1310 2:
1311         faligndata      %f62, %f0, %f38
1312         inc     BLOCK_SIZE, %o1
1313         faligndata      %f0, %f2, %f40
1314         inc     BLOCK_SIZE, %o0
1315         faligndata      %f2, %f4, %f42
1316         faligndata      %f4, %f6, %f44
1317         brlez,pn        %o2, Lmemcpy_blockdone
1318          faligndata     %f6, %f8, %f46
1319
1320         stda    %f32, [%o1] ASI_STORE
1321         ba      3b
1322          inc    BLOCK_SIZE, %o1
1323
1324
1325         !!
1326         !! Source at BLOCK_ALIGN+48
1327         !!
1328         !! We need to load 2 doubles by hand.
1329         !!
1330 L106:
1331 #ifdef RETURN_NAME
1332         sethi   %hi(1f), %g1
1333         ba,pt   %icc, 2f
1334          or     %g1, %lo(1f), %g1
1335 1:
1336         .asciz  "L106"
1337         .align  8
1338 2:
1339 #endif
1340         fmovd   %f0, %f10
1341         ldd     [%o0], %f12
1342         inc     8, %o0
1343         ldd     [%o0], %f14
1344         inc     8, %o0
1345
1346         cmp     %o0, %o5
1347         bleu,a,pn       %icc, 2f
1348          ldda   [%o0] ASI_BLK_P, %f16
1349         membar #Sync
1350 2:
1351         inc     BLOCK_SIZE, %o0
1352 3:
1353         faligndata      %f10, %f12, %f32
1354         cmp     %o0, %o5
1355         faligndata      %f12, %f14, %f34
1356         bleu,a,pn       %icc, 2f
1357          ldda   [%o0] ASI_BLK_P, %f48
1358         membar  #Sync
1359 2:
1360         faligndata      %f14, %f16, %f36
1361         dec     BLOCK_SIZE, %o2
1362         faligndata      %f16, %f18, %f38
1363         inc     BLOCK_SIZE, %o0
1364         faligndata      %f18, %f20, %f40
1365         faligndata      %f20, %f22, %f42
1366         faligndata      %f22, %f24, %f44
1367         brlez,pn        %o2, Lmemcpy_blockdone
1368          faligndata     %f24, %f26, %f46
1369
1370         stda    %f32, [%o1] ASI_STORE
1371
1372         faligndata      %f26, %f28, %f32
1373         cmp     %o0, %o5
1374         faligndata      %f28, %f30, %f34
1375         bleu,a,pn       %icc, 2f
1376          ldda   [%o0] ASI_BLK_P, %f0
1377         membar  #Sync
1378 2:
1379         faligndata      %f30, %f48, %f36
1380         dec     BLOCK_SIZE, %o2
1381         faligndata      %f48, %f50, %f38
1382         inc     BLOCK_SIZE, %o1
1383         faligndata      %f50, %f52, %f40
1384         faligndata      %f52, %f54, %f42
1385         inc     BLOCK_SIZE, %o0
1386         faligndata      %f54, %f56, %f44
1387         brlez,pn        %o2, Lmemcpy_blockdone
1388          faligndata     %f56, %f58, %f46
1389
1390         stda    %f32, [%o1] ASI_STORE
1391
1392         faligndata      %f58, %f60, %f32
1393         cmp     %o0, %o5
1394         faligndata      %f60, %f62, %f34
1395         bleu,a,pn       %icc, 2f
1396          ldda   [%o0] ASI_BLK_P, %f16
1397         membar  #Sync
1398 2:
1399         faligndata      %f62, %f0, %f36
1400         dec     BLOCK_SIZE, %o2
1401         faligndata      %f0, %f2, %f38
1402         inc     BLOCK_SIZE, %o1
1403         faligndata      %f2, %f4, %f40
1404         faligndata      %f4, %f6, %f42
1405         inc     BLOCK_SIZE, %o0
1406         faligndata      %f6, %f8, %f44
1407         brlez,pn        %o2, Lmemcpy_blockdone
1408          faligndata     %f8, %f10, %f46
1409
1410         stda    %f32, [%o1] ASI_STORE
1411         ba      3b
1412          inc    BLOCK_SIZE, %o1
1413
1414
1415         !!
1416         !! Source at BLOCK_ALIGN+56
1417         !!
1418         !! We need to load 1 double by hand.
1419         !!
1420 L107:
1421 #ifdef RETURN_NAME
1422         sethi   %hi(1f), %g1
1423         ba,pt   %icc, 2f
1424          or     %g1, %lo(1f), %g1
1425 1:
1426         .asciz  "L107"
1427         .align  8
1428 2:
1429 #endif
1430         fmovd   %f0, %f12
1431         ldd     [%o0], %f14
1432         inc     8, %o0
1433
1434         cmp     %o0, %o5
1435         bleu,a,pn       %icc, 2f
1436          ldda   [%o0] ASI_BLK_P, %f16
1437         membar #Sync
1438 2:
1439         inc     BLOCK_SIZE, %o0
1440 3:
1441         faligndata      %f12, %f14, %f32
1442         cmp     %o0, %o5
1443         bleu,a,pn       %icc, 2f
1444          ldda   [%o0] ASI_BLK_P, %f48
1445         membar  #Sync
1446 2:
1447         faligndata      %f14, %f16, %f34
1448         dec     BLOCK_SIZE, %o2
1449         faligndata      %f16, %f18, %f36
1450         inc     BLOCK_SIZE, %o0
1451         faligndata      %f18, %f20, %f38
1452         faligndata      %f20, %f22, %f40
1453         faligndata      %f22, %f24, %f42
1454         faligndata      %f24, %f26, %f44
1455         brlez,pn        %o2, Lmemcpy_blockdone
1456          faligndata     %f26, %f28, %f46
1457
1458         stda    %f32, [%o1] ASI_STORE
1459
1460         faligndata      %f28, %f30, %f32
1461         cmp     %o0, %o5
1462         bleu,a,pn       %icc, 2f
1463          ldda   [%o0] ASI_BLK_P, %f0
1464         membar  #Sync
1465 2:
1466         faligndata      %f30, %f48, %f34
1467         dec     BLOCK_SIZE, %o2
1468         faligndata      %f48, %f50, %f36
1469         inc     BLOCK_SIZE, %o1
1470         faligndata      %f50, %f52, %f38
1471         faligndata      %f52, %f54, %f40
1472         inc     BLOCK_SIZE, %o0
1473         faligndata      %f54, %f56, %f42
1474         faligndata      %f56, %f58, %f44
1475         brlez,pn        %o2, Lmemcpy_blockdone
1476          faligndata     %f58, %f60, %f46
1477
1478         stda    %f32, [%o1] ASI_STORE
1479
1480         faligndata      %f60, %f62, %f32
1481         cmp     %o0, %o5
1482         bleu,a,pn       %icc, 2f
1483          ldda   [%o0] ASI_BLK_P, %f16
1484         membar  #Sync
1485 2:
1486         faligndata      %f62, %f0, %f34
1487         dec     BLOCK_SIZE, %o2
1488         faligndata      %f0, %f2, %f36
1489         inc     BLOCK_SIZE, %o1
1490         faligndata      %f2, %f4, %f38
1491         faligndata      %f4, %f6, %f40
1492         inc     BLOCK_SIZE, %o0
1493         faligndata      %f6, %f8, %f42
1494         faligndata      %f8, %f10, %f44
1495
1496         brlez,pn        %o2, Lmemcpy_blockdone
1497          faligndata     %f10, %f12, %f46
1498
1499         stda    %f32, [%o1] ASI_STORE
1500         ba      3b
1501          inc    BLOCK_SIZE, %o1
1502
1503 Lmemcpy_blockdone:
1504         inc     BLOCK_SIZE, %o2                         ! Fixup our overcommit
1505         membar  #Sync                                   ! Finish any pending loads
1506 #define FINISH_REG(f)                           \
1507         deccc   8, %o2;                         \
1508         bl,a    Lmemcpy_blockfinish;            \
1509          fmovd  f, %f48;                        \
1510         std     f, [%o1];                       \
1511         inc     8, %o1
1512
1513         FINISH_REG(%f32)
1514         FINISH_REG(%f34)
1515         FINISH_REG(%f36)
1516         FINISH_REG(%f38)
1517         FINISH_REG(%f40)
1518         FINISH_REG(%f42)
1519         FINISH_REG(%f44)
1520         FINISH_REG(%f46)
1521         FINISH_REG(%f48)
1522 #undef FINISH_REG
1523         !!
1524         !! The low 3 bits have the sub-word bits needed to be
1525         !! stored [because (x-8)&0x7 == x].
1526         !!
1527 Lmemcpy_blockfinish:
1528         brz,pn  %o2, 2f                                 ! 100% complete?
1529          fmovd  %f48, %f4
1530         cmp     %o2, 8                                  ! Exactly 8 bytes?
1531         bz,a,pn CCCR, 2f
1532          std    %f4, [%o1]
1533
1534         btst    4, %o2                                  ! Word store?
1535         bz      CCCR, 1f
1536          nop
1537         st      %f4, [%o1]
1538         inc     4, %o1
1539 1:
1540         btst    2, %o2
1541         fzero   %f0
1542         bz      1f
1543
1544          mov    -6, %o4
1545         alignaddr %o1, %o4, %g0
1546
1547         faligndata %f0, %f4, %f8
1548
1549         stda    %f8, [%o1] ASI_FL16_P                   ! Store short
1550         inc     2, %o1
1551 1:
1552         btst    1, %o2                                  ! Byte aligned?
1553         bz      2f
1554
1555          mov    -7, %o0                                 ! Calculate dest - 7
1556         alignaddr %o1, %o0, %g0                         ! Calculate shift mask and dest.
1557
1558         faligndata %f0, %f4, %f8                        ! Move 1st byte to low part of f8
1559
1560         stda    %f8, [%o1] ASI_FL8_P                    ! Store 1st byte
1561         inc     1, %o1                                  ! Update address
1562 2:
1563         membar  #Sync
1564 #if 0
1565         !!
1566         !! verify copy success.
1567         !!
1568
1569         mov     %i0, %o2
1570         mov     %i1, %o4
1571         mov     %i2, %l4
1572 0:
1573         ldub    [%o2], %o1
1574         inc     %o2
1575         ldub    [%o4], %o3
1576         inc     %o4
1577         cmp     %o3, %o1
1578         bnz     1f
1579          dec    %l4
1580         brnz    %l4, 0b
1581          nop
1582         ba      2f
1583          nop
1584
1585 1:
1586         set     block_disable, %o0
1587         stx     %o0, [%o0]
1588
1589         set     0f, %o0
1590         call    prom_printf
1591          sub    %i2, %l4, %o5
1592         set     1f, %o0
1593         mov     %i0, %o2
1594         mov     %i1, %o1
1595         call    prom_printf
1596          mov    %i2, %o3
1597         ta      1
1598         .data
1599         _ALIGN
1600 0:      .asciz  "block memcpy failed: %x@%p != %x@%p byte %d\r\n"
1601 1:      .asciz  "memcpy(%p, %p, %lx)\r\n"
1602         _ALIGN
1603         .text
1604 2:
1605 #endif
1606 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
1607
1608 /*
1609  * Weve saved our possible fpstate, now disable the fpu
1610  * and continue with life.
1611  */
1612         RESTORE_FPU
1613         ret
1614          restore        %g1, 0, %o0                     ! Return DEST for memcpy
1615 #endif
1616         retl
1617          mov    %g1, %o0
1618 /*
1619  * Use block_disable to turn off block insns for
1620  * memcpy/memset
1621  */
1622         .data
1623         .align  8
1624         .globl  block_disable
1625 block_disable:  .xword  1
1626         .text
1627 #endif  /* USE_BLOCK_STORE_LOAD */