lib/libc/arch/sparc64/string/memcpy.S

   1 /*      $NetBSD: memcpy.S,v 1.1 2001/07/07 04:55:21 eeh Exp $   */
   2
   3 /*
   4  * Copyright (c) 2001   Eduardo E. Horvath
   5  *
   6  * This software was developed by the Computer Systems Engineering group
   7  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
   8  * contributed to Berkeley.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  */
  39
  40 #include <machine/asm.h>
  41 #ifndef _LOCORE
  42 #define _LOCORE
  43 #endif
  44 #include <machine/ctlreg.h>
  45 #include <machine/frame.h>
  46 #include <machine/psl.h>
  47
  48 #if defined(LIBC_SCCS) && !defined(lint)
  49         RCSID("$NetBSD: memcpy.S,v 1.1 2001/07/07 04:55:21 eeh Exp $")
  50 #endif  /* LIBC_SCCS and not lint */
  51
  52 #define EMPTY   nop
  53 #define NOTREACHED      ta      1
  54
  55 #define BCOPY_SMALL     16
  56 #define BLOCK_SIZE      64
  57
  58 #if 0
  59 #define ASI_STORE       ASI_BLK_COMMIT_P
  60 #else
  61 #define ASI_STORE       ASI_BLK_P
  62 #endif
  63
  64 #if 1
  65 /*
  66  * kernel bcopy/memcpy
  67  * Assumes regions do not overlap; has no useful return value.
  68  *
  69  * Must not use %g7 (see copyin/copyout above).
  70  */
  71 ENTRY(memcpy) /* dest, src, size */
  72         /*
  73          * Swap args for bcopy.  Gcc generates calls to memcpy for
  74          * structure assignments.
  75          */
  76         mov     %o0, %o3
  77         mov     %o1, %o0
  78         mov     %o3, %o1
  79 #endif
  80 ENTRY(bcopy) /* src, dest, size */
  81 #ifdef DEBUG
  82         set     pmapdebug, %o4
  83         ld      [%o4], %o4
  84         btst    0x80, %o4       ! PDB_COPY
  85         bz,pt   %icc, 3f
  86          nop
  87         save    %sp, -CC64FSZ, %sp
  88         mov     %i0, %o1
  89         set     2f, %o0
  90         mov     %i1, %o2
  91         call    printf
  92          mov    %i2, %o3
  93 !       ta      1; nop
  94         restore
  95         .data
  96 2:      .asciz  "bcopy(%p->%p,%x)\n"
  97         _ALIGN
  98         .text
  99 3:
 100 #endif
 101         /*
 102          * Check for overlaps and punt.
 103          *
 104          * If src <= dest <= src+len we have a problem.
 105          */
 106
 107         sub     %o1, %o0, %o3
 108
 109         cmp     %o3, %o2
 110         blu,pn  %xcc, Lovbcopy
 111          cmp    %o2, BCOPY_SMALL
 112 Lbcopy_start:
 113         bge,pt  %xcc, 2f        ! if >= this many, go be fancy.
 114          cmp    %o2, 256
 115
 116         mov     %o1, %o5        ! Save memcpy return value
 117         /*
 118          * Not much to copy, just do it a byte at a time.
 119          */
 120         deccc   %o2             ! while (--len >= 0)
 121         bl      1f
 122          EMPTY
 123 0:
 124         inc     %o0
 125         ldsb    [%o0 - 1], %o4  !       (++dst)[-1] = *src++;
 126         stb     %o4, [%o1]
 127         deccc   %o2
 128         bge     0b
 129          inc    %o1
 130 1:
 131         retl
 132          mov    %o5, %o0
 133         NOTREACHED
 134
 135         /*
 136          * Overlapping bcopies -- punt.
 137          */
 138 Lovbcopy:
 139
 140         /*
 141          * Since src comes before dst, and the regions might overlap,
 142          * we have to do the copy starting at the end and working backwards.
 143          *
 144          * We could optimize this, but it almost never happens.
 145          */
 146         mov     %o1, %o5        ! Retval
 147         add     %o2, %o0, %o0   ! src += len
 148         add     %o2, %o1, %o1   ! dst += len
 149
 150         deccc   %o2
 151         bl,pn   %xcc, 1f
 152          dec    %o0
 153 0:
 154         dec     %o1
 155         ldsb    [%o0], %o4
 156         dec     %o0
 157
 158         deccc   %o2
 159         bge,pt  %xcc, 0b
 160          stb    %o4, [%o1]
 161 1:
 162         retl
 163          mov    %o5, %o0
 164
 165         /*
 166          * Plenty of data to copy, so try to do it optimally.
 167          */
 168 2:
 169 #if 1
 170         ! If it is big enough, use VIS instructions
 171         bge     Lbcopy_block
 172          nop
 173 #endif
 174 Lbcopy_fancy:
 175
 176         !!
 177         !! First align the output to a 8-byte entity
 178         !!
 179
 180         save    %sp, -CC64FSZ, %sp
 181
 182         mov     %i0, %o0
 183         mov     %i1, %o1
 184
 185         mov     %i2, %o2
 186         btst    1, %o1
 187
 188         bz,pt   %icc, 4f
 189          btst   2, %o1
 190         ldub    [%o0], %o4                              ! Load 1st byte
 191
 192         deccc   1, %o2
 193         ble,pn  %xcc, Lbcopy_finish                     ! XXXX
 194          inc    1, %o0
 195
 196         stb     %o4, [%o1]                              ! Store 1st byte
 197         inc     1, %o1                                  ! Update address
 198         btst    2, %o1
 199 4:
 200         bz,pt   %icc, 4f
 201
 202          btst   1, %o0
 203         bz,a    1f
 204          lduh   [%o0], %o4                              ! Load short
 205
 206         ldub    [%o0], %o4                              ! Load bytes
 207
 208         ldub    [%o0+1], %o3
 209         sllx    %o4, 8, %o4
 210         or      %o3, %o4, %o4
 211
 212 1:
 213         deccc   2, %o2
 214         ble,pn  %xcc, Lbcopy_finish                     ! XXXX
 215          inc    2, %o0
 216         sth     %o4, [%o1]                              ! Store 1st short
 217
 218         inc     2, %o1
 219 4:
 220         btst    4, %o1
 221         bz,pt   %xcc, 4f
 222
 223          btst   3, %o0
 224         bz,a,pt %xcc, 1f
 225          lduw   [%o0], %o4                              ! Load word -1
 226
 227         btst    1, %o0
 228         bz,a,pt %icc, 2f
 229          lduh   [%o0], %o4
 230
 231         ldub    [%o0], %o4
 232
 233         lduh    [%o0+1], %o3
 234         sllx    %o4, 16, %o4
 235         or      %o4, %o3, %o4
 236
 237         ldub    [%o0+3], %o3
 238         sllx    %o4, 8, %o4
 239         ba,pt   %icc, 1f
 240          or     %o4, %o3, %o4
 241
 242 2:
 243         lduh    [%o0+2], %o3
 244         sllx    %o4, 16, %o4
 245         or      %o4, %o3, %o4
 246
 247 1:
 248         deccc   4, %o2
 249         ble,pn  %xcc, Lbcopy_finish             ! XXXX
 250          inc    4, %o0
 251
 252         st      %o4, [%o1]                              ! Store word
 253         inc     4, %o1
 254 4:
 255         !!
 256         !! We are now 32-bit aligned in the dest.
 257         !!
 258 Lbcopy__common:
 259
 260         and     %o0, 7, %o4                             ! Shift amount
 261         andn    %o0, 7, %o0                             ! Source addr
 262
 263         brz,pt  %o4, Lbcopy_noshift8                    ! No shift version...
 264
 265          sllx   %o4, 3, %o4                             ! In bits
 266         mov     8<<3, %o3
 267
 268         ldx     [%o0], %l0                              ! Load word -1
 269         sub     %o3, %o4, %o3                           ! Reverse shift
 270         deccc   16*8, %o2                               ! Have enough room?
 271
 272         sllx    %l0, %o4, %l0
 273         bl,pn   %xcc, 2f
 274          and    %o3, 0x38, %o3
 275 Lbcopy_unrolled8:
 276
 277         /*
 278          * This is about as close to optimal as you can get, since
 279          * the shifts require EU0 and cannot be paired, and you have
 280          * 3 dependent operations on the data.
 281          */
 282
 283 !       ldx     [%o0+0*8], %l0                          ! Already done
 284 !       sllx    %l0, %o4, %l0                           ! Already done
 285         ldx     [%o0+1*8], %l1
 286         ldx     [%o0+2*8], %l2
 287         ldx     [%o0+3*8], %l3
 288         ldx     [%o0+4*8], %l4
 289         ldx     [%o0+5*8], %l5
 290         ldx     [%o0+6*8], %l6
 291 #if 1
 292         ba,pt   %icc, 1f
 293          ldx    [%o0+7*8], %l7
 294         .align  8
 295 1:
 296         srlx    %l1, %o3, %g1
 297         inc     8*8, %o0
 298
 299         sllx    %l1, %o4, %l1
 300         or      %g1, %l0, %o5
 301         ldx     [%o0+0*8], %l0
 302
 303         stx     %o5, [%o1+0*8]
 304         srlx    %l2, %o3, %g1
 305
 306         sllx    %l2, %o4, %l2
 307         or      %g1, %l1, %o5
 308         ldx     [%o0+1*8], %l1
 309
 310         stx     %o5, [%o1+1*8]
 311         srlx    %l3, %o3, %g1
 312
 313         sllx    %l3, %o4, %l3
 314         or      %g1, %l2, %o5
 315         ldx     [%o0+2*8], %l2
 316
 317         stx     %o5, [%o1+2*8]
 318         srlx    %l4, %o3, %g1
 319
 320         sllx    %l4, %o4, %l4
 321         or      %g1, %l3, %o5
 322         ldx     [%o0+3*8], %l3
 323
 324         stx     %o5, [%o1+3*8]
 325         srlx    %l5, %o3, %g1
 326
 327         sllx    %l5, %o4, %l5
 328         or      %g1, %l4, %o5
 329         ldx     [%o0+4*8], %l4
 330
 331         stx     %o5, [%o1+4*8]
 332         srlx    %l6, %o3, %g1
 333
 334         sllx    %l6, %o4, %l6
 335         or      %g1, %l5, %o5
 336         ldx     [%o0+5*8], %l5
 337
 338         stx     %o5, [%o1+5*8]
 339         srlx    %l7, %o3, %g1
 340
 341         sllx    %l7, %o4, %l7
 342         or      %g1, %l6, %o5
 343         ldx     [%o0+6*8], %l6
 344
 345         stx     %o5, [%o1+6*8]
 346         srlx    %l0, %o3, %g1
 347         deccc   8*8, %o2                                ! Have enough room?
 348
 349         sllx    %l0, %o4, %l0                           ! Next loop
 350         or      %g1, %l7, %o5
 351         ldx     [%o0+7*8], %l7
 352
 353         stx     %o5, [%o1+7*8]
 354         bge,pt  %xcc, 1b
 355          inc    8*8, %o1
 356
 357 Lbcopy_unrolled8_cleanup:
 358         !!
 359         !! Finished 8 byte block, unload the regs.
 360         !!
 361         srlx    %l1, %o3, %g1
 362         inc     7*8, %o0
 363
 364         sllx    %l1, %o4, %l1
 365         or      %g1, %l0, %o5
 366
 367         stx     %o5, [%o1+0*8]
 368         srlx    %l2, %o3, %g1
 369
 370         sllx    %l2, %o4, %l2
 371         or      %g1, %l1, %o5
 372
 373         stx     %o5, [%o1+1*8]
 374         srlx    %l3, %o3, %g1
 375
 376         sllx    %l3, %o4, %l3
 377         or      %g1, %l2, %o5
 378
 379         stx     %o5, [%o1+2*8]
 380         srlx    %l4, %o3, %g1
 381
 382         sllx    %l4, %o4, %l4
 383         or      %g1, %l3, %o5
 384
 385         stx     %o5, [%o1+3*8]
 386         srlx    %l5, %o3, %g1
 387
 388         sllx    %l5, %o4, %l5
 389         or      %g1, %l4, %o5
 390
 391         stx     %o5, [%o1+4*8]
 392         srlx    %l6, %o3, %g1
 393
 394         sllx    %l6, %o4, %l6
 395         or      %g1, %l5, %o5
 396
 397         stx     %o5, [%o1+5*8]
 398         srlx    %l7, %o3, %g1
 399
 400         sllx    %l7, %o4, %l7
 401         or      %g1, %l6, %o5
 402
 403         stx     %o5, [%o1+6*8]
 404         inc     7*8, %o1
 405
 406         mov     %l7, %l0                                ! Save our unused data
 407         dec     7*8, %o2
 408 #else
 409         /*
 410          * This version also handles aligned copies at almost the
 411          * same speed.  It should take the same number of cycles
 412          * as the previous version, but is slightly slower, probably
 413          * due to i$ issues.
 414          */
 415         ldx     [%o0+7*8], %l7
 416         ba,pt   %icc, 1f
 417          clr    %g1
 418         .align 32
 419 1:
 420         srlx    %l1, %o3, %g1
 421         bz,pn   %xcc, 3f
 422          inc    8*8, %o0
 423
 424         sllx    %l1, %o4, %l1
 425         or      %g1, %l0, %o5
 426         ba,pt   %icc, 4f
 427         ldx     [%o0+0*8], %l0
 428
 429         nop
 430 3:
 431         mov     %l0, %o5
 432         ldx     [%o0+0*8], %l0
 433
 434 4:
 435         bz,pn   %icc, 3f
 436         stx     %o5, [%o1+0*8]
 437         srlx    %l2, %o3, %g1
 438
 439         sllx    %l2, %o4, %l2
 440 3:
 441         or      %g1, %l1, %o5
 442         ldx     [%o0+1*8], %l1
 443
 444         bz,pn   %icc, 3f
 445         stx     %o5, [%o1+1*8]
 446         srlx    %l3, %o3, %g1
 447
 448         sllx    %l3, %o4, %l3
 449 3:
 450         or      %g1, %l2, %o5
 451         ldx     [%o0+2*8], %l2
 452
 453         bz,pn   %icc, 3f
 454         stx     %o5, [%o1+2*8]
 455         srlx    %l4, %o3, %g1
 456
 457         sllx    %l4, %o4, %l4
 458 3:
 459         or      %g1, %l3, %o5
 460         ldx     [%o0+3*8], %l3
 461
 462         bz,pn   %icc, 3f
 463         stx     %o5, [%o1+3*8]
 464         srlx    %l5, %o3, %g1
 465
 466         sllx    %l5, %o4, %l5
 467 3:
 468         or      %g1, %l4, %o5
 469         ldx     [%o0+4*8], %l4
 470
 471         bz,pn   %icc, 3f
 472         stx     %o5, [%o1+4*8]
 473         srlx    %l6, %o3, %g1
 474
 475         sllx    %l6, %o4, %l6
 476 3:
 477         or      %g1, %l5, %o5
 478         ldx     [%o0+5*8], %l5
 479
 480         bz,pn   %icc, 3f
 481         stx     %o5, [%o1+5*8]
 482         srlx    %l7, %o3, %g1
 483
 484         sllx    %l7, %o4, %l7
 485 3:
 486         or      %g1, %l6, %o5
 487         ldx     [%o0+6*8], %l6
 488
 489         bz,pn   %icc, 3f
 490         stx     %o5, [%o1+6*8]
 491         srlx    %l0, %o3, %g1
 492
 493         sllx    %l0, %o4, %l0                           ! Next loop
 494 3:
 495         or      %g1, %l7, %o5
 496         ldx     [%o0+7*8], %l7
 497         deccc   8*8, %o2                                ! Have enough room?
 498
 499         stx     %o5, [%o1+7*8]
 500         inc     8*8, %o1
 501         bge,pt  %xcc, 1b
 502          tst    %o4
 503
 504
 505         !!
 506         !! Now unload all those regs
 507         !!
 508 Lbcopy_unrolled8_cleanup:
 509         srlx    %l1, %o3, %g1
 510         bz,pn   %xcc, 3f
 511          inc    7*8, %o0                                ! Point at the last load
 512
 513         sllx    %l1, %o4, %l1
 514         ba,pt   %icc, 4f
 515          or     %g1, %l0, %o5
 516
 517 3:
 518         mov     %l0, %o5
 519
 520 4:
 521         bz,pn   %icc, 3f
 522         stx     %o5, [%o1+0*8]
 523         srlx    %l2, %o3, %g1
 524
 525         sllx    %l2, %o4, %l2
 526 3:
 527         or      %g1, %l1, %o5
 528
 529         bz,pn   %icc, 3f
 530         stx     %o5, [%o1+1*8]
 531         srlx    %l3, %o3, %g1
 532
 533         sllx    %l3, %o4, %l3
 534 3:
 535         or      %g1, %l2, %o5
 536
 537         bz,pn   %icc, 3f
 538         stx     %o5, [%o1+2*8]
 539         srlx    %l4, %o3, %g1
 540
 541         sllx    %l4, %o4, %l4
 542 3:
 543         or      %g1, %l3, %o5
 544
 545         bz,pn   %icc, 3f
 546         stx     %o5, [%o1+3*8]
 547         srlx    %l5, %o3, %g1
 548
 549         sllx    %l5, %o4, %l5
 550 3:
 551         or      %g1, %l4, %o5
 552
 553         bz,pn   %icc, 3f
 554         stx     %o5, [%o1+4*8]
 555         srlx    %l6, %o3, %g1
 556
 557         sllx    %l6, %o4, %l6
 558 3:
 559         or      %g1, %l5, %o5
 560
 561         bz,pn   %icc, 3f
 562         stx     %o5, [%o1+5*8]
 563         srlx    %l7, %o3, %g1
 564
 565         sllx    %l7, %o4, %l7
 566 3:
 567         or      %g1, %l6, %o5
 568         mov     %l7, %l0                                ! Shuffle to %l0
 569
 570         stx     %o5, [%o1+6*8]
 571         or      %g1, %l7, %o5
 572         dec     7*8, %o2
 573
 574         inc     7*8, %o1                                ! Point at last store
 575 #endif
 576 2:
 577         inccc   16*8, %o2
 578         bz,pn   %icc, Lbcopy_complete
 579
 580         !! Unrolled 8 times
 581 Lbcopy_aligned8:
 582 !       ldx     [%o0], %l0                              ! Already done
 583 !       sllx    %l0, %o4, %l0                           ! Shift high word
 584
 585          deccc  8, %o2                                  ! Pre-decrement
 586         bl,pn   %xcc, Lbcopy_finish
 587 1:
 588         ldx     [%o0+8], %l1                            ! Load word 0
 589         inc     8, %o0
 590
 591         srlx    %l1, %o3, %o5
 592         or      %o5, %l0, %o5                           ! Combine
 593
 594         stx     %o5, [%o1]                              ! Store result
 595          inc    8, %o1
 596
 597         deccc   8, %o2
 598         bge,pn  %xcc, 1b
 599          sllx   %l1, %o4, %l0
 600
 601         btst    7, %o2                                  ! Done?
 602         bz,pt   %xcc, Lbcopy_complete
 603
 604         !!
 605         !! Loadup the last dregs into %l0 and shift it into place
 606         !!
 607          srlx   %o3, 3, %o5                             ! # bytes in %l0
 608         dec     8, %o5                                  !  - 8
 609         !! n-8 - (by - 8) -> n - by
 610         subcc   %o2, %o5, %g0                           ! # bytes we need
 611         ble,pt  %icc, Lbcopy_finish
 612          nop
 613         ldx     [%o0+8], %l1                            ! Need another word
 614         srlx    %l1, %o3, %l1
 615         ba,pt   %icc, Lbcopy_finish
 616          or     %l0, %l1, %l0                           ! All loaded up.
 617
 618 Lbcopy_noshift8:
 619         deccc   8*8, %o2                                ! Have enough room?
 620         bl,pn   %xcc, 2f
 621          nop
 622         ba,pt   %icc, 1f
 623          nop
 624         .align  32
 625 1:
 626         ldx     [%o0+0*8], %l0
 627         ldx     [%o0+1*8], %l1
 628         ldx     [%o0+2*8], %l2
 629         ldx     [%o0+3*8], %l3
 630         stx     %l0, [%o1+0*8]
 631         stx     %l1, [%o1+1*8]
 632         stx     %l2, [%o1+2*8]
 633         stx     %l3, [%o1+3*8]
 634
 635
 636         ldx     [%o0+4*8], %l4
 637         ldx     [%o0+5*8], %l5
 638         ldx     [%o0+6*8], %l6
 639         ldx     [%o0+7*8], %l7
 640         inc     8*8, %o0
 641         stx     %l4, [%o1+4*8]
 642         stx     %l5, [%o1+5*8]
 643         deccc   8*8, %o2
 644         stx     %l6, [%o1+6*8]
 645         stx     %l7, [%o1+7*8]
 646         stx     %l2, [%o1+2*8]
 647         bge,pt  %xcc, 1b
 648          inc    8*8, %o1
 649 2:
 650         inc     8*8, %o2
 651 1:
 652         deccc   8, %o2
 653         bl,pn   %icc, 1f                                ! < 0 --> sub word
 654          nop
 655         ldx     [%o0], %o5
 656         inc     8, %o0
 657         stx     %o5, [%o1]
 658         bg,pt   %icc, 1b                                ! Exactly 0 --> done
 659          inc    8, %o1
 660 1:
 661         btst    7, %o2                                  ! Done?
 662         bz,pt   %xcc, Lbcopy_complete
 663          clr    %o4
 664         ldx     [%o0], %l0
 665 Lbcopy_finish:
 666
 667         brz,pn  %o2, 2f                                 ! 100% complete?
 668          cmp    %o2, 8                                  ! Exactly 8 bytes?
 669         bz,a,pn %xcc, 2f
 670          stx    %l0, [%o1]
 671
 672         btst    4, %o2                                  ! Word store?
 673         bz      %xcc, 1f
 674          srlx   %l0, 32, %o5                            ! Shift high word down
 675         stw     %o5, [%o1]
 676         inc     4, %o1
 677         mov     %l0, %o5                                ! Operate on the low bits
 678 1:
 679         btst    2, %o2
 680         mov     %o5, %l0
 681         bz      1f
 682          srlx   %l0, 16, %o5
 683
 684         sth     %o5, [%o1]                              ! Store short
 685         inc     2, %o1
 686         mov     %l0, %o5                                ! Operate on low bytes
 687 1:
 688         mov     %o5, %l0
 689         btst    1, %o2                                  ! Byte aligned?
 690         bz      2f
 691          srlx   %l0, 8, %o5
 692
 693         stb     %o5, [%o1]                              ! Store last byte
 694         inc     1, %o1                                  ! Update address
 695 2:
 696 Lbcopy_complete:
 697 #if 0
 698         !!
 699         !! verify copy success.
 700         !!
 701
 702         mov     %i0, %o2
 703         mov     %i1, %o4
 704         mov     %i2, %l4
 705 0:
 706         ldub    [%o2], %o1
 707         inc     %o2
 708         ldub    [%o4], %o3
 709         inc     %o4
 710         cmp     %o3, %o1
 711         bnz     1f
 712          dec    %l4
 713         brnz    %l4, 0b
 714          nop
 715         ba      2f
 716          nop
 717
 718 1:
 719         set     0f, %o0
 720         call    printf
 721          sub    %i2, %l4, %o5
 722         set     1f, %o0
 723         mov     %i0, %o1
 724         mov     %i1, %o2
 725         call    printf
 726          mov    %i2, %o3
 727         ta      1
 728         .data
 729 0:      .asciz  "bcopy failed: %x@%p != %x@%p byte %d\n"
 730 1:      .asciz  "bcopy(%p, %p, %lx)\n"
 731         .align 8
 732         .text
 733 2:
 734 #endif
 735         ret
 736          restore %i1, %g0, %o0
 737
 738 #if 1
 739
 740 /*
 741  * Block copy.  Useful for >256 byte copies.
 742  *
 743  * Benchmarking has shown this always seems to be slower than
 744  * the integer version, so this is disabled.  Maybe someone will
 745  * figure out why sometime.
 746  */
 747
 748 Lbcopy_block:
 749 #ifdef _KERNEL
 750 /*
 751  * Kernel:
 752  *
 753  * Here we use VIS instructions to do a block clear of a page.
 754  * But before we can do that we need to save and enable the FPU.
 755  * The last owner of the FPU registers is fpproc, and
 756  * fpproc->p_md.md_fpstate is the current fpstate.  If that's not
 757  * null, call savefpstate() with it to store our current fp state.
 758  *
 759  * Next, allocate an aligned fpstate on the stack.  We will properly
 760  * nest calls on a particular stack so this should not be a problem.
 761  *
 762  * Now we grab either curproc (or if we're on the interrupt stack
 763  * proc0).  We stash its existing fpstate in a local register and
 764  * put our new fpstate in curproc->p_md.md_fpstate.  We point
 765  * fpproc at curproc (or proc0) and enable the FPU.
 766  *
 767  * If we are ever preempted, our FPU state will be saved in our
 768  * fpstate.  Then, when we're resumed and we take an FPDISABLED
 769  * trap, the trap handler will be able to fish our FPU state out
 770  * of curproc (or proc0).
 771  *
 772  * On exiting this routine we undo the damage: restore the original
 773  * pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable
 774  * the MMU.
 775  *
 776  *
 777  * Register usage, Kernel only (after save):
 778  *
 779  * %i0          src
 780  * %i1          dest
 781  * %i2          size
 782  *
 783  * %l0          XXXX DEBUG old fpstate
 784  * %l1          fpproc (hi bits only)
 785  * %l2          orig fpproc
 786  * %l3          orig fpstate
 787  * %l5          curproc
 788  * %l6          old fpstate
 789  *
 790  * Register ussage, Kernel and user:
 791  *
 792  * %g1          src (retval for memcpy)
 793  *
 794  * %o0          src
 795  * %o1          dest
 796  * %o2          end dest
 797  * %o5          last safe fetchable address
 798  */
 799
 800 #if 1
 801         ENABLE_FPU(0)
 802 #else
 803         save    %sp, -(CC64FSZ+FS_SIZE+BLOCK_SIZE), %sp ! Allocate an fpstate
 804         sethi   %hi(FPPROC), %l1
 805         LDPTR   [%l1 + %lo(FPPROC)], %l2                ! Load fpproc
 806         add     %sp, (CC64FSZ+STKB+BLOCK_SIZE-1), %l0   ! Calculate pointer to fpstate
 807         brz,pt  %l2, 1f                                 ! fpproc == NULL?
 808          andn   %l0, BLOCK_ALIGN, %l0                   ! And make it block aligned
 809         LDPTR   [%l2 + P_FPSTATE], %l3
 810         brz,pn  %l3, 1f                                 ! Make sure we have an fpstate
 811          mov    %l3, %o0
 812         call    _C_LABEL(savefpstate)                   ! Save the old fpstate
 813          set    EINTSTACK-STKB, %l4                     ! Are we on intr stack?
 814         cmp     %sp, %l4
 815         bgu,pt  %xcc, 1f
 816          set    INTSTACK-STKB, %l4
 817         cmp     %sp, %l4
 818         blu     %xcc, 1f
 819 0:
 820          sethi  %hi(_C_LABEL(proc0)), %l4               ! Yes, use proc0
 821         ba,pt   %xcc, 2f                                ! XXXX needs to change to CPUs idle proc
 822          or     %l4, %lo(_C_LABEL(proc0)), %l5
 823 1:
 824         sethi   %hi(CURPROC), %l4                       ! Use curproc
 825         LDPTR   [%l4 + %lo(CURPROC)], %l5
 826         brz,pn  %l5, 0b                                 ! If curproc is NULL need to use proc0
 827          nop
 828 2:
 829         LDPTR   [%l5 + P_FPSTATE], %l6                  ! Save old fpstate
 830         STPTR   %l0, [%l5 + P_FPSTATE]                  ! Insert new fpstate
 831         STPTR   %l5, [%l1 + %lo(FPPROC)]                ! Set new fpproc
 832         wr      %g0, FPRS_FEF, %fprs                    ! Enable FPU
 833 #endif
 834         mov     %i0, %o0                                ! Src addr.
 835         mov     %i1, %o1                                ! Store our dest ptr here.
 836         mov     %i2, %o2                                ! Len counter
 837 #endif
 838
 839         !!
 840         !! First align the output to a 64-bit entity
 841         !!
 842
 843         mov     %o1, %g1                                ! memcpy retval
 844         add     %o0, %o2, %o5                           ! End of source block
 845
 846         andn    %o0, 7, %o3                             ! Start of block
 847         dec     %o5
 848         fzero   %f0
 849
 850         andn    %o5, BLOCK_ALIGN, %o5                   ! Last safe addr.
 851         ldd     [%o3], %f2                              ! Load 1st word
 852
 853         dec     8, %o3                                  ! Move %o3 1 word back
 854         btst    1, %o1
 855         bz      4f
 856
 857          mov    -7, %o4                                 ! Lowest src addr possible
 858         alignaddr %o0, %o4, %o4                         ! Base addr for load.
 859
 860         cmp     %o3, %o4
 861         be,pt   %xcc, 1f                                ! Already loaded?
 862          mov    %o4, %o3
 863         fmovd   %f2, %f0                                ! No. Shift
 864         ldd     [%o3+8], %f2                            ! And load
 865 1:
 866
 867         faligndata      %f0, %f2, %f4                   ! Isolate 1st byte
 868
 869         stda    %f4, [%o1] ASI_FL8_P                    ! Store 1st byte
 870         inc     1, %o1                                  ! Update address
 871         inc     1, %o0
 872         dec     1, %o2
 873 4:
 874         btst    2, %o1
 875         bz      4f
 876
 877          mov    -6, %o4                                 ! Calculate src - 6
 878         alignaddr %o0, %o4, %o4                         ! calculate shift mask and dest.
 879
 880         cmp     %o3, %o4                                ! Addresses same?
 881         be,pt   %xcc, 1f
 882          mov    %o4, %o3
 883         fmovd   %f2, %f0                                ! Shuffle data
 884         ldd     [%o3+8], %f2                            ! Load word 0
 885 1:
 886         faligndata %f0, %f2, %f4                        ! Move 1st short low part of f8
 887
 888         stda    %f4, [%o1] ASI_FL16_P                   ! Store 1st short
 889         dec     2, %o2
 890         inc     2, %o1
 891         inc     2, %o0
 892 4:
 893         brz,pn  %o2, Lbcopy_blockfinish                 ! XXXX
 894
 895          btst   4, %o1
 896         bz      4f
 897
 898         mov     -4, %o4
 899         alignaddr %o0, %o4, %o4                         ! calculate shift mask and dest.
 900
 901         cmp     %o3, %o4                                ! Addresses same?
 902         beq,pt  %xcc, 1f
 903          mov    %o4, %o3
 904         fmovd   %f2, %f0                                ! Shuffle data
 905         ldd     [%o3+8], %f2                            ! Load word 0
 906 1:
 907         faligndata %f0, %f2, %f4                        ! Move 1st short low part of f8
 908
 909         st      %f5, [%o1]                              ! Store word
 910         dec     4, %o2
 911         inc     4, %o1
 912         inc     4, %o0
 913 4:
 914         brz,pn  %o2, Lbcopy_blockfinish                 ! XXXX
 915         !!
 916         !! We are now 32-bit aligned in the dest.
 917         !!
 918 Lbcopy_block_common:
 919
 920          mov    -0, %o4
 921         alignaddr %o0, %o4, %o4                         ! base - shift
 922
 923         cmp     %o3, %o4                                ! Addresses same?
 924         beq,pt  %xcc, 1f
 925          mov    %o4, %o3
 926         fmovd   %f2, %f0                                ! Shuffle data
 927         ldd     [%o3+8], %f2                            ! Load word 0
 928 1:
 929         add     %o3, 8, %o0                             ! now use %o0 for src
 930
 931         !!
 932         !! Continue until our dest is block aligned
 933         !!
 934 Lbcopy_block_aligned8:
 935 1:
 936         brz     %o2, Lbcopy_blockfinish
 937          btst   BLOCK_ALIGN, %o1                        ! Block aligned?
 938         bz      1f
 939
 940          faligndata %f0, %f2, %f4                       ! Generate result
 941         deccc   8, %o2
 942         ble,pn  %icc, Lbcopy_blockfinish                ! Should never happen
 943          fmovd  %f4, %f48
 944
 945         std     %f4, [%o1]                              ! Store result
 946         inc     8, %o1
 947
 948         fmovd   %f2, %f0
 949         inc     8, %o0
 950         ba,pt   %xcc, 1b                                ! Not yet.
 951          ldd    [%o0], %f2                              ! Load next part
 952 Lbcopy_block_aligned64:
 953 1:
 954
 955 /*
 956  * 64-byte aligned -- ready for block operations.
 957  *
 958  * Here we have the destination block aligned, but the
 959  * source pointer may not be.  Sub-word alignment will
 960  * be handled by faligndata instructions.  But the source
 961  * can still be potentially aligned to 8 different words
 962  * in our 64-bit block, so we have 8 different copy routines.
 963  *
 964  * Once we figure out our source alignment, we branch
 965  * to the appropriate copy routine, which sets up the
 966  * alignment for faligndata and loads (sets) the values
 967  * into the source registers and does the copy loop.
 968  *
 969  * When were down to less than 1 block to store, we
 970  * exit the copy loop and execute cleanup code.
 971  *
 972  * Block loads and stores are not properly interlocked.
 973  * Stores save one reg/cycle, so you can start overwriting
 974  * registers the cycle after the store is issued.
 975  *
 976  * Block loads require a block load to a different register
 977  * block or a membar #Sync before accessing the loaded
 978  * data.
 979  *
 980  * Since the faligndata instructions may be offset as far
 981  * as 7 registers into a block (if you are shifting source
 982  * 7 -> dest 0), you need 3 source register blocks for full
 983  * performance: one you are copying, one you are loading,
 984  * and one for interlocking.  Otherwise, we would need to
 985  * sprinkle the code with membar #Sync and lose the advantage
 986  * of running faligndata in parallel with block stores.  This
 987  * means we are fetching a full 128 bytes ahead of the stores.
 988  * We need to make sure the prefetch does not inadvertently
 989  * cross a page boundary and fault on data that we will never
 990  * store.
 991  *
 992  */
 993 #if 1
 994         and     %o0, BLOCK_ALIGN, %o3
 995         srax    %o3, 3, %o3                             ! Isolate the offset
 996
 997         brz     %o3, L100                               ! 0->0
 998          btst   4, %o3
 999         bnz     %xcc, 4f
1000          btst   2, %o3
1001         bnz     %xcc, 2f
1002          btst   1, %o3
1003         ba,pt   %xcc, L101                              ! 0->1
1004          nop    /* XXX spitfire bug */
1005 2:
1006         bz      %xcc, L102                              ! 0->2
1007          nop
1008         ba,pt   %xcc, L103                              ! 0->3
1009          nop    /* XXX spitfire bug */
1010 4:
1011         bnz     %xcc, 2f
1012          btst   1, %o3
1013         bz      %xcc, L104                              ! 0->4
1014          nop
1015         ba,pt   %xcc, L105                              ! 0->5
1016          nop    /* XXX spitfire bug */
1017 2:
1018         bz      %xcc, L106                              ! 0->6
1019          nop
1020         ba,pt   %xcc, L107                              ! 0->7
1021          nop    /* XXX spitfire bug */
1022 #else
1023
1024         !!
1025         !! Isolate the word offset, which just happens to be
1026         !! the slot in our jump table.
1027         !!
1028         !! This is 6 insns, most of which cannot be paired,
1029         !! which is about the same as the above version.
1030         !!
1031         rd      %pc, %o4
1032 1:
1033         and     %o0, 0x31, %o3
1034         add     %o3, (Lbcopy_block_jmp - 1b), %o3
1035         jmpl    %o4 + %o3, %g0
1036          nop
1037
1038         !!
1039         !! Jump table
1040         !!
1041
1042 Lbcopy_block_jmp:
1043         ba,a,pt %xcc, L100
1044          nop
1045         ba,a,pt %xcc, L101
1046          nop
1047         ba,a,pt %xcc, L102
1048          nop
1049         ba,a,pt %xcc, L103
1050          nop
1051         ba,a,pt %xcc, L104
1052          nop
1053         ba,a,pt %xcc, L105
1054          nop
1055         ba,a,pt %xcc, L106
1056          nop
1057         ba,a,pt %xcc, L107
1058          nop
1059 #endif
1060
1061         !!
1062         !! Source is block aligned.
1063         !!
1064         !! Just load a block and go.
1065         !!
1066 L100:
1067 #ifdef RETURN_NAME
1068         sethi   %hi(1f), %g1
1069         ba,pt   %icc, 2f
1070          or     %g1, %lo(1f), %g1
1071 1:
1072         .asciz  "L100"
1073         .align  8
1074 2:
1075 #endif
1076         fmovd   %f0 , %f62
1077         ldda    [%o0] ASI_BLK_P, %f0
1078         inc     BLOCK_SIZE, %o0
1079         cmp     %o0, %o5
1080         bleu,a,pn       %icc, 3f
1081          ldda   [%o0] ASI_BLK_P, %f16
1082         ba,pt   %icc, 3f
1083          membar #Sync
1084
1085         .align  32                                      ! ICache align.
1086 3:
1087         faligndata      %f62, %f0, %f32
1088         inc     BLOCK_SIZE, %o0
1089         faligndata      %f0, %f2, %f34
1090         dec     BLOCK_SIZE, %o2
1091         faligndata      %f2, %f4, %f36
1092         cmp     %o0, %o5
1093         faligndata      %f4, %f6, %f38
1094         faligndata      %f6, %f8, %f40
1095         faligndata      %f8, %f10, %f42
1096         faligndata      %f10, %f12, %f44
1097         brlez,pn        %o2, Lbcopy_blockdone
1098          faligndata     %f12, %f14, %f46
1099
1100         bleu,a,pn       %icc, 2f
1101          ldda   [%o0] ASI_BLK_P, %f48
1102         membar  #Sync
1103 2:
1104         stda    %f32, [%o1] ASI_STORE
1105         faligndata      %f14, %f16, %f32
1106         inc     BLOCK_SIZE, %o0
1107         faligndata      %f16, %f18, %f34
1108         inc     BLOCK_SIZE, %o1
1109         faligndata      %f18, %f20, %f36
1110         dec     BLOCK_SIZE, %o2
1111         faligndata      %f20, %f22, %f38
1112         cmp     %o0, %o5
1113         faligndata      %f22, %f24, %f40
1114         faligndata      %f24, %f26, %f42
1115         faligndata      %f26, %f28, %f44
1116         brlez,pn        %o2, Lbcopy_blockdone
1117          faligndata     %f28, %f30, %f46
1118
1119         bleu,a,pn       %icc, 2f
1120          ldda   [%o0] ASI_BLK_P, %f0
1121         membar  #Sync
1122 2:
1123         stda    %f32, [%o1] ASI_STORE
1124         faligndata      %f30, %f48, %f32
1125         inc     BLOCK_SIZE, %o0
1126         faligndata      %f48, %f50, %f34
1127         inc     BLOCK_SIZE, %o1
1128         faligndata      %f50, %f52, %f36
1129         dec     BLOCK_SIZE, %o2
1130         faligndata      %f52, %f54, %f38
1131         cmp     %o0, %o5
1132         faligndata      %f54, %f56, %f40
1133         faligndata      %f56, %f58, %f42
1134         faligndata      %f58, %f60, %f44
1135         brlez,pn        %o2, Lbcopy_blockdone
1136          faligndata     %f60, %f62, %f46
1137         bleu,a,pn       %icc, 2f
1138          ldda   [%o0] ASI_BLK_P, %f16                   ! Increment is at top
1139         membar  #Sync
1140 2:
1141         stda    %f32, [%o1] ASI_STORE
1142         ba      3b
1143          inc    BLOCK_SIZE, %o1
1144
1145         !!
1146         !! Source at BLOCK_ALIGN+8
1147         !!
1148         !! We need to load almost 1 complete block by hand.
1149         !!
1150 L101:
1151 #ifdef RETURN_NAME
1152         sethi   %hi(1f), %g1
1153         ba,pt   %icc, 2f
1154          or     %g1, %lo(1f), %g1
1155 1:
1156         .asciz  "L101"
1157         .align  8
1158 2:
1159 #endif
1160 !       fmovd   %f0, %f0                                ! Hoist fmovd
1161         ldd     [%o0], %f2
1162         inc     8, %o0
1163         ldd     [%o0], %f4
1164         inc     8, %o0
1165         ldd     [%o0], %f6
1166         inc     8, %o0
1167         ldd     [%o0], %f8
1168         inc     8, %o0
1169         ldd     [%o0], %f10
1170         inc     8, %o0
1171         ldd     [%o0], %f12
1172         inc     8, %o0
1173         ldd     [%o0], %f14
1174         inc     8, %o0
1175
1176         cmp     %o0, %o5
1177         bleu,a,pn       %icc, 3f
1178          ldda   [%o0] ASI_BLK_P, %f16
1179         membar #Sync
1180 3:
1181         faligndata      %f0, %f2, %f32
1182         inc     BLOCK_SIZE, %o0
1183         faligndata      %f2, %f4, %f34
1184         cmp     %o0, %o5
1185         faligndata      %f4, %f6, %f36
1186         dec     BLOCK_SIZE, %o2
1187         faligndata      %f6, %f8, %f38
1188         faligndata      %f8, %f10, %f40
1189         faligndata      %f10, %f12, %f42
1190         faligndata      %f12, %f14, %f44
1191         bleu,a,pn       %icc, 2f
1192          ldda   [%o0] ASI_BLK_P, %f48
1193         membar  #Sync
1194 2:
1195         brlez,pn        %o2, Lbcopy_blockdone
1196          faligndata     %f14, %f16, %f46
1197
1198         stda    %f32, [%o1] ASI_STORE
1199
1200         faligndata      %f16, %f18, %f32
1201         inc     BLOCK_SIZE, %o0
1202         faligndata      %f18, %f20, %f34
1203         inc     BLOCK_SIZE, %o1
1204         faligndata      %f20, %f22, %f36
1205         cmp     %o0, %o5
1206         faligndata      %f22, %f24, %f38
1207         dec     BLOCK_SIZE, %o2
1208         faligndata      %f24, %f26, %f40
1209         faligndata      %f26, %f28, %f42
1210         faligndata      %f28, %f30, %f44
1211         bleu,a,pn       %icc, 2f
1212          ldda   [%o0] ASI_BLK_P, %f0
1213         membar  #Sync
1214 2:
1215         brlez,pn        %o2, Lbcopy_blockdone
1216          faligndata     %f30, %f48, %f46
1217
1218         stda    %f32, [%o1] ASI_STORE
1219
1220         faligndata      %f48, %f50, %f32
1221         inc     BLOCK_SIZE, %o0
1222         faligndata      %f50, %f52, %f34
1223         inc     BLOCK_SIZE, %o1
1224         faligndata      %f52, %f54, %f36
1225         cmp     %o0, %o5
1226         faligndata      %f54, %f56, %f38
1227         dec     BLOCK_SIZE, %o2
1228         faligndata      %f56, %f58, %f40
1229         faligndata      %f58, %f60, %f42
1230         faligndata      %f60, %f62, %f44
1231         bleu,a,pn       %icc, 2f
1232          ldda   [%o0] ASI_BLK_P, %f16
1233         membar  #Sync
1234 2:
1235         brlez,pn        %o2, Lbcopy_blockdone
1236          faligndata     %f62, %f0, %f46
1237
1238         stda    %f32, [%o1] ASI_STORE
1239         ba      3b
1240          inc    BLOCK_SIZE, %o1
1241
1242         !!
1243         !! Source at BLOCK_ALIGN+16
1244         !!
1245         !! We need to load 6 doubles by hand.
1246         !!
1247 L102:
1248 #ifdef RETURN_NAME
1249         sethi   %hi(1f), %g1
1250         ba,pt   %icc, 2f
1251          or     %g1, %lo(1f), %g1
1252 1:
1253         .asciz  "L102"
1254         .align  8
1255 2:
1256 #endif
1257         ldd     [%o0], %f4
1258         inc     8, %o0
1259         fmovd   %f0, %f2                                ! Hoist fmovd
1260         ldd     [%o0], %f6
1261         inc     8, %o0
1262
1263         ldd     [%o0], %f8
1264         inc     8, %o0
1265         ldd     [%o0], %f10
1266         inc     8, %o0
1267         ldd     [%o0], %f12
1268         inc     8, %o0
1269         ldd     [%o0], %f14
1270         inc     8, %o0
1271
1272         cmp     %o0, %o5
1273         bleu,a,pn       %icc, 3f
1274          ldda   [%o0] ASI_BLK_P, %f16
1275         membar #Sync
1276 3:
1277         faligndata      %f2, %f4, %f32
1278         inc     BLOCK_SIZE, %o0
1279         faligndata      %f4, %f6, %f34
1280         cmp     %o0, %o5
1281         faligndata      %f6, %f8, %f36
1282         dec     BLOCK_SIZE, %o2
1283         faligndata      %f8, %f10, %f38
1284         faligndata      %f10, %f12, %f40
1285         faligndata      %f12, %f14, %f42
1286         bleu,a,pn       %icc, 2f
1287          ldda   [%o0] ASI_BLK_P, %f48
1288         membar  #Sync
1289 2:
1290         faligndata      %f14, %f16, %f44
1291
1292         brlez,pn        %o2, Lbcopy_blockdone
1293          faligndata     %f16, %f18, %f46
1294
1295         stda    %f32, [%o1] ASI_STORE
1296
1297         faligndata      %f18, %f20, %f32
1298         inc     BLOCK_SIZE, %o0
1299         faligndata      %f20, %f22, %f34
1300         inc     BLOCK_SIZE, %o1
1301         faligndata      %f22, %f24, %f36
1302         cmp     %o0, %o5
1303         faligndata      %f24, %f26, %f38
1304         dec     BLOCK_SIZE, %o2
1305         faligndata      %f26, %f28, %f40
1306         faligndata      %f28, %f30, %f42
1307         bleu,a,pn       %icc, 2f
1308          ldda   [%o0] ASI_BLK_P, %f0
1309         membar  #Sync
1310 2:
1311         faligndata      %f30, %f48, %f44
1312         brlez,pn        %o2, Lbcopy_blockdone
1313          faligndata     %f48, %f50, %f46
1314
1315         stda    %f32, [%o1] ASI_STORE
1316
1317         faligndata      %f50, %f52, %f32
1318         inc     BLOCK_SIZE, %o0
1319         faligndata      %f52, %f54, %f34
1320         inc     BLOCK_SIZE, %o1
1321         faligndata      %f54, %f56, %f36
1322         cmp     %o0, %o5
1323         faligndata      %f56, %f58, %f38
1324         dec     BLOCK_SIZE, %o2
1325         faligndata      %f58, %f60, %f40
1326         faligndata      %f60, %f62, %f42
1327         bleu,a,pn       %icc, 2f
1328          ldda   [%o0] ASI_BLK_P, %f16
1329         membar  #Sync
1330 2:
1331         faligndata      %f62, %f0, %f44
1332         brlez,pn        %o2, Lbcopy_blockdone
1333          faligndata     %f0, %f2, %f46
1334
1335         stda    %f32, [%o1] ASI_STORE
1336         ba      3b
1337          inc    BLOCK_SIZE, %o1
1338
1339         !!
1340         !! Source at BLOCK_ALIGN+24
1341         !!
1342         !! We need to load 5 doubles by hand.
1343         !!
1344 L103:
1345 #ifdef RETURN_NAME
1346         sethi   %hi(1f), %g1
1347         ba,pt   %icc, 2f
1348          or     %g1, %lo(1f), %g1
1349 1:
1350         .asciz  "L103"
1351         .align  8
1352 2:
1353 #endif
1354         fmovd   %f0, %f4
1355         ldd     [%o0], %f6
1356         inc     8, %o0
1357         ldd     [%o0], %f8
1358         inc     8, %o0
1359         ldd     [%o0], %f10
1360         inc     8, %o0
1361         ldd     [%o0], %f12
1362         inc     8, %o0
1363         ldd     [%o0], %f14
1364         inc     8, %o0
1365
1366         cmp     %o0, %o5
1367         bleu,a,pn       %icc, 2f
1368          ldda   [%o0] ASI_BLK_P, %f16
1369         membar #Sync
1370 2:
1371         inc     BLOCK_SIZE, %o0
1372 3:
1373         faligndata      %f4, %f6, %f32
1374         cmp     %o0, %o5
1375         faligndata      %f6, %f8, %f34
1376         dec     BLOCK_SIZE, %o2
1377         faligndata      %f8, %f10, %f36
1378         faligndata      %f10, %f12, %f38
1379         faligndata      %f12, %f14, %f40
1380         bleu,a,pn       %icc, 2f
1381          ldda   [%o0] ASI_BLK_P, %f48
1382         membar  #Sync
1383 2:
1384         faligndata      %f14, %f16, %f42
1385         inc     BLOCK_SIZE, %o0
1386         faligndata      %f16, %f18, %f44
1387         brlez,pn        %o2, Lbcopy_blockdone
1388          faligndata     %f18, %f20, %f46
1389
1390         stda    %f32, [%o1] ASI_STORE
1391
1392         faligndata      %f20, %f22, %f32
1393         cmp     %o0, %o5
1394         faligndata      %f22, %f24, %f34
1395         dec     BLOCK_SIZE, %o2
1396         faligndata      %f24, %f26, %f36
1397         inc     BLOCK_SIZE, %o1
1398         faligndata      %f26, %f28, %f38
1399         faligndata      %f28, %f30, %f40
1400         ble,a,pn        %icc, 2f
1401          ldda   [%o0] ASI_BLK_P, %f0
1402         membar  #Sync
1403 2:
1404         faligndata      %f30, %f48, %f42
1405         inc     BLOCK_SIZE, %o0
1406         faligndata      %f48, %f50, %f44
1407         brlez,pn        %o2, Lbcopy_blockdone
1408          faligndata     %f50, %f52, %f46
1409
1410         stda    %f32, [%o1] ASI_STORE
1411
1412         faligndata      %f52, %f54, %f32
1413         cmp     %o0, %o5
1414         faligndata      %f54, %f56, %f34
1415         dec     BLOCK_SIZE, %o2
1416         faligndata      %f56, %f58, %f36
1417         faligndata      %f58, %f60, %f38
1418         inc     BLOCK_SIZE, %o1
1419         faligndata      %f60, %f62, %f40
1420         bleu,a,pn       %icc, 2f
1421          ldda   [%o0] ASI_BLK_P, %f16
1422         membar  #Sync
1423 2:
1424         faligndata      %f62, %f0, %f42
1425         inc     BLOCK_SIZE, %o0
1426         faligndata      %f0, %f2, %f44
1427         brlez,pn        %o2, Lbcopy_blockdone
1428          faligndata     %f2, %f4, %f46
1429
1430         stda    %f32, [%o1] ASI_STORE
1431         ba      3b
1432          inc    BLOCK_SIZE, %o1
1433
1434         !!
1435         !! Source at BLOCK_ALIGN+32
1436         !!
1437         !! We need to load 4 doubles by hand.
1438         !!
1439 L104:
1440 #ifdef RETURN_NAME
1441         sethi   %hi(1f), %g1
1442         ba,pt   %icc, 2f
1443          or     %g1, %lo(1f), %g1
1444 1:
1445         .asciz  "L104"
1446         .align  8
1447 2:
1448 #endif
1449         fmovd   %f0, %f6
1450         ldd     [%o0], %f8
1451         inc     8, %o0
1452         ldd     [%o0], %f10
1453         inc     8, %o0
1454         ldd     [%o0], %f12
1455         inc     8, %o0
1456         ldd     [%o0], %f14
1457         inc     8, %o0
1458
1459         cmp     %o0, %o5
1460         bleu,a,pn       %icc, 2f
1461          ldda   [%o0] ASI_BLK_P, %f16
1462         membar #Sync
1463 2:
1464         inc     BLOCK_SIZE, %o0
1465 3:
1466         faligndata      %f6, %f8, %f32
1467         cmp     %o0, %o5
1468         faligndata      %f8, %f10, %f34
1469         dec     BLOCK_SIZE, %o2
1470         faligndata      %f10, %f12, %f36
1471         faligndata      %f12, %f14, %f38
1472         bleu,a,pn       %icc, 2f
1473          ldda   [%o0] ASI_BLK_P, %f48
1474         membar  #Sync
1475 2:
1476         faligndata      %f14, %f16, %f40
1477         faligndata      %f16, %f18, %f42
1478         inc     BLOCK_SIZE, %o0
1479         faligndata      %f18, %f20, %f44
1480         brlez,pn        %o2, Lbcopy_blockdone
1481          faligndata     %f20, %f22, %f46
1482
1483         stda    %f32, [%o1] ASI_STORE
1484
1485         faligndata      %f22, %f24, %f32
1486         cmp     %o0, %o5
1487         faligndata      %f24, %f26, %f34
1488         faligndata      %f26, %f28, %f36
1489         inc     BLOCK_SIZE, %o1
1490         faligndata      %f28, %f30, %f38
1491         bleu,a,pn       %icc, 2f
1492          ldda   [%o0] ASI_BLK_P, %f0
1493         membar  #Sync
1494 2:
1495         faligndata      %f30, %f48, %f40
1496         dec     BLOCK_SIZE, %o2
1497         faligndata      %f48, %f50, %f42
1498         inc     BLOCK_SIZE, %o0
1499         faligndata      %f50, %f52, %f44
1500         brlez,pn        %o2, Lbcopy_blockdone
1501          faligndata     %f52, %f54, %f46
1502
1503         stda    %f32, [%o1] ASI_STORE
1504
1505         faligndata      %f54, %f56, %f32
1506         cmp     %o0, %o5
1507         faligndata      %f56, %f58, %f34
1508         faligndata      %f58, %f60, %f36
1509         inc     BLOCK_SIZE, %o1
1510         faligndata      %f60, %f62, %f38
1511         bleu,a,pn       %icc, 2f
1512          ldda   [%o0] ASI_BLK_P, %f16
1513         membar  #Sync
1514 2:
1515         faligndata      %f62, %f0, %f40
1516         dec     BLOCK_SIZE, %o2
1517         faligndata      %f0, %f2, %f42
1518         inc     BLOCK_SIZE, %o0
1519         faligndata      %f2, %f4, %f44
1520         brlez,pn        %o2, Lbcopy_blockdone
1521          faligndata     %f4, %f6, %f46
1522
1523         stda    %f32, [%o1] ASI_STORE
1524         ba      3b
1525          inc    BLOCK_SIZE, %o1
1526
1527         !!
1528         !! Source at BLOCK_ALIGN+40
1529         !!
1530         !! We need to load 3 doubles by hand.
1531         !!
1532 L105:
1533 #ifdef RETURN_NAME
1534         sethi   %hi(1f), %g1
1535         ba,pt   %icc, 2f
1536          or     %g1, %lo(1f), %g1
1537 1:
1538         .asciz  "L105"
1539         .align  8
1540 2:
1541 #endif
1542         fmovd   %f0, %f8
1543         ldd     [%o0], %f10
1544         inc     8, %o0
1545         ldd     [%o0], %f12
1546         inc     8, %o0
1547         ldd     [%o0], %f14
1548         inc     8, %o0
1549
1550         cmp     %o0, %o5
1551         bleu,a,pn       %icc, 2f
1552          ldda   [%o0] ASI_BLK_P, %f16
1553         membar #Sync
1554 2:
1555         inc     BLOCK_SIZE, %o0
1556 3:
1557         faligndata      %f8, %f10, %f32
1558         cmp     %o0, %o5
1559         faligndata      %f10, %f12, %f34
1560         faligndata      %f12, %f14, %f36
1561         bleu,a,pn       %icc, 2f
1562          ldda   [%o0] ASI_BLK_P, %f48
1563         membar  #Sync
1564 2:
1565         faligndata      %f14, %f16, %f38
1566         dec     BLOCK_SIZE, %o2
1567         faligndata      %f16, %f18, %f40
1568         inc     BLOCK_SIZE, %o0
1569         faligndata      %f18, %f20, %f42
1570         faligndata      %f20, %f22, %f44
1571         brlez,pn        %o2, Lbcopy_blockdone
1572          faligndata     %f22, %f24, %f46
1573
1574         stda    %f32, [%o1] ASI_STORE
1575
1576         faligndata      %f24, %f26, %f32
1577         cmp     %o0, %o5
1578         faligndata      %f26, %f28, %f34
1579         dec     BLOCK_SIZE, %o2
1580         faligndata      %f28, %f30, %f36
1581         bleu,a,pn       %icc, 2f
1582          ldda   [%o0] ASI_BLK_P, %f0
1583         membar  #Sync
1584 2:
1585         faligndata      %f30, %f48, %f38
1586         inc     BLOCK_SIZE, %o1
1587         faligndata      %f48, %f50, %f40
1588         inc     BLOCK_SIZE, %o0
1589         faligndata      %f50, %f52, %f42
1590         faligndata      %f52, %f54, %f44
1591         brlez,pn        %o2, Lbcopy_blockdone
1592          faligndata     %f54, %f56, %f46
1593
1594         stda    %f32, [%o1] ASI_STORE
1595
1596         faligndata      %f56, %f58, %f32
1597         cmp     %o0, %o5
1598         faligndata      %f58, %f60, %f34
1599         dec     BLOCK_SIZE, %o2
1600         faligndata      %f60, %f62, %f36
1601         bleu,a,pn       %icc, 2f
1602          ldda   [%o0] ASI_BLK_P, %f16
1603         membar  #Sync
1604 2:
1605         faligndata      %f62, %f0, %f38
1606         inc     BLOCK_SIZE, %o1
1607         faligndata      %f0, %f2, %f40
1608         inc     BLOCK_SIZE, %o0
1609         faligndata      %f2, %f4, %f42
1610         faligndata      %f4, %f6, %f44
1611         brlez,pn        %o2, Lbcopy_blockdone
1612          faligndata     %f6, %f8, %f46
1613
1614         stda    %f32, [%o1] ASI_STORE
1615         ba      3b
1616          inc    BLOCK_SIZE, %o1
1617
1618
1619         !!
1620         !! Source at BLOCK_ALIGN+48
1621         !!
1622         !! We need to load 2 doubles by hand.
1623         !!
1624 L106:
1625 #ifdef RETURN_NAME
1626         sethi   %hi(1f), %g1
1627         ba,pt   %icc, 2f
1628          or     %g1, %lo(1f), %g1
1629 1:
1630         .asciz  "L106"
1631         .align  8
1632 2:
1633 #endif
1634         fmovd   %f0, %f10
1635         ldd     [%o0], %f12
1636         inc     8, %o0
1637         ldd     [%o0], %f14
1638         inc     8, %o0
1639
1640         cmp     %o0, %o5
1641         bleu,a,pn       %icc, 2f
1642          ldda   [%o0] ASI_BLK_P, %f16
1643         membar #Sync
1644 2:
1645         inc     BLOCK_SIZE, %o0
1646 3:
1647         faligndata      %f10, %f12, %f32
1648         cmp     %o0, %o5
1649         faligndata      %f12, %f14, %f34
1650         bleu,a,pn       %icc, 2f
1651          ldda   [%o0] ASI_BLK_P, %f48
1652         membar  #Sync
1653 2:
1654         faligndata      %f14, %f16, %f36
1655         dec     BLOCK_SIZE, %o2
1656         faligndata      %f16, %f18, %f38
1657         inc     BLOCK_SIZE, %o0
1658         faligndata      %f18, %f20, %f40
1659         faligndata      %f20, %f22, %f42
1660         faligndata      %f22, %f24, %f44
1661         brlez,pn        %o2, Lbcopy_blockdone
1662          faligndata     %f24, %f26, %f46
1663
1664         stda    %f32, [%o1] ASI_STORE
1665
1666         faligndata      %f26, %f28, %f32
1667         cmp     %o0, %o5
1668         faligndata      %f28, %f30, %f34
1669         bleu,a,pn       %icc, 2f
1670          ldda   [%o0] ASI_BLK_P, %f0
1671         membar  #Sync
1672 2:
1673         faligndata      %f30, %f48, %f36
1674         dec     BLOCK_SIZE, %o2
1675         faligndata      %f48, %f50, %f38
1676         inc     BLOCK_SIZE, %o1
1677         faligndata      %f50, %f52, %f40
1678         faligndata      %f52, %f54, %f42
1679         inc     BLOCK_SIZE, %o0
1680         faligndata      %f54, %f56, %f44
1681         brlez,pn        %o2, Lbcopy_blockdone
1682          faligndata     %f56, %f58, %f46
1683
1684         stda    %f32, [%o1] ASI_STORE
1685
1686         faligndata      %f58, %f60, %f32
1687         cmp     %o0, %o5
1688         faligndata      %f60, %f62, %f34
1689         bleu,a,pn       %icc, 2f
1690          ldda   [%o0] ASI_BLK_P, %f16
1691         membar  #Sync
1692 2:
1693         faligndata      %f62, %f0, %f36
1694         dec     BLOCK_SIZE, %o2
1695         faligndata      %f0, %f2, %f38
1696         inc     BLOCK_SIZE, %o1
1697         faligndata      %f2, %f4, %f40
1698         faligndata      %f4, %f6, %f42
1699         inc     BLOCK_SIZE, %o0
1700         faligndata      %f6, %f8, %f44
1701         brlez,pn        %o2, Lbcopy_blockdone
1702          faligndata     %f8, %f10, %f46
1703
1704         stda    %f32, [%o1] ASI_STORE
1705         ba      3b
1706          inc    BLOCK_SIZE, %o1
1707
1708
1709         !!
1710         !! Source at BLOCK_ALIGN+56
1711         !!
1712         !! We need to load 1 double by hand.
1713         !!
1714 L107:
1715 #ifdef RETURN_NAME
1716         sethi   %hi(1f), %g1
1717         ba,pt   %icc, 2f
1718          or     %g1, %lo(1f), %g1
1719 1:
1720         .asciz  "L107"
1721         .align  8
1722 2:
1723 #endif
1724         fmovd   %f0, %f12
1725         ldd     [%o0], %f14
1726         inc     8, %o0
1727
1728         cmp     %o0, %o5
1729         bleu,a,pn       %icc, 2f
1730          ldda   [%o0] ASI_BLK_P, %f16
1731         membar #Sync
1732 2:
1733         inc     BLOCK_SIZE, %o0
1734 3:
1735         faligndata      %f12, %f14, %f32
1736         cmp     %o0, %o5
1737         bleu,a,pn       %icc, 2f
1738          ldda   [%o0] ASI_BLK_P, %f48
1739         membar  #Sync
1740 2:
1741         faligndata      %f14, %f16, %f34
1742         dec     BLOCK_SIZE, %o2
1743         faligndata      %f16, %f18, %f36
1744         inc     BLOCK_SIZE, %o0
1745         faligndata      %f18, %f20, %f38
1746         faligndata      %f20, %f22, %f40
1747         faligndata      %f22, %f24, %f42
1748         faligndata      %f24, %f26, %f44
1749         brlez,pn        %o2, Lbcopy_blockdone
1750          faligndata     %f26, %f28, %f46
1751
1752         stda    %f32, [%o1] ASI_STORE
1753
1754         faligndata      %f28, %f30, %f32
1755         cmp     %o0, %o5
1756         bleu,a,pn       %icc, 2f
1757          ldda   [%o0] ASI_BLK_P, %f0
1758         membar  #Sync
1759 2:
1760         faligndata      %f30, %f48, %f34
1761         dec     BLOCK_SIZE, %o2
1762         faligndata      %f48, %f50, %f36
1763         inc     BLOCK_SIZE, %o1
1764         faligndata      %f50, %f52, %f38
1765         faligndata      %f52, %f54, %f40
1766         inc     BLOCK_SIZE, %o0
1767         faligndata      %f54, %f56, %f42
1768         faligndata      %f56, %f58, %f44
1769         brlez,pn        %o2, Lbcopy_blockdone
1770          faligndata     %f58, %f60, %f46
1771
1772         stda    %f32, [%o1] ASI_STORE
1773
1774         faligndata      %f60, %f62, %f32
1775         cmp     %o0, %o5
1776         bleu,a,pn       %icc, 2f
1777          ldda   [%o0] ASI_BLK_P, %f16
1778         membar  #Sync
1779 2:
1780         faligndata      %f62, %f0, %f34
1781         dec     BLOCK_SIZE, %o2
1782         faligndata      %f0, %f2, %f36
1783         inc     BLOCK_SIZE, %o1
1784         faligndata      %f2, %f4, %f38
1785         faligndata      %f4, %f6, %f40
1786         inc     BLOCK_SIZE, %o0
1787         faligndata      %f6, %f8, %f42
1788         faligndata      %f8, %f10, %f44
1789
1790         brlez,pn        %o2, Lbcopy_blockdone
1791          faligndata     %f10, %f12, %f46
1792
1793         stda    %f32, [%o1] ASI_STORE
1794         ba      3b
1795          inc    BLOCK_SIZE, %o1
1796
1797 Lbcopy_blockdone:
1798         inc     BLOCK_SIZE, %o2                         ! Fixup our overcommit
1799         membar  #Sync                                   ! Finish any pending loads
1800 #define FINISH_REG(f)                           \
1801         deccc   8, %o2;                         \
1802         bl,a    Lbcopy_blockfinish;             \
1803          fmovd  f, %f48;                        \
1804         std     f, [%o1];                       \
1805         inc     8, %o1
1806
1807         FINISH_REG(%f32)
1808         FINISH_REG(%f34)
1809         FINISH_REG(%f36)
1810         FINISH_REG(%f38)
1811         FINISH_REG(%f40)
1812         FINISH_REG(%f42)
1813         FINISH_REG(%f44)
1814         FINISH_REG(%f46)
1815         FINISH_REG(%f48)
1816 #undef FINISH_REG
1817         !!
1818         !! The low 3 bits have the sub-word bits needed to be
1819         !! stored [because (x-8)&0x7 == x].
1820         !!
1821 Lbcopy_blockfinish:
1822         brz,pn  %o2, 2f                                 ! 100% complete?
1823          fmovd  %f48, %f4
1824         cmp     %o2, 8                                  ! Exactly 8 bytes?
1825         bz,a,pn %xcc, 2f
1826          std    %f4, [%o1]
1827
1828         btst    4, %o2                                  ! Word store?
1829         bz      %xcc, 1f
1830          nop
1831         st      %f4, [%o1]
1832         inc     4, %o1
1833 1:
1834         btst    2, %o2
1835         fzero   %f0
1836         bz      1f
1837
1838          mov    -6, %o4
1839         alignaddr %o1, %o4, %g0
1840
1841         faligndata %f0, %f4, %f8
1842
1843         stda    %f8, [%o1] ASI_FL16_P                   ! Store short
1844         inc     2, %o1
1845 1:
1846         btst    1, %o2                                  ! Byte aligned?
1847         bz      2f
1848
1849          mov    -7, %o0                                 ! Calculate dest - 7
1850         alignaddr %o1, %o0, %g0                         ! Calculate shift mask and dest.
1851
1852         faligndata %f0, %f4, %f8                        ! Move 1st byte to low part of f8
1853
1854         stda    %f8, [%o1] ASI_FL8_P                    ! Store 1st byte
1855         inc     1, %o1                                  ! Update address
1856 2:
1857         membar  #Sync
1858 #if 0
1859         !!
1860         !! verify copy success.
1861         !!
1862
1863         mov     %i0, %o2
1864         mov     %i1, %o4
1865         mov     %i2, %l4
1866 0:
1867         ldub    [%o2], %o1
1868         inc     %o2
1869         ldub    [%o4], %o3
1870         inc     %o4
1871         cmp     %o3, %o1
1872         bnz     1f
1873          dec    %l4
1874         brnz    %l4, 0b
1875          nop
1876         ba      2f
1877          nop
1878
1879 1:
1880         set     block_disable, %o0
1881         stx     %o0, [%o0]
1882
1883         set     0f, %o0
1884         call    prom_printf
1885          sub    %i2, %l4, %o5
1886         set     1f, %o0
1887         mov     %i0, %o1
1888         mov     %i1, %o2
1889         call    prom_printf
1890          mov    %i2, %o3
1891         ta      1
1892         .data
1893         _ALIGN
1894 block_disable:  .xword  0
1895 0:      .asciz  "bcopy failed: %x@%p != %x@%p byte %d\r\n"
1896 1:      .asciz  "bcopy(%p, %p, %lx)\r\n"
1897         _ALIGN
1898         .text
1899 2:
1900 #endif
1901 #ifdef _KERNEL
1902
1903         set 1f, %o0
1904         mov     %i0, %o1
1905         mov     %i1, %o2
1906         call    printf
1907         mov     %i2, %o3
1908
1909         .data
1910         _ALIGN
1911 1:      .asciz "block exit (%p, %p, %d)\n"
1912         _ALIGN
1913         .text
1914 /*
1915  * Weve saved our possible fpstate, now disable the fpu
1916  * and continue with life.
1917  */
1918 #if 1
1919         RESTORE_FPU
1920 #else
1921 #ifdef DEBUG
1922         LDPTR   [%l1 + %lo(FPPROC)], %l7
1923         cmp     %l7, %l5
1924 !       tnz     1               ! fpproc has changed!
1925         LDPTR   [%l5 + P_FPSTATE], %l7
1926         cmp     %l7, %l0
1927         tnz     1               ! fpstate has changed!
1928 #endif
1929         andcc   %l2, %l3, %g0                           ! If (fpproc && fpstate)
1930         STPTR   %l2, [%l1 + %lo(FPPROC)]                ! Restore old fproc
1931         bz,pt   %xcc, 1f                                ! Skip if no fpstate
1932          STPTR  %l6, [%l5 + P_FPSTATE]                  ! Restore old fpstate
1933
1934         call    _C_LABEL(loadfpstate)                   ! Re-load orig fpstate
1935          mov    %l3, %o0
1936 1:
1937 #endif
1938         set 1f, %o0
1939         mov     %i0, %o1
1940         mov     %i1, %o2
1941         call    printf
1942         mov     %i2, %o3
1943
1944         .data
1945         _ALIGN
1946 1:      .asciz "block done (%p, %p, %d)\n"
1947         _ALIGN
1948         .text
1949
1950
1951         ret
1952          restore        %g1, 0, %o0                     ! Return DEST for memcpy
1953 #endif
1954         retl
1955          mov    %g1, %o0
1956 #endif
1957
1958