1 /* $NetBSD: memcpy.S,v 1.1 2001/07/07 04:55:21 eeh Exp $ */
4 * Copyright (c) 2001 Eduardo E. Horvath
6 * This software was developed by the Computer Systems Engineering group
7 * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
8 * contributed to Berkeley.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 #include <machine/asm.h>
44 #include <machine/ctlreg.h>
45 #include <machine/frame.h>
46 #include <machine/psl.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 RCSID("$NetBSD: memcpy.S,v 1.1 2001/07/07 04:55:21 eeh Exp $")
50 #endif /* LIBC_SCCS and not lint */
53 #define NOTREACHED ta 1
55 #define BCOPY_SMALL 16
59 #define ASI_STORE ASI_BLK_COMMIT_P
61 #define ASI_STORE ASI_BLK_P
67 * Assumes regions do not overlap; has no useful return value.
69 * Must not use %g7 (see copyin/copyout above).
71 ENTRY(memcpy) /* dest, src, size */
73 * Swap args for bcopy. Gcc generates calls to memcpy for
74 * structure assignments.
80 ENTRY(bcopy) /* src, dest, size */
84 btst 0x80, %o4 ! PDB_COPY
87 save %sp, -CC64FSZ, %sp
96 2: .asciz "bcopy(%p->%p,%x)\n"
102 * Check for overlaps and punt.
104 * If src <= dest <= src+len we have a problem.
110 blu,pn %xcc, Lovbcopy
113 bge,pt %xcc, 2f ! if >= this many, go be fancy.
116 mov %o1, %o5 ! Save memcpy return value
118 * Not much to copy, just do it a byte at a time.
120 deccc %o2 ! while (--len >= 0)
125 ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++;
136 * Overlapping bcopies -- punt.
141 * Since src comes before dst, and the regions might overlap,
142 * we have to do the copy starting at the end and working backwards.
144 * We could optimize this, but it almost never happens.
146 mov %o1, %o5 ! Retval
147 add %o2, %o0, %o0 ! src += len
148 add %o2, %o1, %o1 ! dst += len
166 * Plenty of data to copy, so try to do it optimally.
170 ! If it is big enough, use VIS instructions
177 !! First align the output to a 8-byte entity
180 save %sp, -CC64FSZ, %sp
190 ldub [%o0], %o4 ! Load 1st byte
193 ble,pn %xcc, Lbcopy_finish ! XXXX
196 stb %o4, [%o1] ! Store 1st byte
197 inc 1, %o1 ! Update address
204 lduh [%o0], %o4 ! Load short
206 ldub [%o0], %o4 ! Load bytes
214 ble,pn %xcc, Lbcopy_finish ! XXXX
216 sth %o4, [%o1] ! Store 1st short
225 lduw [%o0], %o4 ! Load word -1
249 ble,pn %xcc, Lbcopy_finish ! XXXX
252 st %o4, [%o1] ! Store word
256 !! We are now 32-bit aligned in the dest.
260 and %o0, 7, %o4 ! Shift amount
261 andn %o0, 7, %o0 ! Source addr
263 brz,pt %o4, Lbcopy_noshift8 ! No shift version...
265 sllx %o4, 3, %o4 ! In bits
268 ldx [%o0], %l0 ! Load word -1
269 sub %o3, %o4, %o3 ! Reverse shift
270 deccc 16*8, %o2 ! Have enough room?
278 * This is about as close to optimal as you can get, since
279 * the shifts require EU0 and cannot be paired, and you have
280 * 3 dependent operations on the data.
283 ! ldx [%o0+0*8], %l0 ! Already done
284 ! sllx %l0, %o4, %l0 ! Already done
347 deccc 8*8, %o2 ! Have enough room?
349 sllx %l0, %o4, %l0 ! Next loop
357 Lbcopy_unrolled8_cleanup:
359 !! Finished 8 byte block, unload the regs.
406 mov %l7, %l0 ! Save our unused data
410 * This version also handles aligned copies at almost the
411 * same speed. It should take the same number of cycles
412 * as the previous version, but is slightly slower, probably
493 sllx %l0, %o4, %l0 ! Next loop
497 deccc 8*8, %o2 ! Have enough room?
506 !! Now unload all those regs
508 Lbcopy_unrolled8_cleanup:
511 inc 7*8, %o0 ! Point at the last load
568 mov %l7, %l0 ! Shuffle to %l0
574 inc 7*8, %o1 ! Point at last store
578 bz,pn %icc, Lbcopy_complete
582 ! ldx [%o0], %l0 ! Already done
583 ! sllx %l0, %o4, %l0 ! Shift high word
585 deccc 8, %o2 ! Pre-decrement
586 bl,pn %xcc, Lbcopy_finish
588 ldx [%o0+8], %l1 ! Load word 0
592 or %o5, %l0, %o5 ! Combine
594 stx %o5, [%o1] ! Store result
602 bz,pt %xcc, Lbcopy_complete
605 !! Loadup the last dregs into %l0 and shift it into place
607 srlx %o3, 3, %o5 ! # bytes in %l0
609 !! n-8 - (by - 8) -> n - by
610 subcc %o2, %o5, %g0 ! # bytes we need
611 ble,pt %icc, Lbcopy_finish
613 ldx [%o0+8], %l1 ! Need another word
615 ba,pt %icc, Lbcopy_finish
616 or %l0, %l1, %l0 ! All loaded up.
619 deccc 8*8, %o2 ! Have enough room?
653 bl,pn %icc, 1f ! < 0 --> sub word
658 bg,pt %icc, 1b ! Exactly 0 --> done
662 bz,pt %xcc, Lbcopy_complete
667 brz,pn %o2, 2f ! 100% complete?
668 cmp %o2, 8 ! Exactly 8 bytes?
672 btst 4, %o2 ! Word store?
674 srlx %l0, 32, %o5 ! Shift high word down
677 mov %l0, %o5 ! Operate on the low bits
684 sth %o5, [%o1] ! Store short
686 mov %l0, %o5 ! Operate on low bytes
689 btst 1, %o2 ! Byte aligned?
693 stb %o5, [%o1] ! Store last byte
694 inc 1, %o1 ! Update address
699 !! verify copy success.
729 0: .asciz "bcopy failed: %x@%p != %x@%p byte %d\n"
730 1: .asciz "bcopy(%p, %p, %lx)\n"
736 restore %i1, %g0, %o0
741 * Block copy. Useful for >256 byte copies.
743 * Benchmarking has shown this always seems to be slower than
744 * the integer version, so this is disabled. Maybe someone will
745 * figure out why sometime.
753 * Here we use VIS instructions to do a block clear of a page.
754 * But before we can do that we need to save and enable the FPU.
755 * The last owner of the FPU registers is fpproc, and
756 * fpproc->p_md.md_fpstate is the current fpstate. If that's not
757 * null, call savefpstate() with it to store our current fp state.
759 * Next, allocate an aligned fpstate on the stack. We will properly
760 * nest calls on a particular stack so this should not be a problem.
762 * Now we grab either curproc (or if we're on the interrupt stack
763 * proc0). We stash its existing fpstate in a local register and
764 * put our new fpstate in curproc->p_md.md_fpstate. We point
765 * fpproc at curproc (or proc0) and enable the FPU.
767 * If we are ever preempted, our FPU state will be saved in our
768 * fpstate. Then, when we're resumed and we take an FPDISABLED
769 * trap, the trap handler will be able to fish our FPU state out
770 * of curproc (or proc0).
772 * On exiting this routine we undo the damage: restore the original
773 * pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable
777 * Register usage, Kernel only (after save):
783 * %l0 XXXX DEBUG old fpstate
784 * %l1 fpproc (hi bits only)
790 * Register ussage, Kernel and user:
792 * %g1 src (retval for memcpy)
797 * %o5 last safe fetchable address
803 save %sp, -(CC64FSZ+FS_SIZE+BLOCK_SIZE), %sp ! Allocate an fpstate
804 sethi %hi(FPPROC), %l1
805 LDPTR [%l1 + %lo(FPPROC)], %l2 ! Load fpproc
806 add %sp, (CC64FSZ+STKB+BLOCK_SIZE-1), %l0 ! Calculate pointer to fpstate
807 brz,pt %l2, 1f ! fpproc == NULL?
808 andn %l0, BLOCK_ALIGN, %l0 ! And make it block aligned
809 LDPTR [%l2 + P_FPSTATE], %l3
810 brz,pn %l3, 1f ! Make sure we have an fpstate
812 call _C_LABEL(savefpstate) ! Save the old fpstate
813 set EINTSTACK-STKB, %l4 ! Are we on intr stack?
816 set INTSTACK-STKB, %l4
820 sethi %hi(_C_LABEL(proc0)), %l4 ! Yes, use proc0
821 ba,pt %xcc, 2f ! XXXX needs to change to CPUs idle proc
822 or %l4, %lo(_C_LABEL(proc0)), %l5
824 sethi %hi(CURPROC), %l4 ! Use curproc
825 LDPTR [%l4 + %lo(CURPROC)], %l5
826 brz,pn %l5, 0b ! If curproc is NULL need to use proc0
829 LDPTR [%l5 + P_FPSTATE], %l6 ! Save old fpstate
830 STPTR %l0, [%l5 + P_FPSTATE] ! Insert new fpstate
831 STPTR %l5, [%l1 + %lo(FPPROC)] ! Set new fpproc
832 wr %g0, FPRS_FEF, %fprs ! Enable FPU
834 mov %i0, %o0 ! Src addr.
835 mov %i1, %o1 ! Store our dest ptr here.
836 mov %i2, %o2 ! Len counter
840 !! First align the output to a 64-bit entity
843 mov %o1, %g1 ! memcpy retval
844 add %o0, %o2, %o5 ! End of source block
846 andn %o0, 7, %o3 ! Start of block
850 andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr.
851 ldd [%o3], %f2 ! Load 1st word
853 dec 8, %o3 ! Move %o3 1 word back
857 mov -7, %o4 ! Lowest src addr possible
858 alignaddr %o0, %o4, %o4 ! Base addr for load.
861 be,pt %xcc, 1f ! Already loaded?
863 fmovd %f2, %f0 ! No. Shift
864 ldd [%o3+8], %f2 ! And load
867 faligndata %f0, %f2, %f4 ! Isolate 1st byte
869 stda %f4, [%o1] ASI_FL8_P ! Store 1st byte
870 inc 1, %o1 ! Update address
877 mov -6, %o4 ! Calculate src - 6
878 alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
880 cmp %o3, %o4 ! Addresses same?
883 fmovd %f2, %f0 ! Shuffle data
884 ldd [%o3+8], %f2 ! Load word 0
886 faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
888 stda %f4, [%o1] ASI_FL16_P ! Store 1st short
893 brz,pn %o2, Lbcopy_blockfinish ! XXXX
899 alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
901 cmp %o3, %o4 ! Addresses same?
904 fmovd %f2, %f0 ! Shuffle data
905 ldd [%o3+8], %f2 ! Load word 0
907 faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
909 st %f5, [%o1] ! Store word
914 brz,pn %o2, Lbcopy_blockfinish ! XXXX
916 !! We are now 32-bit aligned in the dest.
921 alignaddr %o0, %o4, %o4 ! base - shift
923 cmp %o3, %o4 ! Addresses same?
926 fmovd %f2, %f0 ! Shuffle data
927 ldd [%o3+8], %f2 ! Load word 0
929 add %o3, 8, %o0 ! now use %o0 for src
932 !! Continue until our dest is block aligned
934 Lbcopy_block_aligned8:
936 brz %o2, Lbcopy_blockfinish
937 btst BLOCK_ALIGN, %o1 ! Block aligned?
940 faligndata %f0, %f2, %f4 ! Generate result
942 ble,pn %icc, Lbcopy_blockfinish ! Should never happen
945 std %f4, [%o1] ! Store result
950 ba,pt %xcc, 1b ! Not yet.
951 ldd [%o0], %f2 ! Load next part
952 Lbcopy_block_aligned64:
956 * 64-byte aligned -- ready for block operations.
958 * Here we have the destination block aligned, but the
959 * source pointer may not be. Sub-word alignment will
960 * be handled by faligndata instructions. But the source
961 * can still be potentially aligned to 8 different words
962 * in our 64-bit block, so we have 8 different copy routines.
964 * Once we figure out our source alignment, we branch
965 * to the appropriate copy routine, which sets up the
966 * alignment for faligndata and loads (sets) the values
967 * into the source registers and does the copy loop.
969 * When were down to less than 1 block to store, we
970 * exit the copy loop and execute cleanup code.
972 * Block loads and stores are not properly interlocked.
973 * Stores save one reg/cycle, so you can start overwriting
974 * registers the cycle after the store is issued.
976 * Block loads require a block load to a different register
977 * block or a membar #Sync before accessing the loaded
980 * Since the faligndata instructions may be offset as far
981 * as 7 registers into a block (if you are shifting source
982 * 7 -> dest 0), you need 3 source register blocks for full
983 * performance: one you are copying, one you are loading,
984 * and one for interlocking. Otherwise, we would need to
985 * sprinkle the code with membar #Sync and lose the advantage
986 * of running faligndata in parallel with block stores. This
987 * means we are fetching a full 128 bytes ahead of the stores.
988 * We need to make sure the prefetch does not inadvertently
989 * cross a page boundary and fault on data that we will never
994 and %o0, BLOCK_ALIGN, %o3
995 srax %o3, 3, %o3 ! Isolate the offset
1003 ba,pt %xcc, L101 ! 0->1
1004 nop /* XXX spitfire bug */
1006 bz %xcc, L102 ! 0->2
1008 ba,pt %xcc, L103 ! 0->3
1009 nop /* XXX spitfire bug */
1013 bz %xcc, L104 ! 0->4
1015 ba,pt %xcc, L105 ! 0->5
1016 nop /* XXX spitfire bug */
1018 bz %xcc, L106 ! 0->6
1020 ba,pt %xcc, L107 ! 0->7
1021 nop /* XXX spitfire bug */
1025 !! Isolate the word offset, which just happens to be
1026 !! the slot in our jump table.
1028 !! This is 6 insns, most of which cannot be paired,
1029 !! which is about the same as the above version.
1034 add %o3, (Lbcopy_block_jmp - 1b), %o3
1062 !! Source is block aligned.
1064 !! Just load a block and go.
1070 or %g1, %lo(1f), %g1
1077 ldda [%o0] ASI_BLK_P, %f0
1081 ldda [%o0] ASI_BLK_P, %f16
1085 .align 32 ! ICache align.
1087 faligndata %f62, %f0, %f32
1089 faligndata %f0, %f2, %f34
1091 faligndata %f2, %f4, %f36
1093 faligndata %f4, %f6, %f38
1094 faligndata %f6, %f8, %f40
1095 faligndata %f8, %f10, %f42
1096 faligndata %f10, %f12, %f44
1097 brlez,pn %o2, Lbcopy_blockdone
1098 faligndata %f12, %f14, %f46
1101 ldda [%o0] ASI_BLK_P, %f48
1104 stda %f32, [%o1] ASI_STORE
1105 faligndata %f14, %f16, %f32
1107 faligndata %f16, %f18, %f34
1109 faligndata %f18, %f20, %f36
1111 faligndata %f20, %f22, %f38
1113 faligndata %f22, %f24, %f40
1114 faligndata %f24, %f26, %f42
1115 faligndata %f26, %f28, %f44
1116 brlez,pn %o2, Lbcopy_blockdone
1117 faligndata %f28, %f30, %f46
1120 ldda [%o0] ASI_BLK_P, %f0
1123 stda %f32, [%o1] ASI_STORE
1124 faligndata %f30, %f48, %f32
1126 faligndata %f48, %f50, %f34
1128 faligndata %f50, %f52, %f36
1130 faligndata %f52, %f54, %f38
1132 faligndata %f54, %f56, %f40
1133 faligndata %f56, %f58, %f42
1134 faligndata %f58, %f60, %f44
1135 brlez,pn %o2, Lbcopy_blockdone
1136 faligndata %f60, %f62, %f46
1138 ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top
1141 stda %f32, [%o1] ASI_STORE
1146 !! Source at BLOCK_ALIGN+8
1148 !! We need to load almost 1 complete block by hand.
1154 or %g1, %lo(1f), %g1
1160 ! fmovd %f0, %f0 ! Hoist fmovd
1178 ldda [%o0] ASI_BLK_P, %f16
1181 faligndata %f0, %f2, %f32
1183 faligndata %f2, %f4, %f34
1185 faligndata %f4, %f6, %f36
1187 faligndata %f6, %f8, %f38
1188 faligndata %f8, %f10, %f40
1189 faligndata %f10, %f12, %f42
1190 faligndata %f12, %f14, %f44
1192 ldda [%o0] ASI_BLK_P, %f48
1195 brlez,pn %o2, Lbcopy_blockdone
1196 faligndata %f14, %f16, %f46
1198 stda %f32, [%o1] ASI_STORE
1200 faligndata %f16, %f18, %f32
1202 faligndata %f18, %f20, %f34
1204 faligndata %f20, %f22, %f36
1206 faligndata %f22, %f24, %f38
1208 faligndata %f24, %f26, %f40
1209 faligndata %f26, %f28, %f42
1210 faligndata %f28, %f30, %f44
1212 ldda [%o0] ASI_BLK_P, %f0
1215 brlez,pn %o2, Lbcopy_blockdone
1216 faligndata %f30, %f48, %f46
1218 stda %f32, [%o1] ASI_STORE
1220 faligndata %f48, %f50, %f32
1222 faligndata %f50, %f52, %f34
1224 faligndata %f52, %f54, %f36
1226 faligndata %f54, %f56, %f38
1228 faligndata %f56, %f58, %f40
1229 faligndata %f58, %f60, %f42
1230 faligndata %f60, %f62, %f44
1232 ldda [%o0] ASI_BLK_P, %f16
1235 brlez,pn %o2, Lbcopy_blockdone
1236 faligndata %f62, %f0, %f46
1238 stda %f32, [%o1] ASI_STORE
1243 !! Source at BLOCK_ALIGN+16
1245 !! We need to load 6 doubles by hand.
1251 or %g1, %lo(1f), %g1
1259 fmovd %f0, %f2 ! Hoist fmovd
1274 ldda [%o0] ASI_BLK_P, %f16
1277 faligndata %f2, %f4, %f32
1279 faligndata %f4, %f6, %f34
1281 faligndata %f6, %f8, %f36
1283 faligndata %f8, %f10, %f38
1284 faligndata %f10, %f12, %f40
1285 faligndata %f12, %f14, %f42
1287 ldda [%o0] ASI_BLK_P, %f48
1290 faligndata %f14, %f16, %f44
1292 brlez,pn %o2, Lbcopy_blockdone
1293 faligndata %f16, %f18, %f46
1295 stda %f32, [%o1] ASI_STORE
1297 faligndata %f18, %f20, %f32
1299 faligndata %f20, %f22, %f34
1301 faligndata %f22, %f24, %f36
1303 faligndata %f24, %f26, %f38
1305 faligndata %f26, %f28, %f40
1306 faligndata %f28, %f30, %f42
1308 ldda [%o0] ASI_BLK_P, %f0
1311 faligndata %f30, %f48, %f44
1312 brlez,pn %o2, Lbcopy_blockdone
1313 faligndata %f48, %f50, %f46
1315 stda %f32, [%o1] ASI_STORE
1317 faligndata %f50, %f52, %f32
1319 faligndata %f52, %f54, %f34
1321 faligndata %f54, %f56, %f36
1323 faligndata %f56, %f58, %f38
1325 faligndata %f58, %f60, %f40
1326 faligndata %f60, %f62, %f42
1328 ldda [%o0] ASI_BLK_P, %f16
1331 faligndata %f62, %f0, %f44
1332 brlez,pn %o2, Lbcopy_blockdone
1333 faligndata %f0, %f2, %f46
1335 stda %f32, [%o1] ASI_STORE
1340 !! Source at BLOCK_ALIGN+24
1342 !! We need to load 5 doubles by hand.
1348 or %g1, %lo(1f), %g1
1368 ldda [%o0] ASI_BLK_P, %f16
1373 faligndata %f4, %f6, %f32
1375 faligndata %f6, %f8, %f34
1377 faligndata %f8, %f10, %f36
1378 faligndata %f10, %f12, %f38
1379 faligndata %f12, %f14, %f40
1381 ldda [%o0] ASI_BLK_P, %f48
1384 faligndata %f14, %f16, %f42
1386 faligndata %f16, %f18, %f44
1387 brlez,pn %o2, Lbcopy_blockdone
1388 faligndata %f18, %f20, %f46
1390 stda %f32, [%o1] ASI_STORE
1392 faligndata %f20, %f22, %f32
1394 faligndata %f22, %f24, %f34
1396 faligndata %f24, %f26, %f36
1398 faligndata %f26, %f28, %f38
1399 faligndata %f28, %f30, %f40
1401 ldda [%o0] ASI_BLK_P, %f0
1404 faligndata %f30, %f48, %f42
1406 faligndata %f48, %f50, %f44
1407 brlez,pn %o2, Lbcopy_blockdone
1408 faligndata %f50, %f52, %f46
1410 stda %f32, [%o1] ASI_STORE
1412 faligndata %f52, %f54, %f32
1414 faligndata %f54, %f56, %f34
1416 faligndata %f56, %f58, %f36
1417 faligndata %f58, %f60, %f38
1419 faligndata %f60, %f62, %f40
1421 ldda [%o0] ASI_BLK_P, %f16
1424 faligndata %f62, %f0, %f42
1426 faligndata %f0, %f2, %f44
1427 brlez,pn %o2, Lbcopy_blockdone
1428 faligndata %f2, %f4, %f46
1430 stda %f32, [%o1] ASI_STORE
1435 !! Source at BLOCK_ALIGN+32
1437 !! We need to load 4 doubles by hand.
1443 or %g1, %lo(1f), %g1
1461 ldda [%o0] ASI_BLK_P, %f16
1466 faligndata %f6, %f8, %f32
1468 faligndata %f8, %f10, %f34
1470 faligndata %f10, %f12, %f36
1471 faligndata %f12, %f14, %f38
1473 ldda [%o0] ASI_BLK_P, %f48
1476 faligndata %f14, %f16, %f40
1477 faligndata %f16, %f18, %f42
1479 faligndata %f18, %f20, %f44
1480 brlez,pn %o2, Lbcopy_blockdone
1481 faligndata %f20, %f22, %f46
1483 stda %f32, [%o1] ASI_STORE
1485 faligndata %f22, %f24, %f32
1487 faligndata %f24, %f26, %f34
1488 faligndata %f26, %f28, %f36
1490 faligndata %f28, %f30, %f38
1492 ldda [%o0] ASI_BLK_P, %f0
1495 faligndata %f30, %f48, %f40
1497 faligndata %f48, %f50, %f42
1499 faligndata %f50, %f52, %f44
1500 brlez,pn %o2, Lbcopy_blockdone
1501 faligndata %f52, %f54, %f46
1503 stda %f32, [%o1] ASI_STORE
1505 faligndata %f54, %f56, %f32
1507 faligndata %f56, %f58, %f34
1508 faligndata %f58, %f60, %f36
1510 faligndata %f60, %f62, %f38
1512 ldda [%o0] ASI_BLK_P, %f16
1515 faligndata %f62, %f0, %f40
1517 faligndata %f0, %f2, %f42
1519 faligndata %f2, %f4, %f44
1520 brlez,pn %o2, Lbcopy_blockdone
1521 faligndata %f4, %f6, %f46
1523 stda %f32, [%o1] ASI_STORE
1528 !! Source at BLOCK_ALIGN+40
1530 !! We need to load 3 doubles by hand.
1536 or %g1, %lo(1f), %g1
1552 ldda [%o0] ASI_BLK_P, %f16
1557 faligndata %f8, %f10, %f32
1559 faligndata %f10, %f12, %f34
1560 faligndata %f12, %f14, %f36
1562 ldda [%o0] ASI_BLK_P, %f48
1565 faligndata %f14, %f16, %f38
1567 faligndata %f16, %f18, %f40
1569 faligndata %f18, %f20, %f42
1570 faligndata %f20, %f22, %f44
1571 brlez,pn %o2, Lbcopy_blockdone
1572 faligndata %f22, %f24, %f46
1574 stda %f32, [%o1] ASI_STORE
1576 faligndata %f24, %f26, %f32
1578 faligndata %f26, %f28, %f34
1580 faligndata %f28, %f30, %f36
1582 ldda [%o0] ASI_BLK_P, %f0
1585 faligndata %f30, %f48, %f38
1587 faligndata %f48, %f50, %f40
1589 faligndata %f50, %f52, %f42
1590 faligndata %f52, %f54, %f44
1591 brlez,pn %o2, Lbcopy_blockdone
1592 faligndata %f54, %f56, %f46
1594 stda %f32, [%o1] ASI_STORE
1596 faligndata %f56, %f58, %f32
1598 faligndata %f58, %f60, %f34
1600 faligndata %f60, %f62, %f36
1602 ldda [%o0] ASI_BLK_P, %f16
1605 faligndata %f62, %f0, %f38
1607 faligndata %f0, %f2, %f40
1609 faligndata %f2, %f4, %f42
1610 faligndata %f4, %f6, %f44
1611 brlez,pn %o2, Lbcopy_blockdone
1612 faligndata %f6, %f8, %f46
1614 stda %f32, [%o1] ASI_STORE
1620 !! Source at BLOCK_ALIGN+48
1622 !! We need to load 2 doubles by hand.
1628 or %g1, %lo(1f), %g1
1642 ldda [%o0] ASI_BLK_P, %f16
1647 faligndata %f10, %f12, %f32
1649 faligndata %f12, %f14, %f34
1651 ldda [%o0] ASI_BLK_P, %f48
1654 faligndata %f14, %f16, %f36
1656 faligndata %f16, %f18, %f38
1658 faligndata %f18, %f20, %f40
1659 faligndata %f20, %f22, %f42
1660 faligndata %f22, %f24, %f44
1661 brlez,pn %o2, Lbcopy_blockdone
1662 faligndata %f24, %f26, %f46
1664 stda %f32, [%o1] ASI_STORE
1666 faligndata %f26, %f28, %f32
1668 faligndata %f28, %f30, %f34
1670 ldda [%o0] ASI_BLK_P, %f0
1673 faligndata %f30, %f48, %f36
1675 faligndata %f48, %f50, %f38
1677 faligndata %f50, %f52, %f40
1678 faligndata %f52, %f54, %f42
1680 faligndata %f54, %f56, %f44
1681 brlez,pn %o2, Lbcopy_blockdone
1682 faligndata %f56, %f58, %f46
1684 stda %f32, [%o1] ASI_STORE
1686 faligndata %f58, %f60, %f32
1688 faligndata %f60, %f62, %f34
1690 ldda [%o0] ASI_BLK_P, %f16
1693 faligndata %f62, %f0, %f36
1695 faligndata %f0, %f2, %f38
1697 faligndata %f2, %f4, %f40
1698 faligndata %f4, %f6, %f42
1700 faligndata %f6, %f8, %f44
1701 brlez,pn %o2, Lbcopy_blockdone
1702 faligndata %f8, %f10, %f46
1704 stda %f32, [%o1] ASI_STORE
1710 !! Source at BLOCK_ALIGN+56
1712 !! We need to load 1 double by hand.
1718 or %g1, %lo(1f), %g1
1730 ldda [%o0] ASI_BLK_P, %f16
1735 faligndata %f12, %f14, %f32
1738 ldda [%o0] ASI_BLK_P, %f48
1741 faligndata %f14, %f16, %f34
1743 faligndata %f16, %f18, %f36
1745 faligndata %f18, %f20, %f38
1746 faligndata %f20, %f22, %f40
1747 faligndata %f22, %f24, %f42
1748 faligndata %f24, %f26, %f44
1749 brlez,pn %o2, Lbcopy_blockdone
1750 faligndata %f26, %f28, %f46
1752 stda %f32, [%o1] ASI_STORE
1754 faligndata %f28, %f30, %f32
1757 ldda [%o0] ASI_BLK_P, %f0
1760 faligndata %f30, %f48, %f34
1762 faligndata %f48, %f50, %f36
1764 faligndata %f50, %f52, %f38
1765 faligndata %f52, %f54, %f40
1767 faligndata %f54, %f56, %f42
1768 faligndata %f56, %f58, %f44
1769 brlez,pn %o2, Lbcopy_blockdone
1770 faligndata %f58, %f60, %f46
1772 stda %f32, [%o1] ASI_STORE
1774 faligndata %f60, %f62, %f32
1777 ldda [%o0] ASI_BLK_P, %f16
1780 faligndata %f62, %f0, %f34
1782 faligndata %f0, %f2, %f36
1784 faligndata %f2, %f4, %f38
1785 faligndata %f4, %f6, %f40
1787 faligndata %f6, %f8, %f42
1788 faligndata %f8, %f10, %f44
1790 brlez,pn %o2, Lbcopy_blockdone
1791 faligndata %f10, %f12, %f46
1793 stda %f32, [%o1] ASI_STORE
1798 inc BLOCK_SIZE, %o2 ! Fixup our overcommit
1799 membar #Sync ! Finish any pending loads
1800 #define FINISH_REG(f) \
1802 bl,a Lbcopy_blockfinish; \
1818 !! The low 3 bits have the sub-word bits needed to be
1819 !! stored [because (x-8)&0x7 == x].
1822 brz,pn %o2, 2f ! 100% complete?
1824 cmp %o2, 8 ! Exactly 8 bytes?
1828 btst 4, %o2 ! Word store?
1839 alignaddr %o1, %o4, %g0
1841 faligndata %f0, %f4, %f8
1843 stda %f8, [%o1] ASI_FL16_P ! Store short
1846 btst 1, %o2 ! Byte aligned?
1849 mov -7, %o0 ! Calculate dest - 7
1850 alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest.
1852 faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8
1854 stda %f8, [%o1] ASI_FL8_P ! Store 1st byte
1855 inc 1, %o1 ! Update address
1860 !! verify copy success.
1880 set block_disable, %o0
1894 block_disable: .xword 0
1895 0: .asciz "bcopy failed: %x@%p != %x@%p byte %d\r\n"
1896 1: .asciz "bcopy(%p, %p, %lx)\r\n"
1911 1: .asciz "block exit (%p, %p, %d)\n"
1915 * Weve saved our possible fpstate, now disable the fpu
1916 * and continue with life.
1922 LDPTR [%l1 + %lo(FPPROC)], %l7
1924 ! tnz 1 ! fpproc has changed!
1925 LDPTR [%l5 + P_FPSTATE], %l7
1927 tnz 1 ! fpstate has changed!
1929 andcc %l2, %l3, %g0 ! If (fpproc && fpstate)
1930 STPTR %l2, [%l1 + %lo(FPPROC)] ! Restore old fproc
1931 bz,pt %xcc, 1f ! Skip if no fpstate
1932 STPTR %l6, [%l5 + P_FPSTATE] ! Restore old fpstate
1934 call _C_LABEL(loadfpstate) ! Re-load orig fpstate
1946 1: .asciz "block done (%p, %p, %d)\n"
1952 restore %g1, 0, %o0 ! Return DEST for memcpy