1 /* $NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $ */
4 * Copyright (c) 1996-2002 Eduardo Horvath
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 #include "strmacros.h"
27 #if defined(LIBC_SCCS) && !defined(lint)
28 RCSID("$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $")
29 #endif /* LIBC_SCCS and not lint */
33 * Assumes regions do not overlap;
35 * Must not use %g7 (see copyin/copyout above).
37 ENTRY(memcpy) /* dest, src, size */
39 * Swap args for bcopy. Gcc generates calls to memcpy for
40 * structure assignments.
45 #if !defined(_KERNEL) || defined(_RUMPKERNEL)
46 ENTRY(bcopy) /* src, dest, size */
49 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
52 btst 0x80, %o4 ! PDB_COPY
56 save %sp, -CC64FSZ, %sp
65 2: .asciz "memcpy(%p<-%p,%x)\n"
74 bge,pt CCCR, 2f ! if >= this many, go be fancy.
77 mov %o1, %o5 ! Save memcpy return value
79 * Not much to copy, just do it a byte at a time.
81 deccc %o2 ! while (--len >= 0)
86 ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++;
97 * Plenty of data to copy, so try to do it optimally.
100 #ifdef USE_BLOCK_STORE_LOAD
101 ! If it is big enough, use VIS instructions
104 #endif /* USE_BLOCK_STORE_LOAD */
108 !! First align the output to a 8-byte entity
111 save %sp, -CC64FSZ, %sp
121 ldub [%l0], %l4 ! Load 1st byte
124 ble,pn CCCR, Lmemcpy_finish ! XXXX
127 stb %l4, [%l1] ! Store 1st byte
128 inc 1, %l1 ! Update address
135 lduh [%l0], %l4 ! Load short
137 ldub [%l0], %l4 ! Load bytes
145 ble,pn CCCR, Lmemcpy_finish ! XXXX
147 sth %l4, [%l1] ! Store 1st short
156 lduw [%l0], %l4 ! Load word -1
180 ble,pn CCCR, Lmemcpy_finish ! XXXX
183 st %l4, [%l1] ! Store word
187 !! We are now 32-bit aligned in the dest.
191 and %l0, 7, %l4 ! Shift amount
192 andn %l0, 7, %l0 ! Source addr
194 brz,pt %l4, Lmemcpy_noshift8 ! No shift version...
196 sllx %l4, 3, %l4 ! In bits
199 ldx [%l0], %o0 ! Load word -1
200 sub %l3, %l4, %l3 ! Reverse shift
201 deccc 12*8, %l2 ! Have enough room?
209 * This is about as close to optimal as you can get, since
210 * the shifts require EU0 and cannot be paired, and you have
211 * 3 dependent operations on the data.
214 ! ldx [%l0+0*8], %o0 ! Already done
215 ! sllx %o0, %l4, %o0 ! Already done
261 deccc 6*8, %l2 ! Have enough room?
263 sllx %o0, %l4, %o0 ! Next loop
271 Lmemcpy_unrolled8_cleanup:
273 !! Finished 8 byte block, unload the regs.
308 mov %o5, %o0 ! Save our unused data
312 bz,pn %icc, Lmemcpy_complete
316 ! ldx [%l0], %o0 ! Already done
317 ! sllx %o0, %l4, %o0 ! Shift high word
319 deccc 8, %l2 ! Pre-decrement
320 bl,pn CCCR, Lmemcpy_finish
322 ldx [%l0+8], %o1 ! Load word 0
326 or %g6, %o0, %g6 ! Combine
328 stx %g6, [%l1] ! Store result
336 bz,pt CCCR, Lmemcpy_complete
339 !! Loadup the last dregs into %o0 and shift it into place
341 srlx %l3, 3, %g6 ! # bytes in %o0
343 !! n-8 - (by - 8) -> n - by
344 subcc %l2, %g6, %g0 ! # bytes we need
345 ble,pt %icc, Lmemcpy_finish
347 ldx [%l0+8], %o1 ! Need another word
349 ba,pt %icc, Lmemcpy_finish
350 or %o0, %o1, %o0 ! All loaded up.
353 deccc 6*8, %l2 ! Have enough room?
382 bl,pn %icc, 1f ! < 0 --> sub word
387 bg,pt %icc, 1b ! Exactly 0 --> done
391 bz,pt CCCR, Lmemcpy_complete
396 brz,pn %l2, 2f ! 100% complete?
397 cmp %l2, 8 ! Exactly 8 bytes?
401 btst 4, %l2 ! Word store?
403 srlx %o0, 32, %g6 ! Shift high word down
406 mov %o0, %g6 ! Operate on the low bits
413 sth %g6, [%l1] ! Store short
415 mov %o0, %g6 ! Operate on low bytes
418 btst 1, %l2 ! Byte aligned?
422 stb %g6, [%l1] ! Store last byte
423 inc 1, %l1 ! Update address
428 !! verify copy success.
458 0: .asciz "memcpy failed: %x@%p != %x@%p byte %d\n"
459 1: .asciz "memcpy(%p, %p, %lx)\n"
465 restore %i1, %g0, %o0
467 #ifdef USE_BLOCK_STORE_LOAD
470 * Block copy. Useful for >256 byte copies.
472 * Benchmarking has shown this always seems to be slower than
473 * the integer version, so this is disabled. Maybe someone will
474 * figure out why sometime.
478 sethi %hi(block_disable), %o3
479 ldx [ %o3 + %lo(block_disable) ], %o3
480 brnz,pn %o3, Lmemcpy_fancy
481 !! Make sure our trap table is installed
482 set _C_LABEL(trapbase), %o5
485 brnz,pn %o3, Lmemcpy_fancy ! No, then don't use block load/store
487 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
491 * Here we use VIS instructions to do a block clear of a page.
492 * But before we can do that we need to save and enable the FPU.
493 * The last owner of the FPU registers is fplwp, and
494 * fplwp->l_md.md_fpstate is the current fpstate. If that's not
495 * null, call savefpstate() with it to store our current fp state.
497 * Next, allocate an aligned fpstate on the stack. We will properly
498 * nest calls on a particular stack so this should not be a problem.
500 * Now we grab either curlwp (or if we're on the interrupt stack
501 * lwp0). We stash its existing fpstate in a local register and
502 * put our new fpstate in curlwp->p_md.md_fpstate. We point
503 * fplwp at curlwp (or lwp0) and enable the FPU.
505 * If we are ever preempted, our FPU state will be saved in our
506 * fpstate. Then, when we're resumed and we take an FPDISABLED
507 * trap, the trap handler will be able to fish our FPU state out
508 * of curlwp (or lwp0).
510 * On exiting this routine we undo the damage: restore the original
511 * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable
515 * Register usage, Kernel only (after save):
521 * %l0 XXXX DEBUG old fpstate
522 * %l1 fplwp (hi bits only)
528 * Register ussage, Kernel and user:
530 * %g1 src (retval for memcpy)
535 * %o5 last safe fetchable address
540 mov %i0, %o0 ! Src addr.
541 mov %i1, %o1 ! Store our dest ptr here.
542 mov %i2, %o2 ! Len counter
546 !! First align the output to a 64-bit entity
549 mov %o1, %g1 ! memcpy retval
550 add %o0, %o2, %o5 ! End of source block
552 andn %o0, 7, %o3 ! Start of block
556 andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr.
557 ldd [%o3], %f2 ! Load 1st word
559 dec 8, %o3 ! Move %o3 1 word back
563 mov -7, %o4 ! Lowest src addr possible
564 alignaddr %o0, %o4, %o4 ! Base addr for load.
567 be,pt CCCR, 1f ! Already loaded?
569 fmovd %f2, %f0 ! No. Shift
570 ldd [%o3+8], %f2 ! And load
573 faligndata %f0, %f2, %f4 ! Isolate 1st byte
575 stda %f4, [%o1] ASI_FL8_P ! Store 1st byte
576 inc 1, %o1 ! Update address
583 mov -6, %o4 ! Calculate src - 6
584 alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
586 cmp %o3, %o4 ! Addresses same?
589 fmovd %f2, %f0 ! Shuffle data
590 ldd [%o3+8], %f2 ! Load word 0
592 faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
594 stda %f4, [%o1] ASI_FL16_P ! Store 1st short
599 brz,pn %o2, Lmemcpy_blockfinish ! XXXX
605 alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
607 cmp %o3, %o4 ! Addresses same?
610 fmovd %f2, %f0 ! Shuffle data
611 ldd [%o3+8], %f2 ! Load word 0
613 faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
615 st %f5, [%o1] ! Store word
620 brz,pn %o2, Lmemcpy_blockfinish ! XXXX
622 !! We are now 32-bit aligned in the dest.
624 Lmemcpy_block_common:
627 alignaddr %o0, %o4, %o4 ! base - shift
629 cmp %o3, %o4 ! Addresses same?
632 fmovd %f2, %f0 ! Shuffle data
633 ldd [%o3+8], %f2 ! Load word 0
635 add %o3, 8, %o0 ! now use %o0 for src
638 !! Continue until our dest is block aligned
640 Lmemcpy_block_aligned8:
642 brz %o2, Lmemcpy_blockfinish
643 btst BLOCK_ALIGN, %o1 ! Block aligned?
646 faligndata %f0, %f2, %f4 ! Generate result
648 ble,pn %icc, Lmemcpy_blockfinish ! Should never happen
651 std %f4, [%o1] ! Store result
656 ba,pt %xcc, 1b ! Not yet.
657 ldd [%o0], %f2 ! Load next part
658 Lmemcpy_block_aligned64:
662 * 64-byte aligned -- ready for block operations.
664 * Here we have the destination block aligned, but the
665 * source pointer may not be. Sub-word alignment will
666 * be handled by faligndata instructions. But the source
667 * can still be potentially aligned to 8 different words
668 * in our 64-bit block, so we have 8 different copy routines.
670 * Once we figure out our source alignment, we branch
671 * to the appropriate copy routine, which sets up the
672 * alignment for faligndata and loads (sets) the values
673 * into the source registers and does the copy loop.
675 * When were down to less than 1 block to store, we
676 * exit the copy loop and execute cleanup code.
678 * Block loads and stores are not properly interlocked.
679 * Stores save one reg/cycle, so you can start overwriting
680 * registers the cycle after the store is issued.
682 * Block loads require a block load to a different register
683 * block or a membar #Sync before accessing the loaded
686 * Since the faligndata instructions may be offset as far
687 * as 7 registers into a block (if you are shifting source
688 * 7 -> dest 0), you need 3 source register blocks for full
689 * performance: one you are copying, one you are loading,
690 * and one for interlocking. Otherwise, we would need to
691 * sprinkle the code with membar #Sync and lose the advantage
692 * of running faligndata in parallel with block stores. This
693 * means we are fetching a full 128 bytes ahead of the stores.
694 * We need to make sure the prefetch does not inadvertently
695 * cross a page boundary and fault on data that we will never
700 and %o0, BLOCK_ALIGN, %o3
701 srax %o3, 3, %o3 ! Isolate the offset
709 ba,pt %xcc, L101 ! 0->1
710 nop /* XXX spitfire bug */
714 ba,pt %xcc, L103 ! 0->3
715 nop /* XXX spitfire bug */
721 ba,pt %xcc, L105 ! 0->5
722 nop /* XXX spitfire bug */
726 ba,pt %xcc, L107 ! 0->7
727 nop /* XXX spitfire bug */
731 !! Isolate the word offset, which just happens to be
732 !! the slot in our jump table.
734 !! This is 6 insns, most of which cannot be paired,
735 !! which is about the same as the above version.
740 add %o3, (Lmemcpy_block_jmp - 1b), %o3
768 !! Source is block aligned.
770 !! Just load a block and go.
783 ldda [%o0] ASI_BLK_P, %f0
787 ldda [%o0] ASI_BLK_P, %f16
791 .align 32 ! ICache align.
793 faligndata %f62, %f0, %f32
795 faligndata %f0, %f2, %f34
797 faligndata %f2, %f4, %f36
799 faligndata %f4, %f6, %f38
800 faligndata %f6, %f8, %f40
801 faligndata %f8, %f10, %f42
802 faligndata %f10, %f12, %f44
803 brlez,pn %o2, Lmemcpy_blockdone
804 faligndata %f12, %f14, %f46
807 ldda [%o0] ASI_BLK_P, %f48
810 stda %f32, [%o1] ASI_STORE
811 faligndata %f14, %f16, %f32
813 faligndata %f16, %f18, %f34
815 faligndata %f18, %f20, %f36
817 faligndata %f20, %f22, %f38
819 faligndata %f22, %f24, %f40
820 faligndata %f24, %f26, %f42
821 faligndata %f26, %f28, %f44
822 brlez,pn %o2, Lmemcpy_blockdone
823 faligndata %f28, %f30, %f46
826 ldda [%o0] ASI_BLK_P, %f0
829 stda %f32, [%o1] ASI_STORE
830 faligndata %f30, %f48, %f32
832 faligndata %f48, %f50, %f34
834 faligndata %f50, %f52, %f36
836 faligndata %f52, %f54, %f38
838 faligndata %f54, %f56, %f40
839 faligndata %f56, %f58, %f42
840 faligndata %f58, %f60, %f44
841 brlez,pn %o2, Lmemcpy_blockdone
842 faligndata %f60, %f62, %f46
844 ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top
847 stda %f32, [%o1] ASI_STORE
852 !! Source at BLOCK_ALIGN+8
854 !! We need to load almost 1 complete block by hand.
866 ! fmovd %f0, %f0 ! Hoist fmovd
884 ldda [%o0] ASI_BLK_P, %f16
887 faligndata %f0, %f2, %f32
889 faligndata %f2, %f4, %f34
891 faligndata %f4, %f6, %f36
893 faligndata %f6, %f8, %f38
894 faligndata %f8, %f10, %f40
895 faligndata %f10, %f12, %f42
896 faligndata %f12, %f14, %f44
898 ldda [%o0] ASI_BLK_P, %f48
901 brlez,pn %o2, Lmemcpy_blockdone
902 faligndata %f14, %f16, %f46
904 stda %f32, [%o1] ASI_STORE
906 faligndata %f16, %f18, %f32
908 faligndata %f18, %f20, %f34
910 faligndata %f20, %f22, %f36
912 faligndata %f22, %f24, %f38
914 faligndata %f24, %f26, %f40
915 faligndata %f26, %f28, %f42
916 faligndata %f28, %f30, %f44
918 ldda [%o0] ASI_BLK_P, %f0
921 brlez,pn %o2, Lmemcpy_blockdone
922 faligndata %f30, %f48, %f46
924 stda %f32, [%o1] ASI_STORE
926 faligndata %f48, %f50, %f32
928 faligndata %f50, %f52, %f34
930 faligndata %f52, %f54, %f36
932 faligndata %f54, %f56, %f38
934 faligndata %f56, %f58, %f40
935 faligndata %f58, %f60, %f42
936 faligndata %f60, %f62, %f44
938 ldda [%o0] ASI_BLK_P, %f16
941 brlez,pn %o2, Lmemcpy_blockdone
942 faligndata %f62, %f0, %f46
944 stda %f32, [%o1] ASI_STORE
949 !! Source at BLOCK_ALIGN+16
951 !! We need to load 6 doubles by hand.
965 fmovd %f0, %f2 ! Hoist fmovd
980 ldda [%o0] ASI_BLK_P, %f16
983 faligndata %f2, %f4, %f32
985 faligndata %f4, %f6, %f34
987 faligndata %f6, %f8, %f36
989 faligndata %f8, %f10, %f38
990 faligndata %f10, %f12, %f40
991 faligndata %f12, %f14, %f42
993 ldda [%o0] ASI_BLK_P, %f48
996 faligndata %f14, %f16, %f44
998 brlez,pn %o2, Lmemcpy_blockdone
999 faligndata %f16, %f18, %f46
1001 stda %f32, [%o1] ASI_STORE
1003 faligndata %f18, %f20, %f32
1005 faligndata %f20, %f22, %f34
1007 faligndata %f22, %f24, %f36
1009 faligndata %f24, %f26, %f38
1011 faligndata %f26, %f28, %f40
1012 faligndata %f28, %f30, %f42
1014 ldda [%o0] ASI_BLK_P, %f0
1017 faligndata %f30, %f48, %f44
1018 brlez,pn %o2, Lmemcpy_blockdone
1019 faligndata %f48, %f50, %f46
1021 stda %f32, [%o1] ASI_STORE
1023 faligndata %f50, %f52, %f32
1025 faligndata %f52, %f54, %f34
1027 faligndata %f54, %f56, %f36
1029 faligndata %f56, %f58, %f38
1031 faligndata %f58, %f60, %f40
1032 faligndata %f60, %f62, %f42
1034 ldda [%o0] ASI_BLK_P, %f16
1037 faligndata %f62, %f0, %f44
1038 brlez,pn %o2, Lmemcpy_blockdone
1039 faligndata %f0, %f2, %f46
1041 stda %f32, [%o1] ASI_STORE
1046 !! Source at BLOCK_ALIGN+24
1048 !! We need to load 5 doubles by hand.
1054 or %g1, %lo(1f), %g1
1074 ldda [%o0] ASI_BLK_P, %f16
1079 faligndata %f4, %f6, %f32
1081 faligndata %f6, %f8, %f34
1083 faligndata %f8, %f10, %f36
1084 faligndata %f10, %f12, %f38
1085 faligndata %f12, %f14, %f40
1087 ldda [%o0] ASI_BLK_P, %f48
1090 faligndata %f14, %f16, %f42
1092 faligndata %f16, %f18, %f44
1093 brlez,pn %o2, Lmemcpy_blockdone
1094 faligndata %f18, %f20, %f46
1096 stda %f32, [%o1] ASI_STORE
1098 faligndata %f20, %f22, %f32
1100 faligndata %f22, %f24, %f34
1102 faligndata %f24, %f26, %f36
1104 faligndata %f26, %f28, %f38
1105 faligndata %f28, %f30, %f40
1107 ldda [%o0] ASI_BLK_P, %f0
1110 faligndata %f30, %f48, %f42
1112 faligndata %f48, %f50, %f44
1113 brlez,pn %o2, Lmemcpy_blockdone
1114 faligndata %f50, %f52, %f46
1116 stda %f32, [%o1] ASI_STORE
1118 faligndata %f52, %f54, %f32
1120 faligndata %f54, %f56, %f34
1122 faligndata %f56, %f58, %f36
1123 faligndata %f58, %f60, %f38
1125 faligndata %f60, %f62, %f40
1127 ldda [%o0] ASI_BLK_P, %f16
1130 faligndata %f62, %f0, %f42
1132 faligndata %f0, %f2, %f44
1133 brlez,pn %o2, Lmemcpy_blockdone
1134 faligndata %f2, %f4, %f46
1136 stda %f32, [%o1] ASI_STORE
1141 !! Source at BLOCK_ALIGN+32
1143 !! We need to load 4 doubles by hand.
1149 or %g1, %lo(1f), %g1
1167 ldda [%o0] ASI_BLK_P, %f16
1172 faligndata %f6, %f8, %f32
1174 faligndata %f8, %f10, %f34
1176 faligndata %f10, %f12, %f36
1177 faligndata %f12, %f14, %f38
1179 ldda [%o0] ASI_BLK_P, %f48
1182 faligndata %f14, %f16, %f40
1183 faligndata %f16, %f18, %f42
1185 faligndata %f18, %f20, %f44
1186 brlez,pn %o2, Lmemcpy_blockdone
1187 faligndata %f20, %f22, %f46
1189 stda %f32, [%o1] ASI_STORE
1191 faligndata %f22, %f24, %f32
1193 faligndata %f24, %f26, %f34
1194 faligndata %f26, %f28, %f36
1196 faligndata %f28, %f30, %f38
1198 ldda [%o0] ASI_BLK_P, %f0
1201 faligndata %f30, %f48, %f40
1203 faligndata %f48, %f50, %f42
1205 faligndata %f50, %f52, %f44
1206 brlez,pn %o2, Lmemcpy_blockdone
1207 faligndata %f52, %f54, %f46
1209 stda %f32, [%o1] ASI_STORE
1211 faligndata %f54, %f56, %f32
1213 faligndata %f56, %f58, %f34
1214 faligndata %f58, %f60, %f36
1216 faligndata %f60, %f62, %f38
1218 ldda [%o0] ASI_BLK_P, %f16
1221 faligndata %f62, %f0, %f40
1223 faligndata %f0, %f2, %f42
1225 faligndata %f2, %f4, %f44
1226 brlez,pn %o2, Lmemcpy_blockdone
1227 faligndata %f4, %f6, %f46
1229 stda %f32, [%o1] ASI_STORE
1234 !! Source at BLOCK_ALIGN+40
1236 !! We need to load 3 doubles by hand.
1242 or %g1, %lo(1f), %g1
1258 ldda [%o0] ASI_BLK_P, %f16
1263 faligndata %f8, %f10, %f32
1265 faligndata %f10, %f12, %f34
1266 faligndata %f12, %f14, %f36
1268 ldda [%o0] ASI_BLK_P, %f48
1271 faligndata %f14, %f16, %f38
1273 faligndata %f16, %f18, %f40
1275 faligndata %f18, %f20, %f42
1276 faligndata %f20, %f22, %f44
1277 brlez,pn %o2, Lmemcpy_blockdone
1278 faligndata %f22, %f24, %f46
1280 stda %f32, [%o1] ASI_STORE
1282 faligndata %f24, %f26, %f32
1284 faligndata %f26, %f28, %f34
1286 faligndata %f28, %f30, %f36
1288 ldda [%o0] ASI_BLK_P, %f0
1291 faligndata %f30, %f48, %f38
1293 faligndata %f48, %f50, %f40
1295 faligndata %f50, %f52, %f42
1296 faligndata %f52, %f54, %f44
1297 brlez,pn %o2, Lmemcpy_blockdone
1298 faligndata %f54, %f56, %f46
1300 stda %f32, [%o1] ASI_STORE
1302 faligndata %f56, %f58, %f32
1304 faligndata %f58, %f60, %f34
1306 faligndata %f60, %f62, %f36
1308 ldda [%o0] ASI_BLK_P, %f16
1311 faligndata %f62, %f0, %f38
1313 faligndata %f0, %f2, %f40
1315 faligndata %f2, %f4, %f42
1316 faligndata %f4, %f6, %f44
1317 brlez,pn %o2, Lmemcpy_blockdone
1318 faligndata %f6, %f8, %f46
1320 stda %f32, [%o1] ASI_STORE
1326 !! Source at BLOCK_ALIGN+48
1328 !! We need to load 2 doubles by hand.
1334 or %g1, %lo(1f), %g1
1348 ldda [%o0] ASI_BLK_P, %f16
1353 faligndata %f10, %f12, %f32
1355 faligndata %f12, %f14, %f34
1357 ldda [%o0] ASI_BLK_P, %f48
1360 faligndata %f14, %f16, %f36
1362 faligndata %f16, %f18, %f38
1364 faligndata %f18, %f20, %f40
1365 faligndata %f20, %f22, %f42
1366 faligndata %f22, %f24, %f44
1367 brlez,pn %o2, Lmemcpy_blockdone
1368 faligndata %f24, %f26, %f46
1370 stda %f32, [%o1] ASI_STORE
1372 faligndata %f26, %f28, %f32
1374 faligndata %f28, %f30, %f34
1376 ldda [%o0] ASI_BLK_P, %f0
1379 faligndata %f30, %f48, %f36
1381 faligndata %f48, %f50, %f38
1383 faligndata %f50, %f52, %f40
1384 faligndata %f52, %f54, %f42
1386 faligndata %f54, %f56, %f44
1387 brlez,pn %o2, Lmemcpy_blockdone
1388 faligndata %f56, %f58, %f46
1390 stda %f32, [%o1] ASI_STORE
1392 faligndata %f58, %f60, %f32
1394 faligndata %f60, %f62, %f34
1396 ldda [%o0] ASI_BLK_P, %f16
1399 faligndata %f62, %f0, %f36
1401 faligndata %f0, %f2, %f38
1403 faligndata %f2, %f4, %f40
1404 faligndata %f4, %f6, %f42
1406 faligndata %f6, %f8, %f44
1407 brlez,pn %o2, Lmemcpy_blockdone
1408 faligndata %f8, %f10, %f46
1410 stda %f32, [%o1] ASI_STORE
1416 !! Source at BLOCK_ALIGN+56
1418 !! We need to load 1 double by hand.
1424 or %g1, %lo(1f), %g1
1436 ldda [%o0] ASI_BLK_P, %f16
1441 faligndata %f12, %f14, %f32
1444 ldda [%o0] ASI_BLK_P, %f48
1447 faligndata %f14, %f16, %f34
1449 faligndata %f16, %f18, %f36
1451 faligndata %f18, %f20, %f38
1452 faligndata %f20, %f22, %f40
1453 faligndata %f22, %f24, %f42
1454 faligndata %f24, %f26, %f44
1455 brlez,pn %o2, Lmemcpy_blockdone
1456 faligndata %f26, %f28, %f46
1458 stda %f32, [%o1] ASI_STORE
1460 faligndata %f28, %f30, %f32
1463 ldda [%o0] ASI_BLK_P, %f0
1466 faligndata %f30, %f48, %f34
1468 faligndata %f48, %f50, %f36
1470 faligndata %f50, %f52, %f38
1471 faligndata %f52, %f54, %f40
1473 faligndata %f54, %f56, %f42
1474 faligndata %f56, %f58, %f44
1475 brlez,pn %o2, Lmemcpy_blockdone
1476 faligndata %f58, %f60, %f46
1478 stda %f32, [%o1] ASI_STORE
1480 faligndata %f60, %f62, %f32
1483 ldda [%o0] ASI_BLK_P, %f16
1486 faligndata %f62, %f0, %f34
1488 faligndata %f0, %f2, %f36
1490 faligndata %f2, %f4, %f38
1491 faligndata %f4, %f6, %f40
1493 faligndata %f6, %f8, %f42
1494 faligndata %f8, %f10, %f44
1496 brlez,pn %o2, Lmemcpy_blockdone
1497 faligndata %f10, %f12, %f46
1499 stda %f32, [%o1] ASI_STORE
1504 inc BLOCK_SIZE, %o2 ! Fixup our overcommit
1505 membar #Sync ! Finish any pending loads
1506 #define FINISH_REG(f) \
1508 bl,a Lmemcpy_blockfinish; \
1524 !! The low 3 bits have the sub-word bits needed to be
1525 !! stored [because (x-8)&0x7 == x].
1527 Lmemcpy_blockfinish:
1528 brz,pn %o2, 2f ! 100% complete?
1530 cmp %o2, 8 ! Exactly 8 bytes?
1534 btst 4, %o2 ! Word store?
1545 alignaddr %o1, %o4, %g0
1547 faligndata %f0, %f4, %f8
1549 stda %f8, [%o1] ASI_FL16_P ! Store short
1552 btst 1, %o2 ! Byte aligned?
1555 mov -7, %o0 ! Calculate dest - 7
1556 alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest.
1558 faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8
1560 stda %f8, [%o1] ASI_FL8_P ! Store 1st byte
1561 inc 1, %o1 ! Update address
1566 !! verify copy success.
1586 set block_disable, %o0
1600 0: .asciz "block memcpy failed: %x@%p != %x@%p byte %d\r\n"
1601 1: .asciz "memcpy(%p, %p, %lx)\r\n"
1606 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
1609 * Weve saved our possible fpstate, now disable the fpu
1610 * and continue with life.
1614 restore %g1, 0, %o0 ! Return DEST for memcpy
1619 * Use block_disable to turn off block insns for
1624 .globl block_disable
1625 block_disable: .xword 1
1627 #endif /* USE_BLOCK_STORE_LOAD */