4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
31 * Copy s2 to s1, always copy n bytes.
32 * Note: this C code does not work for overlapped copies.
33 * Memmove() and bcopy() do.
35 * Added entry __align_cpy_1 is generally for use of the compilers.
37 * Fast assembler language version of the following C-program for memcpy
38 * which represents the `standard' for the C-library.
41 * memcpy(void *s, const void *s0, size_t n)
45 * const char *s2 = s0;
58 * Return destination address
61 * Is source aligned on word boundary
62 * If no then align source on word boundary then goto .ald
65 * Is destination aligned on word boundary
66 * Depending on destination offset (last 2 bits of destination)
67 * copy data by shifting and merging.
68 * Copy residue bytes as byte copy
69 * Return destination address
71 * Align destination on block boundary
72 * Depending on the source offset (last 4 bits of source address) align
73 * the data and store to destination. Both the load and store are done
74 * using ASI_BLK_INIT_ST_QUAD_LDD_P.
75 * For remaining count copy as much data in 8-byte chunk from source to
77 * Followed by trailing copy using byte copy.
78 * Return saved destination address
87 * copy bytes; exit with dst addr
88 * if src & dst aligned on word boundary but not long word boundary,
89 * copy with ldw/stw; branch to finish_up
90 * if src & dst aligned on long word boundary
91 * copy with ldx/stx; branch to finish_up
92 * if src & dst not aligned and length <= 14
93 * copy bytes; exit with dst addr
94 * move enough bytes to get src to word boundary
95 * if dst now on word boundary
97 * copy words; branch to finish_up
98 * if dst now on half word boundary
99 * load words, shift half words, store words; branch to finish_up
101 * load words, shift 3 bytes, store words; branch to finish_up
103 * load words, shift 1 byte, store words; branch to finish_up
105 * copy bytes; exit with dst addr
106 * } else { More than 128 bytes
107 * move bytes until dst is on long word boundary
108 * if( src is on long word boundary ) {
110 * finish_long: src/dst aligned on 8 bytes
111 * copy with ldx/stx in 8-way unrolled loop;
112 * copy final 0-63 bytes; exit with dst addr
113 * } else { src/dst aligned; count > 512
114 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
115 * src alignments relative to a 64 byte boundary to select the
116 * 16-way unrolled loop to use for
117 * block load, fmovd, block-init-store, block-store, fmovd operations
118 * then go to finish_long.
120 * } else { src/dst not aligned on 8 bytes
121 * if src is word aligned and count < 512
122 * move words in 8-way unrolled loop
123 * move final 0-31 bytes; exit with dst addr
125 * use alignaddr/faligndata combined with ldd/std in 8-way
126 * unrolled loop to move data.
129 * setup alignaddr for faligndata instructions
130 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
131 * src alignments to nearest long word relative to 64 byte boundary to
132 * select the 8-way unrolled loop to use for
133 * block load, falign, fmovd, block-init-store, block-store loop
134 * (only use block-init-store when src/dst on 8 byte boundaries.)
136 * move remaining bytes for unaligned cases. exit with dst addr.
139 * Comment on N2 memmove and memcpy common code and block-store-init:
140 * In the man page for memmove, it specifies that copying will take place
141 * correctly between objects that overlap. For memcpy, behavior is
142 * undefined for objects that overlap.
144 * In rare cases, some multi-threaded applications may attempt to examine
145 * the copy destination buffer during the copy. Using the block-store-init
146 * instruction allows those applications to observe zeros in some
147 * cache lines of the destination buffer for narrow windows. But the
148 * the block-store-init provides memory throughput advantages for many
149 * common applications. To meet both needs, those applications which need
150 * the destination buffer to retain meaning during the copy should use
151 * memmove instead of memcpy. The memmove version duplicates the memcpy
152 * algorithms except the memmove version does not use block-store-init
153 * in those cases where memcpy does use block-store-init. Otherwise, when
154 * memmove can determine the source and destination do not overlap,
155 * memmove shares the memcpy code.
158 #include <sys/asm_linkage.h>
159 #include <sys/niagaraasi.h>
161 #include <sys/trap.h>
163 /* documented name for primary block initializing store */
164 #define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P
166 #define BLOCK_SIZE 64
170 #define SHORTCHECK 14
171 #define SHORT_LONG 64 /* max copy for short longword-aligned case */
172 /* must be at least 32 */
173 #define SMALL_MAX 128
174 #define MED_UMAX 512 /* max copy for medium un-aligned case */
175 #define MED_WMAX 512 /* max copy for medium word-aligned case */
176 #define MED_MAX 512 /* max copy for medium longword-aligned case */
179 #include <sys/sun4asi.h>
181 #else /* NIAGARA2_IMPL */
183 * This define is to align data for the unaligned source cases.
184 * The data1, data2 and data3 is merged into data1 and data2.
185 * The data3 is preserved for next merge.
187 #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \
188 sllx data1
, lshift
, data1 ;\
189 srlx data2
, rshift
, tmp ;\
190 or data1
, tmp
, data1 ;\
191 sllx data2
, lshift
, data2 ;\
192 srlx data3
, rshift
, tmp ;\
195 * Align the data. Merge the data1 and data2 into data1.
197 #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \
198 sllx data1
, lshift
, data1 ;\
199 srlx data2
, rshift
, tmp ;\
201 #endif /* NIAGARA2_IMPL */
204 ANSI_PRAGMA_WEAK
(memmove
,function
)
205 ANSI_PRAGMA_WEAK
(memcpy
,function
)
208 cmp %o1
, %o0
! if from address is
>= to use forward copy
209 bgeu
,pn
%ncc
, .forcpy ! else use backward if ...
210 sub %o0
, %o1
, %o4
! get difference of two addresses
211 cmp %o2
, %o4
! compare size
and difference of addresses
212 bleu
,pn
%ncc
, .forcpy ! if size is bigger, do overlapped copy
213 add %o1
, %o2
, %o5
! get to end of source space
216 ! an overlapped copy that must
be done
"backwards"
219 cmp %o2
, 8 ! less than
8 byte do byte copy
220 blu
,pt
%ncc
, 2f
! else continue
222 ! Now size is bigger than
8
224 add %o0
, %o2
, %g1
! get to end of dest space
225 andcc
%g1
, 7, %o3
! %o3 has bytes till
dst 8 bytes aligned
226 bz
,a,pn
%ncc
, .dbbck ! if dst is not 8 byte aligned: align it
227 andn
%o2
, 7, %o3
! %o3 count is multiple of
8 bytes size
228 sub %o2
, %o3
, %o2
! update o2 with new count
230 1: dec %o5
! decrement source
231 ldub
[%o5
], %g1
! load one byte
232 deccc
%o3
! decrement count
233 bgu
,pt
%ncc
, 1b ! if
not done keep copying
234 stb %g1
, [%o5+
%o4
] ! store one byte into dest
235 andncc
%o2
, 7, %o3
! %o3 count is multiple of
8 bytes size
236 bz
,pn
%ncc
, 2f
! if size
< 8, move to byte copy
238 ! Now Destination is
8 byte aligned
240 andcc
%o5
, 7, %o0
! %o0 has src offset
241 bz
,a,pn
%ncc
, .dbcopybc ! if src is aligned to fast mem move
242 sub %o2
, %o3
, %o2
! Residue bytes in
%o2
244 .cpy_dbwdbc: ! alignment of src is needed
245 sub %o2
, 8, %o2
! set size one loop ahead
246 sll
%o0
, 3, %g1
! %g1 is left shift
247 mov
64, %g5
! init
%g5 to
be 64
248 sub %g5
, %g1
, %g5
! %g5 right shift
= (64 - left shift
)
249 sub %o5
, %o0
, %o5
! align the src at
8 bytes.
250 add %o4
, %o0
, %o4
! increase difference between src
& dst
251 ldx [%o5
], %o1
! load first
8 bytes
253 1: sub %o5
, 8, %o5
! subtract
8 from src
254 ldx [%o5
], %o0
! load
8 byte
255 sllx
%o0
, %g1
, %o3
! shift loaded
8 bytes left into tmp reg
256 or %o1
, %o3
, %o3
! align data
257 stx %o3
, [%o5+
%o4
] ! store
8 byte
258 subcc
%o2
, 8, %o2
! subtract
8 byte from size
259 bg
,pt
%ncc
, 1b ! if size
> 0 continue
260 srlx
%o0
, %g5
, %o1
! move extra byte for the next use
262 srl
%g1
, 3, %o0
! retsote
%o0 value for alignment
263 add %o5
, %o0
, %o5
! restore src alignment
264 sub %o4
, %o0
, %o4
! restore difference between src
& dest
266 ba 2f
! branch to the trailing byte copy
267 add %o2
, 8, %o2
! restore size value
269 .dbcopybc: ! alignment of src is not needed
270 1: sub %o5
, 8, %o5
! subtract from src
271 ldx [%o5
], %g1
! load
8 bytes
272 subcc
%o3
, 8, %o3
! subtract from size
273 bgu
,pt
%ncc
, 1b ! if size is bigger
0 continue
274 stx %g1
, [%o5+
%o4
] ! store
8 bytes to destination
280 1: ldub
[%o5
], %g1
! load one byte
281 stb %g1
, [%o5+
%o4
] ! store one byte
282 2: deccc
%o2
! decrement size
283 bgeu
,a,pt
%ncc
, 1b ! if size is
>= 0 continue
284 dec %o5
! decrement from address
286 .exitbc: ! exit from backward copy
288 add %o5
, %o4
, %o0
! restore dest addr
292 ! Check to see if memmove is large aligned copy
293 ! If so
, use special version of copy that avoids
294 ! use of block store init
297 cmp %o2
, SMALL_MAX
! check for
not small case
298 blt,pn
%ncc
, .mv_short ! merge with memcpy
299 mov
%o0
, %g1
! save
%o0
301 andcc
%o5
, 7, %o5
! bytes till
DST 8 byte aligned
302 brz
,pt
%o5
, .mv_dst_aligned_on_8
304 ! %o5 has the bytes to
be written in partial store.
306 sub %o1
, %o0
, %o1
! %o1 gets the difference
307 7: ! dst aligning loop
308 ldub
[%o1+
%o0
], %o4
! load one byte
312 add %o0
, 1, %o0
! advance
dst
313 add %o1
, %o0
, %o1
! restore
%o1
314 .mv_dst_aligned_on_8:
316 brnz
,pt
%o5
, .src_dst_unaligned_on_8
317 prefetch
[%o1
+ (1 * BLOCK_SIZE
)], #one_read
319 .mv_src_dst_aligned_on_8:
320 ! check if we are copying MED_MAX
or more bytes
321 cmp %o2
, MED_MAX
! limit to store buffer size
322 bleu
,pt
%ncc
, .medlong
323 prefetch
[%o1
+ (2 * BLOCK_SIZE
)], #one_read
326 * The following memmove code mimics the memcpy code for large aligned copies,
327 * but does not use the ASI_STBI_P (block initializing store) performance
328 * optimization. See memmove rationale section in documentation
330 .mv_large_align8_copy: ! Src and dst share 8 byte alignment
331 rd
%fprs
, %g5
! check for unused fp
332 ! if fprs.fef
== 0, set it.
333 ! Setting it when already set costs more than checking
334 andcc
%g5
, FPRS_FEF
, %g5
! test FEF
, fprs.du
= fprs.dl
= 0
336 wr
%g0
, FPRS_FEF
, %fprs
! fprs.fef
= 1
338 ! align
dst to
64 byte boundary
339 andcc
%o0
, 0x3f, %o3
! %o3
== 0 means
dst is
64 byte aligned
340 brz
,pn
%o3
, .mv_aligned_on_64
341 sub %o3
, 64, %o3
! %o3 has negative bytes to move
342 add %o2
, %o3
, %o2
! adjust remaining count
345 add %o1
, 8, %o1
! increment src ptr
348 brnz
,pt
%o3
, .mv_align_to_64
349 add %o0
, 8, %o0
! increment
dst ptr
352 prefetch
[%o1
+ (3 * BLOCK_SIZE
)], #one_read
353 mov
%asi
,%o4
! save
%asi
354 ! Determine source alignment
355 ! to correct
8 byte offset
357 brnz
,pn
%o3
, .mv_align_1
358 mov ASI_BLK_P
, %asi
! setup
%asi for block load
/store
360 brnz
,pn
%o3
, .mv_align_01
363 brz
,pn
%o3
, .mv_align_000
364 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
369 brnz
,pn
%o3
, .mv_align_011
370 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
375 brnz
,pn
%o3
, .mv_align_11
378 brnz
,pn
%o3
, .mv_align_101
379 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
384 brz
,pn
%o3
, .mv_align_110
385 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
388 ! Alignment off by
8 bytes
392 andn
%o2
, 0x7f, %o5
! %o5 is multiple of
2*block size
393 and %o2
, 0x7f, %o2
! residue bytes in
%o2
396 /* ---- copy line 1 of 2. ---- */
397 ldda
[%o1
]%asi
,%d16
! block load
406 add %o0
, 64, %o0
! advance
dst
407 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], #one_read
410 /* ---- copy line 2 of 2. ---- */
411 ldda
[%o1+
64]%asi
,%d16
419 add %o1
, 128, %o1
! increment src
421 add %o0
, 64, %o0
! advance
dst
423 bgt,pt
%ncc
, .mv_align_111_loop
424 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
429 ! END OF mv_align_111
432 ! Alignment off by
16 bytes
437 andn
%o2
, 0x7f, %o5
! %o5 is multiple of
2*block size
438 and %o2
, 0x7f, %o2
! residue bytes in
%o2
441 /* ---- copy line 1 of 2. ---- */
443 ldda
[%o1
]%asi
,%d16
! block load
451 add %o0
, 64, %o0
! advance
dst
453 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], #one_read
456 /* ---- copy line 2 of 2. ---- */
457 ldda
[%o1+
64]%asi
,%d16
464 add %o1
, 128, %o1
! increment src
466 add %o0
, 64, %o0
! advance
dst
469 bgt,pt
%ncc
, .mv_align_110_loop
470 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
476 ! END OF mv_align_110
479 ! Alignment off by
24 bytes
485 andn
%o2
, 0x7f, %o5
! %o5 is multiple of
2*block size
486 and %o2
, 0x7f, %o2
! residue bytes in
%o2
489 /* ---- copy line 1 of 2. ---- */
491 ldda
[%o1
]%asi
,%d16
! block load
498 add %o0
, 64, %o0
! advance
dst
501 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], #one_read
504 /* ---- copy line 2 of 2. ---- */
505 ldda
[%o1+
64]%asi
,%d16
511 add %o1
, 128, %o1
! increment src
513 add %o0
, 64, %o0
! advance
dst
517 bgt,pt
%ncc
, .mv_align_101_loop
518 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
525 ! END OF mv_align_101
528 ! Alignment off by
32 bytes
535 andn
%o2
, 0x7f, %o5
! %o5 is multiple of
2*block size
536 and %o2
, 0x7f, %o2
! residue bytes in
%o2
539 /* ---- copy line 1 of 2. ---- */
540 ldda
[%o1
]%asi
,%d16
! block load
546 add %o0
, 64, %o0
! advance
dst
550 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], #one_read
553 /* ---- copy line 2 of 2. ---- */
554 ldda
[%o1+
64]%asi
,%d16
559 add %o1
, 128, %o1
! increment src
561 add %o0
, 64, %o0
! advance
dst
566 bgt,pt
%ncc
, .mv_align_100_loop
567 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
575 ! END OF mv_align_100
578 ! Alignment off by
40 bytes
586 andn
%o2
, 0x7f, %o5
! %o5 is multiple of
2*block size
587 and %o2
, 0x7f, %o2
! residue bytes in
%o2
590 /* ---- copy line 1 of 2. ---- */
592 ldda
[%o1
]%asi
,%d16
! block load
597 add %o0
, 64, %o0
! advance
dst
602 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], #one_read
605 /* ---- copy line 2 of 2. ---- */
606 ldda
[%o1+
64]%asi
,%d16
610 add %o1
, 128, %o1
! increment src
612 add %o0
, 64, %o0
! advance
dst
618 bgt,pt
%ncc
, .mv_align_011_loop
619 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
628 ! END OF mv_align_011
631 ! Alignment off by
48 bytes
640 andn
%o2
, 0x7f, %o5
! %o5 is multiple of
2*block size
641 and %o2
, 0x7f, %o2
! residue bytes in
%o2
644 /* ---- copy line 1 of 2. ---- */
646 ldda
[%o1
]%asi
,%d16
! block load
650 add %o0
, 64, %o0
! advance
dst
656 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], #one_read
659 /* ---- copy line 2 of 2. ---- */
660 ldda
[%o1+
64]%asi
,%d16
663 add %o1
, 128, %o1
! increment src
665 add %o0
, 64, %o0
! advance
dst
672 bgt,pt
%ncc
, .mv_align_010_loop
673 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
683 ! END OF mv_align_010
686 ! Alignment off by
56 bytes
696 andn
%o2
, 0x7f, %o5
! %o5 is multiple of
2*block size
697 and %o2
, 0x7f, %o2
! residue bytes in
%o2
700 /* ---- copy line 1 of 2. ---- */
702 ldda
[%o1
]%asi
,%d16
! block load
705 add %o0
, 64, %o0
! advance
dst
712 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], #one_read
715 /* ---- copy line 2 of 2. ---- */
716 ldda
[%o1+
64]%asi
,%d16
718 add %o1
, 128, %o1
! increment src
720 add %o0
, 64, %o0
! advance
dst
728 bgt,pt
%ncc
, .mv_align_001_loop
729 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
740 ! END OF mv_align_001
743 andn
%o2
, 0x7f, %o5
! %o5 is multiple of
2*block size
744 and %o2
, 0x7f, %o2
! residue bytes in
%o2
746 /* ---- copy line 1 of 2. ---- */
750 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], #one_read
752 /* ---- copy line 2 of 2. ---- */
754 ldda
[%o1+
64]%asi
,%d0
755 add %o1
, 128, %o1
! increment src
757 add %o0
, 64, %o0
! increment
dst
758 bgt,pt
%ncc
, .mv_align_000_loop
759 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
763 ! END OF mv_align_000
764 #else /* NIAGARA2_IMPL */
765 #endif /* NIAGARA2_IMPL */
772 cmp %o2
, SMALL_MAX
! check for
not small case
773 bgeu
,pn
%ncc
, .medium ! go to larger cases
774 mov
%o0
, %g1
! save
%o0
776 cmp %o2
, SHORTCOPY
! check for really short case
777 ble,pt
%ncc
, .smallfin
778 or %o0
, %o1
, %o4
! prepare alignment check
779 andcc
%o4
, 0x3, %o5
! test for alignment
780 bz
,pt
%ncc
, .smallword ! branch to word aligned case
782 ble,pt
%ncc
, .smallrest
783 andcc
%o1
, 0x3, %o5
! is src word aligned
785 cmp %o5
, 2 ! is src half-word aligned
787 cmp %o5
, 3 ! src is byte aligned
788 .s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it
790 stb %o3
, [%o0
] ! move
a byte to align src
794 b .ald ! now go align dest
797 .s2algn:lduh [%o1], %o3 ! know src is 2 byte aligned
800 stb %o4
, [%o0
] ! have to do bytes
,
801 stb %o3
, [%o0
+ 1] ! don
't know dst alignment
805 .aldst: andcc %o0, 0x3, %o5 ! align the destination address
806 .ald: bz,pn %ncc, .w4cp
810 .w3cp: lduw [%o1], %o4
817 andn %o2, 3, %o3 ! %o3 is aligned word count
818 dec 4, %o3 ! avoid reading beyond tail of src
819 sub %o1, %o0, %o1 ! %o1 gets the difference
821 1: sll %o4, 8, %g5 ! save residual bytes
824 srl %o4, 24, %o5 ! merge with residual
829 sub %o1, 3, %o1 ! used one byte of last word read
834 .w1cp: srl %o4, 8, %o5
838 andn %o2, 3, %o3 ! %o3 is aligned word count
839 dec 4, %o3 ! avoid reading beyond tail of src
840 sub %o1, %o0, %o1 ! %o1 gets the difference
842 2: sll %o4, 24, %g5 ! save residual bytes
845 srl %o4, 8, %o5 ! merge with residual
850 sub %o1, 1, %o1 ! used three bytes of last word read
855 .w2cp: lduw [%o1], %o4
861 andn %o2, 3, %o3 ! %o3 is aligned word count
862 dec 4, %o3 ! avoid reading beyond tail of src
863 sub %o1, %o0, %o1 ! %o1 gets the difference
865 3: sll %o4, 16, %g5 ! save residual bytes
868 srl %o4, 16, %o5 ! merge with residual
873 sub %o1, 2, %o1 ! used two bytes of last word read
878 .w4cp: andn %o2, 3, %o3 ! %o3 is aligned word count
879 sub %o1, %o0, %o1 ! %o1 gets the difference
881 1: lduw [%o1+%o0], %o4 ! read from address
882 deccc 4, %o3 ! decrement count
883 st %o4, [%o0] ! write at destination address
885 inc 4, %o0 ! increment to address
886 and %o2, 3, %o2 ! number of leftover bytes, if any
888 ! simple finish up byte copy, works with any alignment
890 add %o1, %o0, %o1 ! restore %o1
895 blt,pt %ncc, .smallleft3
899 ldub [%o1], %o3 ! read byte
900 subcc %o2, 4, %o2 ! reduce count by 4
901 stb %o3, [%o0] ! write byte
902 ldub [%o1+1], %o3 ! repeat for total of 4 bytes
903 add %o1, 4, %o1 ! advance SRC by 4
906 add %o0, 4, %o0 ! advance DST by 4
909 bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain
911 addcc %o2, 3, %o2 ! restore count
913 .smallleft3: ! 1, 2, or 3 bytes remain
915 ldub [%o1], %o3 ! load one byte
917 stb %o3, [%o0] ! store one byte
918 ldub [%o1+1], %o3 ! load second byte
921 stb %o3, [%o0+1] ! store second byte
922 ldub [%o1+2], %o3 ! load third byte
923 stb %o3, [%o0+2] ! store third byte
926 mov %g1, %o0 ! restore %o0
930 bnz,pt %ncc, .smallleft3
933 mov %g1, %o0 ! restore %o0
937 lduw [%o1], %o3 ! read word
939 subcc %o2, 8, %o2 ! update count
940 stw %o3, [%o0] ! write word
941 add %o1, 8, %o1 ! update SRC
942 lduw [%o1-4], %o3 ! read word
943 add %o0, 8, %o0 ! update DST
944 bgu,pt %ncc, .smallwords ! loop until done
945 stw %o3, [%o0-4] ! write word
946 addcc %o2, 7, %o2 ! restore count
947 bz,pt %ncc, .smallexit ! check for completion
948 cmp %o2, 4 ! check for 4 or more bytes left
949 blt %ncc, .smallleft3 ! if not, go to finish up
955 bnz,pt %ncc, .smallleft3
958 mov %g1, %o0 ! restore %o0
960 ! 8 or more bytes, src and dest start on word boundary
961 ! %o4 contains or %o0, %o1; %o3 contains first four bytes of src
963 andcc %o4, 0x7, %o5 ! test for long alignment
964 bnz,pt %ncc, .smallwordx ! branch to word aligned case
965 cmp %o2, SHORT_LONG-7
966 bge,a %ncc, .medl64 ! if we branch
967 sub %o2,56,%o2 ! adjust %o2 to -31 off count
968 sub %o1, %o0, %o1 ! %o1 gets the difference
973 bgu,pt %ncc, .small_long_l ! loop until done
974 stx %o3, [%o0-8] ! write word
975 add %o1, %o0, %o1 ! restore %o1
976 addcc %o2, 7, %o2 ! restore %o2 to correct count
977 bz,pt %ncc, .smallexit ! check for completion
978 cmp %o2, 4 ! check for 4 or more bytes left
979 blt,pt %ncc, .smallleft3 ! if not, go to finish up
986 bnz,pt %ncc, .smallleft3
989 mov %g1, %o0 ! restore %o0
992 ! src and dest start on word boundary
994 subcc %o2, 7, %o2 ! adjust count
995 bgu,pt %ncc, .smalllong
996 lduw [%o1], %o3 ! read word
997 addcc %o2, 3, %o2 ! restore count
998 bz,pt %ncc, .smallexit
999 stw %o3, [%o0] ! write word
1000 deccc %o2 ! reduce count for cc test
1001 ldub [%o1+4], %o3 ! load one byte
1002 bz,pt %ncc, .smallexit
1003 stb %o3, [%o0+4] ! store one byte
1004 ldub [%o1+5], %o3 ! load second byte
1006 bz,pt %ncc, .smallexit
1007 stb %o3, [%o0+5] ! store second byte
1008 ldub [%o1+6], %o3 ! load third byte
1009 stb %o3, [%o0+6] ! store third byte
1012 mov %g1, %o0 ! restore %o0
1017 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
1018 brz,pt %o5, .dst_aligned_on_8
1020 ! %o5 has the bytes to be written in partial store.
1022 sub %o1, %o0, %o1 ! %o1 gets the difference
1023 7: ! dst aligning loop
1024 ldub [%o1+%o0], %o4 ! load one byte
1028 add %o0, 1, %o0 ! advance dst
1029 add %o1, %o0, %o1 ! restore %o1
1032 brnz,pt %o5, .src_dst_unaligned_on_8
1033 prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
1035 .src_dst_aligned_on_8:
1036 ! check if we are copying MED_MAX or more bytes
1037 cmp %o2, MED_MAX ! limit to store buffer size
1038 bgu,pt %ncc, .large_align8_copy
1039 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1041 * Special case for handling when src and dest are both long word aligned
1042 * and total data to move is less than MED_MAX bytes
1045 subcc %o2, 63, %o2 ! adjust length to allow cc test
1046 ble,pt %ncc, .medl63 ! skip big loop if less than 64 bytes
1048 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache
1049 ldx [%o1], %o4 ! load
1050 subcc %o2, 64, %o2 ! decrement length count
1051 stx %o4, [%o0] ! and store
1052 ldx [%o1+8], %o3 ! a block of 64 bytes
1058 ldx [%o1+32], %o4 ! load
1059 stx %o4, [%o0+32] ! and store
1060 ldx [%o1+40], %o3 ! a block of 64 bytes
1061 add %o1, 64, %o1 ! increase src ptr by 64
1064 add %o0, 64, %o0 ! increase dst ptr by 64
1067 bgu,pt %ncc, .medl64 ! repeat if at least 64 bytes left
1070 addcc %o2, 32, %o2 ! adjust remaining count
1071 ble,pt %ncc, .medl31 ! to skip if 31 or fewer bytes left
1073 ldx [%o1], %o4 ! load
1074 sub %o2, 32, %o2 ! decrement length count
1075 stx %o4, [%o0] ! and store
1076 ldx [%o1+8], %o3 ! a block of 32 bytes
1077 add %o1, 32, %o1 ! increase src ptr by 32
1080 add %o0, 32, %o0 ! increase dst ptr by 32
1085 addcc %o2, 16, %o2 ! adjust remaining count
1086 ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left
1088 ldx [%o1], %o4 ! load and store 16 bytes
1089 add %o1, 16, %o1 ! increase src ptr by 16
1091 sub %o2, 16, %o2 ! decrease count by 16
1093 add %o0, 16, %o0 ! increase dst ptr by 16
1096 addcc %o2, 15, %o2 ! restore count
1097 bz,pt %ncc, .smallexit ! exit if finished
1099 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
1101 ldx [%o1], %o4 ! load 8 bytes
1102 add %o1, 8, %o1 ! increase src ptr by 8
1103 add %o0, 8, %o0 ! increase dst ptr by 8
1104 subcc %o2, 8, %o2 ! decrease count by 8
1106 stx %o4, [%o0-8] ! and store 8 bytes
1108 mov %g1, %o0 ! restore %o0
1111 .src_dst_unaligned_on_8:
1112 ! DST is 8-byte aligned, src is not
1114 andcc %o1, 0x3, %o5 ! test word alignment
1115 bnz,pt %ncc, .unalignsetup ! branch to skip if not word aligned
1116 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1119 * Handle all cases where src and dest are aligned on word
1120 * boundaries. Use unrolled loops for better performance.
1121 * This option wins over standard large data move when
1122 * source and destination is in cache for medium
1123 * to short data moves.
1125 cmp %o2, MED_WMAX ! limit to store buffer size
1126 bge,pt %ncc, .unalignrejoin ! otherwise rejoin main loop
1127 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1129 subcc %o2, 31, %o2 ! adjust length to allow cc test
1131 ble,pt %ncc, .medw31 ! skip big loop if less than 16
1132 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1134 ld [%o1], %o4 ! move a block of 32 bytes
1143 subcc %o2, 32, %o2 ! decrement length count
1146 add %o1, 32, %o1 ! increase src ptr by 32
1149 add %o0, 32, %o0 ! increase dst ptr by 32
1152 bgu,pt %ncc, .medw32 ! repeat if at least 32 bytes left
1155 addcc %o2, 31, %o2 ! restore count
1157 bz,pt %ncc, .smallexit ! exit if finished
1160 blt,pt %ncc, .medw15
1162 ld [%o1], %o4 ! move a block of 16 bytes
1163 subcc %o2, 16, %o2 ! decrement length count
1166 add %o1, 16, %o1 ! increase src ptr by 16
1169 add %o0, 16, %o0 ! increase dst ptr by 16
1174 bz,pt %ncc, .smallexit ! exit if finished
1176 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
1178 ld [%o1], %o4 ! load 4 bytes
1179 subcc %o2, 8, %o2 ! decrease count by 8
1180 stw %o4, [%o0] ! and store 4 bytes
1181 add %o1, 8, %o1 ! increase src ptr by 8
1182 ld [%o1-4], %o3 ! load 4 bytes
1183 add %o0, 8, %o0 ! increase dst ptr by 8
1184 stw %o3, [%o0-4] ! and store 4 bytes
1185 bz,pt %ncc, .smallexit ! exit if finished
1186 .medw7: ! count is ge 1, less than 8
1187 cmp %o2, 4 ! check for 4 bytes left
1188 blt,pt %ncc, .smallleft3 ! skip if 3 or fewer bytes left
1190 ld [%o1], %o4 ! load 4 bytes
1191 add %o1, 4, %o1 ! increase src ptr by 4
1192 add %o0, 4, %o0 ! increase dst ptr by 4
1193 subcc %o2, 4, %o2 ! decrease count by 4
1195 stw %o4, [%o0-4] ! and store 4 bytes
1197 mov %g1, %o0 ! restore %o0
1200 .large_align8_copy: ! Src and dst share 8 byte alignment
1201 rd %fprs, %g5 ! check for unused fp
1202 ! if fprs.fef == 0, set it.
1203 ! Setting it when already set costs more than checking
1204 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
1206 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1208 ! align dst to 64 byte boundary
1209 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
1210 brz,pn %o3, .aligned_to_64
1211 andcc %o0, 8, %o3 ! odd long words to move?
1212 brz,pt %o3, .aligned_to_16
1216 add %o1, 8, %o1 ! increment src ptr
1217 add %o0, 8, %o0 ! increment dst ptr
1220 andcc %o0, 16, %o3 ! pair of long words to move?
1221 brz,pt %o3, .aligned_to_32
1226 add %o1, 16, %o1 ! increment src ptr
1228 add %o0, 16, %o0 ! increment dst ptr
1231 andcc %o0, 32, %o3 ! four long words to move?
1232 brz,pt %o3, .aligned_to_64
1241 add %o1, 32, %o1 ! increment src ptr
1243 add %o0, 32, %o0 ! increment dst ptr
1246 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1247 mov %asi,%o4 ! save %asi
1248 ! Determine source alignment
1249 ! to correct 8 byte offset
1250 andcc %o1, 0x20, %o3
1251 brnz,pn %o3, .align_1
1252 mov ASI_BLK_P, %asi ! setup %asi for block load/store
1253 andcc %o1, 0x10, %o3
1254 brnz,pn %o3, .align_01
1256 andcc %o1, 0x08, %o3
1257 brz,pn %o3, .align_000
1258 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1262 andcc %o1, 0x08, %o3
1263 brnz,pn %o3, .align_011
1264 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1268 andcc %o1, 0x10, %o3
1269 brnz,pn %o3, .align_11
1271 andcc %o1, 0x08, %o3
1272 brnz,pn %o3, .align_101
1273 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1277 andcc %o1, 0x08, %o3
1278 brz,pn %o3, .align_110
1279 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1282 ! Alignment off by 8 bytes
1286 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1287 and %o2, 0x7f, %o2 ! residue bytes in %o2
1290 /* ---- copy line 1 of 2. ---- */
1291 ldda [%o1]%asi,%d16 ! block load
1299 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1301 add %o0, 64, %o0 ! advance dst
1302 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1305 /* ---- copy line 2 of 2. ---- */
1306 ldda [%o1+64]%asi,%d16
1314 add %o1, 128, %o1 ! increment src
1315 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1317 add %o0, 64, %o0 ! advance dst
1319 bgt,pt %ncc, .align_111_loop
1320 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1328 ! Alignment off by 16 bytes
1333 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1334 and %o2, 0x7f, %o2 ! residue bytes in %o2
1337 /* ---- copy line 1 of 2. ---- */
1339 ldda [%o1]%asi,%d16 ! block load
1346 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1348 add %o0, 64, %o0 ! advance dst
1350 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1353 /* ---- copy line 2 of 2. ---- */
1354 ldda [%o1+64]%asi,%d16
1361 add %o1, 128, %o1 ! increment src
1362 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1364 add %o0, 64, %o0 ! advance dst
1367 bgt,pt %ncc, .align_110_loop
1368 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1377 ! Alignment off by 24 bytes
1383 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1384 and %o2, 0x7f, %o2 ! residue bytes in %o2
1387 /* ---- copy line 1 of 2. ---- */
1389 ldda [%o1]%asi,%d16 ! block load
1395 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1397 add %o0, 64, %o0 ! advance dst
1400 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1403 /* ---- copy line 2 of 2. ---- */
1404 ldda [%o1+64]%asi,%d16
1410 add %o1, 128, %o1 ! increment src
1411 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1413 add %o0, 64, %o0 ! advance dst
1417 bgt,pt %ncc, .align_101_loop
1418 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1428 ! Alignment off by 32 bytes
1435 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1436 and %o2, 0x7f, %o2 ! residue bytes in %o2
1439 /* ---- copy line 1 of 2. ---- */
1440 ldda [%o1]%asi,%d16 ! block load
1445 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1447 add %o0, 64, %o0 ! advance dst
1451 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1454 /* ---- copy line 2 of 2. ---- */
1455 ldda [%o1+64]%asi,%d16
1460 add %o1, 128, %o1 ! increment src
1461 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1463 add %o0, 64, %o0 ! advance dst
1468 bgt,pt %ncc, .align_100_loop
1469 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1480 ! Alignment off by 40 bytes
1488 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1489 and %o2, 0x7f, %o2 ! residue bytes in %o2
1492 /* ---- copy line 1 of 2. ---- */
1494 ldda [%o1]%asi,%d16 ! block load
1498 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1500 add %o0, 64, %o0 ! advance dst
1505 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1508 /* ---- copy line 2 of 2. ---- */
1509 ldda [%o1+64]%asi,%d16
1513 add %o1, 128, %o1 ! increment src
1514 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1516 add %o0, 64, %o0 ! advance dst
1522 bgt,pt %ncc, .align_011_loop
1523 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1535 ! Alignment off by 48 bytes
1544 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1545 and %o2, 0x7f, %o2 ! residue bytes in %o2
1548 /* ---- copy line 1 of 2. ---- */
1550 ldda [%o1]%asi,%d16 ! block load
1553 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1555 add %o0, 64, %o0 ! advance dst
1561 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1564 /* ---- copy line 2 of 2. ---- */
1565 ldda [%o1+64]%asi,%d16
1568 add %o1, 128, %o1 ! increment src
1569 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1571 add %o0, 64, %o0 ! advance dst
1578 bgt,pt %ncc, .align_010_loop
1579 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1592 ! Alignment off by 56 bytes
1602 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1603 and %o2, 0x7f, %o2 ! residue bytes in %o2
1606 /* ---- copy line 1 of 2. ---- */
1608 ldda [%o1]%asi,%d16 ! block load
1610 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1612 add %o0, 64, %o0 ! advance dst
1619 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1622 /* ---- copy line 2 of 2. ---- */
1623 ldda [%o1+64]%asi,%d16
1625 add %o1, 128, %o1 ! increment src
1626 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1628 add %o0, 64, %o0 ! advance dst
1636 bgt,pt %ncc, .align_001_loop
1637 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1651 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1652 and %o2, 0x7f, %o2 ! residue bytes in %o2
1654 /* ---- copy line 1 of 2. ---- */
1657 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1659 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1661 /* ---- copy line 2 of 2. ---- */
1663 ldda [%o1+64]%asi,%d0
1664 add %o1, 128, %o1 ! increment src
1665 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1667 add %o0, 64, %o0 ! increment dst
1668 bgt,pt %ncc, .align_000_loop
1669 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1674 mov %o4, %asi ! restore %asi
1681 ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
1683 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1685 rd %fprs, %g5 ! check for unused fp
1686 ! if fprs.fef == 0, set it.
1687 ! Setting it when already set costs more than checking
1688 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
1690 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1692 cmp %o2, MED_UMAX ! check for medium unaligned limit
1693 bge,pt %ncc,.unalign_large
1695 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
1696 and %o2, 0x3f, %o2 ! residue bytes in %o2
1697 cmp %o2, 8 ! Insure we don't load beyond
1698 bgt .unalign_adjust ! end of source buffer
1699 andn
%o1
, 0x7, %o4
! %o4 has long word aligned src address
1700 add %o2
, 64, %o2
! adjust to leave loop
1701 sub %o5
, 64, %o5
! early if necessary
1703 alignaddr
%o1
, %g0
, %g0
! generate
%gsr
1704 add %o1
, %o5
, %o1
! advance
%o1 to after blocks
1708 faligndata
%d0
, %d2
, %d16
1711 faligndata
%d2
, %d4
, %d18
1714 faligndata
%d4
, %d6
, %d20
1717 faligndata
%d6
, %d8
, %d22
1720 faligndata
%d8
, %d10
, %d24
1723 faligndata
%d10
, %d12
, %d26
1726 faligndata
%d12
, %d14
, %d28
1729 faligndata
%d14
, %d0
, %d30
1730 add %o4
, BLOCK_SIZE
, %o4
1732 add %o0
, BLOCK_SIZE
, %o0
1733 subcc
%o5
, BLOCK_SIZE
, %o5
1734 bgu
,pt
%ncc
, .unalign_loop
1735 prefetch
[%o4
+ (4 * BLOCK_SIZE
)], #one_read
1740 andcc
%o0
, 0x3f, %o3
! is
dst 64-byte block aligned?
1741 bz
%ncc
, .unalignsrc
1742 sub %o3
, 64, %o3
! %o3 will
be multiple of
8
1743 neg %o3
! bytes until dest is
64 byte aligned
1744 sub %o2
, %o3
, %o2
! update cnt with bytes to
be moved
1745 ! Move bytes according to source alignment
1747 bnz
%ncc
, .unalignbyte ! check for byte alignment
1749 andcc
%o1
, 2, %o5
! check for half word alignment
1750 bnz
%ncc
, .unalignhalf
1752 ! Src is word aligned
1754 ld [%o1
], %o4
! load
4 bytes
1755 stw %o4
, [%o0
] ! and store
4 bytes
1756 ld [%o1+
4], %o4
! load
4 bytes
1757 add %o1
, 8, %o1
! increase src ptr by
8
1758 stw %o4
, [%o0+
4] ! and store
4 bytes
1759 subcc
%o3
, 8, %o3
! decrease count by
8
1760 bnz
%ncc
, .unalignword
1761 add %o0
, 8, %o0
! increase
dst ptr by
8
1765 ! Src is half-word aligned
1767 lduh
[%o1
], %o4
! load
2 bytes
1768 sllx
%o4
, 32, %o5
! shift left
1777 bnz
%ncc
, .unalignhalf
1782 ! Src is Byte aligned
1784 sub %o0
, %o1
, %o0
! share pointer advance
1801 bnz
%ncc
, .unalignbyte_loop
1803 add %o0
,%o1
, %o0
! restore pointer
1805 ! Destination is now block
(64 byte aligned
)
1807 andn
%o2
, 0x3f, %o5
! %o5 is multiple of block size
1808 and %o2
, 0x3f, %o2
! residue bytes in
%o2
1809 add %o2
, 64, %o2
! Insure we don
't load beyond
1810 sub %o5, 64, %o5 ! end of source buffer
1812 andn %o1, 0x3f, %o4 ! %o4 has block aligned src address
1813 prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read
1814 alignaddr %o1, %g0, %g0 ! generate %gsr
1815 add %o1, %o5, %o1 ! advance %o1 to after blocks
1817 ! Determine source alignment to correct 8 byte offset
1818 andcc %o1, 0x20, %o3
1819 brnz,pn %o3, .unalign_1
1821 andcc %o1, 0x10, %o3
1822 brnz,pn %o3, .unalign_01
1824 andcc %o1, 0x08, %o3
1825 brz,a %o3, .unalign_000
1826 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1828 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1830 andcc %o1, 0x08, %o3
1831 brnz,a %o3, .unalign_011
1832 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1834 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1836 andcc %o1, 0x10, %o3
1837 brnz,pn %o3, .unalign_11
1839 andcc %o1, 0x08, %o3
1840 brnz,a %o3, .unalign_101
1841 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1843 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1845 andcc %o1, 0x08, %o3
1846 brz,pn %o3, .unalign_110
1847 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1853 ldda [%o4]ASI_BLK_P, %d16
1854 faligndata %d14, %d16, %d48
1855 faligndata %d16, %d18, %d50
1856 faligndata %d18, %d20, %d52
1857 faligndata %d20, %d22, %d54
1858 faligndata %d22, %d24, %d56
1859 faligndata %d24, %d26, %d58
1860 faligndata %d26, %d28, %d60
1861 faligndata %d28, %d30, %d62
1863 stda %d48, [%o0]ASI_BLK_P
1866 bgu,pt %ncc, .unalign_111_loop
1867 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1876 ldda [%o4]ASI_BLK_P, %d16
1877 faligndata %d12, %d14, %d48
1878 faligndata %d14, %d16, %d50
1879 faligndata %d16, %d18, %d52
1880 faligndata %d18, %d20, %d54
1881 faligndata %d20, %d22, %d56
1882 faligndata %d22, %d24, %d58
1883 faligndata %d24, %d26, %d60
1884 faligndata %d26, %d28, %d62
1887 stda %d48, [%o0]ASI_BLK_P
1890 bgu,pt %ncc, .unalign_110_loop
1891 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1901 ldda [%o4]ASI_BLK_P, %d16
1902 faligndata %d10, %d12, %d48
1903 faligndata %d12, %d14, %d50
1904 faligndata %d14, %d16, %d52
1905 faligndata %d16, %d18, %d54
1906 faligndata %d18, %d20, %d56
1907 faligndata %d20, %d22, %d58
1908 faligndata %d22, %d24, %d60
1909 faligndata %d24, %d26, %d62
1913 stda %d48, [%o0]ASI_BLK_P
1916 bgu,pt %ncc, .unalign_101_loop
1917 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1928 ldda [%o4]ASI_BLK_P, %d16
1929 faligndata %d8, %d10, %d48
1930 faligndata %d10, %d12, %d50
1931 faligndata %d12, %d14, %d52
1932 faligndata %d14, %d16, %d54
1933 faligndata %d16, %d18, %d56
1934 faligndata %d18, %d20, %d58
1935 faligndata %d20, %d22, %d60
1936 faligndata %d22, %d24, %d62
1941 stda %d48, [%o0]ASI_BLK_P
1944 bgu,pt %ncc, .unalign_100_loop
1945 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1957 ldda [%o4]ASI_BLK_P, %d16
1958 faligndata %d6, %d8, %d48
1959 faligndata %d8, %d10, %d50
1960 faligndata %d10, %d12, %d52
1961 faligndata %d12, %d14, %d54
1962 faligndata %d14, %d16, %d56
1963 faligndata %d16, %d18, %d58
1964 faligndata %d18, %d20, %d60
1965 faligndata %d20, %d22, %d62
1971 stda %d48, [%o0]ASI_BLK_P
1974 bgu,pt %ncc, .unalign_011_loop
1975 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1988 ldda [%o4]ASI_BLK_P, %d16
1989 faligndata %d4, %d6, %d48
1990 faligndata %d6, %d8, %d50
1991 faligndata %d8, %d10, %d52
1992 faligndata %d10, %d12, %d54
1993 faligndata %d12, %d14, %d56
1994 faligndata %d14, %d16, %d58
1995 faligndata %d16, %d18, %d60
1996 faligndata %d18, %d20, %d62
2003 stda %d48, [%o0]ASI_BLK_P
2006 bgu,pt %ncc, .unalign_010_loop
2007 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2021 ldda [%o4]ASI_BLK_P, %d16
2022 faligndata %d2, %d4, %d48
2023 faligndata %d4, %d6, %d50
2024 faligndata %d6, %d8, %d52
2025 faligndata %d8, %d10, %d54
2026 faligndata %d10, %d12, %d56
2027 faligndata %d12, %d14, %d58
2028 faligndata %d14, %d16, %d60
2029 faligndata %d16, %d18, %d62
2037 stda %d48, [%o0]ASI_BLK_P
2040 bgu,pt %ncc, .unalign_001_loop
2041 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2046 ldda [%o4]ASI_BLK_P, %d0
2049 ldda [%o4]ASI_BLK_P, %d16
2050 faligndata %d0, %d2, %d48
2051 faligndata %d2, %d4, %d50
2052 faligndata %d4, %d6, %d52
2053 faligndata %d6, %d8, %d54
2054 faligndata %d8, %d10, %d56
2055 faligndata %d10, %d12, %d58
2056 faligndata %d12, %d14, %d60
2057 faligndata %d14, %d16, %d62
2066 stda %d48, [%o0]ASI_BLK_P
2069 bgu,pt %ncc, .unalign_000_loop
2070 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2074 ! Handle trailing bytes, 64 to 127
2075 ! Dest long word aligned, Src not long word aligned
2077 bleu %ncc, .unalign_short
2079 andn %o2, 0x7, %o5 ! %o5 is multiple of 8
2080 and %o2, 0x7, %o2 ! residue bytes in %o2
2082 sub %o5, 8, %o5 ! insure we don't load past end of src
2083 andn
%o1
, 0x7, %o4
! %o4 has long word aligned src address
2084 add %o1
, %o5
, %o1
! advance
%o1 to after multiple of
8
2085 ldd
[%o4
], %d0
! fetch partial word
2089 faligndata
%d0
, %d2
, %d16
2093 bgu
,pt
%ncc
, .unalign_by8
2097 brnz
%g5
, .smallrest
2101 #else /* NIAGARA2_IMPL */
2103 mov
%o0
, %g5
! save des address for return val
2104 cmp %o2
, 17 ! for small counts copy bytes
2105 bleu
,pt
%ncc
, .dbytecp
2108 cmp %o2
, 0x80 ! For lengths less than
128 bytes no
2109 bleu
,pn
%ncc
, .no_blkcpy ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2112 * Make sure that source and destination buffers are 64 bytes apart.
2113 * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy
2117 blu
%ncc
, .blkalgndst
2118 cmp %o3
, 0x40 ! if src
- dst >= 0x40
2119 bgeu
,pt
%ncc
, .blkalgndst ! then use ASI_BLK_INIT_ST_QUAD_LDD_P
2121 andcc
%o1
, 3, %o5
! is src word aligned
2123 cmp %o5
, 2 ! is src half-word aligned
2125 cmp %o5
, 3 ! src is byte aligned
2126 .s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it
2128 stb %o3
, [%g5
] ! move
a byte to align src
2130 bne,pt
%ncc
, .s2algn
2132 b .ald ! now go align dest
2135 .s2algn:lduh [%o1], %o3 ! know src is 2 byte alinged
2138 stb %o4
, [%g5
] ! have to do bytes
,
2139 stb %o3
, [%g5
+ 1] ! don
't know dst alingment
2143 .aldst: andcc %g5, 3, %o5 ! align the destination address
2144 .ald: bz,pn %ncc, .w4cp
2148 .w3cp: lduw [%o1], %o4
2155 andn %o2, 3, %o3 ! o3 is aligned word count
2156 dec 4, %o3 ! avoid reading beyond tail of src
2157 sub %o1, %g5, %o1 ! o1 gets the difference
2159 1: sll %o4, 8, %g1 ! save residual bytes
2162 srl %o4, 24, %o5 ! merge with residual
2167 sub %o1, 3, %o1 ! used one byte of last word read
2172 .w1cp: srl %o4, 8, %o5
2176 andn %o2, 3, %o3 ! o3 is aligned word count
2177 dec 4, %o3 ! avoid reading beyond tail of src
2178 sub %o1, %g5, %o1 ! o1 gets the difference
2180 2: sll %o4, 24, %g1 ! save residual bytes
2183 srl %o4, 8, %o5 ! merge with residual
2188 sub %o1, 1, %o1 ! used three bytes of last word read
2193 .w2cp: lduw [%o1], %o4
2199 andn %o2, 3, %o3 ! o3 is aligned word count
2200 dec 4, %o3 ! avoid reading beyond tail of src
2201 sub %o1, %g5, %o1 ! o1 gets the difference
2203 3: sll %o4, 16, %g1 ! save residual bytes
2206 srl %o4, 16, %o5 ! merge with residual
2211 sub %o1, 2, %o1 ! used two bytes of last word read
2216 .w4cp: andn %o2, 3, %o3 ! o3 is aligned word count
2217 sub %o1, %g5, %o1 ! o1 gets the difference
2219 1: lduw [%o1+%g5], %o4 ! read from address
2220 deccc 4, %o3 ! decrement count
2221 st %o4, [%g5] ! write at destination address
2223 inc 4, %g5 ! increment to address
2225 and %o2, 3, %o2 ! number of leftover bytes, if any
2228 ! differenced byte copy, works with any alignment
2232 sub %o1, %g5, %o1 ! o1 gets the difference
2234 4: stb %o4, [%g5] ! write to address
2235 inc %g5 ! inc to address
2236 7: deccc %o2 ! decrement count
2237 bgeu,a,pt %ncc,4b ! loop till done
2238 ldub [%o1+%g5], %o4 ! read from address
2239 retl ! %o0 was preserved
2243 save %sp, -SA(MINFRAME), %sp
2245 ! Block (64 bytes) align the destination.
2246 andcc %i0, 0x3f, %i3 ! is dst block aligned
2247 bz %ncc, .chksrc ! dst already block aligned
2249 neg %i3 ! bytes till dst 64 bytes aligned
2250 sub %i2, %i3, %i2 ! update i2 with new count
2252 ! Based on source and destination alignment do
2253 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2255 ! Is dst & src 8B aligned
2261 ! Is dst & src 4B aligned
2266 ! Is dst & src 2B aligned
2282 ! dst & src 4B aligned
2288 bgu,pt %ncc, .alwdcp
2294 ! dst & src 2B aligned
2300 bgu,pt %ncc, .alhlfwdcp
2306 ! dst & src 8B aligned
2312 bgu,pt %ncc, .alewdcp
2315 ! Now Destination is block (64 bytes) aligned
2317 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
2318 sub %i2, %i3, %i2 ! Residue bytes in %i2
2319 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2320 andcc %i1, 0xf, %l1 ! is src quadword aligned
2321 bz,pn %ncc, .blkcpy ! src offset in %l1
2324 bgu %ncc, .cpy_upper_double
2326 blu %ncc, .cpy_lower_double
2329 ! Falls through when source offset is equal to 8 i.e.
2330 ! source is double word aligned.
2331 ! In this case no shift/merge of data is required
2332 sub %i1, %l1, %i1 ! align the src at 16 bytes.
2333 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
2334 prefetch [%o0+0x0], #one_read
2335 ldda [%i1+0x0]%asi, %o2
2337 ldda [%i1+0x10]%asi, %o4
2338 prefetch [%o0+0x40], #one_read
2340 stxa %o3, [%i0+0x0]%asi
2341 stxa %o4, [%i0+0x8]%asi
2343 ldda [%i1+0x20]%asi, %o2
2344 stxa %o5, [%i0+0x10]%asi
2345 stxa %o2, [%i0+0x18]%asi
2347 ldda [%i1+0x30]%asi, %o4
2348 stxa %o3, [%i0+0x20]%asi
2349 stxa %o4, [%i0+0x28]%asi
2351 ldda [%i1+0x40]%asi, %o2
2352 stxa %o5, [%i0+0x30]%asi
2353 stxa %o2, [%i0+0x38]%asi
2357 subcc %i3, 0x40, %i3
2361 add %i1, %l1, %i1 ! increment the source by src offset
2364 sub %i1, %l1, %i1 ! align the src at 16 bytes.
2365 sll %l1, 3, %l2 ! %l2 left shift
2367 sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift)
2368 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
2369 prefetch [%o0+0x0], #one_read
2370 ldda [%i1+0x0]%asi, %o2 ! partial data in %o2 and %o3 has
2373 ldda [%i1+0x10]%asi, %o4 ! %o4 has partial data for this read.
2374 ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) ! merge %o2, %o3 and %o4
2376 prefetch [%o0+0x40], #one_read
2377 stxa %o2, [%i0+0x0]%asi
2378 stxa %o3, [%i0+0x8]%asi
2380 ldda [%i1+0x20]%asi, %o2
2381 ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) ! merge %o2 with %o5 and
2382 stxa %o4, [%i0+0x10]%asi ! %o4 from previous read
2383 stxa %o5, [%i0+0x18]%asi ! into %o4 and %o5
2385 ! Repeat the same for next 32 bytes.
2387 ldda [%i1+0x30]%asi, %o4
2388 ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)
2389 stxa %o2, [%i0+0x20]%asi
2390 stxa %o3, [%i0+0x28]%asi
2392 ldda [%i1+0x40]%asi, %o2
2393 ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)
2394 stxa %o4, [%i0+0x30]%asi
2395 stxa %o5, [%i0+0x38]%asi
2399 subcc %i3, 0x40, %i3
2403 add %i1, %l1, %i1 ! increment the source by src offset
2406 sub %i1, %l1, %i1 ! align the src at 16 bytes.
2409 sll %l2, 3, %l2 ! %l2 left shift
2411 sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift)
2412 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
2413 prefetch [%o0+0x0], #one_read
2414 ldda [%i1+0x0]%asi, %o2 ! partial data in %o3 for this read and
2417 ldda [%i1+0x10]%asi, %o4 ! %o4 has complete data and %o5 has
2419 ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) ! merge %o3, %o4 and %o5
2421 prefetch [%o0+0x40], #one_read
2422 stxa %o3, [%i0+0x0]%asi
2423 stxa %o4, [%i0+0x8]%asi
2425 ldda [%i1+0x20]%asi, %o2
2426 ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) ! merge %o2 and %o3 with
2427 stxa %o5, [%i0+0x10]%asi ! %o5 from previous read
2428 stxa %o2, [%i0+0x18]%asi ! into %o5 and %o2
2430 ! Repeat the same for next 32 bytes.
2432 ldda [%i1+0x30]%asi, %o4
2433 ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)
2434 stxa %o3, [%i0+0x20]%asi
2435 stxa %o4, [%i0+0x28]%asi
2437 ldda [%i1+0x40]%asi, %o2
2438 ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)
2439 stxa %o5, [%i0+0x30]%asi
2440 stxa %o2, [%i0+0x38]%asi
2444 subcc %i3, 0x40, %i3
2448 add %i1, %l1, %i1 ! increment the source by src offset
2450 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2452 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
2453 prefetch [%o0+0x0], #one_read
2455 prefetch [%o0+0x40], #one_read
2457 ldda [%i1+0x0]%asi, %o2
2458 ldda [%i1+0x10]%asi, %o4
2460 stxa %o2, [%i0+0x0]%asi
2461 stxa %o3, [%i0+0x8]%asi
2462 stxa %o4, [%i0+0x10]%asi
2463 stxa %o5, [%i0+0x18]%asi
2465 ldda [%i1+0x20]%asi, %o2
2466 ldda [%i1+0x30]%asi, %o4
2468 stxa %o2, [%i0+0x20]%asi
2469 stxa %o3, [%i0+0x28]%asi
2470 stxa %o4, [%i0+0x30]%asi
2471 stxa %o5, [%i0+0x38]%asi
2475 subcc %i3, 0x40, %i3
2482 mov ASI_PNF, %asi ! restore %asi to default
2483 ! ASI_PRIMARY_NOFAULT value
2485 bz,pt %ncc, .blkexit
2488 ! Handle trailing bytes
2490 blu,pt %ncc, .residue
2493 ! Can we do some 8B ops
2499 ! Do 8byte ops as long as possible
2510 bz,pt %ncc, .blkexit
2531 bz,pt %ncc, .blkexit
2553 bz,pt %ncc, .blkexit
2561 bgu,pt %ncc, .residue
2567 restore %g5, %g0, %o0
2569 #endif /* NIAGARA2_IMPL */
2571 SET_SIZE(__align_cpy_1)