import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / capabilities / sun4v / common / memcpy.s
blob75a2b3fbdbded473ea11698dd1ff997e5807f88d
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
26 .file "memcpy.s"
29 * memcpy(s1, s2, len)
31 * Copy s2 to s1, always copy n bytes.
32 * Note: this C code does not work for overlapped copies.
33 * Memmove() and bcopy() do.
35 * Added entry __align_cpy_1 is generally for use of the compilers.
37 * Fast assembler language version of the following C-program for memcpy
38 * which represents the `standard' for the C-library.
40 * void *
41 * memcpy(void *s, const void *s0, size_t n)
42 * {
43 * if (n != 0) {
44 * char *s1 = s;
45 * const char *s2 = s0;
46 * do {
47 * *s1++ = *s2++;
48 * } while (--n != 0);
49 * }
50 * return (s);
51 * }
54 * N1 Flow :
56 * if (count < 17) {
57 * Do the byte copy
58 * Return destination address
59 * }
60 * if (count < 128) {
61 * Is source aligned on word boundary
62 * If no then align source on word boundary then goto .ald
63 * If yes goto .ald
64 * .ald:
65 * Is destination aligned on word boundary
66 * Depending on destination offset (last 2 bits of destination)
67 * copy data by shifting and merging.
68 * Copy residue bytes as byte copy
69 * Return destination address
70 * } else {
71 * Align destination on block boundary
72 * Depending on the source offset (last 4 bits of source address) align
73 * the data and store to destination. Both the load and store are done
74 * using ASI_BLK_INIT_ST_QUAD_LDD_P.
75 * For remaining count copy as much data in 8-byte chunk from source to
76 * destination.
77 * Followed by trailing copy using byte copy.
78 * Return saved destination address
79 * }
82 * N2 Flow :
83 * Flow :
85 * if (count < 128) {
86 * if count < 3
87 * copy bytes; exit with dst addr
88 * if src & dst aligned on word boundary but not long word boundary,
89 * copy with ldw/stw; branch to finish_up
90 * if src & dst aligned on long word boundary
91 * copy with ldx/stx; branch to finish_up
92 * if src & dst not aligned and length <= 14
93 * copy bytes; exit with dst addr
94 * move enough bytes to get src to word boundary
95 * if dst now on word boundary
96 * move_words:
97 * copy words; branch to finish_up
98 * if dst now on half word boundary
99 * load words, shift half words, store words; branch to finish_up
100 * if dst on byte 1
101 * load words, shift 3 bytes, store words; branch to finish_up
102 * if dst on byte 3
103 * load words, shift 1 byte, store words; branch to finish_up
104 * finish_up:
105 * copy bytes; exit with dst addr
106 * } else { More than 128 bytes
107 * move bytes until dst is on long word boundary
108 * if( src is on long word boundary ) {
109 * if (count < 512) {
110 * finish_long: src/dst aligned on 8 bytes
111 * copy with ldx/stx in 8-way unrolled loop;
112 * copy final 0-63 bytes; exit with dst addr
113 * } else { src/dst aligned; count > 512
114 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
115 * src alignments relative to a 64 byte boundary to select the
116 * 16-way unrolled loop to use for
117 * block load, fmovd, block-init-store, block-store, fmovd operations
118 * then go to finish_long.
120 * } else { src/dst not aligned on 8 bytes
121 * if src is word aligned and count < 512
122 * move words in 8-way unrolled loop
123 * move final 0-31 bytes; exit with dst addr
124 * if count < 512
125 * use alignaddr/faligndata combined with ldd/std in 8-way
126 * unrolled loop to move data.
127 * go to unalign_done
128 * else
129 * setup alignaddr for faligndata instructions
130 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
131 * src alignments to nearest long word relative to 64 byte boundary to
132 * select the 8-way unrolled loop to use for
133 * block load, falign, fmovd, block-init-store, block-store loop
134 * (only use block-init-store when src/dst on 8 byte boundaries.)
135 * unalign_done:
136 * move remaining bytes for unaligned cases. exit with dst addr.
139 * Comment on N2 memmove and memcpy common code and block-store-init:
140 * In the man page for memmove, it specifies that copying will take place
141 * correctly between objects that overlap. For memcpy, behavior is
142 * undefined for objects that overlap.
144 * In rare cases, some multi-threaded applications may attempt to examine
145 * the copy destination buffer during the copy. Using the block-store-init
146 * instruction allows those applications to observe zeros in some
147 * cache lines of the destination buffer for narrow windows. But the
148 * the block-store-init provides memory throughput advantages for many
149 * common applications. To meet both needs, those applications which need
150 * the destination buffer to retain meaning during the copy should use
151 * memmove instead of memcpy. The memmove version duplicates the memcpy
152 * algorithms except the memmove version does not use block-store-init
153 * in those cases where memcpy does use block-store-init. Otherwise, when
154 * memmove can determine the source and destination do not overlap,
155 * memmove shares the memcpy code.
158 #include <sys/asm_linkage.h>
159 #include <sys/niagaraasi.h>
160 #include <sys/asi.h>
161 #include <sys/trap.h>
163 /* documented name for primary block initializing store */
164 #define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P
166 #define BLOCK_SIZE 64
167 #define FPRS_FEF 0x4
169 #define SHORTCOPY 3
170 #define SHORTCHECK 14
171 #define SHORT_LONG 64 /* max copy for short longword-aligned case */
172 /* must be at least 32 */
173 #define SMALL_MAX 128
174 #define MED_UMAX 512 /* max copy for medium un-aligned case */
175 #define MED_WMAX 512 /* max copy for medium word-aligned case */
176 #define MED_MAX 512 /* max copy for medium longword-aligned case */
178 #ifdef NIAGARA2_IMPL
179 #include <sys/sun4asi.h>
181 #else /* NIAGARA2_IMPL */
183 * This define is to align data for the unaligned source cases.
184 * The data1, data2 and data3 is merged into data1 and data2.
185 * The data3 is preserved for next merge.
187 #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \
188 sllx data1, lshift, data1 ;\
189 srlx data2, rshift, tmp ;\
190 or data1, tmp, data1 ;\
191 sllx data2, lshift, data2 ;\
192 srlx data3, rshift, tmp ;\
193 or data2, tmp, data2
195 * Align the data. Merge the data1 and data2 into data1.
197 #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \
198 sllx data1, lshift, data1 ;\
199 srlx data2, rshift, tmp ;\
200 or data1, tmp, data1
201 #endif /* NIAGARA2_IMPL */
204 ANSI_PRAGMA_WEAK(memmove,function)
205 ANSI_PRAGMA_WEAK(memcpy,function)
207 ENTRY(memmove)
208 cmp %o1, %o0 ! if from address is >= to use forward copy
209 bgeu,pn %ncc, .forcpy ! else use backward if ...
210 sub %o0, %o1, %o4 ! get difference of two addresses
211 cmp %o2, %o4 ! compare size and difference of addresses
212 bleu,pn %ncc, .forcpy ! if size is bigger, do overlapped copy
213 add %o1, %o2, %o5 ! get to end of source space
216 ! an overlapped copy that must be done "backwards"
218 .chksize:
219 cmp %o2, 8 ! less than 8 byte do byte copy
220 blu,pt %ncc, 2f ! else continue
222 ! Now size is bigger than 8
223 .dbalign:
224 add %o0, %o2, %g1 ! get to end of dest space
225 andcc %g1, 7, %o3 ! %o3 has bytes till dst 8 bytes aligned
226 bz,a,pn %ncc, .dbbck ! if dst is not 8 byte aligned: align it
227 andn %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size
228 sub %o2, %o3, %o2 ! update o2 with new count
230 1: dec %o5 ! decrement source
231 ldub [%o5], %g1 ! load one byte
232 deccc %o3 ! decrement count
233 bgu,pt %ncc, 1b ! if not done keep copying
234 stb %g1, [%o5+%o4] ! store one byte into dest
235 andncc %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size
236 bz,pn %ncc, 2f ! if size < 8, move to byte copy
238 ! Now Destination is 8 byte aligned
239 .dbbck:
240 andcc %o5, 7, %o0 ! %o0 has src offset
241 bz,a,pn %ncc, .dbcopybc ! if src is aligned to fast mem move
242 sub %o2, %o3, %o2 ! Residue bytes in %o2
244 .cpy_dbwdbc: ! alignment of src is needed
245 sub %o2, 8, %o2 ! set size one loop ahead
246 sll %o0, 3, %g1 ! %g1 is left shift
247 mov 64, %g5 ! init %g5 to be 64
248 sub %g5, %g1, %g5 ! %g5 right shift = (64 - left shift)
249 sub %o5, %o0, %o5 ! align the src at 8 bytes.
250 add %o4, %o0, %o4 ! increase difference between src & dst
251 ldx [%o5], %o1 ! load first 8 bytes
252 srlx %o1, %g5, %o1
253 1: sub %o5, 8, %o5 ! subtract 8 from src
254 ldx [%o5], %o0 ! load 8 byte
255 sllx %o0, %g1, %o3 ! shift loaded 8 bytes left into tmp reg
256 or %o1, %o3, %o3 ! align data
257 stx %o3, [%o5+%o4] ! store 8 byte
258 subcc %o2, 8, %o2 ! subtract 8 byte from size
259 bg,pt %ncc, 1b ! if size > 0 continue
260 srlx %o0, %g5, %o1 ! move extra byte for the next use
262 srl %g1, 3, %o0 ! retsote %o0 value for alignment
263 add %o5, %o0, %o5 ! restore src alignment
264 sub %o4, %o0, %o4 ! restore difference between src & dest
266 ba 2f ! branch to the trailing byte copy
267 add %o2, 8, %o2 ! restore size value
269 .dbcopybc: ! alignment of src is not needed
270 1: sub %o5, 8, %o5 ! subtract from src
271 ldx [%o5], %g1 ! load 8 bytes
272 subcc %o3, 8, %o3 ! subtract from size
273 bgu,pt %ncc, 1b ! if size is bigger 0 continue
274 stx %g1, [%o5+%o4] ! store 8 bytes to destination
276 ba 2f
279 .bcbyte:
280 1: ldub [%o5], %g1 ! load one byte
281 stb %g1, [%o5+%o4] ! store one byte
282 2: deccc %o2 ! decrement size
283 bgeu,a,pt %ncc, 1b ! if size is >= 0 continue
284 dec %o5 ! decrement from address
286 .exitbc: ! exit from backward copy
287 retl
288 add %o5, %o4, %o0 ! restore dest addr
290 #ifdef NIAGARA2_IMPL
292 ! Check to see if memmove is large aligned copy
293 ! If so, use special version of copy that avoids
294 ! use of block store init
296 .forcpy:
297 cmp %o2, SMALL_MAX ! check for not small case
298 blt,pn %ncc, .mv_short ! merge with memcpy
299 mov %o0, %g1 ! save %o0
300 neg %o0, %o5
301 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
302 brz,pt %o5, .mv_dst_aligned_on_8
304 ! %o5 has the bytes to be written in partial store.
305 sub %o2, %o5, %o2
306 sub %o1, %o0, %o1 ! %o1 gets the difference
307 7: ! dst aligning loop
308 ldub [%o1+%o0], %o4 ! load one byte
309 subcc %o5, 1, %o5
310 stb %o4, [%o0]
311 bgu,pt %ncc, 7b
312 add %o0, 1, %o0 ! advance dst
313 add %o1, %o0, %o1 ! restore %o1
314 .mv_dst_aligned_on_8:
315 andcc %o1, 7, %o5
316 brnz,pt %o5, .src_dst_unaligned_on_8
317 prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
319 .mv_src_dst_aligned_on_8:
320 ! check if we are copying MED_MAX or more bytes
321 cmp %o2, MED_MAX ! limit to store buffer size
322 bleu,pt %ncc, .medlong
323 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
326 * The following memmove code mimics the memcpy code for large aligned copies,
327 * but does not use the ASI_STBI_P (block initializing store) performance
328 * optimization. See memmove rationale section in documentation
330 .mv_large_align8_copy: ! Src and dst share 8 byte alignment
331 rd %fprs, %g5 ! check for unused fp
332 ! if fprs.fef == 0, set it.
333 ! Setting it when already set costs more than checking
334 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
335 bz,a %ncc, 1f
336 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
338 ! align dst to 64 byte boundary
339 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
340 brz,pn %o3, .mv_aligned_on_64
341 sub %o3, 64, %o3 ! %o3 has negative bytes to move
342 add %o2, %o3, %o2 ! adjust remaining count
343 .mv_align_to_64:
344 ldx [%o1], %o4
345 add %o1, 8, %o1 ! increment src ptr
346 addcc %o3, 8, %o3
347 stx %o4, [%o0]
348 brnz,pt %o3, .mv_align_to_64
349 add %o0, 8, %o0 ! increment dst ptr
351 .mv_aligned_on_64:
352 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
353 mov %asi,%o4 ! save %asi
354 ! Determine source alignment
355 ! to correct 8 byte offset
356 andcc %o1, 0x20, %o3
357 brnz,pn %o3, .mv_align_1
358 mov ASI_BLK_P, %asi ! setup %asi for block load/store
359 andcc %o1, 0x10, %o3
360 brnz,pn %o3, .mv_align_01
362 andcc %o1, 0x08, %o3
363 brz,pn %o3, .mv_align_000
364 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
365 ba .mv_align_001
367 .mv_align_01:
368 andcc %o1, 0x08, %o3
369 brnz,pn %o3, .mv_align_011
370 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
371 ba .mv_align_010
373 .mv_align_1:
374 andcc %o1, 0x10, %o3
375 brnz,pn %o3, .mv_align_11
377 andcc %o1, 0x08, %o3
378 brnz,pn %o3, .mv_align_101
379 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
380 ba .mv_align_100
382 .mv_align_11:
383 andcc %o1, 0x08, %o3
384 brz,pn %o3, .mv_align_110
385 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
387 .mv_align_111:
388 ! Alignment off by 8 bytes
389 ldd [%o1], %d0
390 add %o1, 8, %o1
391 sub %o2, 8, %o2
392 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
393 and %o2, 0x7f, %o2 ! residue bytes in %o2
394 .mv_align_111_loop:
395 subcc %o5, 128, %o5
396 /* ---- copy line 1 of 2. ---- */
397 ldda [%o1]%asi,%d16 ! block load
398 fmovd %d16, %d2
399 fmovd %d18, %d4
400 fmovd %d20, %d6
401 fmovd %d22, %d8
402 fmovd %d24, %d10
403 fmovd %d26, %d12
404 fmovd %d28, %d14
405 stda %d0,[%o0]%asi
406 add %o0, 64, %o0 ! advance dst
407 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
408 fmovd %d30, %d0
410 /* ---- copy line 2 of 2. ---- */
411 ldda [%o1+64]%asi,%d16
412 fmovd %d16, %d2
413 fmovd %d18, %d4
414 fmovd %d20, %d6
415 fmovd %d22, %d8
416 fmovd %d24, %d10
417 fmovd %d26, %d12
418 fmovd %d28, %d14
419 add %o1, 128, %o1 ! increment src
420 stda %d0,[%o0]%asi
421 add %o0, 64, %o0 ! advance dst
422 fmovd %d30, %d0
423 bgt,pt %ncc, .mv_align_111_loop
424 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
426 std %d0, [%o0]
427 ba .remain_stuff
428 add %o0, 8, %o0
429 ! END OF mv_align_111
431 .mv_align_110:
432 ! Alignment off by 16 bytes
433 ldd [%o1], %d0
434 ldd [%o1+8], %d2
435 add %o1, 16, %o1
436 sub %o2, 16, %o2
437 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
438 and %o2, 0x7f, %o2 ! residue bytes in %o2
439 .mv_align_110_loop:
440 subcc %o5, 128, %o5
441 /* ---- copy line 1 of 2. ---- */
443 ldda [%o1]%asi,%d16 ! block load
444 fmovd %d16, %d4
445 fmovd %d18, %d6
446 fmovd %d20, %d8
447 fmovd %d22, %d10
448 fmovd %d24, %d12
449 fmovd %d26, %d14
450 stda %d0,[%o0]%asi
451 add %o0, 64, %o0 ! advance dst
452 fmovd %d28, %d0
453 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
454 fmovd %d30, %d2
456 /* ---- copy line 2 of 2. ---- */
457 ldda [%o1+64]%asi,%d16
458 fmovd %d16, %d4
459 fmovd %d18, %d6
460 fmovd %d20, %d8
461 fmovd %d22, %d10
462 fmovd %d24, %d12
463 fmovd %d26, %d14
464 add %o1, 128, %o1 ! increment src
465 stda %d0,[%o0]%asi
466 add %o0, 64, %o0 ! advance dst
467 fmovd %d28, %d0
468 fmovd %d30, %d2
469 bgt,pt %ncc, .mv_align_110_loop
470 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
472 std %d0, [%o0]
473 std %d2, [%o0+8]
474 ba .remain_stuff
475 add %o0, 16, %o0
476 ! END OF mv_align_110
478 .mv_align_101:
479 ! Alignment off by 24 bytes
480 ldd [%o1], %d0
481 ldd [%o1+8], %d2
482 ldd [%o1+16], %d4
483 add %o1, 24, %o1
484 sub %o2, 24, %o2
485 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
486 and %o2, 0x7f, %o2 ! residue bytes in %o2
487 .mv_align_101_loop:
488 subcc %o5, 128, %o5
489 /* ---- copy line 1 of 2. ---- */
491 ldda [%o1]%asi,%d16 ! block load
492 fmovd %d16, %d6
493 fmovd %d18, %d8
494 fmovd %d20, %d10
495 fmovd %d22, %d12
496 fmovd %d24, %d14
497 stda %d0,[%o0]%asi
498 add %o0, 64, %o0 ! advance dst
499 fmovd %d26, %d0
500 fmovd %d28, %d2
501 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
502 fmovd %d30, %d4
504 /* ---- copy line 2 of 2. ---- */
505 ldda [%o1+64]%asi,%d16
506 fmovd %d16, %d6
507 fmovd %d18, %d8
508 fmovd %d20, %d10
509 fmovd %d22, %d12
510 fmovd %d24, %d14
511 add %o1, 128, %o1 ! increment src
512 stda %d0,[%o0]%asi
513 add %o0, 64, %o0 ! advance dst
514 fmovd %d26, %d0
515 fmovd %d28, %d2
516 fmovd %d30, %d4
517 bgt,pt %ncc, .mv_align_101_loop
518 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
520 std %d0, [%o0]
521 std %d2, [%o0+8]
522 std %d4, [%o0+16]
523 ba .remain_stuff
524 add %o0, 24, %o0
525 ! END OF mv_align_101
527 .mv_align_100:
528 ! Alignment off by 32 bytes
529 ldd [%o1], %d0
530 ldd [%o1+8], %d2
531 ldd [%o1+16],%d4
532 ldd [%o1+24],%d6
533 add %o1, 32, %o1
534 sub %o2, 32, %o2
535 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
536 and %o2, 0x7f, %o2 ! residue bytes in %o2
537 .mv_align_100_loop:
538 subcc %o5, 128, %o5
539 /* ---- copy line 1 of 2. ---- */
540 ldda [%o1]%asi,%d16 ! block load
541 fmovd %d16, %d8
542 fmovd %d18, %d10
543 fmovd %d20, %d12
544 fmovd %d22, %d14
545 stda %d0,[%o0]%asi
546 add %o0, 64, %o0 ! advance dst
547 fmovd %d24, %d0
548 fmovd %d26, %d2
549 fmovd %d28, %d4
550 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
551 fmovd %d30, %d6
553 /* ---- copy line 2 of 2. ---- */
554 ldda [%o1+64]%asi,%d16
555 fmovd %d16, %d8
556 fmovd %d18, %d10
557 fmovd %d20, %d12
558 fmovd %d22, %d14
559 add %o1, 128, %o1 ! increment src
560 stda %d0,[%o0]%asi
561 add %o0, 64, %o0 ! advance dst
562 fmovd %d24, %d0
563 fmovd %d26, %d2
564 fmovd %d28, %d4
565 fmovd %d30, %d6
566 bgt,pt %ncc, .mv_align_100_loop
567 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
569 std %d0, [%o0]
570 std %d2, [%o0+8]
571 std %d4, [%o0+16]
572 std %d6, [%o0+24]
573 ba .remain_stuff
574 add %o0, 32, %o0
575 ! END OF mv_align_100
577 .mv_align_011:
578 ! Alignment off by 40 bytes
579 ldd [%o1], %d0
580 ldd [%o1+8], %d2
581 ldd [%o1+16], %d4
582 ldd [%o1+24], %d6
583 ldd [%o1+32], %d8
584 add %o1, 40, %o1
585 sub %o2, 40, %o2
586 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
587 and %o2, 0x7f, %o2 ! residue bytes in %o2
588 .mv_align_011_loop:
589 subcc %o5, 128, %o5
590 /* ---- copy line 1 of 2. ---- */
592 ldda [%o1]%asi,%d16 ! block load
593 fmovd %d16, %d10
594 fmovd %d18, %d12
595 fmovd %d20, %d14
596 stda %d0,[%o0]%asi
597 add %o0, 64, %o0 ! advance dst
598 fmovd %d22, %d0
599 fmovd %d24, %d2
600 fmovd %d26, %d4
601 fmovd %d28, %d6
602 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
603 fmovd %d30, %d8
605 /* ---- copy line 2 of 2. ---- */
606 ldda [%o1+64]%asi,%d16
607 fmovd %d16, %d10
608 fmovd %d18, %d12
609 fmovd %d20, %d14
610 add %o1, 128, %o1 ! increment src
611 stda %d0,[%o0]%asi
612 add %o0, 64, %o0 ! advance dst
613 fmovd %d22, %d0
614 fmovd %d24, %d2
615 fmovd %d26, %d4
616 fmovd %d28, %d6
617 fmovd %d30, %d8
618 bgt,pt %ncc, .mv_align_011_loop
619 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
621 std %d0, [%o0]
622 std %d2, [%o0+8]
623 std %d4, [%o0+16]
624 std %d6, [%o0+24]
625 std %d8, [%o0+32]
626 ba .remain_stuff
627 add %o0, 40, %o0
628 ! END OF mv_align_011
630 .mv_align_010:
631 ! Alignment off by 48 bytes
632 ldd [%o1], %d0
633 ldd [%o1+8], %d2
634 ldd [%o1+16], %d4
635 ldd [%o1+24], %d6
636 ldd [%o1+32], %d8
637 ldd [%o1+40], %d10
638 add %o1, 48, %o1
639 sub %o2, 48, %o2
640 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
641 and %o2, 0x7f, %o2 ! residue bytes in %o2
642 .mv_align_010_loop:
643 subcc %o5, 128, %o5
644 /* ---- copy line 1 of 2. ---- */
646 ldda [%o1]%asi,%d16 ! block load
647 fmovd %d16, %d12
648 fmovd %d18, %d14
649 stda %d0,[%o0]%asi
650 add %o0, 64, %o0 ! advance dst
651 fmovd %d20, %d0
652 fmovd %d22, %d2
653 fmovd %d24, %d4
654 fmovd %d26, %d6
655 fmovd %d28, %d8
656 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
657 fmovd %d30, %d10
659 /* ---- copy line 2 of 2. ---- */
660 ldda [%o1+64]%asi,%d16
661 fmovd %d16, %d12
662 fmovd %d18, %d14
663 add %o1, 128, %o1 ! increment src
664 stda %d0,[%o0]%asi
665 add %o0, 64, %o0 ! advance dst
666 fmovd %d20, %d0
667 fmovd %d22, %d2
668 fmovd %d24, %d4
669 fmovd %d26, %d6
670 fmovd %d28, %d8
671 fmovd %d30, %d10
672 bgt,pt %ncc, .mv_align_010_loop
673 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
675 std %d0, [%o0]
676 std %d2, [%o0+8]
677 std %d4, [%o0+16]
678 std %d6, [%o0+24]
679 std %d8, [%o0+32]
680 std %d10, [%o0+40]
681 ba .remain_stuff
682 add %o0, 48, %o0
683 ! END OF mv_align_010
685 .mv_align_001:
686 ! Alignment off by 56 bytes
687 ldd [%o1], %d0
688 ldd [%o1+8], %d2
689 ldd [%o1+16], %d4
690 ldd [%o1+24], %d6
691 ldd [%o1+32], %d8
692 ldd [%o1+40], %d10
693 ldd [%o1+48], %d12
694 add %o1, 56, %o1
695 sub %o2, 56, %o2
696 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
697 and %o2, 0x7f, %o2 ! residue bytes in %o2
698 .mv_align_001_loop:
699 subcc %o5, 128, %o5
700 /* ---- copy line 1 of 2. ---- */
702 ldda [%o1]%asi,%d16 ! block load
703 fmovd %d16, %d14
704 stda %d0,[%o0]%asi
705 add %o0, 64, %o0 ! advance dst
706 fmovd %d18, %d0
707 fmovd %d20, %d2
708 fmovd %d22, %d4
709 fmovd %d24, %d6
710 fmovd %d26, %d8
711 fmovd %d28, %d10
712 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
713 fmovd %d30, %d12
715 /* ---- copy line 2 of 2. ---- */
716 ldda [%o1+64]%asi,%d16
717 fmovd %d16, %d14
718 add %o1, 128, %o1 ! increment src
719 stda %d0,[%o0]%asi
720 add %o0, 64, %o0 ! advance dst
721 fmovd %d18, %d0
722 fmovd %d20, %d2
723 fmovd %d22, %d4
724 fmovd %d24, %d6
725 fmovd %d26, %d8
726 fmovd %d28, %d10
727 fmovd %d30, %d12
728 bgt,pt %ncc, .mv_align_001_loop
729 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
731 std %d0, [%o0]
732 std %d2, [%o0+8]
733 std %d4, [%o0+16]
734 std %d6, [%o0+24]
735 std %d8, [%o0+32]
736 std %d10, [%o0+40]
737 std %d12, [%o0+48]
738 ba .remain_stuff
739 add %o0, 56, %o0
740 ! END OF mv_align_001
742 .mv_align_000:
743 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
744 and %o2, 0x7f, %o2 ! residue bytes in %o2
745 .mv_align_000_loop:
746 /* ---- copy line 1 of 2. ---- */
747 subcc %o5, 128, %o5
748 ldda [%o1]%asi,%d0
749 stda %d0,[%o0]%asi
750 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
752 /* ---- copy line 2 of 2. ---- */
753 add %o0, 64, %o0
754 ldda [%o1+64]%asi,%d0
755 add %o1, 128, %o1 ! increment src
756 stda %d0,[%o0]%asi
757 add %o0, 64, %o0 ! increment dst
758 bgt,pt %ncc, .mv_align_000_loop
759 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
760 ba .remain_stuff
763 ! END OF mv_align_000
764 #else /* NIAGARA2_IMPL */
765 #endif /* NIAGARA2_IMPL */
767 SET_SIZE(memmove)
769 ENTRY(memcpy)
770 ENTRY(__align_cpy_1)
771 #ifdef NIAGARA2_IMPL
772 cmp %o2, SMALL_MAX ! check for not small case
773 bgeu,pn %ncc, .medium ! go to larger cases
774 mov %o0, %g1 ! save %o0
775 .mv_short:
776 cmp %o2, SHORTCOPY ! check for really short case
777 ble,pt %ncc, .smallfin
778 or %o0, %o1, %o4 ! prepare alignment check
779 andcc %o4, 0x3, %o5 ! test for alignment
780 bz,pt %ncc, .smallword ! branch to word aligned case
781 cmp %o2, SHORTCHECK
782 ble,pt %ncc, .smallrest
783 andcc %o1, 0x3, %o5 ! is src word aligned
784 bz,pn %ncc, .aldst
785 cmp %o5, 2 ! is src half-word aligned
786 be,pt %ncc, .s2algn
787 cmp %o5, 3 ! src is byte aligned
788 .s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it
789 inc 1, %o1
790 stb %o3, [%o0] ! move a byte to align src
791 inc 1, %o0
792 bne,pt %ncc, .s2algn
793 dec %o2
794 b .ald ! now go align dest
795 andcc %o0, 0x3, %o5
797 .s2algn:lduh [%o1], %o3 ! know src is 2 byte aligned
798 inc 2, %o1
799 srl %o3, 8, %o4
800 stb %o4, [%o0] ! have to do bytes,
801 stb %o3, [%o0 + 1] ! don't know dst alignment
802 inc 2, %o0
803 dec 2, %o2
805 .aldst: andcc %o0, 0x3, %o5 ! align the destination address
806 .ald: bz,pn %ncc, .w4cp
807 cmp %o5, 2
808 be,pn %ncc, .w2cp
809 cmp %o5, 3
810 .w3cp: lduw [%o1], %o4
811 inc 4, %o1
812 srl %o4, 24, %o5
813 stb %o5, [%o0]
814 bne,pt %ncc, .w1cp
815 inc %o0
816 dec 1, %o2
817 andn %o2, 3, %o3 ! %o3 is aligned word count
818 dec 4, %o3 ! avoid reading beyond tail of src
819 sub %o1, %o0, %o1 ! %o1 gets the difference
821 1: sll %o4, 8, %g5 ! save residual bytes
822 lduw [%o1+%o0], %o4
823 deccc 4, %o3
824 srl %o4, 24, %o5 ! merge with residual
825 or %o5, %g5, %g5
826 st %g5, [%o0]
827 bnz,pt %ncc, 1b
828 inc 4, %o0
829 sub %o1, 3, %o1 ! used one byte of last word read
830 and %o2, 3, %o2
831 b 7f
832 inc 4, %o2
834 .w1cp: srl %o4, 8, %o5
835 sth %o5, [%o0]
836 inc 2, %o0
837 dec 3, %o2
838 andn %o2, 3, %o3 ! %o3 is aligned word count
839 dec 4, %o3 ! avoid reading beyond tail of src
840 sub %o1, %o0, %o1 ! %o1 gets the difference
842 2: sll %o4, 24, %g5 ! save residual bytes
843 lduw [%o1+%o0], %o4
844 deccc 4, %o3
845 srl %o4, 8, %o5 ! merge with residual
846 or %o5, %g5, %g5
847 st %g5, [%o0]
848 bnz,pt %ncc, 2b
849 inc 4, %o0
850 sub %o1, 1, %o1 ! used three bytes of last word read
851 and %o2, 3, %o2
852 b 7f
853 inc 4, %o2
855 .w2cp: lduw [%o1], %o4
856 inc 4, %o1
857 srl %o4, 16, %o5
858 sth %o5, [%o0]
859 inc 2, %o0
860 dec 2, %o2
861 andn %o2, 3, %o3 ! %o3 is aligned word count
862 dec 4, %o3 ! avoid reading beyond tail of src
863 sub %o1, %o0, %o1 ! %o1 gets the difference
865 3: sll %o4, 16, %g5 ! save residual bytes
866 lduw [%o1+%o0], %o4
867 deccc 4, %o3
868 srl %o4, 16, %o5 ! merge with residual
869 or %o5, %g5, %g5
870 st %g5, [%o0]
871 bnz,pt %ncc, 3b
872 inc 4, %o0
873 sub %o1, 2, %o1 ! used two bytes of last word read
874 and %o2, 3, %o2
875 b 7f
876 inc 4, %o2
878 .w4cp: andn %o2, 3, %o3 ! %o3 is aligned word count
879 sub %o1, %o0, %o1 ! %o1 gets the difference
881 1: lduw [%o1+%o0], %o4 ! read from address
882 deccc 4, %o3 ! decrement count
883 st %o4, [%o0] ! write at destination address
884 bgu,pt %ncc, 1b
885 inc 4, %o0 ! increment to address
886 and %o2, 3, %o2 ! number of leftover bytes, if any
888 ! simple finish up byte copy, works with any alignment
890 add %o1, %o0, %o1 ! restore %o1
891 .smallrest:
892 tst %o2
893 bz,pt %ncc, .smallx
894 cmp %o2, 4
895 blt,pt %ncc, .smallleft3
897 sub %o2, 3, %o2
898 .smallnotalign4:
899 ldub [%o1], %o3 ! read byte
900 subcc %o2, 4, %o2 ! reduce count by 4
901 stb %o3, [%o0] ! write byte
902 ldub [%o1+1], %o3 ! repeat for total of 4 bytes
903 add %o1, 4, %o1 ! advance SRC by 4
904 stb %o3, [%o0+1]
905 ldub [%o1-2], %o3
906 add %o0, 4, %o0 ! advance DST by 4
907 stb %o3, [%o0-2]
908 ldub [%o1-1], %o3
909 bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain
910 stb %o3, [%o0-1]
911 addcc %o2, 3, %o2 ! restore count
912 bz,pt %ncc, .smallx
913 .smallleft3: ! 1, 2, or 3 bytes remain
914 subcc %o2, 1, %o2
915 ldub [%o1], %o3 ! load one byte
916 bz,pt %ncc, .smallx
917 stb %o3, [%o0] ! store one byte
918 ldub [%o1+1], %o3 ! load second byte
919 subcc %o2, 1, %o2
920 bz,pt %ncc, .smallx
921 stb %o3, [%o0+1] ! store second byte
922 ldub [%o1+2], %o3 ! load third byte
923 stb %o3, [%o0+2] ! store third byte
924 .smallx:
925 retl
926 mov %g1, %o0 ! restore %o0
928 .smallfin:
929 tst %o2
930 bnz,pt %ncc, .smallleft3
932 retl
933 mov %g1, %o0 ! restore %o0
935 .align 16
936 .smallwords:
937 lduw [%o1], %o3 ! read word
938 .smallwordx:
939 subcc %o2, 8, %o2 ! update count
940 stw %o3, [%o0] ! write word
941 add %o1, 8, %o1 ! update SRC
942 lduw [%o1-4], %o3 ! read word
943 add %o0, 8, %o0 ! update DST
944 bgu,pt %ncc, .smallwords ! loop until done
945 stw %o3, [%o0-4] ! write word
946 addcc %o2, 7, %o2 ! restore count
947 bz,pt %ncc, .smallexit ! check for completion
948 cmp %o2, 4 ! check for 4 or more bytes left
949 blt %ncc, .smallleft3 ! if not, go to finish up
951 lduw [%o1], %o3
952 add %o1, 4, %o1
953 subcc %o2, 4, %o2
954 add %o0, 4, %o0
955 bnz,pt %ncc, .smallleft3
956 stw %o3, [%o0-4]
957 retl
958 mov %g1, %o0 ! restore %o0
960 ! 8 or more bytes, src and dest start on word boundary
961 ! %o4 contains or %o0, %o1; %o3 contains first four bytes of src
962 .smalllong:
963 andcc %o4, 0x7, %o5 ! test for long alignment
964 bnz,pt %ncc, .smallwordx ! branch to word aligned case
965 cmp %o2, SHORT_LONG-7
966 bge,a %ncc, .medl64 ! if we branch
967 sub %o2,56,%o2 ! adjust %o2 to -31 off count
968 sub %o1, %o0, %o1 ! %o1 gets the difference
969 .small_long_l:
970 ldx [%o1+%o0], %o3
971 subcc %o2, 8, %o2
972 add %o0, 8, %o0
973 bgu,pt %ncc, .small_long_l ! loop until done
974 stx %o3, [%o0-8] ! write word
975 add %o1, %o0, %o1 ! restore %o1
976 addcc %o2, 7, %o2 ! restore %o2 to correct count
977 bz,pt %ncc, .smallexit ! check for completion
978 cmp %o2, 4 ! check for 4 or more bytes left
979 blt,pt %ncc, .smallleft3 ! if not, go to finish up
981 lduw [%o1], %o3
982 add %o1, 4, %o1
983 subcc %o2, 4, %o2
984 stw %o3, [%o0]
985 add %o0, 4, %o0
986 bnz,pt %ncc, .smallleft3
988 retl
989 mov %g1, %o0 ! restore %o0
991 .align 16
992 ! src and dest start on word boundary
993 .smallword:
994 subcc %o2, 7, %o2 ! adjust count
995 bgu,pt %ncc, .smalllong
996 lduw [%o1], %o3 ! read word
997 addcc %o2, 3, %o2 ! restore count
998 bz,pt %ncc, .smallexit
999 stw %o3, [%o0] ! write word
1000 deccc %o2 ! reduce count for cc test
1001 ldub [%o1+4], %o3 ! load one byte
1002 bz,pt %ncc, .smallexit
1003 stb %o3, [%o0+4] ! store one byte
1004 ldub [%o1+5], %o3 ! load second byte
1005 deccc %o2
1006 bz,pt %ncc, .smallexit
1007 stb %o3, [%o0+5] ! store second byte
1008 ldub [%o1+6], %o3 ! load third byte
1009 stb %o3, [%o0+6] ! store third byte
1010 .smallexit:
1011 retl
1012 mov %g1, %o0 ! restore %o0
1014 .align 16
1015 .medium:
1016 neg %o0, %o5
1017 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
1018 brz,pt %o5, .dst_aligned_on_8
1020 ! %o5 has the bytes to be written in partial store.
1021 sub %o2, %o5, %o2
1022 sub %o1, %o0, %o1 ! %o1 gets the difference
1023 7: ! dst aligning loop
1024 ldub [%o1+%o0], %o4 ! load one byte
1025 subcc %o5, 1, %o5
1026 stb %o4, [%o0]
1027 bgu,pt %ncc, 7b
1028 add %o0, 1, %o0 ! advance dst
1029 add %o1, %o0, %o1 ! restore %o1
1030 .dst_aligned_on_8:
1031 andcc %o1, 7, %o5
1032 brnz,pt %o5, .src_dst_unaligned_on_8
1033 prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
1035 .src_dst_aligned_on_8:
1036 ! check if we are copying MED_MAX or more bytes
1037 cmp %o2, MED_MAX ! limit to store buffer size
1038 bgu,pt %ncc, .large_align8_copy
1039 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1041 * Special case for handling when src and dest are both long word aligned
1042 * and total data to move is less than MED_MAX bytes
1044 .medlong:
1045 subcc %o2, 63, %o2 ! adjust length to allow cc test
1046 ble,pt %ncc, .medl63 ! skip big loop if less than 64 bytes
1047 .medl64:
1048 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache
1049 ldx [%o1], %o4 ! load
1050 subcc %o2, 64, %o2 ! decrement length count
1051 stx %o4, [%o0] ! and store
1052 ldx [%o1+8], %o3 ! a block of 64 bytes
1053 stx %o3, [%o0+8]
1054 ldx [%o1+16], %o4
1055 stx %o4, [%o0+16]
1056 ldx [%o1+24], %o3
1057 stx %o3, [%o0+24]
1058 ldx [%o1+32], %o4 ! load
1059 stx %o4, [%o0+32] ! and store
1060 ldx [%o1+40], %o3 ! a block of 64 bytes
1061 add %o1, 64, %o1 ! increase src ptr by 64
1062 stx %o3, [%o0+40]
1063 ldx [%o1-16], %o4
1064 add %o0, 64, %o0 ! increase dst ptr by 64
1065 stx %o4, [%o0-16]
1066 ldx [%o1-8], %o3
1067 bgu,pt %ncc, .medl64 ! repeat if at least 64 bytes left
1068 stx %o3, [%o0-8]
1069 .medl63:
1070 addcc %o2, 32, %o2 ! adjust remaining count
1071 ble,pt %ncc, .medl31 ! to skip if 31 or fewer bytes left
1073 ldx [%o1], %o4 ! load
1074 sub %o2, 32, %o2 ! decrement length count
1075 stx %o4, [%o0] ! and store
1076 ldx [%o1+8], %o3 ! a block of 32 bytes
1077 add %o1, 32, %o1 ! increase src ptr by 32
1078 stx %o3, [%o0+8]
1079 ldx [%o1-16], %o4
1080 add %o0, 32, %o0 ! increase dst ptr by 32
1081 stx %o4, [%o0-16]
1082 ldx [%o1-8], %o3
1083 stx %o3, [%o0-8]
1084 .medl31:
1085 addcc %o2, 16, %o2 ! adjust remaining count
1086 ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left
1087 nop !
1088 ldx [%o1], %o4 ! load and store 16 bytes
1089 add %o1, 16, %o1 ! increase src ptr by 16
1090 stx %o4, [%o0] !
1091 sub %o2, 16, %o2 ! decrease count by 16
1092 ldx [%o1-8], %o3 !
1093 add %o0, 16, %o0 ! increase dst ptr by 16
1094 stx %o3, [%o0-8]
1095 .medl15:
1096 addcc %o2, 15, %o2 ! restore count
1097 bz,pt %ncc, .smallexit ! exit if finished
1098 cmp %o2, 8
1099 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
1100 tst %o2
1101 ldx [%o1], %o4 ! load 8 bytes
1102 add %o1, 8, %o1 ! increase src ptr by 8
1103 add %o0, 8, %o0 ! increase dst ptr by 8
1104 subcc %o2, 8, %o2 ! decrease count by 8
1105 bnz,pt %ncc, .medw7
1106 stx %o4, [%o0-8] ! and store 8 bytes
1107 retl
1108 mov %g1, %o0 ! restore %o0
1110 .align 16
1111 .src_dst_unaligned_on_8:
1112 ! DST is 8-byte aligned, src is not
1114 andcc %o1, 0x3, %o5 ! test word alignment
1115 bnz,pt %ncc, .unalignsetup ! branch to skip if not word aligned
1116 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1119 * Handle all cases where src and dest are aligned on word
1120 * boundaries. Use unrolled loops for better performance.
1121 * This option wins over standard large data move when
1122 * source and destination is in cache for medium
1123 * to short data moves.
1125 cmp %o2, MED_WMAX ! limit to store buffer size
1126 bge,pt %ncc, .unalignrejoin ! otherwise rejoin main loop
1127 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1129 subcc %o2, 31, %o2 ! adjust length to allow cc test
1130 ! for end of loop
1131 ble,pt %ncc, .medw31 ! skip big loop if less than 16
1132 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1133 .medw32:
1134 ld [%o1], %o4 ! move a block of 32 bytes
1135 stw %o4, [%o0]
1136 ld [%o1+4], %o3
1137 stw %o3, [%o0+4]
1138 ld [%o1+8], %o4
1139 stw %o4, [%o0+8]
1140 ld [%o1+12], %o3
1141 stw %o3, [%o0+12]
1142 ld [%o1+16], %o4
1143 subcc %o2, 32, %o2 ! decrement length count
1144 stw %o4, [%o0+16]
1145 ld [%o1+20], %o3
1146 add %o1, 32, %o1 ! increase src ptr by 32
1147 stw %o3, [%o0+20]
1148 ld [%o1-8], %o4
1149 add %o0, 32, %o0 ! increase dst ptr by 32
1150 stw %o4, [%o0-8]
1151 ld [%o1-4], %o3
1152 bgu,pt %ncc, .medw32 ! repeat if at least 32 bytes left
1153 stw %o3, [%o0-4]
1154 .medw31:
1155 addcc %o2, 31, %o2 ! restore count
1157 bz,pt %ncc, .smallexit ! exit if finished
1159 cmp %o2, 16
1160 blt,pt %ncc, .medw15
1162 ld [%o1], %o4 ! move a block of 16 bytes
1163 subcc %o2, 16, %o2 ! decrement length count
1164 stw %o4, [%o0]
1165 ld [%o1+4], %o3
1166 add %o1, 16, %o1 ! increase src ptr by 16
1167 stw %o3, [%o0+4]
1168 ld [%o1-8], %o4
1169 add %o0, 16, %o0 ! increase dst ptr by 16
1170 stw %o4, [%o0-8]
1171 ld [%o1-4], %o3
1172 stw %o3, [%o0-4]
1173 .medw15:
1174 bz,pt %ncc, .smallexit ! exit if finished
1175 cmp %o2, 8
1176 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
1177 tst %o2
1178 ld [%o1], %o4 ! load 4 bytes
1179 subcc %o2, 8, %o2 ! decrease count by 8
1180 stw %o4, [%o0] ! and store 4 bytes
1181 add %o1, 8, %o1 ! increase src ptr by 8
1182 ld [%o1-4], %o3 ! load 4 bytes
1183 add %o0, 8, %o0 ! increase dst ptr by 8
1184 stw %o3, [%o0-4] ! and store 4 bytes
1185 bz,pt %ncc, .smallexit ! exit if finished
1186 .medw7: ! count is ge 1, less than 8
1187 cmp %o2, 4 ! check for 4 bytes left
1188 blt,pt %ncc, .smallleft3 ! skip if 3 or fewer bytes left
1189 nop !
1190 ld [%o1], %o4 ! load 4 bytes
1191 add %o1, 4, %o1 ! increase src ptr by 4
1192 add %o0, 4, %o0 ! increase dst ptr by 4
1193 subcc %o2, 4, %o2 ! decrease count by 4
1194 bnz .smallleft3
1195 stw %o4, [%o0-4] ! and store 4 bytes
1196 retl
1197 mov %g1, %o0 ! restore %o0
1199 .align 16
1200 .large_align8_copy: ! Src and dst share 8 byte alignment
1201 rd %fprs, %g5 ! check for unused fp
1202 ! if fprs.fef == 0, set it.
1203 ! Setting it when already set costs more than checking
1204 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
1205 bz,a %ncc, 1f
1206 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1208 ! align dst to 64 byte boundary
1209 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
1210 brz,pn %o3, .aligned_to_64
1211 andcc %o0, 8, %o3 ! odd long words to move?
1212 brz,pt %o3, .aligned_to_16
1214 ldx [%o1], %o4
1215 sub %o2, 8, %o2
1216 add %o1, 8, %o1 ! increment src ptr
1217 add %o0, 8, %o0 ! increment dst ptr
1218 stx %o4, [%o0-8]
1219 .aligned_to_16:
1220 andcc %o0, 16, %o3 ! pair of long words to move?
1221 brz,pt %o3, .aligned_to_32
1223 ldx [%o1], %o4
1224 sub %o2, 16, %o2
1225 stx %o4, [%o0]
1226 add %o1, 16, %o1 ! increment src ptr
1227 ldx [%o1-8], %o4
1228 add %o0, 16, %o0 ! increment dst ptr
1229 stx %o4, [%o0-8]
1230 .aligned_to_32:
1231 andcc %o0, 32, %o3 ! four long words to move?
1232 brz,pt %o3, .aligned_to_64
1234 ldx [%o1], %o4
1235 sub %o2, 32, %o2
1236 stx %o4, [%o0]
1237 ldx [%o1+8], %o4
1238 stx %o4, [%o0+8]
1239 ldx [%o1+16], %o4
1240 stx %o4, [%o0+16]
1241 add %o1, 32, %o1 ! increment src ptr
1242 ldx [%o1-8], %o4
1243 add %o0, 32, %o0 ! increment dst ptr
1244 stx %o4, [%o0-8]
1245 .aligned_to_64:
1246 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1247 mov %asi,%o4 ! save %asi
1248 ! Determine source alignment
1249 ! to correct 8 byte offset
1250 andcc %o1, 0x20, %o3
1251 brnz,pn %o3, .align_1
1252 mov ASI_BLK_P, %asi ! setup %asi for block load/store
1253 andcc %o1, 0x10, %o3
1254 brnz,pn %o3, .align_01
1256 andcc %o1, 0x08, %o3
1257 brz,pn %o3, .align_000
1258 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1259 ba .align_001
1261 .align_01:
1262 andcc %o1, 0x08, %o3
1263 brnz,pn %o3, .align_011
1264 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1265 ba .align_010
1267 .align_1:
1268 andcc %o1, 0x10, %o3
1269 brnz,pn %o3, .align_11
1271 andcc %o1, 0x08, %o3
1272 brnz,pn %o3, .align_101
1273 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1274 ba .align_100
1276 .align_11:
1277 andcc %o1, 0x08, %o3
1278 brz,pn %o3, .align_110
1279 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1281 .align_111:
1282 ! Alignment off by 8 bytes
1283 ldd [%o1], %d0
1284 add %o1, 8, %o1
1285 sub %o2, 8, %o2
1286 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1287 and %o2, 0x7f, %o2 ! residue bytes in %o2
1288 .align_111_loop:
1289 subcc %o5, 128, %o5
1290 /* ---- copy line 1 of 2. ---- */
1291 ldda [%o1]%asi,%d16 ! block load
1292 fmovd %d16, %d2
1293 fmovd %d18, %d4
1294 fmovd %d20, %d6
1295 fmovd %d22, %d8
1296 fmovd %d24, %d10
1297 fmovd %d26, %d12
1298 fmovd %d28, %d14
1299 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1300 stda %d0,[%o0]%asi
1301 add %o0, 64, %o0 ! advance dst
1302 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1303 fmovd %d30, %d0
1305 /* ---- copy line 2 of 2. ---- */
1306 ldda [%o1+64]%asi,%d16
1307 fmovd %d16, %d2
1308 fmovd %d18, %d4
1309 fmovd %d20, %d6
1310 fmovd %d22, %d8
1311 fmovd %d24, %d10
1312 fmovd %d26, %d12
1313 fmovd %d28, %d14
1314 add %o1, 128, %o1 ! increment src
1315 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1316 stda %d0,[%o0]%asi
1317 add %o0, 64, %o0 ! advance dst
1318 fmovd %d30, %d0
1319 bgt,pt %ncc, .align_111_loop
1320 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1322 std %d0, [%o0]
1323 ba .remain_stuff
1324 add %o0, 8, %o0
1325 ! END OF align_111
1327 .align_110:
1328 ! Alignment off by 16 bytes
1329 ldd [%o1], %d0
1330 ldd [%o1+8], %d2
1331 add %o1, 16, %o1
1332 sub %o2, 16, %o2
1333 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1334 and %o2, 0x7f, %o2 ! residue bytes in %o2
1335 .align_110_loop:
1336 subcc %o5, 128, %o5
1337 /* ---- copy line 1 of 2. ---- */
1339 ldda [%o1]%asi,%d16 ! block load
1340 fmovd %d16, %d4
1341 fmovd %d18, %d6
1342 fmovd %d20, %d8
1343 fmovd %d22, %d10
1344 fmovd %d24, %d12
1345 fmovd %d26, %d14
1346 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1347 stda %d0,[%o0]%asi
1348 add %o0, 64, %o0 ! advance dst
1349 fmovd %d28, %d0
1350 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1351 fmovd %d30, %d2
1353 /* ---- copy line 2 of 2. ---- */
1354 ldda [%o1+64]%asi,%d16
1355 fmovd %d16, %d4
1356 fmovd %d18, %d6
1357 fmovd %d20, %d8
1358 fmovd %d22, %d10
1359 fmovd %d24, %d12
1360 fmovd %d26, %d14
1361 add %o1, 128, %o1 ! increment src
1362 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1363 stda %d0,[%o0]%asi
1364 add %o0, 64, %o0 ! advance dst
1365 fmovd %d28, %d0
1366 fmovd %d30, %d2
1367 bgt,pt %ncc, .align_110_loop
1368 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1370 std %d0, [%o0]
1371 std %d2, [%o0+8]
1372 ba .remain_stuff
1373 add %o0, 16, %o0
1374 ! END OF align_110
1376 .align_101:
1377 ! Alignment off by 24 bytes
1378 ldd [%o1], %d0
1379 ldd [%o1+8], %d2
1380 ldd [%o1+16], %d4
1381 add %o1, 24, %o1
1382 sub %o2, 24, %o2
1383 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1384 and %o2, 0x7f, %o2 ! residue bytes in %o2
1385 .align_101_loop:
1386 subcc %o5, 128, %o5
1387 /* ---- copy line 1 of 2. ---- */
1389 ldda [%o1]%asi,%d16 ! block load
1390 fmovd %d16, %d6
1391 fmovd %d18, %d8
1392 fmovd %d20, %d10
1393 fmovd %d22, %d12
1394 fmovd %d24, %d14
1395 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1396 stda %d0,[%o0]%asi
1397 add %o0, 64, %o0 ! advance dst
1398 fmovd %d26, %d0
1399 fmovd %d28, %d2
1400 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1401 fmovd %d30, %d4
1403 /* ---- copy line 2 of 2. ---- */
1404 ldda [%o1+64]%asi,%d16
1405 fmovd %d16, %d6
1406 fmovd %d18, %d8
1407 fmovd %d20, %d10
1408 fmovd %d22, %d12
1409 fmovd %d24, %d14
1410 add %o1, 128, %o1 ! increment src
1411 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1412 stda %d0,[%o0]%asi
1413 add %o0, 64, %o0 ! advance dst
1414 fmovd %d26, %d0
1415 fmovd %d28, %d2
1416 fmovd %d30, %d4
1417 bgt,pt %ncc, .align_101_loop
1418 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1420 std %d0, [%o0]
1421 std %d2, [%o0+8]
1422 std %d4, [%o0+16]
1423 ba .remain_stuff
1424 add %o0, 24, %o0
1425 ! END OF align_101
1427 .align_100:
1428 ! Alignment off by 32 bytes
1429 ldd [%o1], %d0
1430 ldd [%o1+8], %d2
1431 ldd [%o1+16],%d4
1432 ldd [%o1+24],%d6
1433 add %o1, 32, %o1
1434 sub %o2, 32, %o2
1435 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1436 and %o2, 0x7f, %o2 ! residue bytes in %o2
1437 .align_100_loop:
1438 subcc %o5, 128, %o5
1439 /* ---- copy line 1 of 2. ---- */
1440 ldda [%o1]%asi,%d16 ! block load
1441 fmovd %d16, %d8
1442 fmovd %d18, %d10
1443 fmovd %d20, %d12
1444 fmovd %d22, %d14
1445 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1446 stda %d0,[%o0]%asi
1447 add %o0, 64, %o0 ! advance dst
1448 fmovd %d24, %d0
1449 fmovd %d26, %d2
1450 fmovd %d28, %d4
1451 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1452 fmovd %d30, %d6
1454 /* ---- copy line 2 of 2. ---- */
1455 ldda [%o1+64]%asi,%d16
1456 fmovd %d16, %d8
1457 fmovd %d18, %d10
1458 fmovd %d20, %d12
1459 fmovd %d22, %d14
1460 add %o1, 128, %o1 ! increment src
1461 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1462 stda %d0,[%o0]%asi
1463 add %o0, 64, %o0 ! advance dst
1464 fmovd %d24, %d0
1465 fmovd %d26, %d2
1466 fmovd %d28, %d4
1467 fmovd %d30, %d6
1468 bgt,pt %ncc, .align_100_loop
1469 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1471 std %d0, [%o0]
1472 std %d2, [%o0+8]
1473 std %d4, [%o0+16]
1474 std %d6, [%o0+24]
1475 ba .remain_stuff
1476 add %o0, 32, %o0
1477 ! END OF align_100
1479 .align_011:
1480 ! Alignment off by 40 bytes
1481 ldd [%o1], %d0
1482 ldd [%o1+8], %d2
1483 ldd [%o1+16], %d4
1484 ldd [%o1+24], %d6
1485 ldd [%o1+32], %d8
1486 add %o1, 40, %o1
1487 sub %o2, 40, %o2
1488 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1489 and %o2, 0x7f, %o2 ! residue bytes in %o2
1490 .align_011_loop:
1491 subcc %o5, 128, %o5
1492 /* ---- copy line 1 of 2. ---- */
1494 ldda [%o1]%asi,%d16 ! block load
1495 fmovd %d16, %d10
1496 fmovd %d18, %d12
1497 fmovd %d20, %d14
1498 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1499 stda %d0,[%o0]%asi
1500 add %o0, 64, %o0 ! advance dst
1501 fmovd %d22, %d0
1502 fmovd %d24, %d2
1503 fmovd %d26, %d4
1504 fmovd %d28, %d6
1505 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1506 fmovd %d30, %d8
1508 /* ---- copy line 2 of 2. ---- */
1509 ldda [%o1+64]%asi,%d16
1510 fmovd %d16, %d10
1511 fmovd %d18, %d12
1512 fmovd %d20, %d14
1513 add %o1, 128, %o1 ! increment src
1514 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1515 stda %d0,[%o0]%asi
1516 add %o0, 64, %o0 ! advance dst
1517 fmovd %d22, %d0
1518 fmovd %d24, %d2
1519 fmovd %d26, %d4
1520 fmovd %d28, %d6
1521 fmovd %d30, %d8
1522 bgt,pt %ncc, .align_011_loop
1523 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1525 std %d0, [%o0]
1526 std %d2, [%o0+8]
1527 std %d4, [%o0+16]
1528 std %d6, [%o0+24]
1529 std %d8, [%o0+32]
1530 ba .remain_stuff
1531 add %o0, 40, %o0
1532 ! END OF align_011
1534 .align_010:
1535 ! Alignment off by 48 bytes
1536 ldd [%o1], %d0
1537 ldd [%o1+8], %d2
1538 ldd [%o1+16], %d4
1539 ldd [%o1+24], %d6
1540 ldd [%o1+32], %d8
1541 ldd [%o1+40], %d10
1542 add %o1, 48, %o1
1543 sub %o2, 48, %o2
1544 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1545 and %o2, 0x7f, %o2 ! residue bytes in %o2
1546 .align_010_loop:
1547 subcc %o5, 128, %o5
1548 /* ---- copy line 1 of 2. ---- */
1550 ldda [%o1]%asi,%d16 ! block load
1551 fmovd %d16, %d12
1552 fmovd %d18, %d14
1553 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1554 stda %d0,[%o0]%asi
1555 add %o0, 64, %o0 ! advance dst
1556 fmovd %d20, %d0
1557 fmovd %d22, %d2
1558 fmovd %d24, %d4
1559 fmovd %d26, %d6
1560 fmovd %d28, %d8
1561 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1562 fmovd %d30, %d10
1564 /* ---- copy line 2 of 2. ---- */
1565 ldda [%o1+64]%asi,%d16
1566 fmovd %d16, %d12
1567 fmovd %d18, %d14
1568 add %o1, 128, %o1 ! increment src
1569 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1570 stda %d0,[%o0]%asi
1571 add %o0, 64, %o0 ! advance dst
1572 fmovd %d20, %d0
1573 fmovd %d22, %d2
1574 fmovd %d24, %d4
1575 fmovd %d26, %d6
1576 fmovd %d28, %d8
1577 fmovd %d30, %d10
1578 bgt,pt %ncc, .align_010_loop
1579 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1581 std %d0, [%o0]
1582 std %d2, [%o0+8]
1583 std %d4, [%o0+16]
1584 std %d6, [%o0+24]
1585 std %d8, [%o0+32]
1586 std %d10, [%o0+40]
1587 ba .remain_stuff
1588 add %o0, 48, %o0
1589 ! END OF align_010
1591 .align_001:
1592 ! Alignment off by 56 bytes
1593 ldd [%o1], %d0
1594 ldd [%o1+8], %d2
1595 ldd [%o1+16], %d4
1596 ldd [%o1+24], %d6
1597 ldd [%o1+32], %d8
1598 ldd [%o1+40], %d10
1599 ldd [%o1+48], %d12
1600 add %o1, 56, %o1
1601 sub %o2, 56, %o2
1602 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1603 and %o2, 0x7f, %o2 ! residue bytes in %o2
1604 .align_001_loop:
1605 subcc %o5, 128, %o5
1606 /* ---- copy line 1 of 2. ---- */
1608 ldda [%o1]%asi,%d16 ! block load
1609 fmovd %d16, %d14
1610 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1611 stda %d0,[%o0]%asi
1612 add %o0, 64, %o0 ! advance dst
1613 fmovd %d18, %d0
1614 fmovd %d20, %d2
1615 fmovd %d22, %d4
1616 fmovd %d24, %d6
1617 fmovd %d26, %d8
1618 fmovd %d28, %d10
1619 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1620 fmovd %d30, %d12
1622 /* ---- copy line 2 of 2. ---- */
1623 ldda [%o1+64]%asi,%d16
1624 fmovd %d16, %d14
1625 add %o1, 128, %o1 ! increment src
1626 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1627 stda %d0,[%o0]%asi
1628 add %o0, 64, %o0 ! advance dst
1629 fmovd %d18, %d0
1630 fmovd %d20, %d2
1631 fmovd %d22, %d4
1632 fmovd %d24, %d6
1633 fmovd %d26, %d8
1634 fmovd %d28, %d10
1635 fmovd %d30, %d12
1636 bgt,pt %ncc, .align_001_loop
1637 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1639 std %d0, [%o0]
1640 std %d2, [%o0+8]
1641 std %d4, [%o0+16]
1642 std %d6, [%o0+24]
1643 std %d8, [%o0+32]
1644 std %d10, [%o0+40]
1645 std %d12, [%o0+48]
1646 ba .remain_stuff
1647 add %o0, 56, %o0
1648 ! END OF align_001
1650 .align_000:
1651 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
1652 and %o2, 0x7f, %o2 ! residue bytes in %o2
1653 .align_000_loop:
1654 /* ---- copy line 1 of 2. ---- */
1655 subcc %o5, 128, %o5
1656 ldda [%o1]%asi,%d0
1657 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1658 stda %d0,[%o0]%asi
1659 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1661 /* ---- copy line 2 of 2. ---- */
1662 add %o0, 64, %o0
1663 ldda [%o1+64]%asi,%d0
1664 add %o1, 128, %o1 ! increment src
1665 stxa %g0,[%o0]ASI_STBI_P ! block initializing store
1666 stda %d0,[%o0]%asi
1667 add %o0, 64, %o0 ! increment dst
1668 bgt,pt %ncc, .align_000_loop
1669 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1671 ! END OF align_000
1673 .remain_stuff:
1674 mov %o4, %asi ! restore %asi
1675 brnz %g5, .medlong
1676 membar #Sync
1677 ba .medlong
1678 wr %g5, %g0, %fprs
1680 .align 16
1681 ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
1682 .unalignsetup:
1683 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1684 .unalignrejoin:
1685 rd %fprs, %g5 ! check for unused fp
1686 ! if fprs.fef == 0, set it.
1687 ! Setting it when already set costs more than checking
1688 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
1689 bz,a %ncc, 1f
1690 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1692 cmp %o2, MED_UMAX ! check for medium unaligned limit
1693 bge,pt %ncc,.unalign_large
1695 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
1696 and %o2, 0x3f, %o2 ! residue bytes in %o2
1697 cmp %o2, 8 ! Insure we don't load beyond
1698 bgt .unalign_adjust ! end of source buffer
1699 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
1700 add %o2, 64, %o2 ! adjust to leave loop
1701 sub %o5, 64, %o5 ! early if necessary
1702 .unalign_adjust:
1703 alignaddr %o1, %g0, %g0 ! generate %gsr
1704 add %o1, %o5, %o1 ! advance %o1 to after blocks
1705 ldd [%o4], %d0
1706 .unalign_loop:
1707 ldd [%o4+8], %d2
1708 faligndata %d0, %d2, %d16
1709 ldd [%o4+16], %d4
1710 std %d16, [%o0]
1711 faligndata %d2, %d4, %d18
1712 ldd [%o4+24], %d6
1713 std %d18, [%o0+8]
1714 faligndata %d4, %d6, %d20
1715 ldd [%o4+32], %d8
1716 std %d20, [%o0+16]
1717 faligndata %d6, %d8, %d22
1718 ldd [%o4+40], %d10
1719 std %d22, [%o0+24]
1720 faligndata %d8, %d10, %d24
1721 ldd [%o4+48], %d12
1722 std %d24, [%o0+32]
1723 faligndata %d10, %d12, %d26
1724 ldd [%o4+56], %d14
1725 std %d26, [%o0+40]
1726 faligndata %d12, %d14, %d28
1727 ldd [%o4+64], %d0
1728 std %d28, [%o0+48]
1729 faligndata %d14, %d0, %d30
1730 add %o4, BLOCK_SIZE, %o4
1731 std %d30, [%o0+56]
1732 add %o0, BLOCK_SIZE, %o0
1733 subcc %o5, BLOCK_SIZE, %o5
1734 bgu,pt %ncc, .unalign_loop
1735 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1736 ba .unalign_done
1739 .unalign_large:
1740 andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned?
1741 bz %ncc, .unalignsrc
1742 sub %o3, 64, %o3 ! %o3 will be multiple of 8
1743 neg %o3 ! bytes until dest is 64 byte aligned
1744 sub %o2, %o3, %o2 ! update cnt with bytes to be moved
1745 ! Move bytes according to source alignment
1746 andcc %o1, 0x1, %o5
1747 bnz %ncc, .unalignbyte ! check for byte alignment
1749 andcc %o1, 2, %o5 ! check for half word alignment
1750 bnz %ncc, .unalignhalf
1752 ! Src is word aligned
1753 .unalignword:
1754 ld [%o1], %o4 ! load 4 bytes
1755 stw %o4, [%o0] ! and store 4 bytes
1756 ld [%o1+4], %o4 ! load 4 bytes
1757 add %o1, 8, %o1 ! increase src ptr by 8
1758 stw %o4, [%o0+4] ! and store 4 bytes
1759 subcc %o3, 8, %o3 ! decrease count by 8
1760 bnz %ncc, .unalignword
1761 add %o0, 8, %o0 ! increase dst ptr by 8
1762 ba .unalignsrc
1765 ! Src is half-word aligned
1766 .unalignhalf:
1767 lduh [%o1], %o4 ! load 2 bytes
1768 sllx %o4, 32, %o5 ! shift left
1769 lduw [%o1+2], %o4
1770 or %o4, %o5, %o5
1771 sllx %o5, 16, %o5
1772 lduh [%o1+6], %o4
1773 or %o4, %o5, %o5
1774 stx %o5, [%o0]
1775 add %o1, 8, %o1
1776 subcc %o3, 8, %o3
1777 bnz %ncc, .unalignhalf
1778 add %o0, 8, %o0
1779 ba .unalignsrc
1782 ! Src is Byte aligned
1783 .unalignbyte:
1784 sub %o0, %o1, %o0 ! share pointer advance
1785 .unalignbyte_loop:
1786 ldub [%o1], %o4
1787 sllx %o4, 56, %o5
1788 lduh [%o1+1], %o4
1789 sllx %o4, 40, %o4
1790 or %o4, %o5, %o5
1791 lduh [%o1+3], %o4
1792 sllx %o4, 24, %o4
1793 or %o4, %o5, %o5
1794 lduh [%o1+5], %o4
1795 sllx %o4, 8, %o4
1796 or %o4, %o5, %o5
1797 ldub [%o1+7], %o4
1798 or %o4, %o5, %o5
1799 stx %o5, [%o0+%o1]
1800 subcc %o3, 8, %o3
1801 bnz %ncc, .unalignbyte_loop
1802 add %o1, 8, %o1
1803 add %o0,%o1, %o0 ! restore pointer
1805 ! Destination is now block (64 byte aligned)
1806 .unalignsrc:
1807 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
1808 and %o2, 0x3f, %o2 ! residue bytes in %o2
1809 add %o2, 64, %o2 ! Insure we don't load beyond
1810 sub %o5, 64, %o5 ! end of source buffer
1812 andn %o1, 0x3f, %o4 ! %o4 has block aligned src address
1813 prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read
1814 alignaddr %o1, %g0, %g0 ! generate %gsr
1815 add %o1, %o5, %o1 ! advance %o1 to after blocks
1817 ! Determine source alignment to correct 8 byte offset
1818 andcc %o1, 0x20, %o3
1819 brnz,pn %o3, .unalign_1
1821 andcc %o1, 0x10, %o3
1822 brnz,pn %o3, .unalign_01
1824 andcc %o1, 0x08, %o3
1825 brz,a %o3, .unalign_000
1826 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1827 ba .unalign_001
1828 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1829 .unalign_01:
1830 andcc %o1, 0x08, %o3
1831 brnz,a %o3, .unalign_011
1832 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1833 ba .unalign_010
1834 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1835 .unalign_1:
1836 andcc %o1, 0x10, %o3
1837 brnz,pn %o3, .unalign_11
1839 andcc %o1, 0x08, %o3
1840 brnz,a %o3, .unalign_101
1841 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1842 ba .unalign_100
1843 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1844 .unalign_11:
1845 andcc %o1, 0x08, %o3
1846 brz,pn %o3, .unalign_110
1847 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1849 .unalign_111:
1850 ldd [%o4+56], %d14
1851 .unalign_111_loop:
1852 add %o4, 64, %o4
1853 ldda [%o4]ASI_BLK_P, %d16
1854 faligndata %d14, %d16, %d48
1855 faligndata %d16, %d18, %d50
1856 faligndata %d18, %d20, %d52
1857 faligndata %d20, %d22, %d54
1858 faligndata %d22, %d24, %d56
1859 faligndata %d24, %d26, %d58
1860 faligndata %d26, %d28, %d60
1861 faligndata %d28, %d30, %d62
1862 fmovd %d30, %d14
1863 stda %d48, [%o0]ASI_BLK_P
1864 subcc %o5, 64, %o5
1865 add %o0, 64, %o0
1866 bgu,pt %ncc, .unalign_111_loop
1867 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1868 ba .unalign_done
1869 membar #Sync
1871 .unalign_110:
1872 ldd [%o4+48], %d12
1873 ldd [%o4+56], %d14
1874 .unalign_110_loop:
1875 add %o4, 64, %o4
1876 ldda [%o4]ASI_BLK_P, %d16
1877 faligndata %d12, %d14, %d48
1878 faligndata %d14, %d16, %d50
1879 faligndata %d16, %d18, %d52
1880 faligndata %d18, %d20, %d54
1881 faligndata %d20, %d22, %d56
1882 faligndata %d22, %d24, %d58
1883 faligndata %d24, %d26, %d60
1884 faligndata %d26, %d28, %d62
1885 fmovd %d28, %d12
1886 fmovd %d30, %d14
1887 stda %d48, [%o0]ASI_BLK_P
1888 subcc %o5, 64, %o5
1889 add %o0, 64, %o0
1890 bgu,pt %ncc, .unalign_110_loop
1891 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1892 ba .unalign_done
1893 membar #Sync
1895 .unalign_101:
1896 ldd [%o4+40], %d10
1897 ldd [%o4+48], %d12
1898 ldd [%o4+56], %d14
1899 .unalign_101_loop:
1900 add %o4, 64, %o4
1901 ldda [%o4]ASI_BLK_P, %d16
1902 faligndata %d10, %d12, %d48
1903 faligndata %d12, %d14, %d50
1904 faligndata %d14, %d16, %d52
1905 faligndata %d16, %d18, %d54
1906 faligndata %d18, %d20, %d56
1907 faligndata %d20, %d22, %d58
1908 faligndata %d22, %d24, %d60
1909 faligndata %d24, %d26, %d62
1910 fmovd %d26, %d10
1911 fmovd %d28, %d12
1912 fmovd %d30, %d14
1913 stda %d48, [%o0]ASI_BLK_P
1914 subcc %o5, 64, %o5
1915 add %o0, 64, %o0
1916 bgu,pt %ncc, .unalign_101_loop
1917 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1918 ba .unalign_done
1919 membar #Sync
1921 .unalign_100:
1922 ldd [%o4+32], %d8
1923 ldd [%o4+40], %d10
1924 ldd [%o4+48], %d12
1925 ldd [%o4+56], %d14
1926 .unalign_100_loop:
1927 add %o4, 64, %o4
1928 ldda [%o4]ASI_BLK_P, %d16
1929 faligndata %d8, %d10, %d48
1930 faligndata %d10, %d12, %d50
1931 faligndata %d12, %d14, %d52
1932 faligndata %d14, %d16, %d54
1933 faligndata %d16, %d18, %d56
1934 faligndata %d18, %d20, %d58
1935 faligndata %d20, %d22, %d60
1936 faligndata %d22, %d24, %d62
1937 fmovd %d24, %d8
1938 fmovd %d26, %d10
1939 fmovd %d28, %d12
1940 fmovd %d30, %d14
1941 stda %d48, [%o0]ASI_BLK_P
1942 subcc %o5, 64, %o5
1943 add %o0, 64, %o0
1944 bgu,pt %ncc, .unalign_100_loop
1945 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1946 ba .unalign_done
1947 membar #Sync
1949 .unalign_011:
1950 ldd [%o4+24], %d6
1951 ldd [%o4+32], %d8
1952 ldd [%o4+40], %d10
1953 ldd [%o4+48], %d12
1954 ldd [%o4+56], %d14
1955 .unalign_011_loop:
1956 add %o4, 64, %o4
1957 ldda [%o4]ASI_BLK_P, %d16
1958 faligndata %d6, %d8, %d48
1959 faligndata %d8, %d10, %d50
1960 faligndata %d10, %d12, %d52
1961 faligndata %d12, %d14, %d54
1962 faligndata %d14, %d16, %d56
1963 faligndata %d16, %d18, %d58
1964 faligndata %d18, %d20, %d60
1965 faligndata %d20, %d22, %d62
1966 fmovd %d22, %d6
1967 fmovd %d24, %d8
1968 fmovd %d26, %d10
1969 fmovd %d28, %d12
1970 fmovd %d30, %d14
1971 stda %d48, [%o0]ASI_BLK_P
1972 subcc %o5, 64, %o5
1973 add %o0, 64, %o0
1974 bgu,pt %ncc, .unalign_011_loop
1975 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1976 ba .unalign_done
1977 membar #Sync
1979 .unalign_010:
1980 ldd [%o4+16], %d4
1981 ldd [%o4+24], %d6
1982 ldd [%o4+32], %d8
1983 ldd [%o4+40], %d10
1984 ldd [%o4+48], %d12
1985 ldd [%o4+56], %d14
1986 .unalign_010_loop:
1987 add %o4, 64, %o4
1988 ldda [%o4]ASI_BLK_P, %d16
1989 faligndata %d4, %d6, %d48
1990 faligndata %d6, %d8, %d50
1991 faligndata %d8, %d10, %d52
1992 faligndata %d10, %d12, %d54
1993 faligndata %d12, %d14, %d56
1994 faligndata %d14, %d16, %d58
1995 faligndata %d16, %d18, %d60
1996 faligndata %d18, %d20, %d62
1997 fmovd %d20, %d4
1998 fmovd %d22, %d6
1999 fmovd %d24, %d8
2000 fmovd %d26, %d10
2001 fmovd %d28, %d12
2002 fmovd %d30, %d14
2003 stda %d48, [%o0]ASI_BLK_P
2004 subcc %o5, 64, %o5
2005 add %o0, 64, %o0
2006 bgu,pt %ncc, .unalign_010_loop
2007 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2008 ba .unalign_done
2009 membar #Sync
2011 .unalign_001:
2012 ldd [%o4+8], %d2
2013 ldd [%o4+16], %d4
2014 ldd [%o4+24], %d6
2015 ldd [%o4+32], %d8
2016 ldd [%o4+40], %d10
2017 ldd [%o4+48], %d12
2018 ldd [%o4+56], %d14
2019 .unalign_001_loop:
2020 add %o4, 64, %o4
2021 ldda [%o4]ASI_BLK_P, %d16
2022 faligndata %d2, %d4, %d48
2023 faligndata %d4, %d6, %d50
2024 faligndata %d6, %d8, %d52
2025 faligndata %d8, %d10, %d54
2026 faligndata %d10, %d12, %d56
2027 faligndata %d12, %d14, %d58
2028 faligndata %d14, %d16, %d60
2029 faligndata %d16, %d18, %d62
2030 fmovd %d18, %d2
2031 fmovd %d20, %d4
2032 fmovd %d22, %d6
2033 fmovd %d24, %d8
2034 fmovd %d26, %d10
2035 fmovd %d28, %d12
2036 fmovd %d30, %d14
2037 stda %d48, [%o0]ASI_BLK_P
2038 subcc %o5, 64, %o5
2039 add %o0, 64, %o0
2040 bgu,pt %ncc, .unalign_001_loop
2041 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2042 ba .unalign_done
2043 membar #Sync
2045 .unalign_000:
2046 ldda [%o4]ASI_BLK_P, %d0
2047 .unalign_000_loop:
2048 add %o4, 64, %o4
2049 ldda [%o4]ASI_BLK_P, %d16
2050 faligndata %d0, %d2, %d48
2051 faligndata %d2, %d4, %d50
2052 faligndata %d4, %d6, %d52
2053 faligndata %d6, %d8, %d54
2054 faligndata %d8, %d10, %d56
2055 faligndata %d10, %d12, %d58
2056 faligndata %d12, %d14, %d60
2057 faligndata %d14, %d16, %d62
2058 fmovd %d16, %d0
2059 fmovd %d18, %d2
2060 fmovd %d20, %d4
2061 fmovd %d22, %d6
2062 fmovd %d24, %d8
2063 fmovd %d26, %d10
2064 fmovd %d28, %d12
2065 fmovd %d30, %d14
2066 stda %d48, [%o0]ASI_BLK_P
2067 subcc %o5, 64, %o5
2068 add %o0, 64, %o0
2069 bgu,pt %ncc, .unalign_000_loop
2070 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2071 membar #Sync
2073 .unalign_done:
2074 ! Handle trailing bytes, 64 to 127
2075 ! Dest long word aligned, Src not long word aligned
2076 cmp %o2, 15
2077 bleu %ncc, .unalign_short
2079 andn %o2, 0x7, %o5 ! %o5 is multiple of 8
2080 and %o2, 0x7, %o2 ! residue bytes in %o2
2081 add %o2, 8, %o2
2082 sub %o5, 8, %o5 ! insure we don't load past end of src
2083 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
2084 add %o1, %o5, %o1 ! advance %o1 to after multiple of 8
2085 ldd [%o4], %d0 ! fetch partial word
2086 .unalign_by8:
2087 ldd [%o4+8], %d2
2088 add %o4, 8, %o4
2089 faligndata %d0, %d2, %d16
2090 subcc %o5, 8, %o5
2091 std %d16, [%o0]
2092 fmovd %d2, %d0
2093 bgu,pt %ncc, .unalign_by8
2094 add %o0, 8, %o0
2096 .unalign_short:
2097 brnz %g5, .smallrest
2099 ba .smallrest
2100 wr %g5, %g0, %fprs
2101 #else /* NIAGARA2_IMPL */
2102 .forcpy:
2103 mov %o0, %g5 ! save des address for return val
2104 cmp %o2, 17 ! for small counts copy bytes
2105 bleu,pt %ncc, .dbytecp
2108 cmp %o2, 0x80 ! For lengths less than 128 bytes no
2109 bleu,pn %ncc, .no_blkcpy ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2112 * Make sure that source and destination buffers are 64 bytes apart.
2113 * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy
2114 * the data.
2116 subcc %o1, %o0, %o3
2117 blu %ncc, .blkalgndst
2118 cmp %o3, 0x40 ! if src - dst >= 0x40
2119 bgeu,pt %ncc, .blkalgndst ! then use ASI_BLK_INIT_ST_QUAD_LDD_P
2120 .no_blkcpy:
2121 andcc %o1, 3, %o5 ! is src word aligned
2122 bz,pn %ncc, .aldst
2123 cmp %o5, 2 ! is src half-word aligned
2124 be,pt %ncc, .s2algn
2125 cmp %o5, 3 ! src is byte aligned
2126 .s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it
2127 inc 1, %o1
2128 stb %o3, [%g5] ! move a byte to align src
2129 inc 1, %g5
2130 bne,pt %ncc, .s2algn
2131 dec %o2
2132 b .ald ! now go align dest
2133 andcc %g5, 3, %o5
2135 .s2algn:lduh [%o1], %o3 ! know src is 2 byte alinged
2136 inc 2, %o1
2137 srl %o3, 8, %o4
2138 stb %o4, [%g5] ! have to do bytes,
2139 stb %o3, [%g5 + 1] ! don't know dst alingment
2140 inc 2, %g5
2141 dec 2, %o2
2143 .aldst: andcc %g5, 3, %o5 ! align the destination address
2144 .ald: bz,pn %ncc, .w4cp
2145 cmp %o5, 2
2146 bz,pn %ncc, .w2cp
2147 cmp %o5, 3
2148 .w3cp: lduw [%o1], %o4
2149 inc 4, %o1
2150 srl %o4, 24, %o5
2151 stb %o5, [%g5]
2152 bne,pt %ncc, .w1cp
2153 inc %g5
2154 dec 1, %o2
2155 andn %o2, 3, %o3 ! o3 is aligned word count
2156 dec 4, %o3 ! avoid reading beyond tail of src
2157 sub %o1, %g5, %o1 ! o1 gets the difference
2159 1: sll %o4, 8, %g1 ! save residual bytes
2160 lduw [%o1+%g5], %o4
2161 deccc 4, %o3
2162 srl %o4, 24, %o5 ! merge with residual
2163 or %o5, %g1, %g1
2164 st %g1, [%g5]
2165 bnz,pt %ncc, 1b
2166 inc 4, %g5
2167 sub %o1, 3, %o1 ! used one byte of last word read
2168 and %o2, 3, %o2
2169 b 7f
2170 inc 4, %o2
2172 .w1cp: srl %o4, 8, %o5
2173 sth %o5, [%g5]
2174 inc 2, %g5
2175 dec 3, %o2
2176 andn %o2, 3, %o3 ! o3 is aligned word count
2177 dec 4, %o3 ! avoid reading beyond tail of src
2178 sub %o1, %g5, %o1 ! o1 gets the difference
2180 2: sll %o4, 24, %g1 ! save residual bytes
2181 lduw [%o1+%g5], %o4
2182 deccc 4, %o3
2183 srl %o4, 8, %o5 ! merge with residual
2184 or %o5, %g1, %g1
2185 st %g1, [%g5]
2186 bnz,pt %ncc, 2b
2187 inc 4, %g5
2188 sub %o1, 1, %o1 ! used three bytes of last word read
2189 and %o2, 3, %o2
2190 b 7f
2191 inc 4, %o2
2193 .w2cp: lduw [%o1], %o4
2194 inc 4, %o1
2195 srl %o4, 16, %o5
2196 sth %o5, [%g5]
2197 inc 2, %g5
2198 dec 2, %o2
2199 andn %o2, 3, %o3 ! o3 is aligned word count
2200 dec 4, %o3 ! avoid reading beyond tail of src
2201 sub %o1, %g5, %o1 ! o1 gets the difference
2203 3: sll %o4, 16, %g1 ! save residual bytes
2204 lduw [%o1+%g5], %o4
2205 deccc 4, %o3
2206 srl %o4, 16, %o5 ! merge with residual
2207 or %o5, %g1, %g1
2208 st %g1, [%g5]
2209 bnz,pt %ncc, 3b
2210 inc 4, %g5
2211 sub %o1, 2, %o1 ! used two bytes of last word read
2212 and %o2, 3, %o2
2213 b 7f
2214 inc 4, %o2
2216 .w4cp: andn %o2, 3, %o3 ! o3 is aligned word count
2217 sub %o1, %g5, %o1 ! o1 gets the difference
2219 1: lduw [%o1+%g5], %o4 ! read from address
2220 deccc 4, %o3 ! decrement count
2221 st %o4, [%g5] ! write at destination address
2222 bgu,pt %ncc, 1b
2223 inc 4, %g5 ! increment to address
2224 b 7f
2225 and %o2, 3, %o2 ! number of leftover bytes, if any
2228 ! differenced byte copy, works with any alignment
2230 .dbytecp:
2231 b 7f
2232 sub %o1, %g5, %o1 ! o1 gets the difference
2234 4: stb %o4, [%g5] ! write to address
2235 inc %g5 ! inc to address
2236 7: deccc %o2 ! decrement count
2237 bgeu,a,pt %ncc,4b ! loop till done
2238 ldub [%o1+%g5], %o4 ! read from address
2239 retl ! %o0 was preserved
2242 .blkalgndst:
2243 save %sp, -SA(MINFRAME), %sp
2245 ! Block (64 bytes) align the destination.
2246 andcc %i0, 0x3f, %i3 ! is dst block aligned
2247 bz %ncc, .chksrc ! dst already block aligned
2248 sub %i3, 0x40, %i3
2249 neg %i3 ! bytes till dst 64 bytes aligned
2250 sub %i2, %i3, %i2 ! update i2 with new count
2252 ! Based on source and destination alignment do
2253 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2255 ! Is dst & src 8B aligned
2256 or %i0, %i1, %o2
2257 andcc %o2, 0x7, %g0
2258 bz %ncc, .alewdcp
2261 ! Is dst & src 4B aligned
2262 andcc %o2, 0x3, %g0
2263 bz %ncc, .alwdcp
2266 ! Is dst & src 2B aligned
2267 andcc %o2, 0x1, %g0
2268 bz %ncc, .alhlfwdcp
2271 ! 1B aligned
2272 1: ldub [%i1], %o2
2273 stb %o2, [%i0]
2274 inc %i1
2275 deccc %i3
2276 bgu,pt %ncc, 1b
2277 inc %i0
2279 ba .chksrc
2282 ! dst & src 4B aligned
2283 .alwdcp:
2284 ld [%i1], %o2
2285 st %o2, [%i0]
2286 add %i1, 0x4, %i1
2287 subcc %i3, 0x4, %i3
2288 bgu,pt %ncc, .alwdcp
2289 add %i0, 0x4, %i0
2291 ba .chksrc
2294 ! dst & src 2B aligned
2295 .alhlfwdcp:
2296 lduh [%i1], %o2
2297 stuh %o2, [%i0]
2298 add %i1, 0x2, %i1
2299 subcc %i3, 0x2, %i3
2300 bgu,pt %ncc, .alhlfwdcp
2301 add %i0, 0x2, %i0
2303 ba .chksrc
2306 ! dst & src 8B aligned
2307 .alewdcp:
2308 ldx [%i1], %o2
2309 stx %o2, [%i0]
2310 add %i1, 0x8, %i1
2311 subcc %i3, 0x8, %i3
2312 bgu,pt %ncc, .alewdcp
2313 add %i0, 0x8, %i0
2315 ! Now Destination is block (64 bytes) aligned
2316 .chksrc:
2317 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
2318 sub %i2, %i3, %i2 ! Residue bytes in %i2
2319 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2320 andcc %i1, 0xf, %l1 ! is src quadword aligned
2321 bz,pn %ncc, .blkcpy ! src offset in %l1
2323 cmp %l1, 0x8
2324 bgu %ncc, .cpy_upper_double
2326 blu %ncc, .cpy_lower_double
2329 ! Falls through when source offset is equal to 8 i.e.
2330 ! source is double word aligned.
2331 ! In this case no shift/merge of data is required
2332 sub %i1, %l1, %i1 ! align the src at 16 bytes.
2333 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
2334 prefetch [%o0+0x0], #one_read
2335 ldda [%i1+0x0]%asi, %o2
2336 loop0:
2337 ldda [%i1+0x10]%asi, %o4
2338 prefetch [%o0+0x40], #one_read
2340 stxa %o3, [%i0+0x0]%asi
2341 stxa %o4, [%i0+0x8]%asi
2343 ldda [%i1+0x20]%asi, %o2
2344 stxa %o5, [%i0+0x10]%asi
2345 stxa %o2, [%i0+0x18]%asi
2347 ldda [%i1+0x30]%asi, %o4
2348 stxa %o3, [%i0+0x20]%asi
2349 stxa %o4, [%i0+0x28]%asi
2351 ldda [%i1+0x40]%asi, %o2
2352 stxa %o5, [%i0+0x30]%asi
2353 stxa %o2, [%i0+0x38]%asi
2355 add %o0, 0x40, %o0
2356 add %i1, 0x40, %i1
2357 subcc %i3, 0x40, %i3
2358 bgu,pt %ncc, loop0
2359 add %i0, 0x40, %i0
2360 ba .blkdone
2361 add %i1, %l1, %i1 ! increment the source by src offset
2363 .cpy_lower_double:
2364 sub %i1, %l1, %i1 ! align the src at 16 bytes.
2365 sll %l1, 3, %l2 ! %l2 left shift
2366 mov 0x40, %l3
2367 sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift)
2368 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
2369 prefetch [%o0+0x0], #one_read
2370 ldda [%i1+0x0]%asi, %o2 ! partial data in %o2 and %o3 has
2371 ! complete data
2372 loop1:
2373 ldda [%i1+0x10]%asi, %o4 ! %o4 has partial data for this read.
2374 ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) ! merge %o2, %o3 and %o4
2375 ! into %o2 and %o3
2376 prefetch [%o0+0x40], #one_read
2377 stxa %o2, [%i0+0x0]%asi
2378 stxa %o3, [%i0+0x8]%asi
2380 ldda [%i1+0x20]%asi, %o2
2381 ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) ! merge %o2 with %o5 and
2382 stxa %o4, [%i0+0x10]%asi ! %o4 from previous read
2383 stxa %o5, [%i0+0x18]%asi ! into %o4 and %o5
2385 ! Repeat the same for next 32 bytes.
2387 ldda [%i1+0x30]%asi, %o4
2388 ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)
2389 stxa %o2, [%i0+0x20]%asi
2390 stxa %o3, [%i0+0x28]%asi
2392 ldda [%i1+0x40]%asi, %o2
2393 ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)
2394 stxa %o4, [%i0+0x30]%asi
2395 stxa %o5, [%i0+0x38]%asi
2397 add %o0, 0x40, %o0
2398 add %i1, 0x40, %i1
2399 subcc %i3, 0x40, %i3
2400 bgu,pt %ncc, loop1
2401 add %i0, 0x40, %i0
2402 ba .blkdone
2403 add %i1, %l1, %i1 ! increment the source by src offset
2405 .cpy_upper_double:
2406 sub %i1, %l1, %i1 ! align the src at 16 bytes.
2407 mov 0x8, %l2
2408 sub %l1, %l2, %l2
2409 sll %l2, 3, %l2 ! %l2 left shift
2410 mov 0x40, %l3
2411 sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift)
2412 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
2413 prefetch [%o0+0x0], #one_read
2414 ldda [%i1+0x0]%asi, %o2 ! partial data in %o3 for this read and
2415 ! no data in %o2
2416 loop2:
2417 ldda [%i1+0x10]%asi, %o4 ! %o4 has complete data and %o5 has
2418 ! partial
2419 ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) ! merge %o3, %o4 and %o5
2420 ! into %o3 and %o4
2421 prefetch [%o0+0x40], #one_read
2422 stxa %o3, [%i0+0x0]%asi
2423 stxa %o4, [%i0+0x8]%asi
2425 ldda [%i1+0x20]%asi, %o2
2426 ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) ! merge %o2 and %o3 with
2427 stxa %o5, [%i0+0x10]%asi ! %o5 from previous read
2428 stxa %o2, [%i0+0x18]%asi ! into %o5 and %o2
2430 ! Repeat the same for next 32 bytes.
2432 ldda [%i1+0x30]%asi, %o4
2433 ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)
2434 stxa %o3, [%i0+0x20]%asi
2435 stxa %o4, [%i0+0x28]%asi
2437 ldda [%i1+0x40]%asi, %o2
2438 ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)
2439 stxa %o5, [%i0+0x30]%asi
2440 stxa %o2, [%i0+0x38]%asi
2442 add %o0, 0x40, %o0
2443 add %i1, 0x40, %i1
2444 subcc %i3, 0x40, %i3
2445 bgu,pt %ncc, loop2
2446 add %i0, 0x40, %i0
2447 ba .blkdone
2448 add %i1, %l1, %i1 ! increment the source by src offset
2450 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2451 .blkcpy:
2452 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
2453 prefetch [%o0+0x0], #one_read
2455 prefetch [%o0+0x40], #one_read
2457 ldda [%i1+0x0]%asi, %o2
2458 ldda [%i1+0x10]%asi, %o4
2460 stxa %o2, [%i0+0x0]%asi
2461 stxa %o3, [%i0+0x8]%asi
2462 stxa %o4, [%i0+0x10]%asi
2463 stxa %o5, [%i0+0x18]%asi
2465 ldda [%i1+0x20]%asi, %o2
2466 ldda [%i1+0x30]%asi, %o4
2468 stxa %o2, [%i0+0x20]%asi
2469 stxa %o3, [%i0+0x28]%asi
2470 stxa %o4, [%i0+0x30]%asi
2471 stxa %o5, [%i0+0x38]%asi
2473 add %o0, 0x40, %o0
2474 add %i1, 0x40, %i1
2475 subcc %i3, 0x40, %i3
2476 bgu,pt %ncc, 1b
2477 add %i0, 0x40, %i0
2479 .blkdone:
2480 membar #Sync
2482 mov ASI_PNF, %asi ! restore %asi to default
2483 ! ASI_PRIMARY_NOFAULT value
2484 tst %i2
2485 bz,pt %ncc, .blkexit
2488 ! Handle trailing bytes
2489 cmp %i2, 0x8
2490 blu,pt %ncc, .residue
2493 ! Can we do some 8B ops
2494 or %i1, %i0, %o2
2495 andcc %o2, 0x7, %g0
2496 bnz %ncc, .last4
2499 ! Do 8byte ops as long as possible
2500 .last8:
2501 ldx [%i1], %o2
2502 stx %o2, [%i0]
2503 add %i1, 0x8, %i1
2504 sub %i2, 0x8, %i2
2505 cmp %i2, 0x8
2506 bgu,pt %ncc, .last8
2507 add %i0, 0x8, %i0
2509 tst %i2
2510 bz,pt %ncc, .blkexit
2513 ba .residue
2516 .last4:
2517 ! Can we do 4B ops
2518 andcc %o2, 0x3, %g0
2519 bnz %ncc, .last2
2522 ld [%i1], %o2
2523 st %o2, [%i0]
2524 add %i1, 0x4, %i1
2525 sub %i2, 0x4, %i2
2526 cmp %i2, 0x4
2527 bgu,pt %ncc, 1b
2528 add %i0, 0x4, %i0
2530 cmp %i2, 0
2531 bz,pt %ncc, .blkexit
2534 ba .residue
2537 .last2:
2538 ! Can we do 2B ops
2539 andcc %o2, 0x1, %g0
2540 bnz %ncc, .residue
2544 lduh [%i1], %o2
2545 stuh %o2, [%i0]
2546 add %i1, 0x2, %i1
2547 sub %i2, 0x2, %i2
2548 cmp %i2, 0x2
2549 bgu,pt %ncc, 1b
2550 add %i0, 0x2, %i0
2552 cmp %i2, 0
2553 bz,pt %ncc, .blkexit
2556 .residue:
2557 ldub [%i1], %o2
2558 stb %o2, [%i0]
2559 inc %i1
2560 deccc %i2
2561 bgu,pt %ncc, .residue
2562 inc %i0
2564 .blkexit:
2567 restore %g5, %g0, %o0
2569 #endif /* NIAGARA2_IMPL */
2570 SET_SIZE(memcpy)
2571 SET_SIZE(__align_cpy_1)