4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
31 * Copy s2 to s1, always copy n bytes.
32 * Note: this C code does not work for overlapped copies.
33 * Memmove() and bcopy() do.
35 * Fast assembler language version of the following C-program for memcpy
36 * which represents the `standard' for the C-library.
39 * memcpy(void *s, const void *s0, size_t n)
43 * const char *s2 = s0;
52 #include <sys/asm_linkage.h>
53 #include <sys/sun4asi.h>
56 #define ICACHE_LINE_SIZE 64
62 #define MEDIUM_MAX 255
63 #define MED_WMAX 256 /* max copy for medium word-aligned case */
64 #define MED_MAX 256 /* max copy for medium longword-aligned case */
67 #define BSTORE_SIZE 256 /* min copy size for block store */
70 ANSI_PRAGMA_WEAK
(memmove
,function
)
71 ANSI_PRAGMA_WEAK
(memcpy
,function
)
74 cmp %o1
, %o0
! if from address is
>= to use forward copy
75 bgeu
%ncc
, .forcpy ! else use backward if ...
76 sub %o0
, %o1
, %o4
! get difference of two addresses
77 cmp %o2
, %o4
! compare size
and difference of addresses
78 bleu
%ncc
, .forcpy ! if size is bigger, do overlapped copy
82 ! an overlapped copy that must
be done
"backwards"
85 mov
%o0
, %g1
! save dest address for return val
86 add %o1
, %o2
, %o1
! get to end of source space
87 add %o0
, %o2
, %o0
! get to end of destination space
90 bgeu
,pn
%ncc
, .dbalign
96 ldub
[%o1-
1], %o3
! load last byte
97 stb %o3
, [%o0-
1] ! store last byte
99 ldub
[%o1+
2], %o3
! load
2nd from last byte
100 stb %o3
, [%o0-
2] ! store
2nd from last byte
102 ldub
[%o1+
1], %o3
! load
3rd from last byte
103 stb %o3
, [%o0+
1] ! store
3rd from last byte
105 ldub
[%o1
], %o3
! load
4th from last byte
106 bgu
,pt
%ncc
, .byte4loop
107 stb %o3
, [%o0
] ! store
4th from last byte
112 dec %o1
! decrement src address
113 ldub
[%o1
], %o3
! read
a byte
114 dec %o0
! decrement
dst address
115 deccc
%o2
! decrement count
116 bgu
,pt
%ncc
, .byteloop ! loop until done
117 stb %o3
, [%o0
] ! write byte
124 andcc
%o0
, 7, %o5
! bytes till
DST 8 byte aligned
126 sub %o2
, %o5
, %o2
! update count
128 dec %o1
! decrement src address
129 ldub
[%o1
], %o3
! read
a byte
130 dec %o0
! decrement
dst address
131 deccc
%o5
! decrement count
132 bgu
,pt
%ncc
, .dbalign1 ! loop until done
133 stb %o3
, [%o0
] ! store
a byte
135 ! check for src long word alignment
137 andcc
%o1
, 7, %g0
! chk src long word alignment
141 ! Following code is for overlapping copies where src
and dest
142 ! are long word aligned
145 blt,pn
%ncc
, .dbmedl32enter ! go to no prefetch code
147 prefetch
[%o1
- (1 * BLOCK_SIZE
)], 20 ! into the prefetch cache
148 sub %o2
, 63, %o2
! adjust length to allow cc test
150 prefetch
[%o1
- (2 * BLOCK_SIZE
)], 20 ! into the prefetch cache
151 rd
%fprs
, %o3
! o3
= fprs
152 ! if fprs.fef
== 0, set it. Checking it
, requires
2 instructions.
153 ! So set it anyway
, without checking.
154 prefetch
[%o1
- (3 * BLOCK_SIZE
)], 20 ! into the prefetch cache
155 wr
%g0
, 0x4, %fprs
! fprs.fef
= 1
156 prefetch
[%o1
- (4 * BLOCK_SIZE
)], 20 ! into the prefetch cache
158 prefetch
[%o1
- (5 * BLOCK_SIZE
)], 20 ! into the prefetch cache
159 ldd
[%o1-
8], %d4
! load
160 subcc
%o2
, 64, %o2
! decrement length count
161 std %d4
, [%o0-
8] ! and store
162 ldd
[%o1-
16], %d2
! a block of
64 bytes
163 sub %o1
, 64, %o1
! decrease src ptr by
64
165 sub %o0
, 64, %o0
! decrease
dst ptr by
64
177 bgu
,pt
%ncc
, .dbmedl64 ! repeat if at least 64 bytes left
179 add %o2
, 63, %o2
! restore offset adjustment
180 and %o3
, 0x4, %o3
! fprs.du
= fprs.dl
= 0
181 wr
%o3
, %g0
, %fprs
! fprs
= o3 restore fprs
183 subcc
%o2
, 31, %o2
! adjust length to allow cc test
185 ble,pt
%ncc
, .dbmedl31 ! skip big loop if less than 32
188 ldx [%o1-
8], %o4
! load
189 subcc
%o2
, 32, %o2
! decrement length count
190 stx %o4
, [%o0-
8] ! and store
191 ldx [%o1-
16], %o3
! a block of
32 bytes
192 sub %o1
, 32, %o1
! decrease src ptr by
32
195 sub %o0
, 32, %o0
! decrease
dst ptr by
32
198 bgu
,pt
%ncc
, .dbmedl32 ! repeat if at least 32 bytes left
201 addcc
%o2
, 16, %o2
! adjust remaining count
202 ble,pt
%ncc
, .dbmedl15 ! skip if 15 or fewer bytes left
204 ldx [%o1-
8], %o4
! load
and store
16 bytes
205 sub %o1
, 16, %o1
! decrease src ptr by
16
207 sub %o2
, 16, %o2
! decrease count by
16
209 sub %o0
, 16, %o0
! decrease
dst ptr by
16
212 addcc
%o2
, 15, %o2
! restore count
213 bz
,pt
%ncc
, .dbexit ! exit if finished
216 blt,pt
%ncc
, .dbremain ! skip if 7 or fewer bytes left
218 ldx [%o1-
8], %o4
! load
8 bytes
219 sub %o1
, 8, %o1
! decrease src ptr by
8
220 stx %o4
, [%o0-
8] ! and store
8 bytes
221 subcc
%o2
, 8, %o2
! decrease count by
8
222 bnz
%ncc
, .dbremain ! exit if finished
223 sub %o0
, 8, %o0
! decrease
dst ptr by
8
228 ! Following code is for overlapping copies where src
and dest
229 ! are
not long word aligned
233 rd
%fprs
, %o3
! o3
= fprs
235 ! if fprs.fef
== 0, set it. Checking it
, requires
2 instructions.
236 ! So set it anyway
, without checking.
237 wr
%g0
, 0x4, %fprs
! fprs.fef
= 1
239 alignaddr
%o1
, %g0
, %o5
! align src
240 ldd
[%o5
], %d0
! get first
8 byte block
241 andn
%o2
, 7, %o4
! prepare src ptr for finishup code
245 cmp %o2
, 4095 ! check for short memmoves
246 blt,pn
%ncc
, .dbmv32enter ! go to no prefetch code
248 ldd
[%o5-
8], %d2
! load
8 bytes
249 ldd
[%o5-
16], %d4
! load
8 bytes
251 ldd
[%o5+
40], %d6
! load
8 bytes
253 ldd
[%o5+
32], %d8
! load
8 bytes
254 sub %o2
, 64, %o2
! 64 less bytes to copy
255 ldd
[%o5+
24], %d18
! load
8 bytes
256 cmp %o2
, 64 ! do we have
< 64 bytes remaining
257 ldd
[%o5+
16], %d28
! load
8 bytes
258 ldd
[%o5+
8], %d30
! load
8 bytes
259 prefetch
[%o5
- (5 * BLOCK_SIZE
)], 20 ! into the prefetch cache
260 faligndata
%d2
, %d0
, %d10
! extract
8 bytes out
261 ldd
[%o5
], %d0
! load
8 bytes
262 std %d10
, [%o0+
56] ! store the current
8 bytes
263 faligndata
%d4
, %d2
, %d12
! extract
8 bytes out
264 std %d12
, [%o0+
48] ! store the current
8 bytes
265 faligndata
%d6
, %d4
, %d14
! extract
8 bytes out
266 std %d14
, [%o0+
40] ! store the current
8 bytes
267 faligndata
%d8
, %d6
, %d16
! extract
8 bytes out
268 std %d16
, [%o0+
32] ! store the current
8 bytes
269 faligndata
%d18
, %d8
, %d20
! extract
8 bytes out
270 std %d20
, [%o0+
24] ! store the current
8 bytes
271 faligndata
%d28
, %d18
, %d22
! extract
8 bytes out
272 std %d22
, [%o0+
16] ! store the current
8 bytes
273 faligndata
%d30
, %d28
, %d24
! extract
8 bytes out
274 std %d24
, [%o0+
8] ! store the current
8 bytes
275 faligndata
%d0
, %d30
, %d26
! extract
8 bytes out
276 bgeu
,pt
%ncc
, .dbmv64
277 std %d26
, [%o0
] ! store the current
8 bytes
283 ldd
[%o5-
8], %d2
! load
8 bytes
285 ldd
[%o5-
16], %d4
! load
8 bytes
287 ldd
[%o5+
8], %d6
! load
8 bytes
289 faligndata
%d2
, %d0
, %d10
! extract
8 bytes out
290 ldd
[%o5
], %d0
! load
8 bytes
291 sub %o2
,32, %o2
! 32 less bytes to copy
292 std %d10
, [%o0+
24] ! store the current
8 bytes
293 cmp %o2
, 32 ! do we have
< 32 bytes remaining
294 faligndata
%d4
, %d2
, %d12
! extract
8 bytes out
295 std %d12
, [%o0+
16] ! store the current
8 bytes
296 faligndata
%d6
, %d4
, %d14
! extract
8 bytes out
297 std %d14
, [%o0+
8] ! store the current
8 bytes
298 faligndata
%d0
, %d6
, %d16
! extract
8 bytes out
299 bgeu
,pt
%ncc
, .dbmv32
300 std %d16
, [%o0
] ! store the current
8 bytes
302 cmp %o2
, 8 ! do we have
< 8 bytes remaining
303 blt,pt
%ncc
, .dbmvfinish ! if yes, skip to finish up code
307 sub %o0
, 8, %o0
! since we are at the end
308 ! when we first enter the loop
309 sub %o2
, 8, %o2
! 8 less bytes to copy
311 cmp %o2
, 8 ! do we have
< 8 bytes remaining
312 faligndata
%d2
, %d0
, %d8
! extract
8 bytes out
313 std %d8
, [%o0
] ! store the current
8 bytes
317 and %o3
, 0x4, %o3
! fprs.du
= fprs.dl
= 0
320 wr
%o3
, %g0
, %fprs
! fprs
= o3 restore fprs
326 ldub
[%o1-
1], %o3
! load last byte
327 stb %o3
, [%o0-
1] ! store last byte
329 ldub
[%o1+
2], %o3
! load
2nd from last byte
330 stb %o3
, [%o0-
2] ! store
2nd from last byte
332 ldub
[%o1+
1], %o3
! load
3rd from last byte
333 stb %o3
, [%o0+
1] ! store
3rd from last byte
335 ldub
[%o1
], %o3
! load
4th from last byte
336 stb %o3
, [%o0
] ! store
4th from last byte
339 dec %o1
! decrement src address
340 ldub
[%o1
], %o3
! read
a byte
341 dec %o0
! decrement
dst address
342 deccc
%o2
! decrement count
343 bgu
,pt
%ncc
, .dbbyte ! loop until done
344 stb %o3
, [%o0
] ! write byte
351 .align ICACHE_LINE_SIZE
353 ! adjust instruction alignment
354 nop ! Do
not remove
, these nops affect
355 nop ! icache alignment
and performance
357 cmp %o2
, SMALL_MAX
! check for
not small case
358 bgu
,pn
%ncc
, .medium ! go to larger cases
359 mov
%o0
, %g1
! save
%o0
360 cmp %o2
, SHORTCOPY
! check for really short case
361 ble,pt
%ncc
, .smallleft !
362 or %o0
, %o1
, %o3
! prepare alignment check
363 andcc
%o3
, 0x3, %g0
! test for alignment
364 bz
,pt
%ncc
, .smallword ! branch to word aligned case
365 sub %o2
, 3, %o2
! adjust count to allow cc zero test
367 ldub
[%o1
], %o3
! read byte
368 subcc
%o2
, 4, %o2
! reduce count by
4
369 stb %o3
, [%o0
] ! write byte
370 ldub
[%o1+
1], %o3
! repeat for
a total of
4 bytes
371 add %o1
, 4, %o1
! advance SRC by
4
374 add %o0
, 4, %o0
! advance
DST by
4
377 bgu
,pt
%ncc
, .smallnotalign4 ! loop til 3 or fewer bytes remain
379 add %o2
, 3, %o2
! restore count
382 bz
,pt
%ncc
, .smallexit
384 .smallleft3: ! 1, 2, or 3 bytes remain
385 ldub
[%o1
], %o3
! load one byte
386 deccc
%o2
! reduce count for cc test
387 bz
,pt
%ncc
, .smallexit
388 stb %o3
, [%o0
] ! store one byte
389 ldub
[%o1+
1], %o3
! load second byte
391 bz
,pt
%ncc
, .smallexit
392 stb %o3
, [%o0+
1] ! store second byte
393 ldub
[%o1+
2], %o3
! load third byte
394 stb %o3
, [%o0+
2] ! store third byte
396 mov
%g1
, %o0
! restore
%o0
399 nop ! affects loop icache alignment
401 lduw
[%o1
], %o3
! read word
403 subcc
%o2
, 8, %o2
! update count
404 stw %o3
, [%o0
] ! write word
405 add %o1
, 8, %o1
! update SRC
406 lduw
[%o1-
4], %o3
! read word
407 add %o0
, 8, %o0
! update
DST
408 bgu
,pt
%ncc
, .smallwords ! loop until done
409 stw %o3
, [%o0-
4] ! write word
410 addcc
%o2
, 7, %o2
! restore count
411 bz
,pt
%ncc
, .smallexit ! check for completion
413 cmp %o2
, 4 ! check for
4 or more bytes left
414 blt .smallleft3 ! if not, go to finish up
421 bnz
,pt
%ncc
, .smallleft3
424 mov
%g1
, %o0
! restore
%o0
427 subcc
%o2
, 4, %o2
! update count
428 bgu
,pt
%ncc
, .smallwordx
429 lduw
[%o1
], %o3
! read word
430 addcc
%o2
, 3, %o2
! restore count
431 bz
,pt
%ncc
, .smallexit
432 stw %o3
, [%o0
] ! write word
433 deccc
%o2
! reduce count for cc test
434 ldub
[%o1+
4], %o3
! load one byte
435 bz
,pt
%ncc
, .smallexit
436 stb %o3
, [%o0+
4] ! store one byte
437 ldub
[%o1+
5], %o3
! load second byte
439 bz
,pt
%ncc
, .smallexit
440 stb %o3
, [%o0+
5] ! store second byte
441 ldub
[%o1+
6], %o3
! load third byte
442 stb %o3
, [%o0+
6] ! store third byte
445 mov
%g1
, %o0
! restore
%o0
450 andcc
%o5
, 7, %o5
! bytes till
DST 8 byte aligned
451 and %o3
, 7, %o3
! bytes till SRC
8 byte aligned
454 sub %o5
, %o3
, %o3
! -(bytes till SRC aligned after
DST aligned
)
455 ! o3
={-7, -6, ... 7} o3>0 => SRC overaligned
457 sub %o2
, %o5
, %o2
! update count
467 ! Now
DST is
8-byte aligned. o0
, o1
, o2 are current.
470 andcc
%o1
, 0x3, %g0
! test alignment
471 bnz
,pt
%ncc
, .mediumsetup ! branch to skip aligned cases
472 ! if src
, dst not aligned
473 prefetch
[%o1
+ (1 * BLOCK_SIZE
)], 20
476 * Handle all cases where src and dest are aligned on word
477 * or long word boundaries. Use unrolled loops for better
478 * performance. This option wins over standard large data
479 * move when source and destination is in cache for medium
480 * to short data moves.
482 andcc
%o1
, 0x7, %g0
! test word alignment
483 bz
,pt
%ncc
, .medlword ! branch to long word aligned case
484 prefetch
[%o1
+ (2 * BLOCK_SIZE
)], 20
485 cmp %o2
, MED_WMAX
! limit to store buffer size
486 bgu
,pt
%ncc
, .mediumrejoin ! otherwise rejoin main loop
488 subcc
%o2
, 15, %o2
! adjust length to allow cc test
490 ble,pt
%ncc
, .medw15 ! skip big loop if less than 16
491 prefetch
[%o1
+ (3 * BLOCK_SIZE
)], 20
493 * no need to put prefetch in loop as prefetches have
494 * already been issued for maximum loop size
498 subcc
%o2
, 16, %o2
! decrement length count
499 stw %o4
, [%o0
] ! and store
500 ld [%o1+
4], %o3
! a block of
16 bytes
501 add %o1
, 16, %o1
! increase src ptr by
16
504 add %o0
, 16, %o0
! increase
dst ptr by
16
507 bgu
,pt
%ncc
, .medw16 ! repeat if at least 16 bytes left
510 addcc
%o2
, 15, %o2
! restore count
511 bz
,pt
%ncc
, .medwexit ! exit if finished
514 blt,pt
%ncc
, .medw7 ! skip if 7 or fewer bytes left
516 ld [%o1
], %o4
! load
4 bytes
517 subcc
%o2
, 8, %o2
! decrease count by
8
518 stw %o4
, [%o0
] ! and store
4 bytes
519 add %o1
, 8, %o1
! increase src ptr by
8
520 ld [%o1-
4], %o3
! load
4 bytes
521 add %o0
, 8, %o0
! increase
dst ptr by
8
522 stw %o3
, [%o0-
4] ! and store
4 bytes
523 bz
%ncc
, .medwexit ! exit if finished
525 .medw7: ! count is ge 1, less than 8
526 cmp %o2
, 3 ! check for
4 bytes left
527 ble,pt
%ncc
, .medw3 ! skip if 3 or fewer bytes left
529 ld [%o1
], %o4
! load
4 bytes
530 sub %o2
, 4, %o2
! decrease count by
4
531 add %o1
, 4, %o1
! increase src ptr by
4
532 stw %o4
, [%o0
] ! and store
4 bytes
533 add %o0
, 4, %o0
! increase
dst ptr by
4
534 tst
%o2
! check for zero bytes left
535 bz
%ncc
, .medwexit ! exit if finished
537 .medw3: ! count is known to be 1, 2, or 3
538 deccc
%o2
! reduce count by one
539 ldub
[%o1
], %o3
! load one byte
540 bz
,pt
%ncc
, .medwexit ! exit if last byte
541 stb %o3
, [%o0
] ! store one byte
542 ldub
[%o1+
1], %o3
! load second byte
543 deccc
%o2
! reduce count by one
544 bz
,pt
%ncc
, .medwexit ! exit if last byte
545 stb %o3
, [%o0+
1] ! store second byte
546 ldub
[%o1+
2], %o3
! load third byte
547 stb %o3
, [%o0+
2] ! store third byte
550 mov
%g1
, %o0
! restore
%o0
553 * Special case for handling when src and dest are both long word aligned
554 * and total data to move is between SMALL_MAX and MED_MAX bytes
559 .medlword: ! long word aligned
561 cmp %o2
, MED_MAX
! limit to store buffer size
562 bgu
,pt
%ncc
, .mediumrejoin ! otherwise rejoin main loop
564 subcc
%o2
, 31, %o2
! adjust length to allow cc test
566 ble,pt
%ncc
, .medl31 ! skip big loop if less than 32
567 prefetch
[%o1
+ (3 * BLOCK_SIZE
)], 20 ! into the l2 cache
569 * no need to put prefetch in loop as prefetches have
570 * already been issued for maximum loop size
573 ldx [%o1
], %o4
! load
574 subcc
%o2
, 32, %o2
! decrement length count
575 stx %o4
, [%o0
] ! and store
576 ldx [%o1+
8], %o3
! a block of
32 bytes
577 add %o1
, 32, %o1
! increase src ptr by
32
580 add %o0
, 32, %o0
! increase
dst ptr by
32
583 bgu
,pt
%ncc
, .medl32 ! repeat if at least 32 bytes left
586 addcc
%o2
, 16, %o2
! adjust remaining count
587 ble,pt
%ncc
, .medl15 ! skip if 15 or fewer bytes left
589 ldx [%o1
], %o4
! load
and store
16 bytes
590 add %o1
, 16, %o1
! increase src ptr by
16
592 sub %o2
, 16, %o2
! decrease count by
16
594 add %o0
, 16, %o0
! increase
dst ptr by
16
597 addcc
%o2
, 15, %o2
! restore count
598 bz
,pt
%ncc
, .medwexit ! exit if finished
601 blt,pt
%ncc
, .medw7 ! skip if 7 or fewer bytes left
603 ldx [%o1
], %o4
! load
8 bytes
604 add %o1
, 8, %o1
! increase src ptr by
8
605 stx %o4
, [%o0
] ! and store
8 bytes
606 subcc
%o2
, 8, %o2
! decrease count by
8
607 bz
%ncc
, .medwexit ! exit if finished
608 add %o0
, 8, %o0
! increase
dst ptr by
8
617 prefetch
[%o1
+ (2 * BLOCK_SIZE
)], 21
619 rd
%fprs
, %o4
! check for unused FPU
621 add %o1
, 8, %o1
! prepare to round SRC upward
623 sethi
%hi
(0x1234567f), %o5
! For GSR.MASK
626 andcc
%o4
, FPRS_FEF
, %o4
! test FEF
, fprs.du
= fprs.dl
= 0
628 wr
%g0
, FPRS_FEF
, %fprs
! fprs.fef
= 1
633 ! Compute o5
(number of bytes that need copying using the main loop
).
634 ! First
, compute for the medium case.
635 ! Then
, if large case
, o5 is replaced by count for block alignment.
636 ! Be careful
not to read past end of SRC
637 ! Currently
, o2 is the actual count remaining
638 ! o3 is how much sooner we
'll cross the alignment boundary
639 ! in SRC compared to in DST
641 ! Examples: Let # denote bytes that should not be accessed
642 ! Let x denote a byte already copied to align DST
643 ! Let . and - denote bytes not yet copied
644 ! Let | denote double alignment boundaries
646 ! DST: ######xx|........|--------|..###### o2 = 18
649 ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8
652 ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8
655 ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8
659 alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1
661 movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0
666 andn %o5, 7, %o5 ! 8 byte aligned count
667 neg %o0, %o5 ! 'large
' case
668 and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned
670 brgez,a %o3, .beginmedloop
673 add %o1, %o3, %o1 ! back up o1
675 ldda [%o1]ASI_FL8_P, %d2
679 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
684 sub %o2
, %o5
, %o2
! update count for later
686 ! Main loop to write out doubles. Note
: o5
& 7 == 0
689 subcc
%o5
, 8, %o5
! update local count
691 add %o1
, 8, %o1
! update SRC
694 faligndata
%d0
, %d2
, %d4
696 subcc
%o5
, 8, %o5
! update local count
697 add %o1
, 16, %o1
! update SRC
700 faligndata
%d2
, %d0
, %d6
702 subcc
%o5
, 8, %o5
! update local count
704 bnz
,pt
%ncc
, .medloop
705 add %o0
, 16, %o0
! update
DST
708 faligndata
%d0
, %d2
, %d4
721 ! Currently
, o1 is pointing to the next double-aligned byte in SRC
722 ! The
8 bytes starting at
[o1-
8] are available in d0
723 ! At least one
, and possibly all
, of these need to
be written.
726 bgu
%ncc
, .large ! otherwise, less than 16 bytes left
730 /* This code will use partial stores. */
733 and %o3
, 7, %o3
! Number of bytes needed to completely
734 ! fill
%d0 with good
(unwritten
) data.
736 subcc
%o2
, 8, %o2
! update count
(maybe too much
)
738 addcc
%o3
, %o5
, %o5
! extra bytes we can stuff into
%d0
739 sub %o3
, %o5
, %o3
! update o3
(# bad bytes in %d0)
742 alignaddr
%o3
, %g0
, %g0
! set GSR.ALIGN
746 ldda
[%o1
]ASI_FL8_P
, %d2
749 bshuffle
%d0
, %d2
, %d0
! shifts d0 left
1 byte
and or's in d2
753 faligndata %d0, %d0, %d0 ! shift bytes to the left
754 and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3]
756 stda %d0, [%o0]%o5, ASI_PST8_P
757 brlez %o2, .mediumexit
758 add %o0, %o3, %o0 ! update DST to last stored byte
769 andcc %o3, 7, %o5 ! Number of bytes needed to completely
770 ! fill %d0 with good (unwritten) data.
772 sub %o5, 8, %o3 ! -(number of good bytes in %d0)
774 bl,a %ncc, 3f ! Not enough bytes to fill %d0
775 add %o1, %o3, %o1 ! Back up %o1
779 ldda [%o1]ASI_FL8_P, %d2
782 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
799 wr
%o4
, %g0
, %fprs
! fprs
= o4 restore fprs
804 .align ICACHE_LINE_SIZE
806 ! The following test for BSTORE_SIZE is used to decide whether
807 ! to store data with
a block store
or with individual stores.
808 ! The block store wins when the amount of data is so large
809 ! that it is causes other application data to
be moved out
810 ! of the L1
or L2 cache.
811 ! On
a Panther
, block store can lose more often because block
812 ! store forces the stored data to
be removed from the L3 cache.
814 sethi
%hi
(BSTORE_SIZE
),%o5
815 or %o5
,%lo
(BSTORE_SIZE
),%o5
819 ! %o0 I
/O
DST is
64-byte aligned
820 ! %o1 I
/O
8-byte aligned
(and we
've set GSR.ALIGN)
821 ! %d0 I/O already loaded with SRC data from [%o1-8]
822 ! %o2 I/O count (number of bytes that need to be written)
823 ! %o3 I Not written. If zero, then SRC is double aligned.
824 ! %o4 I Not written. Holds fprs.
825 ! %o5 O The number of doubles that remain to be written.
827 ! Load the rest of the current block
828 ! Recall that %o1 is further into SRC than %o0 is into DST
830 prefetch [%o0 + (0 * BLOCK_SIZE)], 22
831 prefetch [%o0 + (1 * BLOCK_SIZE)], 22
832 prefetch [%o0 + (2 * BLOCK_SIZE)], 22
834 prefetch [%o1 + (3 * BLOCK_SIZE)], 21
836 faligndata %f0, %f2, %f32
837 ldd [%o1 + 0x10], %f6
838 faligndata %f2, %f4, %f34
839 ldd [%o1 + 0x18], %f8
840 faligndata %f4, %f6, %f36
841 ldd [%o1 + 0x20], %f10
842 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
843 prefetch [%o1 + (4 * BLOCK_SIZE)], 21
844 faligndata %f6, %f8, %f38
845 ldd [%o1 + 0x28], %f12
846 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter)
847 faligndata %f8, %f10, %f40
848 ldd [%o1 + 0x30], %f14
849 faligndata %f10, %f12, %f42
850 ldd [%o1 + 0x38], %f0
851 sub %o2, BLOCK_SIZE, %o2 ! update count
852 prefetch [%o1 + (5 * BLOCK_SIZE)], 21
853 add %o1, BLOCK_SIZE, %o1 ! update SRC
855 ! Main loop. Write previous block. Load rest of current block.
856 ! Some bytes will be loaded that won't yet
be written.
859 faligndata
%f12, %f14, %f44
861 faligndata
%f14, %f0, %f46
870 sub %o2
, BLOCK_SIZE
, %o2
! update count
871 prefetch
[%o0
+ (6 * BLOCK_SIZE
)], 22
872 prefetch
[%o0
+ (3 * BLOCK_SIZE
)], 22
873 add %o0
, BLOCK_SIZE
, %o0
! update
DST
874 ldd
[%o1
+ 0x10], %f6
875 faligndata
%f0, %f2, %f32
876 ldd
[%o1
+ 0x18], %f8
877 faligndata
%f2, %f4, %f34
878 ldd
[%o1
+ 0x20], %f10
879 faligndata
%f4, %f6, %f36
880 ldd
[%o1
+ 0x28], %f12
881 faligndata
%f6, %f8, %f38
882 ldd
[%o1
+ 0x30], %f14
883 faligndata
%f8, %f10, %f40
884 ldd
[%o1
+ 0x38], %f0
885 faligndata
%f10, %f12, %f42
886 cmp %o2
, BLOCK_SIZE
+ 8
887 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], 21
889 add %o1
, BLOCK_SIZE
, %o1
! update SRC
890 faligndata
%f12, %f14, %f44
891 faligndata
%f14, %f0, %f46
892 stda
%f32
, [%o0
]ASI_BLK_P
! store
64 bytes
, bypass cache
894 bne %ncc
, 2f
! exactly
1 block remaining?
895 add %o0
, BLOCK_SIZE
, %o0
! update
DST
896 brz
,a %o3
, 3f
! is SRC double aligned?
900 add %o5
, %o2
, %o5
! %o5 was already set to
0 or -8
903 membar
#StoreLoad|#StoreStore
906 andn
%o5
, 7, %o5
! 8 byte aligned count
909 ! This is when there is exactly
1 block remaining
and SRC is aligned
912 ldd
[%o1
+ 0x10], %f6
914 ldd
[%o1
+ 0x18], %f8
916 ldd
[%o1
+ 0x20], %f10
918 ldd
[%o1
+ 0x28], %f12
920 ldd
[%o1
+ 0x30], %f14
925 stda
%f32
, [%o0
]ASI_BLK_P
926 membar
#StoreLoad|#StoreStore
933 ! two nops here causes loop starting at
1f below to
be
934 ! on
a cache line boundary
, improving performance
938 ! %o0 I
/O
DST is
64-byte aligned
939 ! %o1 I
/O
8-byte aligned
(and we
've set GSR.ALIGN)
940 ! %d0 I/O already loaded with SRC data from [%o1-8]
941 ! %o2 I/O count (number of bytes that need to be written)
942 ! %o3 I Not written. If zero, then SRC is double aligned.
943 ! %o4 I Not written. Holds fprs.
944 ! %o5 O The number of doubles that remain to be written.
946 ! Load the rest of the current block
947 ! Recall that %o1 is further into SRC than %o0 is into DST
949 ! prefetch [%o1 + (3 * BLOCK_SIZE)], 21
950 ! executed in delay slot for branch to .xlarge
951 prefetch [%o1 + (4 * BLOCK_SIZE)], 21
952 prefetch [%o1 + (5 * BLOCK_SIZE)], 21
954 prefetch [%o1 + (6 * BLOCK_SIZE)], 21
956 faligndata %f0, %f2, %f32
957 ldd [%o1 + 0x10], %f6
958 faligndata %f2, %f4, %f34
959 ldd [%o1 + 0x18], %f8
960 faligndata %f4, %f6, %f36
961 ldd [%o1 + 0x20], %f10
962 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
963 faligndata %f6, %f8, %f38
964 ldd [%o1 + 0x28], %f12
965 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later)
966 faligndata %f8, %f10, %f40
967 ldd [%o1 + 0x30], %f14
968 faligndata %f10, %f12, %f42
969 ldd [%o1 + 0x38], %f0
970 sub %o2, BLOCK_SIZE, %o2 ! update count
971 prefetch [%o1 + (7 * BLOCK_SIZE)], 21
972 add %o1, BLOCK_SIZE, %o1 ! update SRC
974 ! This point is 32-byte aligned since 24 instructions appear since
975 ! the previous alignment directive.
978 ! Main loop. Write previous block. Load rest of current block.
979 ! Some bytes will be loaded that won't yet
be written.
982 faligndata
%f12, %f14, %f44
984 faligndata
%f14, %f0, %f46
985 stda
%f32
, [%o0
]ASI_BLK_P
986 sub %o2
, BLOCK_SIZE
, %o2
! update count
987 ldd
[%o1
+ 0x10], %f6
988 faligndata
%f0, %f2, %f32
989 ldd
[%o1
+ 0x18], %f8
990 faligndata
%f2, %f4, %f34
991 ldd
[%o1
+ 0x20], %f10
992 faligndata
%f4, %f6, %f36
993 ldd
[%o1
+ 0x28], %f12
994 faligndata
%f6, %f8, %f38
995 ldd
[%o1
+ 0x30], %f14
996 faligndata
%f8, %f10, %f40
997 ldd
[%o1
+ 0x38], %f0
998 faligndata
%f10, %f12, %f42
999 ! offset of
8*BLK+
8 bytes works best over range of
(src-
dst) mod
1K
1000 prefetch
[%o1
+ (8 * BLOCK_SIZE
) + 8], 21
1001 add %o0
, BLOCK_SIZE
, %o0
! update
DST
1002 cmp %o2
, BLOCK_SIZE
+ 8
1003 ! second prefetch important to correct for occasional dropped
1004 ! initial prefetches
, 5*BLK works best over range of
(src-
dst) mod
1K
1005 ! strong prefetch prevents drops on Panther
, but Jaguar
and earlier
1006 ! US-III models treat strong prefetches as weak prefetchs
1007 ! to avoid regressions on customer hardware
, we retain the prefetch
1008 prefetch
[%o1
+ (5 * BLOCK_SIZE
)], 21
1010 add %o1
, BLOCK_SIZE
, %o1
! update SRC
1012 faligndata
%f12, %f14, %f44
1013 faligndata
%f14, %f0, %f46
1014 stda
%f32
, [%o0
]ASI_BLK_P
! store
64 bytes
, bypass cache
1016 bne %ncc
, 2f
! exactly
1 block remaining?
1017 add %o0
, BLOCK_SIZE
, %o0
! update
DST
1018 brz
,a %o3
, 3f
! is SRC double aligned?
1022 add %o5
, %o2
, %o5
! %o5 was already set to
0 or -8
1025 membar
#StoreLoad|#StoreStore
1028 andn
%o5
, 7, %o5
! 8 byte aligned count
1031 ! This is when there is exactly
1 block remaining
and SRC is aligned
1033 ldd
[%o1
+ 0x8], %f4
1034 ldd
[%o1
+ 0x10], %f6
1036 ldd
[%o1
+ 0x18], %f8
1038 ldd
[%o1
+ 0x20], %f10
1040 ldd
[%o1
+ 0x28], %f12
1042 ldd
[%o1
+ 0x30], %f14
1047 stda
%f32
, [%o0
]ASI_BLK_P
1048 membar
#StoreLoad|#StoreStore