4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
31 * Copy s2 to s1, always copy n bytes.
32 * Note: this C code does not work for overlapped copies.
33 * Memmove() and bcopy() do.
35 * Fast assembler language version of the following C-program for memcpy
36 * which represents the `standard' for the C-library.
39 * memcpy(void *s, const void *s0, size_t n)
43 * const char *s2 = s0;
52 #include <sys/asm_linkage.h>
53 #include <sys/sun4asi.h>
56 #define ICACHE_LINE_SIZE 64
60 #define ALIGNED8_FPCOPY_THRESHOLD 1024
61 #define ALIGNED4_FPCOPY_THRESHOLD 1024
62 #define BST_THRESHOLD 65536
66 #define MEDIUM_MAX 255
67 #define MED_WMAX 256 /* max copy for medium word-aligned case */
69 #define N_READS_STRONG 20
70 #define N_WRITES_STRONG 22
73 ANSI_PRAGMA_WEAK
(memmove
,function
)
74 ANSI_PRAGMA_WEAK
(memcpy
,function
)
77 prefetch
[%o1
], N_READS_STRONG
78 prefetch
[%o0
], N_WRITES_STRONG
79 cmp %o1
, %o0
! if from address is
>= to use forward copy
80 bgeu
%ncc
, .forcpy ! else use backward if ...
81 sub %o0
, %o1
, %o4
! get difference of two addresses
82 cmp %o2
, %o4
! compare size
and difference of addresses
83 bleu
%ncc
, .forcpy ! if size is bigger, do overlapped copy
87 ! an overlapped copy that must
be done
"backwards"
90 mov
%o0
, %g1
! save dest address for return val
91 add %o1
, %o2
, %o1
! get to end of source space
92 add %o0
, %o2
, %o0
! get to end of destination space
95 bgeu
,pn
%ncc
, .dbalign
101 ldub
[%o1-
1], %o3
! load last byte
102 stb %o3
, [%o0-
1] ! store last byte
104 ldub
[%o1+
2], %o3
! load
2nd from last byte
105 stb %o3
, [%o0-
2] ! store
2nd from last byte
107 ldub
[%o1+
1], %o3
! load
3rd from last byte
108 stb %o3
, [%o0+
1] ! store
3rd from last byte
110 ldub
[%o1
], %o3
! load
4th from last byte
111 bgu
,pt
%ncc
, .byte4loop
112 stb %o3
, [%o0
] ! store
4th from last byte
117 dec %o1
! decrement src address
118 ldub
[%o1
], %o3
! read
a byte
119 dec %o0
! decrement
dst address
120 deccc
%o2
! decrement count
121 bgu
,pt
%ncc
, .byteloop ! loop until done
122 stb %o3
, [%o0
] ! write byte
129 prefetch
[%o1
- (4 * BLOCK_SIZE
)], #one_read
130 prefetch
[%o0
- (4 * BLOCK_SIZE
)], #one_write
131 andcc
%o0
, 7, %o5
! bytes till
DST 8 byte aligned
133 sub %o2
, %o5
, %o2
! update count
135 dec %o1
! decrement src address
136 ldub
[%o1
], %o3
! read
a byte
137 dec %o0
! decrement
dst address
138 deccc
%o5
! decrement count
139 bgu
,pt
%ncc
, .dbalign1 ! loop until done
140 stb %o3
, [%o0
] ! store
a byte
142 ! check for src long word alignment
144 andcc
%o1
, 7, %g0
! chk src long word alignment
148 ! Following code is for overlapping copies where src
and dest
149 ! are long word aligned
152 ! For SPARC64-VI
, prefetch is effective for both integer
and fp register
153 ! operations. There are no benefits in using the fp registers for
154 ! aligned data copying.
157 subcc
%o2
, 31, %o2
! adjust length to allow cc test
159 ble,pt
%ncc
, .dbmedl31 ! skip big loop if less than 32
162 ldx [%o1-
8], %o4
! load
163 prefetch
[%o1
- (8 * BLOCK_SIZE
)], #one_read
164 subcc
%o2
, 32, %o2
! decrement length count
165 stx %o4
, [%o0-
8] ! and store
166 prefetch
[%o0
- (8 * BLOCK_SIZE
)], #one_write
167 ldx [%o1-
16], %o3
! a block of
32 bytes
168 sub %o1
, 32, %o1
! decrease src ptr by
32
171 sub %o0
, 32, %o0
! decrease
dst ptr by
32
174 bgu
,pt
%ncc
, .dbmedl32 ! repeat if at least 32 bytes left
177 addcc
%o2
, 16, %o2
! adjust remaining count
178 ble,pt
%ncc
, .dbmedl15 ! skip if 15 or fewer bytes left
180 ldx [%o1-
8], %o4
! load
and store
16 bytes
181 sub %o1
, 16, %o1
! decrease src ptr by
16
183 sub %o2
, 16, %o2
! decrease count by
16
185 sub %o0
, 16, %o0
! decrease
dst ptr by
16
188 addcc
%o2
, 15, %o2
! restore count
189 bz
,pt
%ncc
, .dbexit ! exit if finished
192 blt,pt
%ncc
, .dbremain ! skip if 7 or fewer bytes left
194 ldx [%o1-
8], %o4
! load
8 bytes
195 sub %o1
, 8, %o1
! decrease src ptr by
8
196 stx %o4
, [%o0-
8] ! and store
8 bytes
197 subcc
%o2
, 8, %o2
! decrease count by
8
198 bnz
%ncc
, .dbremain ! exit if finished
199 sub %o0
, 8, %o0
! decrease
dst ptr by
8
204 ! Following code is for overlapping copies where src
and dest
205 ! are
not long word aligned
209 rd
%fprs
, %o3
! o3
= fprs
211 ! if fprs.fef
== 0, set it. Checking it
, requires
2 instructions.
212 ! So set it anyway
, without checking.
213 wr
%g0
, 0x4, %fprs
! fprs.fef
= 1
215 alignaddr
%o1
, %g0
, %o5
! align src
216 ldd
[%o5
], %d0
! get first
8 byte block
217 andn
%o2
, 7, %o4
! prepare src ptr for finishup code
221 cmp %o2
, 4095 ! check for short memmoves
222 blt,pn
%ncc
, .dbmv32enter ! go to no prefetch code
224 ldd
[%o5-
8], %d2
! load
8 bytes
225 ldd
[%o5-
16], %d4
! load
8 bytes
227 ldd
[%o5+
40], %d6
! load
8 bytes
229 ldd
[%o5+
32], %d8
! load
8 bytes
230 sub %o2
, 64, %o2
! 64 less bytes to copy
231 ldd
[%o5+
24], %d18
! load
8 bytes
232 cmp %o2
, 64 ! do we have
< 64 bytes remaining
233 ldd
[%o5+
16], %d28
! load
8 bytes
234 ldd
[%o5+
8], %d30
! load
8 bytes
235 faligndata
%d2
, %d0
, %d10
! extract
8 bytes out
236 prefetch
[%o5
- (5 * BLOCK_SIZE
)], #one_read
237 ldd
[%o5
], %d0
! load
8 bytes
238 std %d10
, [%o0+
56] ! store the current
8 bytes
239 faligndata
%d4
, %d2
, %d12
! extract
8 bytes out
240 prefetch
[%o0
- (5 * BLOCK_SIZE
)], #one_write
241 std %d12
, [%o0+
48] ! store the current
8 bytes
242 faligndata
%d6
, %d4
, %d14
! extract
8 bytes out
243 std %d14
, [%o0+
40] ! store the current
8 bytes
244 faligndata
%d8
, %d6
, %d16
! extract
8 bytes out
245 std %d16
, [%o0+
32] ! store the current
8 bytes
246 faligndata
%d18
, %d8
, %d20
! extract
8 bytes out
247 std %d20
, [%o0+
24] ! store the current
8 bytes
248 faligndata
%d28
, %d18
, %d22
! extract
8 bytes out
249 std %d22
, [%o0+
16] ! store the current
8 bytes
250 faligndata
%d30
, %d28
, %d24
! extract
8 bytes out
251 std %d24
, [%o0+
8] ! store the current
8 bytes
252 faligndata
%d0
, %d30
, %d26
! extract
8 bytes out
253 bgeu
,pt
%ncc
, .dbmv64
254 std %d26
, [%o0
] ! store the current
8 bytes
260 ldd
[%o5-
8], %d2
! load
8 bytes
262 ldd
[%o5-
16], %d4
! load
8 bytes
264 ldd
[%o5+
8], %d6
! load
8 bytes
266 faligndata
%d2
, %d0
, %d10
! extract
8 bytes out
267 ldd
[%o5
], %d0
! load
8 bytes
268 sub %o2
,32, %o2
! 32 less bytes to copy
269 std %d10
, [%o0+
24] ! store the current
8 bytes
270 cmp %o2
, 32 ! do we have
< 32 bytes remaining
271 faligndata
%d4
, %d2
, %d12
! extract
8 bytes out
272 std %d12
, [%o0+
16] ! store the current
8 bytes
273 faligndata
%d6
, %d4
, %d14
! extract
8 bytes out
274 std %d14
, [%o0+
8] ! store the current
8 bytes
275 faligndata
%d0
, %d6
, %d16
! extract
8 bytes out
276 bgeu
,pt
%ncc
, .dbmv32
277 std %d16
, [%o0
] ! store the current
8 bytes
279 cmp %o2
, 8 ! do we have
< 8 bytes remaining
280 blt,pt
%ncc
, .dbmvfinish ! if yes, skip to finish up code
284 sub %o0
, 8, %o0
! since we are at the end
285 ! when we first enter the loop
286 sub %o2
, 8, %o2
! 8 less bytes to copy
288 cmp %o2
, 8 ! do we have
< 8 bytes remaining
289 faligndata
%d2
, %d0
, %d8
! extract
8 bytes out
290 std %d8
, [%o0
] ! store the current
8 bytes
294 and %o3
, 0x4, %o3
! fprs.du
= fprs.dl
= 0
297 wr
%o3
, %g0
, %fprs
! fprs
= o3 restore fprs
303 ldub
[%o1-
1], %o3
! load last byte
304 stb %o3
, [%o0-
1] ! store last byte
306 ldub
[%o1+
2], %o3
! load
2nd from last byte
307 stb %o3
, [%o0-
2] ! store
2nd from last byte
309 ldub
[%o1+
1], %o3
! load
3rd from last byte
310 stb %o3
, [%o0+
1] ! store
3rd from last byte
312 ldub
[%o1
], %o3
! load
4th from last byte
313 stb %o3
, [%o0
] ! store
4th from last byte
316 dec %o1
! decrement src address
317 ldub
[%o1
], %o3
! read
a byte
318 dec %o0
! decrement
dst address
319 deccc
%o2
! decrement count
320 bgu
,pt
%ncc
, .dbbyte ! loop until done
321 stb %o3
, [%o0
] ! write byte
328 .align ICACHE_LINE_SIZE
330 ! adjust instruction alignment
331 nop ! Do
not remove
, these nops affect
332 nop ! icache alignment
and performance
334 prefetch
[%o1
], N_READS_STRONG
335 prefetch
[%o0
], N_WRITES_STRONG
336 cmp %o2
, SMALL_MAX
! check for
not small case
337 bgu
,pn
%ncc
, .medium ! go to larger cases
338 mov
%o0
, %g1
! save
%o0
339 cmp %o2
, SHORTCOPY
! check for really short case
340 ble,pt
%ncc
, .smallleft !
341 or %o0
, %o1
, %o3
! prepare alignment check
342 andcc
%o3
, 0x3, %g0
! test for alignment
343 bz
,pt
%ncc
, .smallword ! branch to word aligned case
344 sub %o2
, 3, %o2
! adjust count to allow cc zero test
346 ldub
[%o1
], %o3
! read byte
347 subcc
%o2
, 4, %o2
! reduce count by
4
348 stb %o3
, [%o0
] ! write byte
349 ldub
[%o1+
1], %o3
! repeat for
a total of
4 bytes
350 add %o1
, 4, %o1
! advance SRC by
4
353 add %o0
, 4, %o0
! advance
DST by
4
356 bgu
,pt
%ncc
, .smallnotalign4 ! loop til 3 or fewer bytes remain
358 add %o2
, 3, %o2
! restore count
361 bz
,pt
%ncc
, .smallexit
363 .smallleft3: ! 1, 2, or 3 bytes remain
364 ldub
[%o1
], %o3
! load one byte
365 deccc
%o2
! reduce count for cc test
366 bz
,pt
%ncc
, .smallexit
367 stb %o3
, [%o0
] ! store one byte
368 ldub
[%o1+
1], %o3
! load second byte
370 bz
,pt
%ncc
, .smallexit
371 stb %o3
, [%o0+
1] ! store second byte
372 ldub
[%o1+
2], %o3
! load third byte
373 stb %o3
, [%o0+
2] ! store third byte
375 mov
%g1
, %o0
! restore
%o0
378 nop ! affects loop icache alignment
380 lduw
[%o1
], %o3
! read word
382 subcc
%o2
, 8, %o2
! update count
383 stw %o3
, [%o0
] ! write word
384 add %o1
, 8, %o1
! update SRC
385 lduw
[%o1-
4], %o3
! read word
386 add %o0
, 8, %o0
! update
DST
387 bgu
,pt
%ncc
, .smallwords ! loop until done
388 stw %o3
, [%o0-
4] ! write word
389 addcc
%o2
, 7, %o2
! restore count
390 bz
,pt
%ncc
, .smallexit ! check for completion
392 cmp %o2
, 4 ! check for
4 or more bytes left
393 blt .smallleft3 ! if not, go to finish up
400 bnz
,pt
%ncc
, .smallleft3
403 mov
%g1
, %o0
! restore
%o0
406 subcc
%o2
, 4, %o2
! update count
407 bgu
,pt
%ncc
, .smallwordx
408 lduw
[%o1
], %o3
! read word
409 addcc
%o2
, 3, %o2
! restore count
410 bz
,pt
%ncc
, .smallexit
411 stw %o3
, [%o0
] ! write word
412 deccc
%o2
! reduce count for cc test
413 ldub
[%o1+
4], %o3
! load one byte
414 bz
,pt
%ncc
, .smallexit
415 stb %o3
, [%o0+
4] ! store one byte
416 ldub
[%o1+
5], %o3
! load second byte
418 bz
,pt
%ncc
, .smallexit
419 stb %o3
, [%o0+
5] ! store second byte
420 ldub
[%o1+
6], %o3
! load third byte
421 stb %o3
, [%o0+
6] ! store third byte
424 mov
%g1
, %o0
! restore
%o0
427 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
428 prefetch
[%o0
+ (4 * BLOCK_SIZE
)], #one_write
431 andcc
%o5
, 7, %o5
! bytes till
DST 8 byte aligned
432 and %o3
, 7, %o3
! bytes till SRC
8 byte aligned
435 sub %o5
, %o3
, %o3
! -(bytes till SRC aligned after
DST aligned
)
436 ! o3
={-7, -6, ... 7} o3>0 => SRC overaligned
438 sub %o2
, %o5
, %o2
! update count
448 ! Now
DST is
8-byte aligned. o0
, o1
, o2 are current.
451 andcc
%o1
, 0x3, %g0
! test alignment
452 prefetch
[%o1
+ (1 * BLOCK_SIZE
)], #one_read
453 bnz
,pt
%ncc
, .mediumsetup ! branch to skip aligned cases
454 ! if src
, dst not aligned
455 prefetch
[%o0
+ (1 * BLOCK_SIZE
)], #one_write
458 * Handle all cases where src and dest are aligned on word
459 * or long word boundaries. Use unrolled loops for better
460 * performance. This option wins over standard large data
461 * move when source and destination is in cache for medium
462 * to short data moves.
464 andcc
%o1
, 0x7, %g0
! test word alignment
465 prefetch
[%o1
+ (2 * BLOCK_SIZE
)], #one_read
466 bz
,pt
%ncc
, .medlword ! branch to long word aligned case
467 prefetch
[%o0
+ (2 * BLOCK_SIZE
)], #one_write
468 cmp %o2
, ALIGNED4_FPCOPY_THRESHOLD
! limit to store buffer size
469 bgu
,pt
%ncc
, .mediumrejoin ! otherwise rejoin main loop
470 prefetch
[%o1
+ (3 * BLOCK_SIZE
)], #one_read
471 subcc
%o2
, 15, %o2
! adjust length to allow cc test
472 prefetch
[%o0
+ (3 * BLOCK_SIZE
)], #one_write
474 ble,pt
%ncc
, .medw15 ! skip big loop if less than 16
477 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
479 subcc
%o2
, 16, %o2
! decrement length count
480 prefetch
[%o0
+ (4 * BLOCK_SIZE
)], #one_write
481 stw %o4
, [%o0
] ! and store
482 ld [%o1+
4], %o3
! a block of
16 bytes
483 add %o1
, 16, %o1
! increase src ptr by
16
486 add %o0
, 16, %o0
! increase
dst ptr by
16
489 bgu
,pt
%ncc
, .medw16 ! repeat if at least 16 bytes left
492 addcc
%o2
, 15, %o2
! restore count
493 bz
,pt
%ncc
, .medwexit ! exit if finished
496 blt,pt
%ncc
, .medw7 ! skip if 7 or fewer bytes left
498 ld [%o1
], %o4
! load
4 bytes
499 subcc
%o2
, 8, %o2
! decrease count by
8
500 stw %o4
, [%o0
] ! and store
4 bytes
501 add %o1
, 8, %o1
! increase src ptr by
8
502 ld [%o1-
4], %o3
! load
4 bytes
503 add %o0
, 8, %o0
! increase
dst ptr by
8
504 stw %o3
, [%o0-
4] ! and store
4 bytes
505 bz
%ncc
, .medwexit ! exit if finished
507 .medw7: ! count is ge 1, less than 8
508 cmp %o2
, 3 ! check for
4 bytes left
509 ble,pt
%ncc
, .medw3 ! skip if 3 or fewer bytes left
511 ld [%o1
], %o4
! load
4 bytes
512 sub %o2
, 4, %o2
! decrease count by
4
513 add %o1
, 4, %o1
! increase src ptr by
4
514 stw %o4
, [%o0
] ! and store
4 bytes
515 add %o0
, 4, %o0
! increase
dst ptr by
4
516 tst
%o2
! check for zero bytes left
517 bz
%ncc
, .medwexit ! exit if finished
519 .medw3: ! count is known to be 1, 2, or 3
520 deccc
%o2
! reduce count by one
521 ldub
[%o1
], %o3
! load one byte
522 bz
,pt
%ncc
, .medwexit ! exit if last byte
523 stb %o3
, [%o0
] ! store one byte
524 ldub
[%o1+
1], %o3
! load second byte
525 deccc
%o2
! reduce count by one
526 bz
,pt
%ncc
, .medwexit ! exit if last byte
527 stb %o3
, [%o0+
1] ! store second byte
528 ldub
[%o1+
2], %o3
! load third byte
529 stb %o3
, [%o0+
2] ! store third byte
532 mov
%g1
, %o0
! restore
%o0
535 * Special case for handling when src and dest are both long word aligned
536 * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD
542 .medlword: ! long word aligned
543 ! length
> ALIGNED8_FPCOPY_THRESHOLD
544 cmp %o2
, ALIGNED8_FPCOPY_THRESHOLD
545 bgu
,pt
%ncc
, .mediumrejoin ! otherwise rejoin main loop
546 prefetch
[%o1
+ (3 * BLOCK_SIZE
)], #one_read
547 prefetch
[%o0
+ (3 * BLOCK_SIZE
)], #one_write
548 subcc
%o2
, 31, %o2
! adjust length to allow cc test
550 ble,pt
%ncc
, .medl31 ! skip big loop if less than 32
553 prefetch
[%o1
+ (4 * BLOCK_SIZE
)], #one_read
554 ldx [%o1
], %o4
! load
555 subcc
%o2
, 32, %o2
! decrement length count
556 prefetch
[%o0
+ (4 * BLOCK_SIZE
)], #one_read
557 stx %o4
, [%o0
] ! and store
558 ldx [%o1+
8], %o3
! a block of
32 bytes
559 add %o1
, 32, %o1
! increase src ptr by
32
562 add %o0
, 32, %o0
! increase
dst ptr by
32
565 bgu
,pt
%ncc
, .medl32 ! repeat if at least 32 bytes left
568 addcc
%o2
, 16, %o2
! adjust remaining count
569 ble,pt
%ncc
, .medl15 ! skip if 15 or fewer bytes left
571 ldx [%o1
], %o4
! load
and store
16 bytes
572 add %o1
, 16, %o1
! increase src ptr by
16
574 sub %o2
, 16, %o2
! decrease count by
16
576 add %o0
, 16, %o0
! increase
dst ptr by
16
579 addcc
%o2
, 15, %o2
! restore count
580 bz
,pt
%ncc
, .medwexit ! exit if finished
583 blt,pt
%ncc
, .medw7 ! skip if 7 or fewer bytes left
585 ldx [%o1
], %o4
! load
8 bytes
586 add %o1
, 8, %o1
! increase src ptr by
8
587 stx %o4
, [%o0
] ! and store
8 bytes
588 subcc
%o2
, 8, %o2
! decrease count by
8
589 bz
%ncc
, .medwexit ! exit if finished
590 add %o0
, 8, %o0
! increase
dst ptr by
8
599 prefetch
[%o1
+ (2 * BLOCK_SIZE
)], #one_read
600 prefetch
[%o1
+ (3 * BLOCK_SIZE
)], #one_read
602 rd
%fprs
, %o4
! check for unused FPU
604 add %o1
, 8, %o1
! prepare to round SRC upward
606 sethi
%hi
(0x1234567f), %o5
! For GSR.MASK
609 andcc
%o4
, FPRS_FEF
, %o4
! test FEF
, fprs.du
= fprs.dl
= 0
611 wr
%g0
, FPRS_FEF
, %fprs
! fprs.fef
= 1
616 ! Compute o5
(number of bytes that need copying using the main loop
).
617 ! First
, compute for the medium case.
618 ! Then
, if large case
, o5 is replaced by count for block alignment.
619 ! Be careful
not to read past end of SRC
620 ! Currently
, o2 is the actual count remaining
621 ! o3 is how much sooner we
'll cross the alignment boundary
622 ! in SRC compared to in DST
624 ! Examples: Let # denote bytes that should not be accessed
625 ! Let x denote a byte already copied to align DST
626 ! Let . and - denote bytes not yet copied
627 ! Let | denote double alignment boundaries
629 ! DST: ######xx|........|--------|..###### o2 = 18
632 ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8
635 ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8
638 ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8
642 alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1
644 movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0
649 andn %o5, 7, %o5 ! 8 byte aligned count
650 neg %o0, %o5 ! 'large
' case
651 and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned
653 brgez,a %o3, .beginmedloop
656 add %o1, %o3, %o1 ! back up o1
658 ldda [%o1]ASI_FL8_P, %d2
662 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
667 sub %o2
, %o5
, %o2
! update count for later
669 ! Main loop to write out doubles. Note
: o5
& 7 == 0
672 subcc
%o5
, 8, %o5
! update local count
674 add %o1
, 8, %o1
! update SRC
677 faligndata
%d0
, %d2
, %d4
679 subcc
%o5
, 8, %o5
! update local count
680 add %o1
, 16, %o1
! update SRC
683 faligndata
%d2
, %d0
, %d6
685 subcc
%o5
, 8, %o5
! update local count
687 bnz
,pt
%ncc
, .medloop
688 add %o0
, 16, %o0
! update
DST
691 faligndata
%d0
, %d2
, %d4
704 ! Currently
, o1 is pointing to the next double-aligned byte in SRC
705 ! The
8 bytes starting at
[o1-
8] are available in d0
706 ! At least one
, and possibly all
, of these need to
be written.
709 bgu
%ncc
, .large ! otherwise, less than 16 bytes left
713 /* This code will use partial stores. */
716 and %o3
, 7, %o3
! Number of bytes needed to completely
717 ! fill
%d0 with good
(unwritten
) data.
719 subcc
%o2
, 8, %o2
! update count
(maybe too much
)
721 addcc
%o3
, %o5
, %o5
! extra bytes we can stuff into
%d0
722 sub %o3
, %o5
, %o3
! update o3
(# bad bytes in %d0)
725 alignaddr
%o3
, %g0
, %g0
! set GSR.ALIGN
729 ldda
[%o1
]ASI_FL8_P
, %d2
732 bshuffle
%d0
, %d2
, %d0
! shifts d0 left
1 byte
and or's in d2
736 faligndata %d0, %d0, %d0 ! shift bytes to the left
737 and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3]
739 stda %d0, [%o0]%o5, ASI_PST8_P
740 brlez %o2, .mediumexit
741 add %o0, %o3, %o0 ! update DST to last stored byte
752 andcc %o3, 7, %o5 ! Number of bytes needed to completely
753 ! fill %d0 with good (unwritten) data.
755 sub %o5, 8, %o3 ! -(number of good bytes in %d0)
757 bl,a %ncc, 3f ! Not enough bytes to fill %d0
758 add %o1, %o3, %o1 ! Back up %o1
762 ldda [%o1]ASI_FL8_P, %d2
765 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
782 wr
%o4
, %g0
, %fprs
! fprs
= o4 restore fprs
787 .align ICACHE_LINE_SIZE
790 ! %o0 I
/O
DST is
64-byte aligned
791 ! %o1 I
/O
8-byte aligned
(and we
've set GSR.ALIGN)
792 ! %d0 I/O already loaded with SRC data from [%o1-8]
793 ! %o2 I/O count (number of bytes that need to be written)
794 ! %o3 I Not written. If zero, then SRC is double aligned.
795 ! %o4 I Not written. Holds fprs.
796 ! %o5 O The number of doubles that remain to be written.
798 ! Load the rest of the current block
799 ! Recall that %o1 is further into SRC than %o0 is into DST
801 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
802 prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read
804 set BST_THRESHOLD, %o5
807 prefetch [%o1 + (12 * BLOCK_SIZE)], #one_read
811 faligndata %f0, %f2, %f32
812 ldd [%o1 + 0x10], %f6
813 faligndata %f2, %f4, %f34
814 ldd [%o1 + 0x18], %f8
815 faligndata %f4, %f6, %f36
816 ldd [%o1 + 0x20], %f10
817 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
818 faligndata %f6, %f8, %f38
819 prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
820 ldd [%o1 + 0x28], %f12
821 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter)
822 faligndata %f8, %f10, %f40
823 ldd [%o1 + 0x30], %f14
824 faligndata %f10, %f12, %f42
825 ldd [%o1 + 0x38], %f0
826 sub %o2, BLOCK_SIZE, %o2 ! update count
827 add %o1, BLOCK_SIZE, %o1 ! update SRC
829 ! Main loop. Write previous block. Load rest of current block.
830 ! Some bytes will be loaded that won't yet
be written.
833 faligndata
%f12, %f14, %f44
835 faligndata
%f14, %f0, %f46
844 sub %o2
, BLOCK_SIZE
, %o2
! update count
845 prefetch
[%o1
+ (24 * BLOCK_SIZE
) + BLOCK_SIZE
], #one_read
846 add %o0
, BLOCK_SIZE
, %o0
! update
DST
847 ldd
[%o1
+ 0x10], %f6
848 faligndata
%f0, %f2, %f32
849 ldd
[%o1
+ 0x18], %f8
850 faligndata
%f2, %f4, %f34
851 ldd
[%o1
+ 0x20], %f10
852 faligndata
%f4, %f6, %f36
853 ldd
[%o1
+ 0x28], %f12
854 faligndata
%f6, %f8, %f38
855 ldd
[%o1
+ 0x30], %f14
856 faligndata
%f8, %f10, %f40
857 ldd
[%o1
+ 0x38], %f0
858 faligndata
%f10, %f12, %f42
859 prefetch
[%o1
+ (18 * BLOCK_SIZE
)], #one_read
860 cmp %o2
, BLOCK_SIZE
+ 8
861 prefetch
[%o0
+ (18 * BLOCK_SIZE
)], #one_write
863 add %o1
, BLOCK_SIZE
, %o1
! update SRC
864 faligndata
%f12, %f14, %f44
865 faligndata
%f14, %f0, %f46
866 stda
%f32
, [%o0
]ASI_BLK_P
! store
64 bytes
, bypass cache
868 bne %ncc
, 2f
! exactly
1 block remaining?
869 add %o0
, BLOCK_SIZE
, %o0
! update
DST
870 brz
,a %o3
, 3f
! is SRC double aligned?
874 add %o5
, %o2
, %o5
! %o5 was already set to
0 or -8
877 membar
#StoreLoad|#StoreStore
880 andn
%o5
, 7, %o5
! 8 byte aligned count
883 ! This is when there is exactly
1 block remaining
and SRC is aligned
886 ldd
[%o1
+ 0x10], %f6
888 ldd
[%o1
+ 0x18], %f8
890 ldd
[%o1
+ 0x20], %f10
892 ldd
[%o1
+ 0x28], %f12
894 ldd
[%o1
+ 0x30], %f14
899 stda
%f32
, [%o0
]ASI_BLK_P
900 membar
#StoreLoad|#StoreStore
907 ! two nops here causes loop starting at
1f below to
be
908 ! on
a cache line boundary
, improving performance
912 ! %o0 I
/O
DST is
64-byte aligned
913 ! %o1 I
/O
8-byte aligned
(and we
've set GSR.ALIGN)
914 ! %d0 I/O already loaded with SRC data from [%o1-8]
915 ! %o2 I/O count (number of bytes that need to be written)
916 ! %o3 I Not written. If zero, then SRC is double aligned.
917 ! %o4 I Not written. Holds fprs.
918 ! %o5 O The number of doubles that remain to be written.
920 ! Load the rest of the current block
921 ! Recall that %o1 is further into SRC than %o0 is into DST
925 faligndata %f0, %f2, %f32
926 ldd [%o1 + 0x10], %f6
927 faligndata %f2, %f4, %f34
928 ldd [%o1 + 0x18], %f8
929 faligndata %f4, %f6, %f36
930 ldd [%o1 + 0x20], %f10
931 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
932 faligndata %f6, %f8, %f38
933 ldd [%o1 + 0x28], %f12
934 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later)
935 prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
936 faligndata %f8, %f10, %f40
937 ldd [%o1 + 0x30], %f14
938 faligndata %f10, %f12, %f42
939 ldd [%o1 + 0x38], %f0
940 prefetch [%o1 + (17 * BLOCK_SIZE)], #one_read
941 sub %o2, BLOCK_SIZE, %o2 ! update count
942 add %o1, BLOCK_SIZE, %o1 ! update SRC
944 ! This point is 32-byte aligned since 24 instructions appear since
945 ! the previous alignment directive.
948 ! Main loop. Write previous block. Load rest of current block.
949 ! Some bytes will be loaded that won't yet
be written.
952 faligndata
%f12, %f14, %f44
954 faligndata
%f14, %f0, %f46
955 stda
%f32
, [%o0
]ASI_BLK_P
956 sub %o2
, BLOCK_SIZE
, %o2
! update count
957 ldd
[%o1
+ 0x10], %f6
958 faligndata
%f0, %f2, %f32
959 ldd
[%o1
+ 0x18], %f8
960 faligndata
%f2, %f4, %f34
961 ldd
[%o1
+ 0x20], %f10
962 faligndata
%f4, %f6, %f36
963 ldd
[%o1
+ 0x28], %f12
964 faligndata
%f6, %f8, %f38
965 ldd
[%o1
+ 0x30], %f14
966 prefetch
[%o1
+ (2 * BLOCK_SIZE
)], #n_reads
967 faligndata
%f8, %f10, %f40
968 ldd
[%o1
+ 0x38], %f0
969 faligndata
%f10, %f12, %f42
970 prefetch
[%o1
+ (25 * BLOCK_SIZE
)], #one_read
971 add %o0
, BLOCK_SIZE
, %o0
! update
DST
972 cmp %o2
, BLOCK_SIZE
+ 8
973 ! second prefetch important to correct for occasional dropped
974 prefetch
[%o1
+ (18 * BLOCK_SIZE
)], #one_read
976 add %o1
, BLOCK_SIZE
, %o1
! update SRC
978 faligndata
%f12, %f14, %f44
979 faligndata
%f14, %f0, %f46
980 stda
%f32
, [%o0
]ASI_BLK_P
! store
64 bytes
, bypass cache
982 bne %ncc
, 2f
! exactly
1 block remaining?
983 add %o0
, BLOCK_SIZE
, %o0
! update
DST
984 brz
,a %o3
, 3f
! is SRC double aligned?
988 add %o5
, %o2
, %o5
! %o5 was already set to
0 or -8
991 membar
#StoreLoad|#StoreStore
994 andn
%o5
, 7, %o5
! 8 byte aligned count
997 ! This is when there is exactly
1 block remaining
and SRC is aligned
1000 ldd
[%o1
+ 0x10], %f6
1002 ldd
[%o1
+ 0x18], %f8
1004 ldd
[%o1
+ 0x20], %f10
1006 ldd
[%o1
+ 0x28], %f12
1008 ldd
[%o1
+ 0x30], %f14
1013 stda
%f32
, [%o0
]ASI_BLK_P
1014 membar
#StoreLoad|#StoreStore