import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / capabilities / sun4u-us3 / common / memcpy.s
blob5b8bbff7cc7ce99d36cc325a879dd22d23502d01
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
26 .file "memcpy.s"
29 * memcpy(s1, s2, len)
31 * Copy s2 to s1, always copy n bytes.
32 * Note: this C code does not work for overlapped copies.
33 * Memmove() and bcopy() do.
35 * Fast assembler language version of the following C-program for memcpy
36 * which represents the `standard' for the C-library.
38 * void *
39 * memcpy(void *s, const void *s0, size_t n)
40 * {
41 * if (n != 0) {
42 * char *s1 = s;
43 * const char *s2 = s0;
44 * do {
45 * *s1++ = *s2++;
46 * } while (--n != 0);
47 * }
48 * return (s);
49 * }
52 #include <sys/asm_linkage.h>
53 #include <sys/sun4asi.h>
54 #include <sys/trap.h>
56 #define ICACHE_LINE_SIZE 64
57 #define BLOCK_SIZE 64
58 #define FPRS_FEF 0x4
60 #define SHORTCOPY 3
61 #define SMALL_MAX 39
62 #define MEDIUM_MAX 255
63 #define MED_WMAX 256 /* max copy for medium word-aligned case */
64 #define MED_MAX 256 /* max copy for medium longword-aligned case */
66 #ifndef BSTORE_SIZE
67 #define BSTORE_SIZE 256 /* min copy size for block store */
68 #endif
70 ANSI_PRAGMA_WEAK(memmove,function)
71 ANSI_PRAGMA_WEAK(memcpy,function)
73 ENTRY(memmove)
74 cmp %o1, %o0 ! if from address is >= to use forward copy
75 bgeu %ncc, .forcpy ! else use backward if ...
76 sub %o0, %o1, %o4 ! get difference of two addresses
77 cmp %o2, %o4 ! compare size and difference of addresses
78 bleu %ncc, .forcpy ! if size is bigger, do overlapped copy
79 nop
82 ! an overlapped copy that must be done "backwards"
84 .ovbc:
85 mov %o0, %g1 ! save dest address for return val
86 add %o1, %o2, %o1 ! get to end of source space
87 add %o0, %o2, %o0 ! get to end of destination space
89 cmp %o2, 24
90 bgeu,pn %ncc, .dbalign
91 nop
92 cmp %o2, 4
93 blt,pn %ncc, .byte
94 sub %o2, 3, %o2
95 .byte4loop:
96 ldub [%o1-1], %o3 ! load last byte
97 stb %o3, [%o0-1] ! store last byte
98 sub %o1, 4, %o1
99 ldub [%o1+2], %o3 ! load 2nd from last byte
100 stb %o3, [%o0-2] ! store 2nd from last byte
101 sub %o0, 4, %o0
102 ldub [%o1+1], %o3 ! load 3rd from last byte
103 stb %o3, [%o0+1] ! store 3rd from last byte
104 subcc %o2, 4, %o2
105 ldub [%o1], %o3 ! load 4th from last byte
106 bgu,pt %ncc, .byte4loop
107 stb %o3, [%o0] ! store 4th from last byte
108 .byte:
109 addcc %o2, 3, %o2
110 bz,pt %ncc, .exit
111 .byteloop:
112 dec %o1 ! decrement src address
113 ldub [%o1], %o3 ! read a byte
114 dec %o0 ! decrement dst address
115 deccc %o2 ! decrement count
116 bgu,pt %ncc, .byteloop ! loop until done
117 stb %o3, [%o0] ! write byte
118 .exit:
119 retl
120 mov %g1, %o0
122 .align 16
123 .dbalign:
124 andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned
125 bz,pt %ncc, .dbmed
126 sub %o2, %o5, %o2 ! update count
127 .dbalign1:
128 dec %o1 ! decrement src address
129 ldub [%o1], %o3 ! read a byte
130 dec %o0 ! decrement dst address
131 deccc %o5 ! decrement count
132 bgu,pt %ncc, .dbalign1 ! loop until done
133 stb %o3, [%o0] ! store a byte
135 ! check for src long word alignment
136 .dbmed:
137 andcc %o1, 7, %g0 ! chk src long word alignment
138 bnz,pn %ncc, .dbbck
141 ! Following code is for overlapping copies where src and dest
142 ! are long word aligned
144 cmp %o2, 4095
145 blt,pn %ncc, .dbmedl32enter ! go to no prefetch code
147 prefetch [%o1 - (1 * BLOCK_SIZE)], 20 ! into the prefetch cache
148 sub %o2, 63, %o2 ! adjust length to allow cc test
149 ! for end of loop
150 prefetch [%o1 - (2 * BLOCK_SIZE)], 20 ! into the prefetch cache
151 rd %fprs, %o3 ! o3 = fprs
152 ! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
153 ! So set it anyway, without checking.
154 prefetch [%o1 - (3 * BLOCK_SIZE)], 20 ! into the prefetch cache
155 wr %g0, 0x4, %fprs ! fprs.fef = 1
156 prefetch [%o1 - (4 * BLOCK_SIZE)], 20 ! into the prefetch cache
157 .dbmedl64:
158 prefetch [%o1 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache
159 ldd [%o1-8], %d4 ! load
160 subcc %o2, 64, %o2 ! decrement length count
161 std %d4, [%o0-8] ! and store
162 ldd [%o1-16], %d2 ! a block of 64 bytes
163 sub %o1, 64, %o1 ! decrease src ptr by 64
164 std %d2, [%o0-16]
165 sub %o0, 64, %o0 ! decrease dst ptr by 64
166 ldd [%o1+40], %d4
167 std %d4, [%o0+40]
168 ldd [%o1+32], %d2
169 std %d2, [%o0+32]
170 ldd [%o1+24], %d4
171 std %d4, [%o0+24]
172 ldd [%o1+16], %d2
173 std %d2, [%o0+16]
174 ldd [%o1+8], %d4
175 std %d4, [%o0+8]
176 ldd [%o1], %d2
177 bgu,pt %ncc, .dbmedl64 ! repeat if at least 64 bytes left
178 std %d2, [%o0]
179 add %o2, 63, %o2 ! restore offset adjustment
180 and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0
181 wr %o3, %g0, %fprs ! fprs = o3 restore fprs
182 .dbmedl32enter:
183 subcc %o2, 31, %o2 ! adjust length to allow cc test
184 ! for end of loop
185 ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32
187 .dbmedl32:
188 ldx [%o1-8], %o4 ! load
189 subcc %o2, 32, %o2 ! decrement length count
190 stx %o4, [%o0-8] ! and store
191 ldx [%o1-16], %o3 ! a block of 32 bytes
192 sub %o1, 32, %o1 ! decrease src ptr by 32
193 stx %o3, [%o0-16]
194 ldx [%o1+8], %o4
195 sub %o0, 32, %o0 ! decrease dst ptr by 32
196 stx %o4, [%o0+8]
197 ldx [%o1], %o3
198 bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left
199 stx %o3, [%o0]
200 .dbmedl31:
201 addcc %o2, 16, %o2 ! adjust remaining count
202 ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left
203 nop !
204 ldx [%o1-8], %o4 ! load and store 16 bytes
205 sub %o1, 16, %o1 ! decrease src ptr by 16
206 stx %o4, [%o0-8] !
207 sub %o2, 16, %o2 ! decrease count by 16
208 ldx [%o1], %o3 !
209 sub %o0, 16, %o0 ! decrease dst ptr by 16
210 stx %o3, [%o0]
211 .dbmedl15:
212 addcc %o2, 15, %o2 ! restore count
213 bz,pt %ncc, .dbexit ! exit if finished
215 cmp %o2, 8
216 blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left
218 ldx [%o1-8], %o4 ! load 8 bytes
219 sub %o1, 8, %o1 ! decrease src ptr by 8
220 stx %o4, [%o0-8] ! and store 8 bytes
221 subcc %o2, 8, %o2 ! decrease count by 8
222 bnz %ncc, .dbremain ! exit if finished
223 sub %o0, 8, %o0 ! decrease dst ptr by 8
224 retl
225 mov %g1, %o0
228 ! Following code is for overlapping copies where src and dest
229 ! are not long word aligned
231 .align 16
232 .dbbck:
233 rd %fprs, %o3 ! o3 = fprs
235 ! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
236 ! So set it anyway, without checking.
237 wr %g0, 0x4, %fprs ! fprs.fef = 1
239 alignaddr %o1, %g0, %o5 ! align src
240 ldd [%o5], %d0 ! get first 8 byte block
241 andn %o2, 7, %o4 ! prepare src ptr for finishup code
242 cmp %o2, 32
243 blt,pn %ncc, .dbmv8
244 sub %o1, %o4, %o1 !
245 cmp %o2, 4095 ! check for short memmoves
246 blt,pn %ncc, .dbmv32enter ! go to no prefetch code
247 .dbmv64:
248 ldd [%o5-8], %d2 ! load 8 bytes
249 ldd [%o5-16], %d4 ! load 8 bytes
250 sub %o5, 64, %o5 !
251 ldd [%o5+40], %d6 ! load 8 bytes
252 sub %o0, 64, %o0 !
253 ldd [%o5+32], %d8 ! load 8 bytes
254 sub %o2, 64, %o2 ! 64 less bytes to copy
255 ldd [%o5+24], %d18 ! load 8 bytes
256 cmp %o2, 64 ! do we have < 64 bytes remaining
257 ldd [%o5+16], %d28 ! load 8 bytes
258 ldd [%o5+8], %d30 ! load 8 bytes
259 prefetch [%o5 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache
260 faligndata %d2, %d0, %d10 ! extract 8 bytes out
261 ldd [%o5], %d0 ! load 8 bytes
262 std %d10, [%o0+56] ! store the current 8 bytes
263 faligndata %d4, %d2, %d12 ! extract 8 bytes out
264 std %d12, [%o0+48] ! store the current 8 bytes
265 faligndata %d6, %d4, %d14 ! extract 8 bytes out
266 std %d14, [%o0+40] ! store the current 8 bytes
267 faligndata %d8, %d6, %d16 ! extract 8 bytes out
268 std %d16, [%o0+32] ! store the current 8 bytes
269 faligndata %d18, %d8, %d20 ! extract 8 bytes out
270 std %d20, [%o0+24] ! store the current 8 bytes
271 faligndata %d28, %d18, %d22 ! extract 8 bytes out
272 std %d22, [%o0+16] ! store the current 8 bytes
273 faligndata %d30, %d28, %d24 ! extract 8 bytes out
274 std %d24, [%o0+8] ! store the current 8 bytes
275 faligndata %d0, %d30, %d26 ! extract 8 bytes out
276 bgeu,pt %ncc, .dbmv64
277 std %d26, [%o0] ! store the current 8 bytes
279 cmp %o2, 32
280 blt,pn %ncc, .dbmvx
282 .dbmv32:
283 ldd [%o5-8], %d2 ! load 8 bytes
284 .dbmv32enter:
285 ldd [%o5-16], %d4 ! load 8 bytes
286 sub %o5, 32, %o5 !
287 ldd [%o5+8], %d6 ! load 8 bytes
288 sub %o0, 32, %o0 !
289 faligndata %d2, %d0, %d10 ! extract 8 bytes out
290 ldd [%o5], %d0 ! load 8 bytes
291 sub %o2,32, %o2 ! 32 less bytes to copy
292 std %d10, [%o0+24] ! store the current 8 bytes
293 cmp %o2, 32 ! do we have < 32 bytes remaining
294 faligndata %d4, %d2, %d12 ! extract 8 bytes out
295 std %d12, [%o0+16] ! store the current 8 bytes
296 faligndata %d6, %d4, %d14 ! extract 8 bytes out
297 std %d14, [%o0+8] ! store the current 8 bytes
298 faligndata %d0, %d6, %d16 ! extract 8 bytes out
299 bgeu,pt %ncc, .dbmv32
300 std %d16, [%o0] ! store the current 8 bytes
301 .dbmvx:
302 cmp %o2, 8 ! do we have < 8 bytes remaining
303 blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code
305 .dbmv8:
306 ldd [%o5-8], %d2
307 sub %o0, 8, %o0 ! since we are at the end
308 ! when we first enter the loop
309 sub %o2, 8, %o2 ! 8 less bytes to copy
310 sub %o5, 8, %o5
311 cmp %o2, 8 ! do we have < 8 bytes remaining
312 faligndata %d2, %d0, %d8 ! extract 8 bytes out
313 std %d8, [%o0] ! store the current 8 bytes
314 bgeu,pt %ncc, .dbmv8
315 fmovd %d2, %d0
316 .dbmvfinish:
317 and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0
318 tst %o2
319 bz,pt %ncc, .dbexit
320 wr %o3, %g0, %fprs ! fprs = o3 restore fprs
322 .dbremain:
323 cmp %o2, 4
324 blt,pn %ncc, .dbbyte
326 ldub [%o1-1], %o3 ! load last byte
327 stb %o3, [%o0-1] ! store last byte
328 sub %o1, 4, %o1
329 ldub [%o1+2], %o3 ! load 2nd from last byte
330 stb %o3, [%o0-2] ! store 2nd from last byte
331 sub %o0, 4, %o0
332 ldub [%o1+1], %o3 ! load 3rd from last byte
333 stb %o3, [%o0+1] ! store 3rd from last byte
334 subcc %o2, 4, %o2
335 ldub [%o1], %o3 ! load 4th from last byte
336 stb %o3, [%o0] ! store 4th from last byte
337 bz,pt %ncc, .dbexit
338 .dbbyte:
339 dec %o1 ! decrement src address
340 ldub [%o1], %o3 ! read a byte
341 dec %o0 ! decrement dst address
342 deccc %o2 ! decrement count
343 bgu,pt %ncc, .dbbyte ! loop until done
344 stb %o3, [%o0] ! write byte
345 .dbexit:
346 retl
347 mov %g1, %o0
348 SET_SIZE(memmove)
351 .align ICACHE_LINE_SIZE
352 ENTRY(memcpy)
353 ! adjust instruction alignment
354 nop ! Do not remove, these nops affect
355 nop ! icache alignment and performance
356 .forcpy:
357 cmp %o2, SMALL_MAX ! check for not small case
358 bgu,pn %ncc, .medium ! go to larger cases
359 mov %o0, %g1 ! save %o0
360 cmp %o2, SHORTCOPY ! check for really short case
361 ble,pt %ncc, .smallleft !
362 or %o0, %o1, %o3 ! prepare alignment check
363 andcc %o3, 0x3, %g0 ! test for alignment
364 bz,pt %ncc, .smallword ! branch to word aligned case
365 sub %o2, 3, %o2 ! adjust count to allow cc zero test
366 .smallnotalign4:
367 ldub [%o1], %o3 ! read byte
368 subcc %o2, 4, %o2 ! reduce count by 4
369 stb %o3, [%o0] ! write byte
370 ldub [%o1+1], %o3 ! repeat for a total of 4 bytes
371 add %o1, 4, %o1 ! advance SRC by 4
372 stb %o3, [%o0+1]
373 ldub [%o1-2], %o3
374 add %o0, 4, %o0 ! advance DST by 4
375 stb %o3, [%o0-2]
376 ldub [%o1-1], %o3
377 bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain
378 stb %o3, [%o0-1]
379 add %o2, 3, %o2 ! restore count
380 .smallleft:
381 tst %o2
382 bz,pt %ncc, .smallexit
384 .smallleft3: ! 1, 2, or 3 bytes remain
385 ldub [%o1], %o3 ! load one byte
386 deccc %o2 ! reduce count for cc test
387 bz,pt %ncc, .smallexit
388 stb %o3, [%o0] ! store one byte
389 ldub [%o1+1], %o3 ! load second byte
390 deccc %o2
391 bz,pt %ncc, .smallexit
392 stb %o3, [%o0+1] ! store second byte
393 ldub [%o1+2], %o3 ! load third byte
394 stb %o3, [%o0+2] ! store third byte
395 retl
396 mov %g1, %o0 ! restore %o0
398 .align 16
399 nop ! affects loop icache alignment
400 .smallwords:
401 lduw [%o1], %o3 ! read word
402 .smallwordx:
403 subcc %o2, 8, %o2 ! update count
404 stw %o3, [%o0] ! write word
405 add %o1, 8, %o1 ! update SRC
406 lduw [%o1-4], %o3 ! read word
407 add %o0, 8, %o0 ! update DST
408 bgu,pt %ncc, .smallwords ! loop until done
409 stw %o3, [%o0-4] ! write word
410 addcc %o2, 7, %o2 ! restore count
411 bz,pt %ncc, .smallexit ! check for completion
413 cmp %o2, 4 ! check for 4 or more bytes left
414 blt .smallleft3 ! if not, go to finish up
416 lduw [%o1], %o3
417 add %o1, 4, %o1
418 subcc %o2, 4, %o2
419 stw %o3, [%o0]
420 add %o0, 4, %o0
421 bnz,pt %ncc, .smallleft3
423 retl
424 mov %g1, %o0 ! restore %o0
426 .smallword:
427 subcc %o2, 4, %o2 ! update count
428 bgu,pt %ncc, .smallwordx
429 lduw [%o1], %o3 ! read word
430 addcc %o2, 3, %o2 ! restore count
431 bz,pt %ncc, .smallexit
432 stw %o3, [%o0] ! write word
433 deccc %o2 ! reduce count for cc test
434 ldub [%o1+4], %o3 ! load one byte
435 bz,pt %ncc, .smallexit
436 stb %o3, [%o0+4] ! store one byte
437 ldub [%o1+5], %o3 ! load second byte
438 deccc %o2
439 bz,pt %ncc, .smallexit
440 stb %o3, [%o0+5] ! store second byte
441 ldub [%o1+6], %o3 ! load third byte
442 stb %o3, [%o0+6] ! store third byte
443 .smallexit:
444 retl
445 mov %g1, %o0 ! restore %o0
446 .align 16
447 .medium:
448 neg %o0, %o5
449 neg %o1, %o3
450 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
451 and %o3, 7, %o3 ! bytes till SRC 8 byte aligned
453 bz %ncc, 2f
454 sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned)
455 ! o3={-7, -6, ... 7} o3>0 => SRC overaligned
457 sub %o2, %o5, %o2 ! update count
460 ldub [%o1], %o4
461 deccc %o5
462 inc %o1
463 stb %o4, [%o0]
464 bgu,pt %ncc, 1b
465 inc %o0
467 ! Now DST is 8-byte aligned. o0, o1, o2 are current.
470 andcc %o1, 0x3, %g0 ! test alignment
471 bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases
472 ! if src, dst not aligned
473 prefetch [%o1 + (1 * BLOCK_SIZE)], 20
476 * Handle all cases where src and dest are aligned on word
477 * or long word boundaries. Use unrolled loops for better
478 * performance. This option wins over standard large data
479 * move when source and destination is in cache for medium
480 * to short data moves.
482 andcc %o1, 0x7, %g0 ! test word alignment
483 bz,pt %ncc, .medlword ! branch to long word aligned case
484 prefetch [%o1 + (2 * BLOCK_SIZE)], 20
485 cmp %o2, MED_WMAX ! limit to store buffer size
486 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
488 subcc %o2, 15, %o2 ! adjust length to allow cc test
489 ! for end of loop
490 ble,pt %ncc, .medw15 ! skip big loop if less than 16
491 prefetch [%o1 + (3 * BLOCK_SIZE)], 20
493 * no need to put prefetch in loop as prefetches have
494 * already been issued for maximum loop size
496 .medw16:
497 ld [%o1], %o4 ! load
498 subcc %o2, 16, %o2 ! decrement length count
499 stw %o4, [%o0] ! and store
500 ld [%o1+4], %o3 ! a block of 16 bytes
501 add %o1, 16, %o1 ! increase src ptr by 16
502 stw %o3, [%o0+4]
503 ld [%o1-8], %o4
504 add %o0, 16, %o0 ! increase dst ptr by 16
505 stw %o4, [%o0-8]
506 ld [%o1-4], %o3
507 bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left
508 stw %o3, [%o0-4]
509 .medw15:
510 addcc %o2, 15, %o2 ! restore count
511 bz,pt %ncc, .medwexit ! exit if finished
513 cmp %o2, 8
514 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
515 nop !
516 ld [%o1], %o4 ! load 4 bytes
517 subcc %o2, 8, %o2 ! decrease count by 8
518 stw %o4, [%o0] ! and store 4 bytes
519 add %o1, 8, %o1 ! increase src ptr by 8
520 ld [%o1-4], %o3 ! load 4 bytes
521 add %o0, 8, %o0 ! increase dst ptr by 8
522 stw %o3, [%o0-4] ! and store 4 bytes
523 bz %ncc, .medwexit ! exit if finished
525 .medw7: ! count is ge 1, less than 8
526 cmp %o2, 3 ! check for 4 bytes left
527 ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left
528 nop !
529 ld [%o1], %o4 ! load 4 bytes
530 sub %o2, 4, %o2 ! decrease count by 4
531 add %o1, 4, %o1 ! increase src ptr by 4
532 stw %o4, [%o0] ! and store 4 bytes
533 add %o0, 4, %o0 ! increase dst ptr by 4
534 tst %o2 ! check for zero bytes left
535 bz %ncc, .medwexit ! exit if finished
537 .medw3: ! count is known to be 1, 2, or 3
538 deccc %o2 ! reduce count by one
539 ldub [%o1], %o3 ! load one byte
540 bz,pt %ncc, .medwexit ! exit if last byte
541 stb %o3, [%o0] ! store one byte
542 ldub [%o1+1], %o3 ! load second byte
543 deccc %o2 ! reduce count by one
544 bz,pt %ncc, .medwexit ! exit if last byte
545 stb %o3, [%o0+1] ! store second byte
546 ldub [%o1+2], %o3 ! load third byte
547 stb %o3, [%o0+2] ! store third byte
548 .medwexit:
549 retl
550 mov %g1, %o0 ! restore %o0
553 * Special case for handling when src and dest are both long word aligned
554 * and total data to move is between SMALL_MAX and MED_MAX bytes
557 .align 16
559 .medlword: ! long word aligned
560 ! length > SMALL_MAX
561 cmp %o2, MED_MAX ! limit to store buffer size
562 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
564 subcc %o2, 31, %o2 ! adjust length to allow cc test
565 ! for end of loop
566 ble,pt %ncc, .medl31 ! skip big loop if less than 32
567 prefetch [%o1 + (3 * BLOCK_SIZE)], 20 ! into the l2 cache
569 * no need to put prefetch in loop as prefetches have
570 * already been issued for maximum loop size
572 .medl32:
573 ldx [%o1], %o4 ! load
574 subcc %o2, 32, %o2 ! decrement length count
575 stx %o4, [%o0] ! and store
576 ldx [%o1+8], %o3 ! a block of 32 bytes
577 add %o1, 32, %o1 ! increase src ptr by 32
578 stx %o3, [%o0+8]
579 ldx [%o1-16], %o4
580 add %o0, 32, %o0 ! increase dst ptr by 32
581 stx %o4, [%o0-16]
582 ldx [%o1-8], %o3
583 bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left
584 stx %o3, [%o0-8]
585 .medl31:
586 addcc %o2, 16, %o2 ! adjust remaining count
587 ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left
588 nop !
589 ldx [%o1], %o4 ! load and store 16 bytes
590 add %o1, 16, %o1 ! increase src ptr by 16
591 stx %o4, [%o0] !
592 sub %o2, 16, %o2 ! decrease count by 16
593 ldx [%o1-8], %o3 !
594 add %o0, 16, %o0 ! increase dst ptr by 16
595 stx %o3, [%o0-8]
596 .medl15:
597 addcc %o2, 15, %o2 ! restore count
598 bz,pt %ncc, .medwexit ! exit if finished
600 cmp %o2, 8
601 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
603 ldx [%o1], %o4 ! load 8 bytes
604 add %o1, 8, %o1 ! increase src ptr by 8
605 stx %o4, [%o0] ! and store 8 bytes
606 subcc %o2, 8, %o2 ! decrease count by 8
607 bz %ncc, .medwexit ! exit if finished
608 add %o0, 8, %o0 ! increase dst ptr by 8
609 ba .medw7
612 .align 16
616 .mediumsetup:
617 prefetch [%o1 + (2 * BLOCK_SIZE)], 21
618 .mediumrejoin:
619 rd %fprs, %o4 ! check for unused FPU
621 add %o1, 8, %o1 ! prepare to round SRC upward
623 sethi %hi(0x1234567f), %o5 ! For GSR.MASK
624 or %o5, 0x67f, %o5
626 andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0
627 bz,a %ncc, 3f
628 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
630 cmp %o2, MEDIUM_MAX
631 bmask %o5, %g0, %g0
633 ! Compute o5 (number of bytes that need copying using the main loop).
634 ! First, compute for the medium case.
635 ! Then, if large case, o5 is replaced by count for block alignment.
636 ! Be careful not to read past end of SRC
637 ! Currently, o2 is the actual count remaining
638 ! o3 is how much sooner we'll cross the alignment boundary
639 ! in SRC compared to in DST
641 ! Examples: Let # denote bytes that should not be accessed
642 ! Let x denote a byte already copied to align DST
643 ! Let . and - denote bytes not yet copied
644 ! Let | denote double alignment boundaries
646 ! DST: ######xx|........|--------|..###### o2 = 18
647 ! o0
649 ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8
650 ! o1
652 ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8
653 ! o1
655 ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8
656 ! o1
658 or %g0, -8, %o5
659 alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1
661 movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0
662 add %o5, %o2, %o5
663 add %o5, %o3, %o5
665 bleu %ncc, 4f
666 andn %o5, 7, %o5 ! 8 byte aligned count
667 neg %o0, %o5 ! 'large' case
668 and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned
670 brgez,a %o3, .beginmedloop
671 ldd [%o1-8], %d0
673 add %o1, %o3, %o1 ! back up o1
675 ldda [%o1]ASI_FL8_P, %d2
676 inc %o1
677 andcc %o1, 7, %g0
678 bnz %ncc, 5b
679 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
681 .beginmedloop:
682 tst %o5
683 bz %ncc, .endmedloop
684 sub %o2, %o5, %o2 ! update count for later
686 ! Main loop to write out doubles. Note: o5 & 7 == 0
688 ldd [%o1], %d2
689 subcc %o5, 8, %o5 ! update local count
690 bz,pn %ncc, 1f
691 add %o1, 8, %o1 ! update SRC
693 .medloop:
694 faligndata %d0, %d2, %d4
695 ldd [%o1], %d0
696 subcc %o5, 8, %o5 ! update local count
697 add %o1, 16, %o1 ! update SRC
698 std %d4, [%o0]
699 bz,pn %ncc, 2f
700 faligndata %d2, %d0, %d6
701 ldd [%o1 - 8], %d2
702 subcc %o5, 8, %o5 ! update local count
703 std %d6, [%o0 + 8]
704 bnz,pt %ncc, .medloop
705 add %o0, 16, %o0 ! update DST
708 faligndata %d0, %d2, %d4
709 fmovd %d2, %d0
710 std %d4, [%o0]
711 ba .endmedloop
712 add %o0, 8, %o0
715 std %d6, [%o0 + 8]
716 sub %o1, 8, %o1
717 add %o0, 16, %o0
720 .endmedloop:
721 ! Currently, o1 is pointing to the next double-aligned byte in SRC
722 ! The 8 bytes starting at [o1-8] are available in d0
723 ! At least one, and possibly all, of these need to be written.
725 cmp %o2, BLOCK_SIZE
726 bgu %ncc, .large ! otherwise, less than 16 bytes left
728 #if 0
730 /* This code will use partial stores. */
732 mov %g0, %o5
733 and %o3, 7, %o3 ! Number of bytes needed to completely
734 ! fill %d0 with good (unwritten) data.
736 subcc %o2, 8, %o2 ! update count (maybe too much)
737 movl %ncc, %o2, %o5
738 addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0
739 sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0)
741 bz %ncc, 2f
742 alignaddr %o3, %g0, %g0 ! set GSR.ALIGN
745 deccc %o5
746 ldda [%o1]ASI_FL8_P, %d2
747 inc %o1
748 bgu %ncc, 1b
749 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
752 not %o3
753 faligndata %d0, %d0, %d0 ! shift bytes to the left
754 and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3]
755 edge8n %g0, %o3, %o5
756 stda %d0, [%o0]%o5, ASI_PST8_P
757 brlez %o2, .mediumexit
758 add %o0, %o3, %o0 ! update DST to last stored byte
760 inc %o0
761 deccc %o2
762 ldub [%o1], %o3
763 stb %o3, [%o0]
764 bgu %ncc, 3b
765 inc %o1
767 #else
769 andcc %o3, 7, %o5 ! Number of bytes needed to completely
770 ! fill %d0 with good (unwritten) data.
771 bz %ncc, 2f
772 sub %o5, 8, %o3 ! -(number of good bytes in %d0)
773 cmp %o2, 8
774 bl,a %ncc, 3f ! Not enough bytes to fill %d0
775 add %o1, %o3, %o1 ! Back up %o1
778 deccc %o5
779 ldda [%o1]ASI_FL8_P, %d2
780 inc %o1
781 bgu %ncc, 1b
782 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
785 subcc %o2, 8, %o2
786 std %d0, [%o0]
787 bz %ncc, .mediumexit
788 add %o0, 8, %o0
790 ldub [%o1], %o3
791 deccc %o2
792 inc %o1
793 stb %o3, [%o0]
794 bgu %ncc, 3b
795 inc %o0
796 #endif
798 .mediumexit:
799 wr %o4, %g0, %fprs ! fprs = o4 restore fprs
800 retl
801 mov %g1, %o0
804 .align ICACHE_LINE_SIZE
805 .large:
806 ! The following test for BSTORE_SIZE is used to decide whether
807 ! to store data with a block store or with individual stores.
808 ! The block store wins when the amount of data is so large
809 ! that it is causes other application data to be moved out
810 ! of the L1 or L2 cache.
811 ! On a Panther, block store can lose more often because block
812 ! store forces the stored data to be removed from the L3 cache.
814 sethi %hi(BSTORE_SIZE),%o5
815 or %o5,%lo(BSTORE_SIZE),%o5
816 cmp %o2, %o5
817 bgu %ncc, .xlarge
819 ! %o0 I/O DST is 64-byte aligned
820 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
821 ! %d0 I/O already loaded with SRC data from [%o1-8]
822 ! %o2 I/O count (number of bytes that need to be written)
823 ! %o3 I Not written. If zero, then SRC is double aligned.
824 ! %o4 I Not written. Holds fprs.
825 ! %o5 O The number of doubles that remain to be written.
827 ! Load the rest of the current block
828 ! Recall that %o1 is further into SRC than %o0 is into DST
830 prefetch [%o0 + (0 * BLOCK_SIZE)], 22
831 prefetch [%o0 + (1 * BLOCK_SIZE)], 22
832 prefetch [%o0 + (2 * BLOCK_SIZE)], 22
833 ldd [%o1], %f2
834 prefetch [%o1 + (3 * BLOCK_SIZE)], 21
835 ldd [%o1 + 0x8], %f4
836 faligndata %f0, %f2, %f32
837 ldd [%o1 + 0x10], %f6
838 faligndata %f2, %f4, %f34
839 ldd [%o1 + 0x18], %f8
840 faligndata %f4, %f6, %f36
841 ldd [%o1 + 0x20], %f10
842 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
843 prefetch [%o1 + (4 * BLOCK_SIZE)], 21
844 faligndata %f6, %f8, %f38
845 ldd [%o1 + 0x28], %f12
846 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter)
847 faligndata %f8, %f10, %f40
848 ldd [%o1 + 0x30], %f14
849 faligndata %f10, %f12, %f42
850 ldd [%o1 + 0x38], %f0
851 sub %o2, BLOCK_SIZE, %o2 ! update count
852 prefetch [%o1 + (5 * BLOCK_SIZE)], 21
853 add %o1, BLOCK_SIZE, %o1 ! update SRC
855 ! Main loop. Write previous block. Load rest of current block.
856 ! Some bytes will be loaded that won't yet be written.
858 ldd [%o1], %f2
859 faligndata %f12, %f14, %f44
860 ldd [%o1 + 0x8], %f4
861 faligndata %f14, %f0, %f46
862 std %f32, [%o0]
863 std %f34, [%o0+8]
864 std %f36, [%o0+16]
865 std %f38, [%o0+24]
866 std %f40, [%o0+32]
867 std %f42, [%o0+40]
868 std %f44, [%o0+48]
869 std %f46, [%o0+56]
870 sub %o2, BLOCK_SIZE, %o2 ! update count
871 prefetch [%o0 + (6 * BLOCK_SIZE)], 22
872 prefetch [%o0 + (3 * BLOCK_SIZE)], 22
873 add %o0, BLOCK_SIZE, %o0 ! update DST
874 ldd [%o1 + 0x10], %f6
875 faligndata %f0, %f2, %f32
876 ldd [%o1 + 0x18], %f8
877 faligndata %f2, %f4, %f34
878 ldd [%o1 + 0x20], %f10
879 faligndata %f4, %f6, %f36
880 ldd [%o1 + 0x28], %f12
881 faligndata %f6, %f8, %f38
882 ldd [%o1 + 0x30], %f14
883 faligndata %f8, %f10, %f40
884 ldd [%o1 + 0x38], %f0
885 faligndata %f10, %f12, %f42
886 cmp %o2, BLOCK_SIZE + 8
887 prefetch [%o1 + (5 * BLOCK_SIZE)], 21
888 bgu,pt %ncc, 1b
889 add %o1, BLOCK_SIZE, %o1 ! update SRC
890 faligndata %f12, %f14, %f44
891 faligndata %f14, %f0, %f46
892 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
893 cmp %o2, BLOCK_SIZE
894 bne %ncc, 2f ! exactly 1 block remaining?
895 add %o0, BLOCK_SIZE, %o0 ! update DST
896 brz,a %o3, 3f ! is SRC double aligned?
897 ldd [%o1], %f2
900 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
901 add %o5, %o3, %o5
903 membar #StoreLoad|#StoreStore
905 ba .beginmedloop
906 andn %o5, 7, %o5 ! 8 byte aligned count
909 ! This is when there is exactly 1 block remaining and SRC is aligned
911 ldd [%o1 + 0x8], %f4
912 ldd [%o1 + 0x10], %f6
913 fsrc1 %f0, %f32
914 ldd [%o1 + 0x18], %f8
915 fsrc1 %f2, %f34
916 ldd [%o1 + 0x20], %f10
917 fsrc1 %f4, %f36
918 ldd [%o1 + 0x28], %f12
919 fsrc1 %f6, %f38
920 ldd [%o1 + 0x30], %f14
921 fsrc1 %f8, %f40
922 fsrc1 %f10, %f42
923 fsrc1 %f12, %f44
924 fsrc1 %f14, %f46
925 stda %f32, [%o0]ASI_BLK_P
926 membar #StoreLoad|#StoreStore
927 wr %o4, 0, %fprs
928 retl
929 mov %g1, %o0
932 .align 16
933 ! two nops here causes loop starting at 1f below to be
934 ! on a cache line boundary, improving performance
937 .xlarge:
938 ! %o0 I/O DST is 64-byte aligned
939 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
940 ! %d0 I/O already loaded with SRC data from [%o1-8]
941 ! %o2 I/O count (number of bytes that need to be written)
942 ! %o3 I Not written. If zero, then SRC is double aligned.
943 ! %o4 I Not written. Holds fprs.
944 ! %o5 O The number of doubles that remain to be written.
946 ! Load the rest of the current block
947 ! Recall that %o1 is further into SRC than %o0 is into DST
949 ! prefetch [%o1 + (3 * BLOCK_SIZE)], 21
950 ! executed in delay slot for branch to .xlarge
951 prefetch [%o1 + (4 * BLOCK_SIZE)], 21
952 prefetch [%o1 + (5 * BLOCK_SIZE)], 21
953 ldd [%o1], %f2
954 prefetch [%o1 + (6 * BLOCK_SIZE)], 21
955 ldd [%o1 + 0x8], %f4
956 faligndata %f0, %f2, %f32
957 ldd [%o1 + 0x10], %f6
958 faligndata %f2, %f4, %f34
959 ldd [%o1 + 0x18], %f8
960 faligndata %f4, %f6, %f36
961 ldd [%o1 + 0x20], %f10
962 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
963 faligndata %f6, %f8, %f38
964 ldd [%o1 + 0x28], %f12
965 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later)
966 faligndata %f8, %f10, %f40
967 ldd [%o1 + 0x30], %f14
968 faligndata %f10, %f12, %f42
969 ldd [%o1 + 0x38], %f0
970 sub %o2, BLOCK_SIZE, %o2 ! update count
971 prefetch [%o1 + (7 * BLOCK_SIZE)], 21
972 add %o1, BLOCK_SIZE, %o1 ! update SRC
974 ! This point is 32-byte aligned since 24 instructions appear since
975 ! the previous alignment directive.
978 ! Main loop. Write previous block. Load rest of current block.
979 ! Some bytes will be loaded that won't yet be written.
981 ldd [%o1], %f2
982 faligndata %f12, %f14, %f44
983 ldd [%o1 + 0x8], %f4
984 faligndata %f14, %f0, %f46
985 stda %f32, [%o0]ASI_BLK_P
986 sub %o2, BLOCK_SIZE, %o2 ! update count
987 ldd [%o1 + 0x10], %f6
988 faligndata %f0, %f2, %f32
989 ldd [%o1 + 0x18], %f8
990 faligndata %f2, %f4, %f34
991 ldd [%o1 + 0x20], %f10
992 faligndata %f4, %f6, %f36
993 ldd [%o1 + 0x28], %f12
994 faligndata %f6, %f8, %f38
995 ldd [%o1 + 0x30], %f14
996 faligndata %f8, %f10, %f40
997 ldd [%o1 + 0x38], %f0
998 faligndata %f10, %f12, %f42
999 ! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
1000 prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21
1001 add %o0, BLOCK_SIZE, %o0 ! update DST
1002 cmp %o2, BLOCK_SIZE + 8
1003 ! second prefetch important to correct for occasional dropped
1004 ! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
1005 ! strong prefetch prevents drops on Panther, but Jaguar and earlier
1006 ! US-III models treat strong prefetches as weak prefetchs
1007 ! to avoid regressions on customer hardware, we retain the prefetch
1008 prefetch [%o1 + (5 * BLOCK_SIZE)], 21
1009 bgu,pt %ncc, 1b
1010 add %o1, BLOCK_SIZE, %o1 ! update SRC
1012 faligndata %f12, %f14, %f44
1013 faligndata %f14, %f0, %f46
1014 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
1015 cmp %o2, BLOCK_SIZE
1016 bne %ncc, 2f ! exactly 1 block remaining?
1017 add %o0, BLOCK_SIZE, %o0 ! update DST
1018 brz,a %o3, 3f ! is SRC double aligned?
1019 ldd [%o1], %f2
1022 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
1023 add %o5, %o3, %o5
1025 membar #StoreLoad|#StoreStore
1027 ba .beginmedloop
1028 andn %o5, 7, %o5 ! 8 byte aligned count
1031 ! This is when there is exactly 1 block remaining and SRC is aligned
1033 ldd [%o1 + 0x8], %f4
1034 ldd [%o1 + 0x10], %f6
1035 fsrc1 %f0, %f32
1036 ldd [%o1 + 0x18], %f8
1037 fsrc1 %f2, %f34
1038 ldd [%o1 + 0x20], %f10
1039 fsrc1 %f4, %f36
1040 ldd [%o1 + 0x28], %f12
1041 fsrc1 %f6, %f38
1042 ldd [%o1 + 0x30], %f14
1043 fsrc1 %f8, %f40
1044 fsrc1 %f10, %f42
1045 fsrc1 %f12, %f44
1046 fsrc1 %f14, %f46
1047 stda %f32, [%o0]ASI_BLK_P
1048 membar #StoreLoad|#StoreStore
1049 wr %o4, 0, %fprs
1050 retl
1051 mov %g1, %o0
1053 SET_SIZE(memcpy)