import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / capabilities / sun4u-opl / common / memcpy.s
blobd7fe66ef94bcba285e7511012f042bac7c67145a
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
26 .file "memcpy.s"
29 * memcpy(s1, s2, len)
31 * Copy s2 to s1, always copy n bytes.
32 * Note: this C code does not work for overlapped copies.
33 * Memmove() and bcopy() do.
35 * Fast assembler language version of the following C-program for memcpy
36 * which represents the `standard' for the C-library.
38 * void *
39 * memcpy(void *s, const void *s0, size_t n)
40 * {
41 * if (n != 0) {
42 * char *s1 = s;
43 * const char *s2 = s0;
44 * do {
45 * *s1++ = *s2++;
46 * } while (--n != 0);
47 * }
48 * return (s);
49 * }
52 #include <sys/asm_linkage.h>
53 #include <sys/sun4asi.h>
54 #include <sys/trap.h>
56 #define ICACHE_LINE_SIZE 64
57 #define BLOCK_SIZE 64
58 #define FPRS_FEF 0x4
60 #define ALIGNED8_FPCOPY_THRESHOLD 1024
61 #define ALIGNED4_FPCOPY_THRESHOLD 1024
62 #define BST_THRESHOLD 65536
64 #define SHORTCOPY 3
65 #define SMALL_MAX 64
66 #define MEDIUM_MAX 255
67 #define MED_WMAX 256 /* max copy for medium word-aligned case */
69 #define N_READS_STRONG 20
70 #define N_WRITES_STRONG 22
73 ANSI_PRAGMA_WEAK(memmove,function)
74 ANSI_PRAGMA_WEAK(memcpy,function)
76 ENTRY(memmove)
77 prefetch [%o1], N_READS_STRONG
78 prefetch [%o0], N_WRITES_STRONG
79 cmp %o1, %o0 ! if from address is >= to use forward copy
80 bgeu %ncc, .forcpy ! else use backward if ...
81 sub %o0, %o1, %o4 ! get difference of two addresses
82 cmp %o2, %o4 ! compare size and difference of addresses
83 bleu %ncc, .forcpy ! if size is bigger, do overlapped copy
84 nop
87 ! an overlapped copy that must be done "backwards"
89 .ovbc:
90 mov %o0, %g1 ! save dest address for return val
91 add %o1, %o2, %o1 ! get to end of source space
92 add %o0, %o2, %o0 ! get to end of destination space
94 cmp %o2, 64
95 bgeu,pn %ncc, .dbalign
96 nop
97 cmp %o2, 4
98 blt,pn %ncc, .byte
99 sub %o2, 3, %o2
100 .byte4loop:
101 ldub [%o1-1], %o3 ! load last byte
102 stb %o3, [%o0-1] ! store last byte
103 sub %o1, 4, %o1
104 ldub [%o1+2], %o3 ! load 2nd from last byte
105 stb %o3, [%o0-2] ! store 2nd from last byte
106 sub %o0, 4, %o0
107 ldub [%o1+1], %o3 ! load 3rd from last byte
108 stb %o3, [%o0+1] ! store 3rd from last byte
109 subcc %o2, 4, %o2
110 ldub [%o1], %o3 ! load 4th from last byte
111 bgu,pt %ncc, .byte4loop
112 stb %o3, [%o0] ! store 4th from last byte
113 .byte:
114 addcc %o2, 3, %o2
115 bz,pt %ncc, .exit
116 .byteloop:
117 dec %o1 ! decrement src address
118 ldub [%o1], %o3 ! read a byte
119 dec %o0 ! decrement dst address
120 deccc %o2 ! decrement count
121 bgu,pt %ncc, .byteloop ! loop until done
122 stb %o3, [%o0] ! write byte
123 .exit:
124 retl
125 mov %g1, %o0
127 .align 16
128 .dbalign:
129 prefetch [%o1 - (4 * BLOCK_SIZE)], #one_read
130 prefetch [%o0 - (4 * BLOCK_SIZE)], #one_write
131 andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned
132 bz,pt %ncc, .dbmed
133 sub %o2, %o5, %o2 ! update count
134 .dbalign1:
135 dec %o1 ! decrement src address
136 ldub [%o1], %o3 ! read a byte
137 dec %o0 ! decrement dst address
138 deccc %o5 ! decrement count
139 bgu,pt %ncc, .dbalign1 ! loop until done
140 stb %o3, [%o0] ! store a byte
142 ! check for src long word alignment
143 .dbmed:
144 andcc %o1, 7, %g0 ! chk src long word alignment
145 bnz,pn %ncc, .dbbck
148 ! Following code is for overlapping copies where src and dest
149 ! are long word aligned
152 ! For SPARC64-VI, prefetch is effective for both integer and fp register
153 ! operations. There are no benefits in using the fp registers for
154 ! aligned data copying.
156 .dbmedl32enter:
157 subcc %o2, 31, %o2 ! adjust length to allow cc test
158 ! for end of loop
159 ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32
161 .dbmedl32:
162 ldx [%o1-8], %o4 ! load
163 prefetch [%o1 - (8 * BLOCK_SIZE)], #one_read
164 subcc %o2, 32, %o2 ! decrement length count
165 stx %o4, [%o0-8] ! and store
166 prefetch [%o0 - (8 * BLOCK_SIZE)], #one_write
167 ldx [%o1-16], %o3 ! a block of 32 bytes
168 sub %o1, 32, %o1 ! decrease src ptr by 32
169 stx %o3, [%o0-16]
170 ldx [%o1+8], %o4
171 sub %o0, 32, %o0 ! decrease dst ptr by 32
172 stx %o4, [%o0+8]
173 ldx [%o1], %o3
174 bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left
175 stx %o3, [%o0]
176 .dbmedl31:
177 addcc %o2, 16, %o2 ! adjust remaining count
178 ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left
179 nop !
180 ldx [%o1-8], %o4 ! load and store 16 bytes
181 sub %o1, 16, %o1 ! decrease src ptr by 16
182 stx %o4, [%o0-8] !
183 sub %o2, 16, %o2 ! decrease count by 16
184 ldx [%o1], %o3 !
185 sub %o0, 16, %o0 ! decrease dst ptr by 16
186 stx %o3, [%o0]
187 .dbmedl15:
188 addcc %o2, 15, %o2 ! restore count
189 bz,pt %ncc, .dbexit ! exit if finished
191 cmp %o2, 8
192 blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left
194 ldx [%o1-8], %o4 ! load 8 bytes
195 sub %o1, 8, %o1 ! decrease src ptr by 8
196 stx %o4, [%o0-8] ! and store 8 bytes
197 subcc %o2, 8, %o2 ! decrease count by 8
198 bnz %ncc, .dbremain ! exit if finished
199 sub %o0, 8, %o0 ! decrease dst ptr by 8
200 retl
201 mov %g1, %o0
204 ! Following code is for overlapping copies where src and dest
205 ! are not long word aligned
207 .align 16
208 .dbbck:
209 rd %fprs, %o3 ! o3 = fprs
211 ! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
212 ! So set it anyway, without checking.
213 wr %g0, 0x4, %fprs ! fprs.fef = 1
215 alignaddr %o1, %g0, %o5 ! align src
216 ldd [%o5], %d0 ! get first 8 byte block
217 andn %o2, 7, %o4 ! prepare src ptr for finishup code
218 cmp %o2, 32
219 blt,pn %ncc, .dbmv8
220 sub %o1, %o4, %o1 !
221 cmp %o2, 4095 ! check for short memmoves
222 blt,pn %ncc, .dbmv32enter ! go to no prefetch code
223 .dbmv64:
224 ldd [%o5-8], %d2 ! load 8 bytes
225 ldd [%o5-16], %d4 ! load 8 bytes
226 sub %o5, 64, %o5 !
227 ldd [%o5+40], %d6 ! load 8 bytes
228 sub %o0, 64, %o0 !
229 ldd [%o5+32], %d8 ! load 8 bytes
230 sub %o2, 64, %o2 ! 64 less bytes to copy
231 ldd [%o5+24], %d18 ! load 8 bytes
232 cmp %o2, 64 ! do we have < 64 bytes remaining
233 ldd [%o5+16], %d28 ! load 8 bytes
234 ldd [%o5+8], %d30 ! load 8 bytes
235 faligndata %d2, %d0, %d10 ! extract 8 bytes out
236 prefetch [%o5 - (5 * BLOCK_SIZE)], #one_read
237 ldd [%o5], %d0 ! load 8 bytes
238 std %d10, [%o0+56] ! store the current 8 bytes
239 faligndata %d4, %d2, %d12 ! extract 8 bytes out
240 prefetch [%o0 - (5 * BLOCK_SIZE)], #one_write
241 std %d12, [%o0+48] ! store the current 8 bytes
242 faligndata %d6, %d4, %d14 ! extract 8 bytes out
243 std %d14, [%o0+40] ! store the current 8 bytes
244 faligndata %d8, %d6, %d16 ! extract 8 bytes out
245 std %d16, [%o0+32] ! store the current 8 bytes
246 faligndata %d18, %d8, %d20 ! extract 8 bytes out
247 std %d20, [%o0+24] ! store the current 8 bytes
248 faligndata %d28, %d18, %d22 ! extract 8 bytes out
249 std %d22, [%o0+16] ! store the current 8 bytes
250 faligndata %d30, %d28, %d24 ! extract 8 bytes out
251 std %d24, [%o0+8] ! store the current 8 bytes
252 faligndata %d0, %d30, %d26 ! extract 8 bytes out
253 bgeu,pt %ncc, .dbmv64
254 std %d26, [%o0] ! store the current 8 bytes
256 cmp %o2, 32
257 blt,pn %ncc, .dbmvx
259 .dbmv32:
260 ldd [%o5-8], %d2 ! load 8 bytes
261 .dbmv32enter:
262 ldd [%o5-16], %d4 ! load 8 bytes
263 sub %o5, 32, %o5 !
264 ldd [%o5+8], %d6 ! load 8 bytes
265 sub %o0, 32, %o0 !
266 faligndata %d2, %d0, %d10 ! extract 8 bytes out
267 ldd [%o5], %d0 ! load 8 bytes
268 sub %o2,32, %o2 ! 32 less bytes to copy
269 std %d10, [%o0+24] ! store the current 8 bytes
270 cmp %o2, 32 ! do we have < 32 bytes remaining
271 faligndata %d4, %d2, %d12 ! extract 8 bytes out
272 std %d12, [%o0+16] ! store the current 8 bytes
273 faligndata %d6, %d4, %d14 ! extract 8 bytes out
274 std %d14, [%o0+8] ! store the current 8 bytes
275 faligndata %d0, %d6, %d16 ! extract 8 bytes out
276 bgeu,pt %ncc, .dbmv32
277 std %d16, [%o0] ! store the current 8 bytes
278 .dbmvx:
279 cmp %o2, 8 ! do we have < 8 bytes remaining
280 blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code
282 .dbmv8:
283 ldd [%o5-8], %d2
284 sub %o0, 8, %o0 ! since we are at the end
285 ! when we first enter the loop
286 sub %o2, 8, %o2 ! 8 less bytes to copy
287 sub %o5, 8, %o5
288 cmp %o2, 8 ! do we have < 8 bytes remaining
289 faligndata %d2, %d0, %d8 ! extract 8 bytes out
290 std %d8, [%o0] ! store the current 8 bytes
291 bgeu,pt %ncc, .dbmv8
292 fmovd %d2, %d0
293 .dbmvfinish:
294 and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0
295 tst %o2
296 bz,pt %ncc, .dbexit
297 wr %o3, %g0, %fprs ! fprs = o3 restore fprs
299 .dbremain:
300 cmp %o2, 4
301 blt,pn %ncc, .dbbyte
303 ldub [%o1-1], %o3 ! load last byte
304 stb %o3, [%o0-1] ! store last byte
305 sub %o1, 4, %o1
306 ldub [%o1+2], %o3 ! load 2nd from last byte
307 stb %o3, [%o0-2] ! store 2nd from last byte
308 sub %o0, 4, %o0
309 ldub [%o1+1], %o3 ! load 3rd from last byte
310 stb %o3, [%o0+1] ! store 3rd from last byte
311 subcc %o2, 4, %o2
312 ldub [%o1], %o3 ! load 4th from last byte
313 stb %o3, [%o0] ! store 4th from last byte
314 bz,pt %ncc, .dbexit
315 .dbbyte:
316 dec %o1 ! decrement src address
317 ldub [%o1], %o3 ! read a byte
318 dec %o0 ! decrement dst address
319 deccc %o2 ! decrement count
320 bgu,pt %ncc, .dbbyte ! loop until done
321 stb %o3, [%o0] ! write byte
322 .dbexit:
323 retl
324 mov %g1, %o0
325 SET_SIZE(memmove)
328 .align ICACHE_LINE_SIZE
329 ENTRY(memcpy)
330 ! adjust instruction alignment
331 nop ! Do not remove, these nops affect
332 nop ! icache alignment and performance
333 .forcpy:
334 prefetch [%o1], N_READS_STRONG
335 prefetch [%o0], N_WRITES_STRONG
336 cmp %o2, SMALL_MAX ! check for not small case
337 bgu,pn %ncc, .medium ! go to larger cases
338 mov %o0, %g1 ! save %o0
339 cmp %o2, SHORTCOPY ! check for really short case
340 ble,pt %ncc, .smallleft !
341 or %o0, %o1, %o3 ! prepare alignment check
342 andcc %o3, 0x3, %g0 ! test for alignment
343 bz,pt %ncc, .smallword ! branch to word aligned case
344 sub %o2, 3, %o2 ! adjust count to allow cc zero test
345 .smallnotalign4:
346 ldub [%o1], %o3 ! read byte
347 subcc %o2, 4, %o2 ! reduce count by 4
348 stb %o3, [%o0] ! write byte
349 ldub [%o1+1], %o3 ! repeat for a total of 4 bytes
350 add %o1, 4, %o1 ! advance SRC by 4
351 stb %o3, [%o0+1]
352 ldub [%o1-2], %o3
353 add %o0, 4, %o0 ! advance DST by 4
354 stb %o3, [%o0-2]
355 ldub [%o1-1], %o3
356 bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain
357 stb %o3, [%o0-1]
358 add %o2, 3, %o2 ! restore count
359 .smallleft:
360 tst %o2
361 bz,pt %ncc, .smallexit
363 .smallleft3: ! 1, 2, or 3 bytes remain
364 ldub [%o1], %o3 ! load one byte
365 deccc %o2 ! reduce count for cc test
366 bz,pt %ncc, .smallexit
367 stb %o3, [%o0] ! store one byte
368 ldub [%o1+1], %o3 ! load second byte
369 deccc %o2
370 bz,pt %ncc, .smallexit
371 stb %o3, [%o0+1] ! store second byte
372 ldub [%o1+2], %o3 ! load third byte
373 stb %o3, [%o0+2] ! store third byte
374 retl
375 mov %g1, %o0 ! restore %o0
377 .align 16
378 nop ! affects loop icache alignment
379 .smallwords:
380 lduw [%o1], %o3 ! read word
381 .smallwordx:
382 subcc %o2, 8, %o2 ! update count
383 stw %o3, [%o0] ! write word
384 add %o1, 8, %o1 ! update SRC
385 lduw [%o1-4], %o3 ! read word
386 add %o0, 8, %o0 ! update DST
387 bgu,pt %ncc, .smallwords ! loop until done
388 stw %o3, [%o0-4] ! write word
389 addcc %o2, 7, %o2 ! restore count
390 bz,pt %ncc, .smallexit ! check for completion
392 cmp %o2, 4 ! check for 4 or more bytes left
393 blt .smallleft3 ! if not, go to finish up
395 lduw [%o1], %o3
396 add %o1, 4, %o1
397 subcc %o2, 4, %o2
398 stw %o3, [%o0]
399 add %o0, 4, %o0
400 bnz,pt %ncc, .smallleft3
402 retl
403 mov %g1, %o0 ! restore %o0
405 .smallword:
406 subcc %o2, 4, %o2 ! update count
407 bgu,pt %ncc, .smallwordx
408 lduw [%o1], %o3 ! read word
409 addcc %o2, 3, %o2 ! restore count
410 bz,pt %ncc, .smallexit
411 stw %o3, [%o0] ! write word
412 deccc %o2 ! reduce count for cc test
413 ldub [%o1+4], %o3 ! load one byte
414 bz,pt %ncc, .smallexit
415 stb %o3, [%o0+4] ! store one byte
416 ldub [%o1+5], %o3 ! load second byte
417 deccc %o2
418 bz,pt %ncc, .smallexit
419 stb %o3, [%o0+5] ! store second byte
420 ldub [%o1+6], %o3 ! load third byte
421 stb %o3, [%o0+6] ! store third byte
422 .smallexit:
423 retl
424 mov %g1, %o0 ! restore %o0
425 .align 16
426 .medium:
427 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
428 prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write
429 neg %o0, %o5
430 neg %o1, %o3
431 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
432 and %o3, 7, %o3 ! bytes till SRC 8 byte aligned
434 bz %ncc, 2f
435 sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned)
436 ! o3={-7, -6, ... 7} o3>0 => SRC overaligned
438 sub %o2, %o5, %o2 ! update count
441 ldub [%o1], %o4
442 deccc %o5
443 inc %o1
444 stb %o4, [%o0]
445 bgu,pt %ncc, 1b
446 inc %o0
448 ! Now DST is 8-byte aligned. o0, o1, o2 are current.
451 andcc %o1, 0x3, %g0 ! test alignment
452 prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
453 bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases
454 ! if src, dst not aligned
455 prefetch [%o0 + (1 * BLOCK_SIZE)], #one_write
458 * Handle all cases where src and dest are aligned on word
459 * or long word boundaries. Use unrolled loops for better
460 * performance. This option wins over standard large data
461 * move when source and destination is in cache for medium
462 * to short data moves.
464 andcc %o1, 0x7, %g0 ! test word alignment
465 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
466 bz,pt %ncc, .medlword ! branch to long word aligned case
467 prefetch [%o0 + (2 * BLOCK_SIZE)], #one_write
468 cmp %o2, ALIGNED4_FPCOPY_THRESHOLD ! limit to store buffer size
469 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
470 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
471 subcc %o2, 15, %o2 ! adjust length to allow cc test
472 prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write
473 ! for end of loop
474 ble,pt %ncc, .medw15 ! skip big loop if less than 16
475 .empty
476 .medw16:
477 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
478 ld [%o1], %o4 ! load
479 subcc %o2, 16, %o2 ! decrement length count
480 prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write
481 stw %o4, [%o0] ! and store
482 ld [%o1+4], %o3 ! a block of 16 bytes
483 add %o1, 16, %o1 ! increase src ptr by 16
484 stw %o3, [%o0+4]
485 ld [%o1-8], %o4
486 add %o0, 16, %o0 ! increase dst ptr by 16
487 stw %o4, [%o0-8]
488 ld [%o1-4], %o3
489 bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left
490 stw %o3, [%o0-4]
491 .medw15:
492 addcc %o2, 15, %o2 ! restore count
493 bz,pt %ncc, .medwexit ! exit if finished
495 cmp %o2, 8
496 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
497 nop !
498 ld [%o1], %o4 ! load 4 bytes
499 subcc %o2, 8, %o2 ! decrease count by 8
500 stw %o4, [%o0] ! and store 4 bytes
501 add %o1, 8, %o1 ! increase src ptr by 8
502 ld [%o1-4], %o3 ! load 4 bytes
503 add %o0, 8, %o0 ! increase dst ptr by 8
504 stw %o3, [%o0-4] ! and store 4 bytes
505 bz %ncc, .medwexit ! exit if finished
507 .medw7: ! count is ge 1, less than 8
508 cmp %o2, 3 ! check for 4 bytes left
509 ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left
510 nop !
511 ld [%o1], %o4 ! load 4 bytes
512 sub %o2, 4, %o2 ! decrease count by 4
513 add %o1, 4, %o1 ! increase src ptr by 4
514 stw %o4, [%o0] ! and store 4 bytes
515 add %o0, 4, %o0 ! increase dst ptr by 4
516 tst %o2 ! check for zero bytes left
517 bz %ncc, .medwexit ! exit if finished
519 .medw3: ! count is known to be 1, 2, or 3
520 deccc %o2 ! reduce count by one
521 ldub [%o1], %o3 ! load one byte
522 bz,pt %ncc, .medwexit ! exit if last byte
523 stb %o3, [%o0] ! store one byte
524 ldub [%o1+1], %o3 ! load second byte
525 deccc %o2 ! reduce count by one
526 bz,pt %ncc, .medwexit ! exit if last byte
527 stb %o3, [%o0+1] ! store second byte
528 ldub [%o1+2], %o3 ! load third byte
529 stb %o3, [%o0+2] ! store third byte
530 .medwexit:
531 retl
532 mov %g1, %o0 ! restore %o0
535 * Special case for handling when src and dest are both long word aligned
536 * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD
537 * bytes.
540 .align 16
542 .medlword: ! long word aligned
543 ! length > ALIGNED8_FPCOPY_THRESHOLD
544 cmp %o2, ALIGNED8_FPCOPY_THRESHOLD
545 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
546 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
547 prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write
548 subcc %o2, 31, %o2 ! adjust length to allow cc test
549 ! for end of loop
550 ble,pt %ncc, .medl31 ! skip big loop if less than 32
551 .empty
552 .medl32:
553 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
554 ldx [%o1], %o4 ! load
555 subcc %o2, 32, %o2 ! decrement length count
556 prefetch [%o0 + (4 * BLOCK_SIZE)], #one_read
557 stx %o4, [%o0] ! and store
558 ldx [%o1+8], %o3 ! a block of 32 bytes
559 add %o1, 32, %o1 ! increase src ptr by 32
560 stx %o3, [%o0+8]
561 ldx [%o1-16], %o4
562 add %o0, 32, %o0 ! increase dst ptr by 32
563 stx %o4, [%o0-16]
564 ldx [%o1-8], %o3
565 bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left
566 stx %o3, [%o0-8]
567 .medl31:
568 addcc %o2, 16, %o2 ! adjust remaining count
569 ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left
570 nop !
571 ldx [%o1], %o4 ! load and store 16 bytes
572 add %o1, 16, %o1 ! increase src ptr by 16
573 stx %o4, [%o0] !
574 sub %o2, 16, %o2 ! decrease count by 16
575 ldx [%o1-8], %o3 !
576 add %o0, 16, %o0 ! increase dst ptr by 16
577 stx %o3, [%o0-8]
578 .medl15:
579 addcc %o2, 15, %o2 ! restore count
580 bz,pt %ncc, .medwexit ! exit if finished
582 cmp %o2, 8
583 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
585 ldx [%o1], %o4 ! load 8 bytes
586 add %o1, 8, %o1 ! increase src ptr by 8
587 stx %o4, [%o0] ! and store 8 bytes
588 subcc %o2, 8, %o2 ! decrease count by 8
589 bz %ncc, .medwexit ! exit if finished
590 add %o0, 8, %o0 ! increase dst ptr by 8
591 ba .medw7
594 .align 16
598 .mediumsetup:
599 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
600 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
601 .mediumrejoin:
602 rd %fprs, %o4 ! check for unused FPU
604 add %o1, 8, %o1 ! prepare to round SRC upward
606 sethi %hi(0x1234567f), %o5 ! For GSR.MASK
607 or %o5, 0x67f, %o5
609 andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0
610 bz,a %ncc, 3f
611 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
613 cmp %o2, MEDIUM_MAX
614 bmask %o5, %g0, %g0
616 ! Compute o5 (number of bytes that need copying using the main loop).
617 ! First, compute for the medium case.
618 ! Then, if large case, o5 is replaced by count for block alignment.
619 ! Be careful not to read past end of SRC
620 ! Currently, o2 is the actual count remaining
621 ! o3 is how much sooner we'll cross the alignment boundary
622 ! in SRC compared to in DST
624 ! Examples: Let # denote bytes that should not be accessed
625 ! Let x denote a byte already copied to align DST
626 ! Let . and - denote bytes not yet copied
627 ! Let | denote double alignment boundaries
629 ! DST: ######xx|........|--------|..###### o2 = 18
630 ! o0
632 ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8
633 ! o1
635 ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8
636 ! o1
638 ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8
639 ! o1
641 or %g0, -8, %o5
642 alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1
644 movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0
645 add %o5, %o2, %o5
646 add %o5, %o3, %o5
648 bleu %ncc, 4f
649 andn %o5, 7, %o5 ! 8 byte aligned count
650 neg %o0, %o5 ! 'large' case
651 and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned
653 brgez,a %o3, .beginmedloop
654 ldd [%o1-8], %d0
656 add %o1, %o3, %o1 ! back up o1
658 ldda [%o1]ASI_FL8_P, %d2
659 inc %o1
660 andcc %o1, 7, %g0
661 bnz %ncc, 5b
662 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
664 .beginmedloop:
665 tst %o5
666 bz %ncc, .endmedloop
667 sub %o2, %o5, %o2 ! update count for later
669 ! Main loop to write out doubles. Note: o5 & 7 == 0
671 ldd [%o1], %d2
672 subcc %o5, 8, %o5 ! update local count
673 bz,pn %ncc, 1f
674 add %o1, 8, %o1 ! update SRC
676 .medloop:
677 faligndata %d0, %d2, %d4
678 ldd [%o1], %d0
679 subcc %o5, 8, %o5 ! update local count
680 add %o1, 16, %o1 ! update SRC
681 std %d4, [%o0]
682 bz,pn %ncc, 2f
683 faligndata %d2, %d0, %d6
684 ldd [%o1 - 8], %d2
685 subcc %o5, 8, %o5 ! update local count
686 std %d6, [%o0 + 8]
687 bnz,pt %ncc, .medloop
688 add %o0, 16, %o0 ! update DST
691 faligndata %d0, %d2, %d4
692 fmovd %d2, %d0
693 std %d4, [%o0]
694 ba .endmedloop
695 add %o0, 8, %o0
698 std %d6, [%o0 + 8]
699 sub %o1, 8, %o1
700 add %o0, 16, %o0
703 .endmedloop:
704 ! Currently, o1 is pointing to the next double-aligned byte in SRC
705 ! The 8 bytes starting at [o1-8] are available in d0
706 ! At least one, and possibly all, of these need to be written.
708 cmp %o2, BLOCK_SIZE
709 bgu %ncc, .large ! otherwise, less than 16 bytes left
711 #if 0
713 /* This code will use partial stores. */
715 mov %g0, %o5
716 and %o3, 7, %o3 ! Number of bytes needed to completely
717 ! fill %d0 with good (unwritten) data.
719 subcc %o2, 8, %o2 ! update count (maybe too much)
720 movl %ncc, %o2, %o5
721 addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0
722 sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0)
724 bz %ncc, 2f
725 alignaddr %o3, %g0, %g0 ! set GSR.ALIGN
728 deccc %o5
729 ldda [%o1]ASI_FL8_P, %d2
730 inc %o1
731 bgu %ncc, 1b
732 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
735 not %o3
736 faligndata %d0, %d0, %d0 ! shift bytes to the left
737 and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3]
738 edge8n %g0, %o3, %o5
739 stda %d0, [%o0]%o5, ASI_PST8_P
740 brlez %o2, .mediumexit
741 add %o0, %o3, %o0 ! update DST to last stored byte
743 inc %o0
744 deccc %o2
745 ldub [%o1], %o3
746 stb %o3, [%o0]
747 bgu %ncc, 3b
748 inc %o1
750 #else
752 andcc %o3, 7, %o5 ! Number of bytes needed to completely
753 ! fill %d0 with good (unwritten) data.
754 bz %ncc, 2f
755 sub %o5, 8, %o3 ! -(number of good bytes in %d0)
756 cmp %o2, 8
757 bl,a %ncc, 3f ! Not enough bytes to fill %d0
758 add %o1, %o3, %o1 ! Back up %o1
761 deccc %o5
762 ldda [%o1]ASI_FL8_P, %d2
763 inc %o1
764 bgu %ncc, 1b
765 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
768 subcc %o2, 8, %o2
769 std %d0, [%o0]
770 bz %ncc, .mediumexit
771 add %o0, 8, %o0
773 ldub [%o1], %o3
774 deccc %o2
775 inc %o1
776 stb %o3, [%o0]
777 bgu %ncc, 3b
778 inc %o0
779 #endif
781 .mediumexit:
782 wr %o4, %g0, %fprs ! fprs = o4 restore fprs
783 retl
784 mov %g1, %o0
787 .align ICACHE_LINE_SIZE
788 .large:
790 ! %o0 I/O DST is 64-byte aligned
791 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
792 ! %d0 I/O already loaded with SRC data from [%o1-8]
793 ! %o2 I/O count (number of bytes that need to be written)
794 ! %o3 I Not written. If zero, then SRC is double aligned.
795 ! %o4 I Not written. Holds fprs.
796 ! %o5 O The number of doubles that remain to be written.
798 ! Load the rest of the current block
799 ! Recall that %o1 is further into SRC than %o0 is into DST
801 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
802 prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read
804 set BST_THRESHOLD, %o5
805 cmp %o2, %o5
806 bgu,pn %icc, .xlarge
807 prefetch [%o1 + (12 * BLOCK_SIZE)], #one_read
809 ldd [%o1], %f2
810 ldd [%o1 + 0x8], %f4
811 faligndata %f0, %f2, %f32
812 ldd [%o1 + 0x10], %f6
813 faligndata %f2, %f4, %f34
814 ldd [%o1 + 0x18], %f8
815 faligndata %f4, %f6, %f36
816 ldd [%o1 + 0x20], %f10
817 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
818 faligndata %f6, %f8, %f38
819 prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
820 ldd [%o1 + 0x28], %f12
821 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter)
822 faligndata %f8, %f10, %f40
823 ldd [%o1 + 0x30], %f14
824 faligndata %f10, %f12, %f42
825 ldd [%o1 + 0x38], %f0
826 sub %o2, BLOCK_SIZE, %o2 ! update count
827 add %o1, BLOCK_SIZE, %o1 ! update SRC
829 ! Main loop. Write previous block. Load rest of current block.
830 ! Some bytes will be loaded that won't yet be written.
832 ldd [%o1], %f2
833 faligndata %f12, %f14, %f44
834 ldd [%o1 + 0x8], %f4
835 faligndata %f14, %f0, %f46
836 std %f32, [%o0]
837 std %f34, [%o0+8]
838 std %f36, [%o0+16]
839 std %f38, [%o0+24]
840 std %f40, [%o0+32]
841 std %f42, [%o0+40]
842 std %f44, [%o0+48]
843 std %f46, [%o0+56]
844 sub %o2, BLOCK_SIZE, %o2 ! update count
845 prefetch [%o1 + (24 * BLOCK_SIZE) + BLOCK_SIZE], #one_read
846 add %o0, BLOCK_SIZE, %o0 ! update DST
847 ldd [%o1 + 0x10], %f6
848 faligndata %f0, %f2, %f32
849 ldd [%o1 + 0x18], %f8
850 faligndata %f2, %f4, %f34
851 ldd [%o1 + 0x20], %f10
852 faligndata %f4, %f6, %f36
853 ldd [%o1 + 0x28], %f12
854 faligndata %f6, %f8, %f38
855 ldd [%o1 + 0x30], %f14
856 faligndata %f8, %f10, %f40
857 ldd [%o1 + 0x38], %f0
858 faligndata %f10, %f12, %f42
859 prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read
860 cmp %o2, BLOCK_SIZE + 8
861 prefetch [%o0 + (18 * BLOCK_SIZE)], #one_write
862 bgu,pt %ncc, 1b
863 add %o1, BLOCK_SIZE, %o1 ! update SRC
864 faligndata %f12, %f14, %f44
865 faligndata %f14, %f0, %f46
866 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
867 cmp %o2, BLOCK_SIZE
868 bne %ncc, 2f ! exactly 1 block remaining?
869 add %o0, BLOCK_SIZE, %o0 ! update DST
870 brz,a %o3, 3f ! is SRC double aligned?
871 ldd [%o1], %f2
874 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
875 add %o5, %o3, %o5
877 membar #StoreLoad|#StoreStore
879 ba .beginmedloop
880 andn %o5, 7, %o5 ! 8 byte aligned count
883 ! This is when there is exactly 1 block remaining and SRC is aligned
885 ldd [%o1 + 0x8], %f4
886 ldd [%o1 + 0x10], %f6
887 fsrc1 %f0, %f32
888 ldd [%o1 + 0x18], %f8
889 fsrc1 %f2, %f34
890 ldd [%o1 + 0x20], %f10
891 fsrc1 %f4, %f36
892 ldd [%o1 + 0x28], %f12
893 fsrc1 %f6, %f38
894 ldd [%o1 + 0x30], %f14
895 fsrc1 %f8, %f40
896 fsrc1 %f10, %f42
897 fsrc1 %f12, %f44
898 fsrc1 %f14, %f46
899 stda %f32, [%o0]ASI_BLK_P
900 membar #StoreLoad|#StoreStore
901 wr %o4, 0, %fprs
902 retl
903 mov %g1, %o0
906 .align 16
907 ! two nops here causes loop starting at 1f below to be
908 ! on a cache line boundary, improving performance
911 .xlarge:
912 ! %o0 I/O DST is 64-byte aligned
913 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
914 ! %d0 I/O already loaded with SRC data from [%o1-8]
915 ! %o2 I/O count (number of bytes that need to be written)
916 ! %o3 I Not written. If zero, then SRC is double aligned.
917 ! %o4 I Not written. Holds fprs.
918 ! %o5 O The number of doubles that remain to be written.
920 ! Load the rest of the current block
921 ! Recall that %o1 is further into SRC than %o0 is into DST
923 ldd [%o1], %f2
924 ldd [%o1 + 0x8], %f4
925 faligndata %f0, %f2, %f32
926 ldd [%o1 + 0x10], %f6
927 faligndata %f2, %f4, %f34
928 ldd [%o1 + 0x18], %f8
929 faligndata %f4, %f6, %f36
930 ldd [%o1 + 0x20], %f10
931 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
932 faligndata %f6, %f8, %f38
933 ldd [%o1 + 0x28], %f12
934 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later)
935 prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
936 faligndata %f8, %f10, %f40
937 ldd [%o1 + 0x30], %f14
938 faligndata %f10, %f12, %f42
939 ldd [%o1 + 0x38], %f0
940 prefetch [%o1 + (17 * BLOCK_SIZE)], #one_read
941 sub %o2, BLOCK_SIZE, %o2 ! update count
942 add %o1, BLOCK_SIZE, %o1 ! update SRC
944 ! This point is 32-byte aligned since 24 instructions appear since
945 ! the previous alignment directive.
948 ! Main loop. Write previous block. Load rest of current block.
949 ! Some bytes will be loaded that won't yet be written.
951 ldd [%o1], %f2
952 faligndata %f12, %f14, %f44
953 ldd [%o1 + 0x8], %f4
954 faligndata %f14, %f0, %f46
955 stda %f32, [%o0]ASI_BLK_P
956 sub %o2, BLOCK_SIZE, %o2 ! update count
957 ldd [%o1 + 0x10], %f6
958 faligndata %f0, %f2, %f32
959 ldd [%o1 + 0x18], %f8
960 faligndata %f2, %f4, %f34
961 ldd [%o1 + 0x20], %f10
962 faligndata %f4, %f6, %f36
963 ldd [%o1 + 0x28], %f12
964 faligndata %f6, %f8, %f38
965 ldd [%o1 + 0x30], %f14
966 prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads
967 faligndata %f8, %f10, %f40
968 ldd [%o1 + 0x38], %f0
969 faligndata %f10, %f12, %f42
970 prefetch [%o1 + (25 * BLOCK_SIZE)], #one_read
971 add %o0, BLOCK_SIZE, %o0 ! update DST
972 cmp %o2, BLOCK_SIZE + 8
973 ! second prefetch important to correct for occasional dropped
974 prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read
975 bgu,pt %ncc, 1b
976 add %o1, BLOCK_SIZE, %o1 ! update SRC
978 faligndata %f12, %f14, %f44
979 faligndata %f14, %f0, %f46
980 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
981 cmp %o2, BLOCK_SIZE
982 bne %ncc, 2f ! exactly 1 block remaining?
983 add %o0, BLOCK_SIZE, %o0 ! update DST
984 brz,a %o3, 3f ! is SRC double aligned?
985 ldd [%o1], %f2
988 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
989 add %o5, %o3, %o5
991 membar #StoreLoad|#StoreStore
993 ba .beginmedloop
994 andn %o5, 7, %o5 ! 8 byte aligned count
997 ! This is when there is exactly 1 block remaining and SRC is aligned
999 ldd [%o1 + 0x8], %f4
1000 ldd [%o1 + 0x10], %f6
1001 fsrc1 %f0, %f32
1002 ldd [%o1 + 0x18], %f8
1003 fsrc1 %f2, %f34
1004 ldd [%o1 + 0x20], %f10
1005 fsrc1 %f4, %f36
1006 ldd [%o1 + 0x28], %f12
1007 fsrc1 %f6, %f38
1008 ldd [%o1 + 0x30], %f14
1009 fsrc1 %f8, %f40
1010 fsrc1 %f10, %f42
1011 fsrc1 %f12, %f44
1012 fsrc1 %f14, %f46
1013 stda %f32, [%o0]ASI_BLK_P
1014 membar #StoreLoad|#StoreStore
1015 wr %o4, 0, %fprs
1016 retl
1017 mov %g1, %o0
1019 SET_SIZE(memcpy)