1 /* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara-2.
2 Copyright (C) 2007, 2008 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by David S. Miller (davem@davemloft.net)
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
24 #define ASI_BLK_P 0xf0
30 #define VISEntryHalf \
32 wr %g0, FPRS_FEF, %fprs
35 and %o5, FPRS_FEF, %o5; \
38 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
40 #define LOAD(type,addr,dest) type [addr], dest
41 #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
42 #define STORE(type,src,addr) type src, [addr]
43 #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
44 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
51 #define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
52 faligndata %x0, %x1, %f0; \
53 faligndata %x1, %x2, %f2; \
54 faligndata %x2, %x3, %f4; \
55 faligndata %x3, %x4, %f6; \
56 faligndata %x4, %x5, %f8; \
57 faligndata %x5, %x6, %f10; \
58 faligndata %x6, %x7, %f12; \
59 faligndata %x7, %x8, %f14;
61 #define FREG_MOVE_1(x0) \
63 #define FREG_MOVE_2(x0, x1) \
66 #define FREG_MOVE_3(x0, x1, x2) \
70 #define FREG_MOVE_4(x0, x1, x2, x3) \
75 #define FREG_MOVE_5(x0, x1, x2, x3, x4) \
81 #define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
88 #define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
96 #define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
105 #define FREG_LOAD_1(base, x0) \
106 LOAD(ldd, base + 0x00, %x0)
107 #define FREG_LOAD_2(base, x0, x1) \
108 LOAD(ldd, base + 0x00, %x0); \
109 LOAD(ldd, base + 0x08, %x1);
110 #define FREG_LOAD_3(base, x0, x1, x2) \
111 LOAD(ldd, base + 0x00, %x0); \
112 LOAD(ldd, base + 0x08, %x1); \
113 LOAD(ldd, base + 0x10, %x2);
114 #define FREG_LOAD_4(base, x0, x1, x2, x3) \
115 LOAD(ldd, base + 0x00, %x0); \
116 LOAD(ldd, base + 0x08, %x1); \
117 LOAD(ldd, base + 0x10, %x2); \
118 LOAD(ldd, base + 0x18, %x3);
119 #define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
120 LOAD(ldd, base + 0x00, %x0); \
121 LOAD(ldd, base + 0x08, %x1); \
122 LOAD(ldd, base + 0x10, %x2); \
123 LOAD(ldd, base + 0x18, %x3); \
124 LOAD(ldd, base + 0x20, %x4);
125 #define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
126 LOAD(ldd, base + 0x00, %x0); \
127 LOAD(ldd, base + 0x08, %x1); \
128 LOAD(ldd, base + 0x10, %x2); \
129 LOAD(ldd, base + 0x18, %x3); \
130 LOAD(ldd, base + 0x20, %x4); \
131 LOAD(ldd, base + 0x28, %x5);
132 #define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
133 LOAD(ldd, base + 0x00, %x0); \
134 LOAD(ldd, base + 0x08, %x1); \
135 LOAD(ldd, base + 0x10, %x2); \
136 LOAD(ldd, base + 0x18, %x3); \
137 LOAD(ldd, base + 0x20, %x4); \
138 LOAD(ldd, base + 0x28, %x5); \
139 LOAD(ldd, base + 0x30, %x6);
141 .register %g2,#scratch
142 .register %g3,#scratch
143 .register %g6,#scratch
169 100: /* %o0=dst, %o1=src, %o2=len */
173 218: or %o0, %o1, %o3
178 /* 2 blocks (128 bytes) is the minimum we can do the block
179 * copy with. We need to ensure that we'll iterate at least
180 * once in the block copy loop. At worst we'll need to align
181 * the destination to a 64-byte boundary which can chew up
182 * to (64 - 1) bytes from the length before we perform the
185 * However, the cut-off point, performance wise, is around
194 * %o2: len (known to be >= 128)
196 * The block copy loops can use %o4, %g2, %g3 as
197 * temporaries while copying the data. %o5 must
198 * be preserved between VISEntryHalf and VISExitHalf
201 LOAD(prefetch, %o1 + 0x000, #one_read)
202 LOAD(prefetch, %o1 + 0x040, #one_read)
203 LOAD(prefetch, %o1 + 0x080, #one_read)
205 /* Align destination on 64-byte boundary. */
206 andcc %o0, (64 - 1), %o4
209 sub %g0, %o4, %o4 ! bytes to align dst
219 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
220 * o5 from here until we hit VISExitHalf.
224 alignaddr %o1, %g0, %g0
226 add %o1, (64 - 1), %o4
227 andn %o4, (64 - 1), %o4
228 andn %o2, (64 - 1), %g1
231 and %o1, (64 - 1), %g2
246 4: /* 32 <= low bits < 48 */
250 5: /* 0 < low bits < 32 */
257 6: /* 0 < low bits < 16 */
260 /* fall through for 0 < low bits < 8 */
261 110: sub %o4, 64, %g2
263 1: STORE_INIT(%g0, %o4 + %g3)
265 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
266 STORE_BLK(%f0, %o4 + %g3)
267 FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
271 LOAD(prefetch, %o4 + 64, #one_read)
275 120: sub %o4, 56, %g2
276 FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
277 1: STORE_INIT(%g0, %o4 + %g3)
279 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
280 STORE_BLK(%f0, %o4 + %g3)
281 FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
285 LOAD(prefetch, %o4 + 64, #one_read)
289 130: sub %o4, 48, %g2
290 FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
291 1: STORE_INIT(%g0, %o4 + %g3)
293 FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
294 STORE_BLK(%f0, %o4 + %g3)
295 FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
299 LOAD(prefetch, %o4 + 64, #one_read)
303 140: sub %o4, 40, %g2
304 FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
305 1: STORE_INIT(%g0, %o4 + %g3)
307 FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
308 STORE_BLK(%f0, %o4 + %g3)
309 FREG_MOVE_5(f22, f24, f26, f28, f30)
313 LOAD(prefetch, %o4 + 64, #one_read)
317 150: sub %o4, 32, %g2
318 FREG_LOAD_4(%g2, f0, f2, f4, f6)
319 1: STORE_INIT(%g0, %o4 + %g3)
321 FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
322 STORE_BLK(%f0, %o4 + %g3)
323 FREG_MOVE_4(f24, f26, f28, f30)
327 LOAD(prefetch, %o4 + 64, #one_read)
331 160: sub %o4, 24, %g2
332 FREG_LOAD_3(%g2, f0, f2, f4)
333 1: STORE_INIT(%g0, %o4 + %g3)
335 FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
336 STORE_BLK(%f0, %o4 + %g3)
337 FREG_MOVE_3(f26, f28, f30)
341 LOAD(prefetch, %o4 + 64, #one_read)
345 170: sub %o4, 16, %g2
346 FREG_LOAD_2(%g2, f0, f2)
347 1: STORE_INIT(%g0, %o4 + %g3)
349 FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
350 STORE_BLK(%f0, %o4 + %g3)
351 FREG_MOVE_2(f28, f30)
355 LOAD(prefetch, %o4 + 64, #one_read)
361 1: STORE_INIT(%g0, %o4 + %g3)
363 FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
364 STORE_BLK(%f0, %o4 + %g3)
369 LOAD(prefetch, %o4 + 64, #one_read)
374 1: STORE_INIT(%g0, %o4 + %g3)
377 STORE_BLK(%f0, %o4 + %g3)
380 LOAD(prefetch, %o4 + 64, #one_read)
388 /* %o2 contains any final bytes still needed to be copied
389 * over. If anything is left, we copy it one byte at a time.
396 75: /* 16 < len <= 64 */
403 1: subcc %o4, 0x10, %o4
408 STORE(stx, %o5, %o1 + %o3)
410 STORE(stx, %g1, %o1 + %o3)
413 73: andcc %o2, 0x8, %g0
418 STORE(stx, %o5, %o1 + %o3)
420 1: andcc %o2, 0x4, %g0
425 STORE(stw, %o5, %o1 + %o3)
442 STORE(stb, %o5, %o1 + %o3)
480 80: /* 0 < len <= 16 */
488 STORE(stw, %g1, %o1 + %o3)
499 STORE(stb, %g1, %o1 + %o3)
507 #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3) \
508 ldx [%src - offset - 0x20], %t0; \
509 ldx [%src - offset - 0x18], %t1; \
510 ldx [%src - offset - 0x10], %t2; \
511 ldx [%src - offset - 0x08], %t3; \
512 stw %t0, [%dst - offset - 0x1c]; \
514 stw %t0, [%dst - offset - 0x20]; \
515 stw %t1, [%dst - offset - 0x14]; \
517 stw %t1, [%dst - offset - 0x18]; \
518 stw %t2, [%dst - offset - 0x0c]; \
520 stw %t2, [%dst - offset - 0x10]; \
521 stw %t3, [%dst - offset - 0x04]; \
523 stw %t3, [%dst - offset - 0x08];
525 #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
526 ldx [%src - offset - 0x20], %t0; \
527 ldx [%src - offset - 0x18], %t1; \
528 ldx [%src - offset - 0x10], %t2; \
529 ldx [%src - offset - 0x08], %t3; \
530 stx %t0, [%dst - offset - 0x20]; \
531 stx %t1, [%dst - offset - 0x18]; \
532 stx %t2, [%dst - offset - 0x10]; \
533 stx %t3, [%dst - offset - 0x08]; \
534 ldx [%src - offset - 0x40], %t0; \
535 ldx [%src - offset - 0x38], %t1; \
536 ldx [%src - offset - 0x30], %t2; \
537 ldx [%src - offset - 0x28], %t3; \
538 stx %t0, [%dst - offset - 0x40]; \
539 stx %t1, [%dst - offset - 0x38]; \
540 stx %t2, [%dst - offset - 0x30]; \
541 stx %t3, [%dst - offset - 0x28];
543 #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
544 ldx [%src + offset + 0x00], %t0; \
545 ldx [%src + offset + 0x08], %t1; \
546 stw %t0, [%dst + offset + 0x04]; \
548 stw %t2, [%dst + offset + 0x00]; \
549 stw %t1, [%dst + offset + 0x0c]; \
551 stw %t3, [%dst + offset + 0x08];
553 #define RMOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1) \
554 ldx [%src + offset + 0x00], %t0; \
555 ldx [%src + offset + 0x08], %t1; \
556 stx %t0, [%dst + offset + 0x00]; \
557 stx %t1, [%dst + offset + 0x08];
560 228: andcc %o2, 1, %g0
562 1: ldub [%o1 - 1], %o5
568 2: ldub [%o1 - 1], %o5
593 220: add %o1, %o2, %o1
612 4: lduh [%o1 - 2], %g2
618 236: be,a,pn %XCC, 2f
629 5: RMOVE_BIGCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
630 RMOVE_BIGCHUNK(o1, o0, 0x20, g1, g3, g5, o5)
631 RMOVE_BIGCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
632 RMOVE_BIGCHUNK(o1, o0, 0x60, g1, g3, g5, o5)
637 235: andcc %o2, 0x70, %g6
645 jmpl %o5 + %lo(280f - 279b), %g0
647 RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g5, o5)
648 RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g5, o5)
649 RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g5, o5)
650 RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g5, o5)
651 RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g5, o5)
652 RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g5, o5)
653 RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g5, o5)
654 280: be,pt %XCC, 281f
681 282: RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
682 RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
694 jmpl %o5 + %lo(284f - 283b), %g0
696 RMOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3)
697 RMOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3)
698 RMOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3)
699 RMOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3)
700 RMOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3)
701 RMOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3)
702 RMOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3)
703 284: be,pt %XCC, 285f
728 232: ldub [%o1 - 1], %g5
739 weak_alias (memcpy, __align_cpy_1)
740 weak_alias (memcpy, __align_cpy_2)
741 weak_alias (memcpy, __align_cpy_4)
742 weak_alias (memcpy, __align_cpy_8)
743 weak_alias (memcpy, __align_cpy_16)
745 libc_hidden_builtin_def (memcpy)
746 libc_hidden_builtin_def (memmove)