2 * Optimized memory copy routines.
4 * Copyright (C) 2004 Randolph Chung <tausq@debian.org>
5 * Copyright (C) 2013 Helge Deller <deller@gmx.de>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 * Portions derived from the GNU C Library
22 * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
24 * Several strategies are tried to try to get the best performance for various
25 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
26 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
27 * general registers. Unaligned copies are handled either by aligning the
28 * destination and then using shift-and-write method, or in a few cases by
29 * falling back to a byte-at-a-time copy.
31 * I chose to implement this in C because it is easier to maintain and debug,
32 * and in my experiments it appears that the C code generated by gcc (3.3/3.4
33 * at the time of writing) is fairly optimal. Unfortunately some of the
34 * semantics of the copy routine (exception handling) is difficult to express
35 * in C, so we have to play some tricks to get it to work.
37 * All the loads and stores are done via explicit asm() code in order to use
38 * the right space registers.
40 * Testing with various alignments and buffer sizes shows that this code is
41 * often >10x faster than a simple byte-at-a-time copy, even for strangely
42 * aligned operands. It is interesting to note that the glibc version
43 * of memcpy (written in C) is actually quite fast already. This routine is
44 * able to beat it by 30-40% for aligned copies because of the loop unrolling,
45 * but in some cases the glibc version is still slightly faster. This lends
46 * more credibility that gcc can generate very good code as long as we are
50 * - cache prefetching needs more experimentation to get optimal settings
51 * - try not to use the post-increment address modifiers; they create additional
53 * - replace byte-copy loops with stybs sequences
57 #include <linux/module.h>
58 #include <linux/compiler.h>
59 #include <linux/uaccess.h>
60 #define s_space "%%sr1"
61 #define d_space "%%sr2"
64 #define s_space "%%sr0"
65 #define d_space "%%sr0"
66 #define pa_memcpy new2_copy
69 DECLARE_PER_CPU(struct exception_data
, exception_data
);
71 #define preserve_branch(label) do { \
72 volatile int dummy = 0; \
73 /* The following branch is never taken, it's just here to */ \
74 /* prevent gcc from optimizing away our exception code. */ \
75 if (unlikely(dummy != dummy)) \
79 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
80 #define get_kernel_space() (0)
82 #define MERGE(w0, sh_1, w1, sh_2) ({ \
86 "shrpw %1, %2, %%sar, %0\n" \
88 : "r"(w0), "r"(w1), "r"(sh_2) \
95 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
97 #define DPRINTF(fmt, args...)
100 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
101 __asm__ __volatile__ ( \
102 "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
103 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
104 : _tt(_t), "+r"(_a) \
108 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
109 __asm__ __volatile__ ( \
110 "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
111 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
116 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
117 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
118 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
119 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
120 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
121 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
123 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
124 __asm__ __volatile__ ( \
125 "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \
126 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
131 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
132 __asm__ __volatile__ ( \
133 "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \
134 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
139 #define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
140 #define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
142 #ifdef CONFIG_PREFETCH
143 static inline void prefetch_src(const void *addr
)
145 __asm__("ldw 0(" s_space
",%0), %%r0" : : "r" (addr
));
148 static inline void prefetch_dst(const void *addr
)
150 __asm__("ldd 0(" d_space
",%0), %%r0" : : "r" (addr
));
153 #define prefetch_src(addr) do { } while(0)
154 #define prefetch_dst(addr) do { } while(0)
157 #define PA_MEMCPY_OK 0
158 #define PA_MEMCPY_LOAD_ERROR 1
159 #define PA_MEMCPY_STORE_ERROR 2
161 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
162 * per loop. This code is derived from glibc.
164 static noinline
unsigned long copy_dstaligned(unsigned long dst
,
165 unsigned long src
, unsigned long len
)
167 /* gcc complains that a2 and a3 may be uninitialized, but actually
168 * they cannot be. Initialize a2/a3 to shut gcc up.
170 register unsigned int a0
, a1
, a2
= 0, a3
= 0;
173 /* prefetch_src((const void *)src); */
175 /* Calculate how to shift a word read at the memory operation
176 aligned srcp to make it aligned for copy. */
177 sh_1
= 8 * (src
% sizeof(unsigned int));
178 sh_2
= 8 * sizeof(unsigned int) - sh_1
;
180 /* Make src aligned by rounding it down. */
181 src
&= -sizeof(unsigned int);
186 /* a1 = ((unsigned int *) src)[0];
187 a2 = ((unsigned int *) src)[1]; */
188 ldw(s_space
, 0, src
, a1
, cda_ldw_exc
);
189 ldw(s_space
, 4, src
, a2
, cda_ldw_exc
);
190 src
-= 1 * sizeof(unsigned int);
191 dst
-= 3 * sizeof(unsigned int);
195 /* a0 = ((unsigned int *) src)[0];
196 a1 = ((unsigned int *) src)[1]; */
197 ldw(s_space
, 0, src
, a0
, cda_ldw_exc
);
198 ldw(s_space
, 4, src
, a1
, cda_ldw_exc
);
199 src
-= 0 * sizeof(unsigned int);
200 dst
-= 2 * sizeof(unsigned int);
206 /* a3 = ((unsigned int *) src)[0];
207 a0 = ((unsigned int *) src)[1]; */
208 ldw(s_space
, 0, src
, a3
, cda_ldw_exc
);
209 ldw(s_space
, 4, src
, a0
, cda_ldw_exc
);
210 src
-=-1 * sizeof(unsigned int);
211 dst
-= 1 * sizeof(unsigned int);
215 /* a2 = ((unsigned int *) src)[0];
216 a3 = ((unsigned int *) src)[1]; */
217 ldw(s_space
, 0, src
, a2
, cda_ldw_exc
);
218 ldw(s_space
, 4, src
, a3
, cda_ldw_exc
);
219 src
-=-2 * sizeof(unsigned int);
220 dst
-= 0 * sizeof(unsigned int);
224 goto do4
; /* No-op. */
229 /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
231 /* a0 = ((unsigned int *) src)[0]; */
232 ldw(s_space
, 0, src
, a0
, cda_ldw_exc
);
233 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
234 stw(d_space
, MERGE (a2
, sh_1
, a3
, sh_2
), 0, dst
, cda_stw_exc
);
236 /* a1 = ((unsigned int *) src)[1]; */
237 ldw(s_space
, 4, src
, a1
, cda_ldw_exc
);
238 /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
239 stw(d_space
, MERGE (a3
, sh_1
, a0
, sh_2
), 4, dst
, cda_stw_exc
);
241 /* a2 = ((unsigned int *) src)[2]; */
242 ldw(s_space
, 8, src
, a2
, cda_ldw_exc
);
243 /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
244 stw(d_space
, MERGE (a0
, sh_1
, a1
, sh_2
), 8, dst
, cda_stw_exc
);
246 /* a3 = ((unsigned int *) src)[3]; */
247 ldw(s_space
, 12, src
, a3
, cda_ldw_exc
);
248 /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
249 stw(d_space
, MERGE (a1
, sh_1
, a2
, sh_2
), 12, dst
, cda_stw_exc
);
251 src
+= 4 * sizeof(unsigned int);
252 dst
+= 4 * sizeof(unsigned int);
258 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
259 stw(d_space
, MERGE (a2
, sh_1
, a3
, sh_2
), 0, dst
, cda_stw_exc
);
261 preserve_branch(handle_load_error
);
262 preserve_branch(handle_store_error
);
267 __asm__
__volatile__ ("cda_ldw_exc:\n");
268 return PA_MEMCPY_LOAD_ERROR
;
271 __asm__
__volatile__ ("cda_stw_exc:\n");
272 return PA_MEMCPY_STORE_ERROR
;
276 /* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR.
277 * In case of an access fault the faulty address can be read from the per_cpu
278 * exception data struct. */
279 static noinline
unsigned long pa_memcpy_internal(void *dstp
, const void *srcp
,
282 register unsigned long src
, dst
, t1
, t2
, t3
;
283 register unsigned char *pcs
, *pcd
;
284 register unsigned int *pws
, *pwd
;
285 register double *pds
, *pdd
;
288 src
= (unsigned long)srcp
;
289 dst
= (unsigned long)dstp
;
290 pcs
= (unsigned char *)srcp
;
291 pcd
= (unsigned char *)dstp
;
293 /* prefetch_src((const void *)srcp); */
298 /* Check alignment */
300 if (unlikely(t1
& (sizeof(double)-1)))
303 /* src and dst have same alignment. */
305 /* Copy bytes till we are double-aligned. */
306 t2
= src
& (sizeof(double) - 1);
307 if (unlikely(t2
!= 0)) {
308 t2
= sizeof(double) - t2
;
310 /* *pcd++ = *pcs++; */
311 ldbma(s_space
, pcs
, t3
, pmc_load_exc
);
313 stbma(d_space
, t3
, pcd
, pmc_store_exc
);
322 /* Copy 8 doubles at a time */
323 while (len
>= 8*sizeof(double)) {
324 register double r1
, r2
, r3
, r4
, r5
, r6
, r7
, r8
;
325 /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
326 flddma(s_space
, pds
, r1
, pmc_load_exc
);
327 flddma(s_space
, pds
, r2
, pmc_load_exc
);
328 flddma(s_space
, pds
, r3
, pmc_load_exc
);
329 flddma(s_space
, pds
, r4
, pmc_load_exc
);
330 fstdma(d_space
, r1
, pdd
, pmc_store_exc
);
331 fstdma(d_space
, r2
, pdd
, pmc_store_exc
);
332 fstdma(d_space
, r3
, pdd
, pmc_store_exc
);
333 fstdma(d_space
, r4
, pdd
, pmc_store_exc
);
336 if (L1_CACHE_BYTES
<= 32)
337 prefetch_src((char *)pds
+ L1_CACHE_BYTES
);
339 flddma(s_space
, pds
, r5
, pmc_load_exc
);
340 flddma(s_space
, pds
, r6
, pmc_load_exc
);
341 flddma(s_space
, pds
, r7
, pmc_load_exc
);
342 flddma(s_space
, pds
, r8
, pmc_load_exc
);
343 fstdma(d_space
, r5
, pdd
, pmc_store_exc
);
344 fstdma(d_space
, r6
, pdd
, pmc_store_exc
);
345 fstdma(d_space
, r7
, pdd
, pmc_store_exc
);
346 fstdma(d_space
, r8
, pdd
, pmc_store_exc
);
347 len
-= 8*sizeof(double);
351 pws
= (unsigned int *)pds
;
352 pwd
= (unsigned int *)pdd
;
355 while (len
>= 8*sizeof(unsigned int)) {
356 register unsigned int r1
,r2
,r3
,r4
,r5
,r6
,r7
,r8
;
357 /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
358 ldwma(s_space
, pws
, r1
, pmc_load_exc
);
359 ldwma(s_space
, pws
, r2
, pmc_load_exc
);
360 ldwma(s_space
, pws
, r3
, pmc_load_exc
);
361 ldwma(s_space
, pws
, r4
, pmc_load_exc
);
362 stwma(d_space
, r1
, pwd
, pmc_store_exc
);
363 stwma(d_space
, r2
, pwd
, pmc_store_exc
);
364 stwma(d_space
, r3
, pwd
, pmc_store_exc
);
365 stwma(d_space
, r4
, pwd
, pmc_store_exc
);
367 ldwma(s_space
, pws
, r5
, pmc_load_exc
);
368 ldwma(s_space
, pws
, r6
, pmc_load_exc
);
369 ldwma(s_space
, pws
, r7
, pmc_load_exc
);
370 ldwma(s_space
, pws
, r8
, pmc_load_exc
);
371 stwma(d_space
, r5
, pwd
, pmc_store_exc
);
372 stwma(d_space
, r6
, pwd
, pmc_store_exc
);
373 stwma(d_space
, r7
, pwd
, pmc_store_exc
);
374 stwma(d_space
, r8
, pwd
, pmc_store_exc
);
375 len
-= 8*sizeof(unsigned int);
378 while (len
>= 4*sizeof(unsigned int)) {
379 register unsigned int r1
,r2
,r3
,r4
;
380 ldwma(s_space
, pws
, r1
, pmc_load_exc
);
381 ldwma(s_space
, pws
, r2
, pmc_load_exc
);
382 ldwma(s_space
, pws
, r3
, pmc_load_exc
);
383 ldwma(s_space
, pws
, r4
, pmc_load_exc
);
384 stwma(d_space
, r1
, pwd
, pmc_store_exc
);
385 stwma(d_space
, r2
, pwd
, pmc_store_exc
);
386 stwma(d_space
, r3
, pwd
, pmc_store_exc
);
387 stwma(d_space
, r4
, pwd
, pmc_store_exc
);
388 len
-= 4*sizeof(unsigned int);
391 pcs
= (unsigned char *)pws
;
392 pcd
= (unsigned char *)pwd
;
396 /* *pcd++ = *pcs++; */
397 ldbma(s_space
, pcs
, t3
, pmc_load_exc
);
398 stbma(d_space
, t3
, pcd
, pmc_store_exc
);
405 /* possibly we are aligned on a word, but not on a double... */
406 if (likely((t1
& (sizeof(unsigned int)-1)) == 0)) {
407 t2
= src
& (sizeof(unsigned int) - 1);
409 if (unlikely(t2
!= 0)) {
410 t2
= sizeof(unsigned int) - t2
;
412 /* *pcd++ = *pcs++; */
413 ldbma(s_space
, pcs
, t3
, pmc_load_exc
);
414 stbma(d_space
, t3
, pcd
, pmc_store_exc
);
420 pws
= (unsigned int *)pcs
;
421 pwd
= (unsigned int *)pcd
;
425 /* Align the destination. */
426 if (unlikely((dst
& (sizeof(unsigned int) - 1)) != 0)) {
427 t2
= sizeof(unsigned int) - (dst
& (sizeof(unsigned int) - 1));
429 /* *pcd++ = *pcs++; */
430 ldbma(s_space
, pcs
, t3
, pmc_load_exc
);
431 stbma(d_space
, t3
, pcd
, pmc_store_exc
);
435 dst
= (unsigned long)pcd
;
436 src
= (unsigned long)pcs
;
439 ret
= copy_dstaligned(dst
, src
, len
/ sizeof(unsigned int));
443 pcs
+= (len
& -sizeof(unsigned int));
444 pcd
+= (len
& -sizeof(unsigned int));
445 len
%= sizeof(unsigned int);
447 preserve_branch(handle_load_error
);
448 preserve_branch(handle_store_error
);
453 __asm__
__volatile__ ("pmc_load_exc:\n");
454 return PA_MEMCPY_LOAD_ERROR
;
457 __asm__
__volatile__ ("pmc_store_exc:\n");
458 return PA_MEMCPY_STORE_ERROR
;
462 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
463 static unsigned long pa_memcpy(void *dstp
, const void *srcp
, unsigned long len
)
465 unsigned long ret
, fault_addr
, reference
;
466 struct exception_data
*d
;
468 ret
= pa_memcpy_internal(dstp
, srcp
, len
);
469 if (likely(ret
== PA_MEMCPY_OK
))
472 /* if a load or store fault occured we can get the faulty addr */
473 d
= this_cpu_ptr(&exception_data
);
474 fault_addr
= d
->fault_addr
;
476 /* error in load or store? */
477 if (ret
== PA_MEMCPY_LOAD_ERROR
)
478 reference
= (unsigned long) srcp
;
480 reference
= (unsigned long) dstp
;
482 DPRINTF("pa_memcpy: fault type = %lu, len=%lu fault_addr=%lu ref=%lu\n",
483 ret
, len
, fault_addr
, reference
);
485 if (fault_addr
>= reference
)
486 return len
- (fault_addr
- reference
);
492 unsigned long copy_to_user(void __user
*dst
, const void *src
, unsigned long len
)
494 mtsp(get_kernel_space(), 1);
495 mtsp(get_user_space(), 2);
496 return pa_memcpy((void __force
*)dst
, src
, len
);
499 EXPORT_SYMBOL(__copy_from_user
);
500 unsigned long __copy_from_user(void *dst
, const void __user
*src
, unsigned long len
)
502 mtsp(get_user_space(), 1);
503 mtsp(get_kernel_space(), 2);
504 return pa_memcpy(dst
, (void __force
*)src
, len
);
507 unsigned long copy_in_user(void __user
*dst
, const void __user
*src
, unsigned long len
)
509 mtsp(get_user_space(), 1);
510 mtsp(get_user_space(), 2);
511 return pa_memcpy((void __force
*)dst
, (void __force
*)src
, len
);
515 void * memcpy(void * dst
,const void *src
, size_t count
)
517 mtsp(get_kernel_space(), 1);
518 mtsp(get_kernel_space(), 2);
519 pa_memcpy(dst
, src
, count
);
523 EXPORT_SYMBOL(copy_to_user
);
524 EXPORT_SYMBOL(copy_from_user
);
525 EXPORT_SYMBOL(copy_in_user
);
526 EXPORT_SYMBOL(memcpy
);
528 long probe_kernel_read(void *dst
, const void *src
, size_t size
)
530 unsigned long addr
= (unsigned long)src
;
532 if (addr
< PAGE_SIZE
)
535 /* check for I/O space F_EXTEND(0xfff00000) access as well? */
537 return __probe_kernel_read(dst
, src
, size
);