arch/parisc/lib/memcpy.c

   1 /*
   2  *    Optimized memory copy routines.
   3  *
   4  *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
   5  *
   6  *    This program is free software; you can redistribute it and/or modify
   7  *    it under the terms of the GNU General Public License as published by
   8  *    the Free Software Foundation; either version 2, or (at your option)
   9  *    any later version.
  10  *
  11  *    This program is distributed in the hope that it will be useful,
  12  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  *    GNU General Public License for more details.
  15  *
  16  *    You should have received a copy of the GNU General Public License
  17  *    along with this program; if not, write to the Free Software
  18  *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19  *
  20  *    Portions derived from the GNU C Library
  21  *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
  22  *
  23  * Several strategies are tried to try to get the best performance for various
  24  * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
  25  * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
  26  * general registers.  Unaligned copies are handled either by aligning the
  27  * destination and then using shift-and-write method, or in a few cases by
  28  * falling back to a byte-at-a-time copy.
  29  *
  30  * I chose to implement this in C because it is easier to maintain and debug,
  31  * and in my experiments it appears that the C code generated by gcc (3.3/3.4
  32  * at the time of writing) is fairly optimal. Unfortunately some of the
  33  * semantics of the copy routine (exception handling) is difficult to express
  34  * in C, so we have to play some tricks to get it to work.
  35  *
  36  * All the loads and stores are done via explicit asm() code in order to use
  37  * the right space registers.
  38  *
  39  * Testing with various alignments and buffer sizes shows that this code is
  40  * often >10x faster than a simple byte-at-a-time copy, even for strangely
  41  * aligned operands. It is interesting to note that the glibc version
  42  * of memcpy (written in C) is actually quite fast already. This routine is
  43  * able to beat it by 30-40% for aligned copies because of the loop unrolling,
  44  * but in some cases the glibc version is still slightly faster. This lends
  45  * more credibility that gcc can generate very good code as long as we are
  46  * careful.
  47  *
  48  * TODO:
  49  * - cache prefetching needs more experimentation to get optimal settings
  50  * - try not to use the post-increment address modifiers; they create additional
  51  *   interlocks
  52  * - replace byte-copy loops with stybs sequences
  53  */
  54
  55 #ifdef __KERNEL__
  56 #include <linux/module.h>
  57 #include <linux/compiler.h>
  58 #include <asm/uaccess.h>
  59 #define s_space "%%sr1"
  60 #define d_space "%%sr2"
  61 #else
  62 #include "memcpy.h"
  63 #define s_space "%%sr0"
  64 #define d_space "%%sr0"
  65 #define pa_memcpy new2_copy
  66 #endif
  67
  68 DECLARE_PER_CPU(struct exception_data, exception_data);
  69
  70 #define preserve_branch(label)  do {                                    \
  71         volatile int dummy;                                             \
  72         /* The following branch is never taken, it's just here to  */   \
  73         /* prevent gcc from optimizing away our exception code. */      \
  74         if (unlikely(dummy != dummy))                                   \
  75                 goto label;                                             \
  76 } while (0)
  77
  78 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
  79 #define get_kernel_space() (0)
  80
  81 #define MERGE(w0, sh_1, w1, sh_2)  ({                                   \
  82         unsigned int _r;                                                \
  83         asm volatile (                                                  \
  84         "mtsar %3\n"                                                    \
  85         "shrpw %1, %2, %%sar, %0\n"                                     \
  86         : "=r"(_r)                                                      \
  87         : "r"(w0), "r"(w1), "r"(sh_2)                                   \
  88         );                                                              \
  89         _r;                                                             \
  90 })
  91 #define THRESHOLD       16
  92
  93 #ifdef DEBUG_MEMCPY
  94 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
  95 #else
  96 #define DPRINTF(fmt, args...)
  97 #endif
  98
  99 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)     \
 100         __asm__ __volatile__ (                          \
 101         "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
 102         ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 103         : _tt(_t), "+r"(_a)                             \
 104         :                                               \
 105         : "r8")
 106
 107 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)    \
 108         __asm__ __volatile__ (                          \
 109         "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
 110         ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 111         : "+r"(_a)                                      \
 112         : _tt(_t)                                       \
 113         : "r8")
 114
 115 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
 116 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
 117 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
 118 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
 119 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
 120 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
 121
 122 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e)         \
 123         __asm__ __volatile__ (                          \
 124         "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t"     \
 125         ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 126         : _tt(_t)                                       \
 127         : "r"(_a)                                       \
 128         : "r8")
 129
 130 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e)        \
 131         __asm__ __volatile__ (                          \
 132         "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t"     \
 133         ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 134         :                                               \
 135         : _tt(_t), "r"(_a)                              \
 136         : "r8")
 137
 138 #define ldw(_s,_o,_a,_t,_e)     def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
 139 #define stw(_s,_t,_o,_a,_e)     def_store_insn(stw,"r",_s,_t,_o,_a,_e)
 140
 141 #ifdef  CONFIG_PREFETCH
 142 static inline void prefetch_src(const void *addr)
 143 {
 144         __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
 145 }
 146
 147 static inline void prefetch_dst(const void *addr)
 148 {
 149         __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
 150 }
 151 #else
 152 #define prefetch_src(addr) do { } while(0)
 153 #define prefetch_dst(addr) do { } while(0)
 154 #endif
 155
 156 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
 157  * per loop.  This code is derived from glibc.
 158  */
 159 static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
 160 {
 161         /* gcc complains that a2 and a3 may be uninitialized, but actually
 162          * they cannot be.  Initialize a2/a3 to shut gcc up.
 163          */
 164         register unsigned int a0, a1, a2 = 0, a3 = 0;
 165         int sh_1, sh_2;
 166         struct exception_data *d;
 167
 168         /* prefetch_src((const void *)src); */
 169
 170         /* Calculate how to shift a word read at the memory operation
 171            aligned srcp to make it aligned for copy.  */
 172         sh_1 = 8 * (src % sizeof(unsigned int));
 173         sh_2 = 8 * sizeof(unsigned int) - sh_1;
 174
 175         /* Make src aligned by rounding it down.  */
 176         src &= -sizeof(unsigned int);
 177
 178         switch (len % 4)
 179         {
 180                 case 2:
 181                         /* a1 = ((unsigned int *) src)[0];
 182                            a2 = ((unsigned int *) src)[1]; */
 183                         ldw(s_space, 0, src, a1, cda_ldw_exc);
 184                         ldw(s_space, 4, src, a2, cda_ldw_exc);
 185                         src -= 1 * sizeof(unsigned int);
 186                         dst -= 3 * sizeof(unsigned int);
 187                         len += 2;
 188                         goto do1;
 189                 case 3:
 190                         /* a0 = ((unsigned int *) src)[0];
 191                            a1 = ((unsigned int *) src)[1]; */
 192                         ldw(s_space, 0, src, a0, cda_ldw_exc);
 193                         ldw(s_space, 4, src, a1, cda_ldw_exc);
 194                         src -= 0 * sizeof(unsigned int);
 195                         dst -= 2 * sizeof(unsigned int);
 196                         len += 1;
 197                         goto do2;
 198                 case 0:
 199                         if (len == 0)
 200                                 return 0;
 201                         /* a3 = ((unsigned int *) src)[0];
 202                            a0 = ((unsigned int *) src)[1]; */
 203                         ldw(s_space, 0, src, a3, cda_ldw_exc);
 204                         ldw(s_space, 4, src, a0, cda_ldw_exc);
 205                         src -=-1 * sizeof(unsigned int);
 206                         dst -= 1 * sizeof(unsigned int);
 207                         len += 0;
 208                         goto do3;
 209                 case 1:
 210                         /* a2 = ((unsigned int *) src)[0];
 211                            a3 = ((unsigned int *) src)[1]; */
 212                         ldw(s_space, 0, src, a2, cda_ldw_exc);
 213                         ldw(s_space, 4, src, a3, cda_ldw_exc);
 214                         src -=-2 * sizeof(unsigned int);
 215                         dst -= 0 * sizeof(unsigned int);
 216                         len -= 1;
 217                         if (len == 0)
 218                                 goto do0;
 219                         goto do4;                       /* No-op.  */
 220         }
 221
 222         do
 223         {
 224                 /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
 225 do4:
 226                 /* a0 = ((unsigned int *) src)[0]; */
 227                 ldw(s_space, 0, src, a0, cda_ldw_exc);
 228                 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
 229                 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
 230 do3:
 231                 /* a1 = ((unsigned int *) src)[1]; */
 232                 ldw(s_space, 4, src, a1, cda_ldw_exc);
 233                 /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
 234                 stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
 235 do2:
 236                 /* a2 = ((unsigned int *) src)[2]; */
 237                 ldw(s_space, 8, src, a2, cda_ldw_exc);
 238                 /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
 239                 stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
 240 do1:
 241                 /* a3 = ((unsigned int *) src)[3]; */
 242                 ldw(s_space, 12, src, a3, cda_ldw_exc);
 243                 /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
 244                 stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
 245
 246                 src += 4 * sizeof(unsigned int);
 247                 dst += 4 * sizeof(unsigned int);
 248                 len -= 4;
 249         }
 250         while (len != 0);
 251
 252 do0:
 253         /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
 254         stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
 255
 256         preserve_branch(handle_load_error);
 257         preserve_branch(handle_store_error);
 258
 259         return 0;
 260
 261 handle_load_error:
 262         __asm__ __volatile__ ("cda_ldw_exc:\n");
 263         d = &__get_cpu_var(exception_data);
 264         DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
 265                 o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
 266         return o_len * 4 - d->fault_addr + o_src;
 267
 268 handle_store_error:
 269         __asm__ __volatile__ ("cda_stw_exc:\n");
 270         d = &__get_cpu_var(exception_data);
 271         DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
 272                 o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
 273         return o_len * 4 - d->fault_addr + o_dst;
 274 }
 275
 276
 277 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
 278 unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
 279 {
 280         register unsigned long src, dst, t1, t2, t3;
 281         register unsigned char *pcs, *pcd;
 282         register unsigned int *pws, *pwd;
 283         register double *pds, *pdd;
 284         unsigned long ret = 0;
 285         unsigned long o_dst, o_src, o_len;
 286         struct exception_data *d;
 287
 288         src = (unsigned long)srcp;
 289         dst = (unsigned long)dstp;
 290         pcs = (unsigned char *)srcp;
 291         pcd = (unsigned char *)dstp;
 292
 293         o_dst = dst; o_src = src; o_len = len;
 294
 295         /* prefetch_src((const void *)srcp); */
 296
 297         if (len < THRESHOLD)
 298                 goto byte_copy;
 299
 300         /* Check alignment */
 301         t1 = (src ^ dst);
 302         if (unlikely(t1 & (sizeof(double)-1)))
 303                 goto unaligned_copy;
 304
 305         /* src and dst have same alignment. */
 306
 307         /* Copy bytes till we are double-aligned. */
 308         t2 = src & (sizeof(double) - 1);
 309         if (unlikely(t2 != 0)) {
 310                 t2 = sizeof(double) - t2;
 311                 while (t2 && len) {
 312                         /* *pcd++ = *pcs++; */
 313                         ldbma(s_space, pcs, t3, pmc_load_exc);
 314                         len--;
 315                         stbma(d_space, t3, pcd, pmc_store_exc);
 316                         t2--;
 317                 }
 318         }
 319
 320         pds = (double *)pcs;
 321         pdd = (double *)pcd;
 322
 323 #if 0
 324         /* Copy 8 doubles at a time */
 325         while (len >= 8*sizeof(double)) {
 326                 register double r1, r2, r3, r4, r5, r6, r7, r8;
 327                 /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
 328                 flddma(s_space, pds, r1, pmc_load_exc);
 329                 flddma(s_space, pds, r2, pmc_load_exc);
 330                 flddma(s_space, pds, r3, pmc_load_exc);
 331                 flddma(s_space, pds, r4, pmc_load_exc);
 332                 fstdma(d_space, r1, pdd, pmc_store_exc);
 333                 fstdma(d_space, r2, pdd, pmc_store_exc);
 334                 fstdma(d_space, r3, pdd, pmc_store_exc);
 335                 fstdma(d_space, r4, pdd, pmc_store_exc);
 336
 337 #if 0
 338                 if (L1_CACHE_BYTES <= 32)
 339                         prefetch_src((char *)pds + L1_CACHE_BYTES);
 340 #endif
 341                 flddma(s_space, pds, r5, pmc_load_exc);
 342                 flddma(s_space, pds, r6, pmc_load_exc);
 343                 flddma(s_space, pds, r7, pmc_load_exc);
 344                 flddma(s_space, pds, r8, pmc_load_exc);
 345                 fstdma(d_space, r5, pdd, pmc_store_exc);
 346                 fstdma(d_space, r6, pdd, pmc_store_exc);
 347                 fstdma(d_space, r7, pdd, pmc_store_exc);
 348                 fstdma(d_space, r8, pdd, pmc_store_exc);
 349                 len -= 8*sizeof(double);
 350         }
 351 #endif
 352
 353         pws = (unsigned int *)pds;
 354         pwd = (unsigned int *)pdd;
 355
 356 word_copy:
 357         while (len >= 8*sizeof(unsigned int)) {
 358                 register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
 359                 /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
 360                 ldwma(s_space, pws, r1, pmc_load_exc);
 361                 ldwma(s_space, pws, r2, pmc_load_exc);
 362                 ldwma(s_space, pws, r3, pmc_load_exc);
 363                 ldwma(s_space, pws, r4, pmc_load_exc);
 364                 stwma(d_space, r1, pwd, pmc_store_exc);
 365                 stwma(d_space, r2, pwd, pmc_store_exc);
 366                 stwma(d_space, r3, pwd, pmc_store_exc);
 367                 stwma(d_space, r4, pwd, pmc_store_exc);
 368
 369                 ldwma(s_space, pws, r5, pmc_load_exc);
 370                 ldwma(s_space, pws, r6, pmc_load_exc);
 371                 ldwma(s_space, pws, r7, pmc_load_exc);
 372                 ldwma(s_space, pws, r8, pmc_load_exc);
 373                 stwma(d_space, r5, pwd, pmc_store_exc);
 374                 stwma(d_space, r6, pwd, pmc_store_exc);
 375                 stwma(d_space, r7, pwd, pmc_store_exc);
 376                 stwma(d_space, r8, pwd, pmc_store_exc);
 377                 len -= 8*sizeof(unsigned int);
 378         }
 379
 380         while (len >= 4*sizeof(unsigned int)) {
 381                 register unsigned int r1,r2,r3,r4;
 382                 ldwma(s_space, pws, r1, pmc_load_exc);
 383                 ldwma(s_space, pws, r2, pmc_load_exc);
 384                 ldwma(s_space, pws, r3, pmc_load_exc);
 385                 ldwma(s_space, pws, r4, pmc_load_exc);
 386                 stwma(d_space, r1, pwd, pmc_store_exc);
 387                 stwma(d_space, r2, pwd, pmc_store_exc);
 388                 stwma(d_space, r3, pwd, pmc_store_exc);
 389                 stwma(d_space, r4, pwd, pmc_store_exc);
 390                 len -= 4*sizeof(unsigned int);
 391         }
 392
 393         pcs = (unsigned char *)pws;
 394         pcd = (unsigned char *)pwd;
 395
 396 byte_copy:
 397         while (len) {
 398                 /* *pcd++ = *pcs++; */
 399                 ldbma(s_space, pcs, t3, pmc_load_exc);
 400                 stbma(d_space, t3, pcd, pmc_store_exc);
 401                 len--;
 402         }
 403
 404         return 0;
 405
 406 unaligned_copy:
 407         /* possibly we are aligned on a word, but not on a double... */
 408         if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
 409                 t2 = src & (sizeof(unsigned int) - 1);
 410
 411                 if (unlikely(t2 != 0)) {
 412                         t2 = sizeof(unsigned int) - t2;
 413                         while (t2) {
 414                                 /* *pcd++ = *pcs++; */
 415                                 ldbma(s_space, pcs, t3, pmc_load_exc);
 416                                 stbma(d_space, t3, pcd, pmc_store_exc);
 417                                 len--;
 418                                 t2--;
 419                         }
 420                 }
 421
 422                 pws = (unsigned int *)pcs;
 423                 pwd = (unsigned int *)pcd;
 424                 goto word_copy;
 425         }
 426
 427         /* Align the destination.  */
 428         if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
 429                 t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
 430                 while (t2) {
 431                         /* *pcd++ = *pcs++; */
 432                         ldbma(s_space, pcs, t3, pmc_load_exc);
 433                         stbma(d_space, t3, pcd, pmc_store_exc);
 434                         len--;
 435                         t2--;
 436                 }
 437                 dst = (unsigned long)pcd;
 438                 src = (unsigned long)pcs;
 439         }
 440
 441         ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
 442                 o_dst, o_src, o_len);
 443         if (ret)
 444                 return ret;
 445
 446         pcs += (len & -sizeof(unsigned int));
 447         pcd += (len & -sizeof(unsigned int));
 448         len %= sizeof(unsigned int);
 449
 450         preserve_branch(handle_load_error);
 451         preserve_branch(handle_store_error);
 452
 453         goto byte_copy;
 454
 455 handle_load_error:
 456         __asm__ __volatile__ ("pmc_load_exc:\n");
 457         d = &__get_cpu_var(exception_data);
 458         DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
 459                 o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
 460         return o_len - d->fault_addr + o_src;
 461
 462 handle_store_error:
 463         __asm__ __volatile__ ("pmc_store_exc:\n");
 464         d = &__get_cpu_var(exception_data);
 465         DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
 466                 o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
 467         return o_len - d->fault_addr + o_dst;
 468 }
 469
 470 #ifdef __KERNEL__
 471 unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
 472 {
 473         mtsp(get_kernel_space(), 1);
 474         mtsp(get_user_space(), 2);
 475         return pa_memcpy((void __force *)dst, src, len);
 476 }
 477
 478 unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len)
 479 {
 480         mtsp(get_user_space(), 1);
 481         mtsp(get_kernel_space(), 2);
 482         return pa_memcpy(dst, (void __force *)src, len);
 483 }
 484
 485 unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
 486 {
 487         mtsp(get_user_space(), 1);
 488         mtsp(get_user_space(), 2);
 489         return pa_memcpy((void __force *)dst, (void __force *)src, len);
 490 }
 491
 492
 493 void * memcpy(void * dst,const void *src, size_t count)
 494 {
 495         mtsp(get_kernel_space(), 1);
 496         mtsp(get_kernel_space(), 2);
 497         pa_memcpy(dst, src, count);
 498         return dst;
 499 }
 500
 501 EXPORT_SYMBOL(copy_to_user);
 502 EXPORT_SYMBOL(copy_from_user);
 503 EXPORT_SYMBOL(copy_in_user);
 504 EXPORT_SYMBOL(memcpy);
 505 #endif