arch/mips/lib/memcpy.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Unified implementation of memcpy, memmove and the __copy_user backend.
   7  *
   8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10  * Copyright (C) 2002 Broadcom, Inc.
  11  *   memcpy/copy_user author: Mark Vandevoorde
  12  *
  13  * Mnemonic names for arguments to memcpy/__copy_user
  14  */
  15
  16 /*
  17  * Hack to resolve longstanding prefetch issue
  18  *
  19  * Prefetching may be fatal on some systems if we're prefetching beyond the
  20  * end of memory on some systems.  It's also a seriously bad idea on non
  21  * dma-coherent systems.
  22  */
  23 #if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
  24 #undef CONFIG_CPU_HAS_PREFETCH
  25 #endif
  26 #ifdef CONFIG_MIPS_MALTA
  27 #undef CONFIG_CPU_HAS_PREFETCH
  28 #endif
  29
  30 #include <asm/asm.h>
  31 #include <asm/asm-offsets.h>
  32 #include <asm/regdef.h>
  33
  34 #define dst a0
  35 #define src a1
  36 #define len a2
  37
  38 /*
  39  * Spec
  40  *
  41  * memcpy copies len bytes from src to dst and sets v0 to dst.
  42  * It assumes that
  43  *   - src and dst don't overlap
  44  *   - src is readable
  45  *   - dst is writable
  46  * memcpy uses the standard calling convention
  47  *
  48  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  49  * the number of uncopied bytes due to an exception caused by a read or write.
  50  * __copy_user assumes that src and dst don't overlap, and that the call is
  51  * implementing one of the following:
  52  *   copy_to_user
  53  *     - src is readable  (no exceptions when reading src)
  54  *   copy_from_user
  55  *     - dst is writable  (no exceptions when writing dst)
  56  * __copy_user uses a non-standard calling convention; see
  57  * include/asm-mips/uaccess.h
  58  *
  59  * When an exception happens on a load, the handler must
  60  # ensure that all of the destination buffer is overwritten to prevent
  61  * leaking information to user mode programs.
  62  */
  63
  64 /*
  65  * Implementation
  66  */
  67
  68 /*
  69  * The exception handler for loads requires that:
  70  *  1- AT contain the address of the byte just past the end of the source
  71  *     of the copy,
  72  *  2- src_entry <= src < AT, and
  73  *  3- (dst - src) == (dst_entry - src_entry),
  74  * The _entry suffix denotes values when __copy_user was called.
  75  *
  76  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  77  * (2) is met by incrementing src by the number of bytes copied
  78  * (3) is met by not doing loads between a pair of increments of dst and src
  79  *
  80  * The exception handlers for stores adjust len (if necessary) and return.
  81  * These handlers do not need to overwrite any data.
  82  *
  83  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  84  * they're not protected.
  85  */
  86
  87 #define EXC(inst_reg,addr,handler)              \
  88 9:      inst_reg, addr;                         \
  89         .section __ex_table,"a";                \
  90         PTR     9b, handler;                    \
  91         .previous
  92
  93 /*
  94  * Only on the 64-bit kernel we can made use of 64-bit registers.
  95  */
  96 #ifdef CONFIG_64BIT
  97 #define USE_DOUBLE
  98 #endif
  99
 100 #ifdef USE_DOUBLE
 101
 102 #define LOAD   ld
 103 #define LOADL  ldl
 104 #define LOADR  ldr
 105 #define STOREL sdl
 106 #define STORER sdr
 107 #define STORE  sd
 108 #define ADD    daddu
 109 #define SUB    dsubu
 110 #define SRL    dsrl
 111 #define SRA    dsra
 112 #define SLL    dsll
 113 #define SLLV   dsllv
 114 #define SRLV   dsrlv
 115 #define NBYTES 8
 116 #define LOG_NBYTES 3
 117
 118 /*
 119  * As we are sharing code base with the mips32 tree (which use the o32 ABI
 120  * register definitions). We need to redefine the register definitions from
 121  * the n64 ABI register naming to the o32 ABI register naming.
 122  */
 123 #undef t0
 124 #undef t1
 125 #undef t2
 126 #undef t3
 127 #define t0      $8
 128 #define t1      $9
 129 #define t2      $10
 130 #define t3      $11
 131 #define t4      $12
 132 #define t5      $13
 133 #define t6      $14
 134 #define t7      $15
 135
 136 #else
 137
 138 #define LOAD   lw
 139 #define LOADL  lwl
 140 #define LOADR  lwr
 141 #define STOREL swl
 142 #define STORER swr
 143 #define STORE  sw
 144 #define ADD    addu
 145 #define SUB    subu
 146 #define SRL    srl
 147 #define SLL    sll
 148 #define SRA    sra
 149 #define SLLV   sllv
 150 #define SRLV   srlv
 151 #define NBYTES 4
 152 #define LOG_NBYTES 2
 153
 154 #endif /* USE_DOUBLE */
 155
 156 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 157 #define LDFIRST LOADR
 158 #define LDREST  LOADL
 159 #define STFIRST STORER
 160 #define STREST  STOREL
 161 #define SHIFT_DISCARD SLLV
 162 #else
 163 #define LDFIRST LOADL
 164 #define LDREST  LOADR
 165 #define STFIRST STOREL
 166 #define STREST  STORER
 167 #define SHIFT_DISCARD SRLV
 168 #endif
 169
 170 #define FIRST(unit) ((unit)*NBYTES)
 171 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 172 #define UNIT(unit)  FIRST(unit)
 173
 174 #define ADDRMASK (NBYTES-1)
 175
 176         .text
 177         .set    noreorder
 178         .set    noat
 179
 180 /*
 181  * A combined memcpy/__copy_user
 182  * __copy_user sets len to 0 for success; else to an upper bound of
 183  * the number of uncopied bytes.
 184  * memcpy sets v0 to dst.
 185  */
 186         .align  5
 187 LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 188         move    v0, dst                         /* return value */
 189 __memcpy:
 190 FEXPORT(__copy_user)
 191         /*
 192          * Note: dst & src may be unaligned, len may be 0
 193          * Temps
 194          */
 195 #define rem t8
 196
 197         /*
 198          * The "issue break"s below are very approximate.
 199          * Issue delays for dcache fills will perturb the schedule, as will
 200          * load queue full replay traps, etc.
 201          *
 202          * If len < NBYTES use byte operations.
 203          */
 204         PREF(   0, 0(src) )
 205         PREF(   1, 0(dst) )
 206         sltu    t2, len, NBYTES
 207         and     t1, dst, ADDRMASK
 208         PREF(   0, 1*32(src) )
 209         PREF(   1, 1*32(dst) )
 210         bnez    t2, copy_bytes_checklen
 211          and    t0, src, ADDRMASK
 212         PREF(   0, 2*32(src) )
 213         PREF(   1, 2*32(dst) )
 214         bnez    t1, dst_unaligned
 215          nop
 216         bnez    t0, src_unaligned_dst_aligned
 217         /*
 218          * use delay slot for fall-through
 219          * src and dst are aligned; need to compute rem
 220          */
 221 both_aligned:
 222          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 223         beqz    t0, cleanup_both_aligned # len < 8*NBYTES
 224          and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
 225         PREF(   0, 3*32(src) )
 226         PREF(   1, 3*32(dst) )
 227         .align  4
 228 1:
 229 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 230 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 231 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 232 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 233         SUB     len, len, 8*NBYTES
 234 EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
 235 EXC(    LOAD    t7, UNIT(5)(src),       l_exc_copy)
 236 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p8u)
 237 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p7u)
 238 EXC(    LOAD    t0, UNIT(6)(src),       l_exc_copy)
 239 EXC(    LOAD    t1, UNIT(7)(src),       l_exc_copy)
 240         ADD     src, src, 8*NBYTES
 241         ADD     dst, dst, 8*NBYTES
 242 EXC(    STORE   t2, UNIT(-6)(dst),      s_exc_p6u)
 243 EXC(    STORE   t3, UNIT(-5)(dst),      s_exc_p5u)
 244 EXC(    STORE   t4, UNIT(-4)(dst),      s_exc_p4u)
 245 EXC(    STORE   t7, UNIT(-3)(dst),      s_exc_p3u)
 246 EXC(    STORE   t0, UNIT(-2)(dst),      s_exc_p2u)
 247 EXC(    STORE   t1, UNIT(-1)(dst),      s_exc_p1u)
 248         PREF(   0, 8*32(src) )
 249         PREF(   1, 8*32(dst) )
 250         bne     len, rem, 1b
 251          nop
 252
 253         /*
 254          * len == rem == the number of bytes left to copy < 8*NBYTES
 255          */
 256 cleanup_both_aligned:
 257         beqz    len, done
 258          sltu   t0, len, 4*NBYTES
 259         bnez    t0, less_than_4units
 260          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 261         /*
 262          * len >= 4*NBYTES
 263          */
 264 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 265 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 266 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 267 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 268         SUB     len, len, 4*NBYTES
 269         ADD     src, src, 4*NBYTES
 270 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 271 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 272 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 273 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 274         beqz    len, done
 275          ADD    dst, dst, 4*NBYTES
 276 less_than_4units:
 277         /*
 278          * rem = len % NBYTES
 279          */
 280         beq     rem, len, copy_bytes
 281          nop
 282 1:
 283 EXC(    LOAD    t0, 0(src),             l_exc)
 284         ADD     src, src, NBYTES
 285         SUB     len, len, NBYTES
 286 EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 287         bne     rem, len, 1b
 288          ADD    dst, dst, NBYTES
 289
 290         /*
 291          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 292          * A loop would do only a byte at a time with possible branch
 293          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 294          * because can't assume read-access to dst.  Instead, use
 295          * STREST dst, which doesn't require read access to dst.
 296          *
 297          * This code should perform better than a simple loop on modern,
 298          * wide-issue mips processors because the code has fewer branches and
 299          * more instruction-level parallelism.
 300          */
 301 #define bits t2
 302         beqz    len, done
 303          ADD    t1, dst, len    # t1 is just past last byte of dst
 304         li      bits, 8*NBYTES
 305         SLL     rem, len, 3     # rem = number of bits to keep
 306 EXC(    LOAD    t0, 0(src),             l_exc)
 307         SUB     bits, bits, rem # bits = number of bits to discard
 308         SHIFT_DISCARD t0, t0, bits
 309 EXC(    STREST  t0, -1(t1),             s_exc)
 310         jr      ra
 311          move   len, zero
 312 dst_unaligned:
 313         /*
 314          * dst is unaligned
 315          * t0 = src & ADDRMASK
 316          * t1 = dst & ADDRMASK; T1 > 0
 317          * len >= NBYTES
 318          *
 319          * Copy enough bytes to align dst
 320          * Set match = (src and dst have same alignment)
 321          */
 322 #define match rem
 323 EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
 324         ADD     t2, zero, NBYTES
 325 EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
 326         SUB     t2, t2, t1      # t2 = number of bytes copied
 327         xor     match, t0, t1
 328 EXC(    STFIRST t3, FIRST(0)(dst),      s_exc)
 329         beq     len, t2, done
 330          SUB    len, len, t2
 331         ADD     dst, dst, t2
 332         beqz    match, both_aligned
 333          ADD    src, src, t2
 334
 335 src_unaligned_dst_aligned:
 336         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 337         PREF(   0, 3*32(src) )
 338         beqz    t0, cleanup_src_unaligned
 339          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 340         PREF(   1, 3*32(dst) )
 341 1:
 342 /*
 343  * Avoid consecutive LD*'s to the same register since some mips
 344  * implementations can't issue them in the same cycle.
 345  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 346  * are to the same unit (unless src is aligned, but it's not).
 347  */
 348 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 349 EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
 350         SUB     len, len, 4*NBYTES
 351 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 352 EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
 353 EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
 354 EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
 355 EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
 356 EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
 357         PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 358         ADD     src, src, 4*NBYTES
 359 #ifdef CONFIG_CPU_SB1
 360         nop                             # improves slotting
 361 #endif
 362 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 363 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 364 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 365 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 366         PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 367         bne     len, rem, 1b
 368          ADD    dst, dst, 4*NBYTES
 369
 370 cleanup_src_unaligned:
 371         beqz    len, done
 372          and    rem, len, NBYTES-1  # rem = len % NBYTES
 373         beq     rem, len, copy_bytes
 374          nop
 375 1:
 376 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 377 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 378         ADD     src, src, NBYTES
 379         SUB     len, len, NBYTES
 380 EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 381         bne     len, rem, 1b
 382          ADD    dst, dst, NBYTES
 383
 384 copy_bytes_checklen:
 385         beqz    len, done
 386          nop
 387 copy_bytes:
 388         /* 0 < len < NBYTES  */
 389 #define COPY_BYTE(N)                    \
 390 EXC(    lb      t0, N(src), l_exc);     \
 391         SUB     len, len, 1;            \
 392         beqz    len, done;              \
 393 EXC(     sb     t0, N(dst), s_exc_p1)
 394
 395         COPY_BYTE(0)
 396         COPY_BYTE(1)
 397 #ifdef USE_DOUBLE
 398         COPY_BYTE(2)
 399         COPY_BYTE(3)
 400         COPY_BYTE(4)
 401         COPY_BYTE(5)
 402 #endif
 403 EXC(    lb      t0, NBYTES-2(src), l_exc)
 404         SUB     len, len, 1
 405         jr      ra
 406 EXC(     sb     t0, NBYTES-2(dst), s_exc_p1)
 407 done:
 408         jr      ra
 409          nop
 410         END(memcpy)
 411
 412 l_exc_copy:
 413         /*
 414          * Copy bytes from src until faulting load address (or until a
 415          * lb faults)
 416          *
 417          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 418          * may be more than a byte beyond the last address.
 419          * Hence, the lb below may get an exception.
 420          *
 421          * Assumes src < THREAD_BUADDR($28)
 422          */
 423         LOAD    t0, TI_TASK($28)
 424          nop
 425         LOAD    t0, THREAD_BUADDR(t0)
 426 1:
 427 EXC(    lb      t1, 0(src),     l_exc)
 428         ADD     src, src, 1
 429         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 430         bne     src, t0, 1b
 431          ADD    dst, dst, 1
 432 l_exc:
 433         LOAD    t0, TI_TASK($28)
 434          nop
 435         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 436          nop
 437         SUB     len, AT, t0             # len number of uncopied bytes
 438         /*
 439          * Here's where we rely on src and dst being incremented in tandem,
 440          *   See (3) above.
 441          * dst += (fault addr - src) to put dst at first byte to clear
 442          */
 443         ADD     dst, t0                 # compute start address in a1
 444         SUB     dst, src
 445         /*
 446          * Clear len bytes starting at dst.  Can't call __bzero because it
 447          * might modify len.  An inefficient loop for these rare times...
 448          */
 449         beqz    len, done
 450          SUB    src, len, 1
 451 1:      sb      zero, 0(dst)
 452         ADD     dst, dst, 1
 453         bnez    src, 1b
 454          SUB    src, src, 1
 455         jr      ra
 456          nop
 457
 458
 459 #define SEXC(n)                         \
 460 s_exc_p ## n ## u:                      \
 461         jr      ra;                     \
 462          ADD    len, len, n*NBYTES
 463
 464 SEXC(8)
 465 SEXC(7)
 466 SEXC(6)
 467 SEXC(5)
 468 SEXC(4)
 469 SEXC(3)
 470 SEXC(2)
 471 SEXC(1)
 472
 473 s_exc_p1:
 474         jr      ra
 475          ADD    len, len, 1
 476 s_exc:
 477         jr      ra
 478          nop
 479
 480         .align  5
 481 LEAF(memmove)
 482         ADD     t0, a0, a2
 483         ADD     t1, a1, a2
 484         sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 485         sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 486         and     t0, t1
 487         beqz    t0, __memcpy
 488          move   v0, a0                          /* return value */
 489         beqz    a2, r_out
 490         END(memmove)
 491
 492         /* fall through to __rmemcpy */
 493 LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 494          sltu   t0, a1, a0
 495         beqz    t0, r_end_bytes_up              # src >= dst
 496          nop
 497         ADD     a0, a2                          # dst = dst + len
 498         ADD     a1, a2                          # src = src + len
 499
 500 r_end_bytes:
 501         lb      t0, -1(a1)
 502         SUB     a2, a2, 0x1
 503         sb      t0, -1(a0)
 504         SUB     a1, a1, 0x1
 505         bnez    a2, r_end_bytes
 506          SUB    a0, a0, 0x1
 507
 508 r_out:
 509         jr      ra
 510          move   a2, zero
 511
 512 r_end_bytes_up:
 513         lb      t0, (a1)
 514         SUB     a2, a2, 0x1
 515         sb      t0, (a0)
 516         ADD     a1, a1, 0x1
 517         bnez    a2, r_end_bytes_up
 518          ADD    a0, a0, 0x1
 519
 520         jr      ra
 521          move   a2, zero
 522         END(__rmemcpy)