arch/mips/lib/memcpy.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Unified implementation of memcpy, memmove and the __copy_user backend.
   7  *
   8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10  * Copyright (C) 2002 Broadcom, Inc.
  11  *   memcpy/copy_user author: Mark Vandevoorde
  12  * Copyright (C) 2007  Maciej W. Rozycki
  13  *
  14  * Mnemonic names for arguments to memcpy/__copy_user
  15  */
  16
  17 /*
  18  * Hack to resolve longstanding prefetch issue
  19  *
  20  * Prefetching may be fatal on some systems if we're prefetching beyond the
  21  * end of memory on some systems.  It's also a seriously bad idea on non
  22  * dma-coherent systems.
  23  */
  24 #ifdef CONFIG_DMA_NONCOHERENT
  25 #undef CONFIG_CPU_HAS_PREFETCH
  26 #endif
  27 #ifdef CONFIG_MIPS_MALTA
  28 #undef CONFIG_CPU_HAS_PREFETCH
  29 #endif
  30
  31 #include <asm/asm.h>
  32 #include <asm/asm-offsets.h>
  33 #include <asm/regdef.h>
  34
  35 #define dst a0
  36 #define src a1
  37 #define len a2
  38
  39 /*
  40  * Spec
  41  *
  42  * memcpy copies len bytes from src to dst and sets v0 to dst.
  43  * It assumes that
  44  *   - src and dst don't overlap
  45  *   - src is readable
  46  *   - dst is writable
  47  * memcpy uses the standard calling convention
  48  *
  49  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  50  * the number of uncopied bytes due to an exception caused by a read or write.
  51  * __copy_user assumes that src and dst don't overlap, and that the call is
  52  * implementing one of the following:
  53  *   copy_to_user
  54  *     - src is readable  (no exceptions when reading src)
  55  *   copy_from_user
  56  *     - dst is writable  (no exceptions when writing dst)
  57  * __copy_user uses a non-standard calling convention; see
  58  * include/asm-mips/uaccess.h
  59  *
  60  * When an exception happens on a load, the handler must
  61  # ensure that all of the destination buffer is overwritten to prevent
  62  * leaking information to user mode programs.
  63  */
  64
  65 /*
  66  * Implementation
  67  */
  68
  69 /*
  70  * The exception handler for loads requires that:
  71  *  1- AT contain the address of the byte just past the end of the source
  72  *     of the copy,
  73  *  2- src_entry <= src < AT, and
  74  *  3- (dst - src) == (dst_entry - src_entry),
  75  * The _entry suffix denotes values when __copy_user was called.
  76  *
  77  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  78  * (2) is met by incrementing src by the number of bytes copied
  79  * (3) is met by not doing loads between a pair of increments of dst and src
  80  *
  81  * The exception handlers for stores adjust len (if necessary) and return.
  82  * These handlers do not need to overwrite any data.
  83  *
  84  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  85  * they're not protected.
  86  */
  87
  88 #define EXC(inst_reg,addr,handler)              \
  89 9:      inst_reg, addr;                         \
  90         .section __ex_table,"a";                \
  91         PTR     9b, handler;                    \
  92         .previous
  93
  94 /*
  95  * Only on the 64-bit kernel we can made use of 64-bit registers.
  96  */
  97 #ifdef CONFIG_64BIT
  98 #define USE_DOUBLE
  99 #endif
 100
 101 #ifdef USE_DOUBLE
 102
 103 #define LOAD   ld
 104 #define LOADL  ldl
 105 #define LOADR  ldr
 106 #define STOREL sdl
 107 #define STORER sdr
 108 #define STORE  sd
 109 #define ADD    daddu
 110 #define SUB    dsubu
 111 #define SRL    dsrl
 112 #define SRA    dsra
 113 #define SLL    dsll
 114 #define SLLV   dsllv
 115 #define SRLV   dsrlv
 116 #define NBYTES 8
 117 #define LOG_NBYTES 3
 118
 119 /*
 120  * As we are sharing code base with the mips32 tree (which use the o32 ABI
 121  * register definitions). We need to redefine the register definitions from
 122  * the n64 ABI register naming to the o32 ABI register naming.
 123  */
 124 #undef t0
 125 #undef t1
 126 #undef t2
 127 #undef t3
 128 #define t0      $8
 129 #define t1      $9
 130 #define t2      $10
 131 #define t3      $11
 132 #define t4      $12
 133 #define t5      $13
 134 #define t6      $14
 135 #define t7      $15
 136
 137 #else
 138
 139 #define LOAD   lw
 140 #define LOADL  lwl
 141 #define LOADR  lwr
 142 #define STOREL swl
 143 #define STORER swr
 144 #define STORE  sw
 145 #define ADD    addu
 146 #define SUB    subu
 147 #define SRL    srl
 148 #define SLL    sll
 149 #define SRA    sra
 150 #define SLLV   sllv
 151 #define SRLV   srlv
 152 #define NBYTES 4
 153 #define LOG_NBYTES 2
 154
 155 #endif /* USE_DOUBLE */
 156
 157 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 158 #define LDFIRST LOADR
 159 #define LDREST  LOADL
 160 #define STFIRST STORER
 161 #define STREST  STOREL
 162 #define SHIFT_DISCARD SLLV
 163 #else
 164 #define LDFIRST LOADL
 165 #define LDREST  LOADR
 166 #define STFIRST STOREL
 167 #define STREST  STORER
 168 #define SHIFT_DISCARD SRLV
 169 #endif
 170
 171 #define FIRST(unit) ((unit)*NBYTES)
 172 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 173 #define UNIT(unit)  FIRST(unit)
 174
 175 #define ADDRMASK (NBYTES-1)
 176
 177         .text
 178         .set    noreorder
 179 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 180         .set    noat
 181 #else
 182         .set    at=v1
 183 #endif
 184
 185 /*
 186  * t6 is used as a flag to note inatomic mode.
 187  */
 188 LEAF(__copy_user_inatomic)
 189         b       __copy_user_common
 190          li     t6, 1
 191         END(__copy_user_inatomic)
 192
 193 /*
 194  * A combined memcpy/__copy_user
 195  * __copy_user sets len to 0 for success; else to an upper bound of
 196  * the number of uncopied bytes.
 197  * memcpy sets v0 to dst.
 198  */
 199         .align  5
 200 LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 201         move    v0, dst                         /* return value */
 202 .L__memcpy:
 203 FEXPORT(__copy_user)
 204         li      t6, 0   /* not inatomic */
 205 __copy_user_common:
 206         /*
 207          * Note: dst & src may be unaligned, len may be 0
 208          * Temps
 209          */
 210 #define rem t8
 211
 212         R10KCBARRIER(0(ra))
 213         /*
 214          * The "issue break"s below are very approximate.
 215          * Issue delays for dcache fills will perturb the schedule, as will
 216          * load queue full replay traps, etc.
 217          *
 218          * If len < NBYTES use byte operations.
 219          */
 220         PREF(   0, 0(src) )
 221         PREF(   1, 0(dst) )
 222         sltu    t2, len, NBYTES
 223         and     t1, dst, ADDRMASK
 224         PREF(   0, 1*32(src) )
 225         PREF(   1, 1*32(dst) )
 226         bnez    t2, .Lcopy_bytes_checklen
 227          and    t0, src, ADDRMASK
 228         PREF(   0, 2*32(src) )
 229         PREF(   1, 2*32(dst) )
 230         bnez    t1, .Ldst_unaligned
 231          nop
 232         bnez    t0, .Lsrc_unaligned_dst_aligned
 233         /*
 234          * use delay slot for fall-through
 235          * src and dst are aligned; need to compute rem
 236          */
 237 .Lboth_aligned:
 238          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 239         beqz    t0, .Lcleanup_both_aligned # len < 8*NBYTES
 240          and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
 241         PREF(   0, 3*32(src) )
 242         PREF(   1, 3*32(dst) )
 243         .align  4
 244 1:
 245         R10KCBARRIER(0(ra))
 246 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 247 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 248 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 249 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 250         SUB     len, len, 8*NBYTES
 251 EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
 252 EXC(    LOAD    t7, UNIT(5)(src),       .Ll_exc_copy)
 253 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p8u)
 254 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p7u)
 255 EXC(    LOAD    t0, UNIT(6)(src),       .Ll_exc_copy)
 256 EXC(    LOAD    t1, UNIT(7)(src),       .Ll_exc_copy)
 257         ADD     src, src, 8*NBYTES
 258         ADD     dst, dst, 8*NBYTES
 259 EXC(    STORE   t2, UNIT(-6)(dst),      .Ls_exc_p6u)
 260 EXC(    STORE   t3, UNIT(-5)(dst),      .Ls_exc_p5u)
 261 EXC(    STORE   t4, UNIT(-4)(dst),      .Ls_exc_p4u)
 262 EXC(    STORE   t7, UNIT(-3)(dst),      .Ls_exc_p3u)
 263 EXC(    STORE   t0, UNIT(-2)(dst),      .Ls_exc_p2u)
 264 EXC(    STORE   t1, UNIT(-1)(dst),      .Ls_exc_p1u)
 265         PREF(   0, 8*32(src) )
 266         PREF(   1, 8*32(dst) )
 267         bne     len, rem, 1b
 268          nop
 269
 270         /*
 271          * len == rem == the number of bytes left to copy < 8*NBYTES
 272          */
 273 .Lcleanup_both_aligned:
 274         beqz    len, .Ldone
 275          sltu   t0, len, 4*NBYTES
 276         bnez    t0, .Lless_than_4units
 277          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 278         /*
 279          * len >= 4*NBYTES
 280          */
 281 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 282 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 283 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 284 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 285         SUB     len, len, 4*NBYTES
 286         ADD     src, src, 4*NBYTES
 287         R10KCBARRIER(0(ra))
 288 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p4u)
 289 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p3u)
 290 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc_p2u)
 291 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc_p1u)
 292         .set    reorder                         /* DADDI_WAR */
 293         ADD     dst, dst, 4*NBYTES
 294         beqz    len, .Ldone
 295         .set    noreorder
 296 .Lless_than_4units:
 297         /*
 298          * rem = len % NBYTES
 299          */
 300         beq     rem, len, .Lcopy_bytes
 301          nop
 302 1:
 303         R10KCBARRIER(0(ra))
 304 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 305         ADD     src, src, NBYTES
 306         SUB     len, len, NBYTES
 307 EXC(    STORE   t0, 0(dst),             .Ls_exc_p1u)
 308         .set    reorder                         /* DADDI_WAR */
 309         ADD     dst, dst, NBYTES
 310         bne     rem, len, 1b
 311         .set    noreorder
 312
 313         /*
 314          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 315          * A loop would do only a byte at a time with possible branch
 316          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 317          * because can't assume read-access to dst.  Instead, use
 318          * STREST dst, which doesn't require read access to dst.
 319          *
 320          * This code should perform better than a simple loop on modern,
 321          * wide-issue mips processors because the code has fewer branches and
 322          * more instruction-level parallelism.
 323          */
 324 #define bits t2
 325         beqz    len, .Ldone
 326          ADD    t1, dst, len    # t1 is just past last byte of dst
 327         li      bits, 8*NBYTES
 328         SLL     rem, len, 3     # rem = number of bits to keep
 329 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 330         SUB     bits, bits, rem # bits = number of bits to discard
 331         SHIFT_DISCARD t0, t0, bits
 332 EXC(    STREST  t0, -1(t1),             .Ls_exc)
 333         jr      ra
 334          move   len, zero
 335 .Ldst_unaligned:
 336         /*
 337          * dst is unaligned
 338          * t0 = src & ADDRMASK
 339          * t1 = dst & ADDRMASK; T1 > 0
 340          * len >= NBYTES
 341          *
 342          * Copy enough bytes to align dst
 343          * Set match = (src and dst have same alignment)
 344          */
 345 #define match rem
 346 EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
 347         ADD     t2, zero, NBYTES
 348 EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
 349         SUB     t2, t2, t1      # t2 = number of bytes copied
 350         xor     match, t0, t1
 351         R10KCBARRIER(0(ra))
 352 EXC(    STFIRST t3, FIRST(0)(dst),      .Ls_exc)
 353         beq     len, t2, .Ldone
 354          SUB    len, len, t2
 355         ADD     dst, dst, t2
 356         beqz    match, .Lboth_aligned
 357          ADD    src, src, t2
 358
 359 .Lsrc_unaligned_dst_aligned:
 360         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 361         PREF(   0, 3*32(src) )
 362         beqz    t0, .Lcleanup_src_unaligned
 363          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 364         PREF(   1, 3*32(dst) )
 365 1:
 366 /*
 367  * Avoid consecutive LD*'s to the same register since some mips
 368  * implementations can't issue them in the same cycle.
 369  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 370  * are to the same unit (unless src is aligned, but it's not).
 371  */
 372         R10KCBARRIER(0(ra))
 373 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 374 EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
 375         SUB     len, len, 4*NBYTES
 376 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 377 EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
 378 EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
 379 EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
 380 EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
 381 EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
 382         PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 383         ADD     src, src, 4*NBYTES
 384 #ifdef CONFIG_CPU_SB1
 385         nop                             # improves slotting
 386 #endif
 387 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p4u)
 388 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p3u)
 389 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc_p2u)
 390 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc_p1u)
 391         PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 392         .set    reorder                         /* DADDI_WAR */
 393         ADD     dst, dst, 4*NBYTES
 394         bne     len, rem, 1b
 395         .set    noreorder
 396
 397 .Lcleanup_src_unaligned:
 398         beqz    len, .Ldone
 399          and    rem, len, NBYTES-1  # rem = len % NBYTES
 400         beq     rem, len, .Lcopy_bytes
 401          nop
 402 1:
 403         R10KCBARRIER(0(ra))
 404 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 405 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 406         ADD     src, src, NBYTES
 407         SUB     len, len, NBYTES
 408 EXC(    STORE   t0, 0(dst),             .Ls_exc_p1u)
 409         .set    reorder                         /* DADDI_WAR */
 410         ADD     dst, dst, NBYTES
 411         bne     len, rem, 1b
 412         .set    noreorder
 413
 414 .Lcopy_bytes_checklen:
 415         beqz    len, .Ldone
 416          nop
 417 .Lcopy_bytes:
 418         /* 0 < len < NBYTES  */
 419         R10KCBARRIER(0(ra))
 420 #define COPY_BYTE(N)                    \
 421 EXC(    lb      t0, N(src), .Ll_exc);   \
 422         SUB     len, len, 1;            \
 423         beqz    len, .Ldone;            \
 424 EXC(     sb     t0, N(dst), .Ls_exc_p1)
 425
 426         COPY_BYTE(0)
 427         COPY_BYTE(1)
 428 #ifdef USE_DOUBLE
 429         COPY_BYTE(2)
 430         COPY_BYTE(3)
 431         COPY_BYTE(4)
 432         COPY_BYTE(5)
 433 #endif
 434 EXC(    lb      t0, NBYTES-2(src), .Ll_exc)
 435         SUB     len, len, 1
 436         jr      ra
 437 EXC(     sb     t0, NBYTES-2(dst), .Ls_exc_p1)
 438 .Ldone:
 439         jr      ra
 440          nop
 441         END(memcpy)
 442
 443 .Ll_exc_copy:
 444         /*
 445          * Copy bytes from src until faulting load address (or until a
 446          * lb faults)
 447          *
 448          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 449          * may be more than a byte beyond the last address.
 450          * Hence, the lb below may get an exception.
 451          *
 452          * Assumes src < THREAD_BUADDR($28)
 453          */
 454         LOAD    t0, TI_TASK($28)
 455          nop
 456         LOAD    t0, THREAD_BUADDR(t0)
 457 1:
 458 EXC(    lb      t1, 0(src),     .Ll_exc)
 459         ADD     src, src, 1
 460         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 461         .set    reorder                         /* DADDI_WAR */
 462         ADD     dst, dst, 1
 463         bne     src, t0, 1b
 464         .set    noreorder
 465 .Ll_exc:
 466         LOAD    t0, TI_TASK($28)
 467          nop
 468         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 469          nop
 470         SUB     len, AT, t0             # len number of uncopied bytes
 471         bnez    t6, .Ldone      /* Skip the zeroing part if inatomic */
 472         /*
 473          * Here's where we rely on src and dst being incremented in tandem,
 474          *   See (3) above.
 475          * dst += (fault addr - src) to put dst at first byte to clear
 476          */
 477         ADD     dst, t0                 # compute start address in a1
 478         SUB     dst, src
 479         /*
 480          * Clear len bytes starting at dst.  Can't call __bzero because it
 481          * might modify len.  An inefficient loop for these rare times...
 482          */
 483         .set    reorder                         /* DADDI_WAR */
 484         SUB     src, len, 1
 485         beqz    len, .Ldone
 486         .set    noreorder
 487 1:      sb      zero, 0(dst)
 488         ADD     dst, dst, 1
 489 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 490         bnez    src, 1b
 491          SUB    src, src, 1
 492 #else
 493         .set    push
 494         .set    noat
 495         li      v1, 1
 496         bnez    src, 1b
 497          SUB    src, src, v1
 498         .set    pop
 499 #endif
 500         jr      ra
 501          nop
 502
 503
 504 #define SEXC(n)                                                 \
 505         .set    reorder;                        /* DADDI_WAR */ \
 506 .Ls_exc_p ## n ## u:                                            \
 507         ADD     len, len, n*NBYTES;                             \
 508         jr      ra;                                             \
 509         .set    noreorder
 510
 511 SEXC(8)
 512 SEXC(7)
 513 SEXC(6)
 514 SEXC(5)
 515 SEXC(4)
 516 SEXC(3)
 517 SEXC(2)
 518 SEXC(1)
 519
 520 .Ls_exc_p1:
 521         .set    reorder                         /* DADDI_WAR */
 522         ADD     len, len, 1
 523         jr      ra
 524         .set    noreorder
 525 .Ls_exc:
 526         jr      ra
 527          nop
 528
 529         .align  5
 530 LEAF(memmove)
 531         ADD     t0, a0, a2
 532         ADD     t1, a1, a2
 533         sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 534         sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 535         and     t0, t1
 536         beqz    t0, .L__memcpy
 537          move   v0, a0                          /* return value */
 538         beqz    a2, .Lr_out
 539         END(memmove)
 540
 541         /* fall through to __rmemcpy */
 542 LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 543          sltu   t0, a1, a0
 544         beqz    t0, .Lr_end_bytes_up            # src >= dst
 545          nop
 546         ADD     a0, a2                          # dst = dst + len
 547         ADD     a1, a2                          # src = src + len
 548
 549 .Lr_end_bytes:
 550         R10KCBARRIER(0(ra))
 551         lb      t0, -1(a1)
 552         SUB     a2, a2, 0x1
 553         sb      t0, -1(a0)
 554         SUB     a1, a1, 0x1
 555         .set    reorder                         /* DADDI_WAR */
 556         SUB     a0, a0, 0x1
 557         bnez    a2, .Lr_end_bytes
 558         .set    noreorder
 559
 560 .Lr_out:
 561         jr      ra
 562          move   a2, zero
 563
 564 .Lr_end_bytes_up:
 565         R10KCBARRIER(0(ra))
 566         lb      t0, (a1)
 567         SUB     a2, a2, 0x1
 568         sb      t0, (a0)
 569         ADD     a1, a1, 0x1
 570         .set    reorder                         /* DADDI_WAR */
 571         ADD     a0, a0, 0x1
 572         bnez    a2, .Lr_end_bytes_up
 573         .set    noreorder
 574
 575         jr      ra
 576          move   a2, zero
 577         END(__rmemcpy)