arch/mips/lib/csum_partial.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Quick'n'dirty IP checksum ...
   7  *
   8  * Copyright (C) 1998, 1999 Ralf Baechle
   9  * Copyright (C) 1999 Silicon Graphics, Inc.
  10  * Copyright (C) 2007  Maciej W. Rozycki
  11  */
  12 #include <linux/errno.h>
  13 #include <asm/asm.h>
  14 #include <asm/asm-offsets.h>
  15 #include <asm/regdef.h>
  16
  17 #ifdef CONFIG_64BIT
  18 /*
  19  * As we are sharing code base with the mips32 tree (which use the o32 ABI
  20  * register definitions). We need to redefine the register definitions from
  21  * the n64 ABI register naming to the o32 ABI register naming.
  22  */
  23 #undef t0
  24 #undef t1
  25 #undef t2
  26 #undef t3
  27 #define t0      $8
  28 #define t1      $9
  29 #define t2      $10
  30 #define t3      $11
  31 #define t4      $12
  32 #define t5      $13
  33 #define t6      $14
  34 #define t7      $15
  35
  36 #define USE_DOUBLE
  37 #endif
  38
  39 #ifdef USE_DOUBLE
  40
  41 #define LOAD   ld
  42 #define LOAD32 lwu
  43 #define ADD    daddu
  44 #define NBYTES 8
  45
  46 #else
  47
  48 #define LOAD   lw
  49 #define LOAD32 lw
  50 #define ADD    addu
  51 #define NBYTES 4
  52
  53 #endif /* USE_DOUBLE */
  54
  55 #define UNIT(unit)  ((unit)*NBYTES)
  56
  57 #define ADDC(sum,reg)                                           \
  58         ADD     sum, reg;                                       \
  59         sltu    v1, sum, reg;                                   \
  60         ADD     sum, v1;                                        \
  61
  62 #define ADDC32(sum,reg)                                         \
  63         addu    sum, reg;                                       \
  64         sltu    v1, sum, reg;                                   \
  65         addu    sum, v1;                                        \
  66
  67 #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)    \
  68         LOAD    _t0, (offset + UNIT(0))(src);                   \
  69         LOAD    _t1, (offset + UNIT(1))(src);                   \
  70         LOAD    _t2, (offset + UNIT(2))(src);                   \
  71         LOAD    _t3, (offset + UNIT(3))(src);                   \
  72         ADDC(sum, _t0);                                         \
  73         ADDC(sum, _t1);                                         \
  74         ADDC(sum, _t2);                                         \
  75         ADDC(sum, _t3)
  76
  77 #ifdef USE_DOUBLE
  78 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  79         CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
  80 #else
  81 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  82         CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);   \
  83         CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
  84 #endif
  85
  86 /*
  87  * a0: source address
  88  * a1: length of the area to checksum
  89  * a2: partial checksum
  90  */
  91
  92 #define src a0
  93 #define sum v0
  94
  95         .text
  96         .set    noreorder
  97         .align  5
  98 LEAF(csum_partial)
  99         move    sum, zero
 100         move    t7, zero
 101
 102         sltiu   t8, a1, 0x8
 103         bnez    t8, .Lsmall_csumcpy             /* < 8 bytes to copy */
 104          move   t2, a1
 105
 106         andi    t7, src, 0x1                    /* odd buffer? */
 107
 108 .Lhword_align:
 109         beqz    t7, .Lword_align
 110          andi   t8, src, 0x2
 111
 112         lbu     t0, (src)
 113         LONG_SUBU       a1, a1, 0x1
 114 #ifdef __MIPSEL__
 115         sll     t0, t0, 8
 116 #endif
 117         ADDC(sum, t0)
 118         PTR_ADDU        src, src, 0x1
 119         andi    t8, src, 0x2
 120
 121 .Lword_align:
 122         beqz    t8, .Ldword_align
 123          sltiu  t8, a1, 56
 124
 125         lhu     t0, (src)
 126         LONG_SUBU       a1, a1, 0x2
 127         ADDC(sum, t0)
 128         sltiu   t8, a1, 56
 129         PTR_ADDU        src, src, 0x2
 130
 131 .Ldword_align:
 132         bnez    t8, .Ldo_end_words
 133          move   t8, a1
 134
 135         andi    t8, src, 0x4
 136         beqz    t8, .Lqword_align
 137          andi   t8, src, 0x8
 138
 139         LOAD32  t0, 0x00(src)
 140         LONG_SUBU       a1, a1, 0x4
 141         ADDC(sum, t0)
 142         PTR_ADDU        src, src, 0x4
 143         andi    t8, src, 0x8
 144
 145 .Lqword_align:
 146         beqz    t8, .Loword_align
 147          andi   t8, src, 0x10
 148
 149 #ifdef USE_DOUBLE
 150         ld      t0, 0x00(src)
 151         LONG_SUBU       a1, a1, 0x8
 152         ADDC(sum, t0)
 153 #else
 154         lw      t0, 0x00(src)
 155         lw      t1, 0x04(src)
 156         LONG_SUBU       a1, a1, 0x8
 157         ADDC(sum, t0)
 158         ADDC(sum, t1)
 159 #endif
 160         PTR_ADDU        src, src, 0x8
 161         andi    t8, src, 0x10
 162
 163 .Loword_align:
 164         beqz    t8, .Lbegin_movement
 165          LONG_SRL       t8, a1, 0x7
 166
 167 #ifdef USE_DOUBLE
 168         ld      t0, 0x00(src)
 169         ld      t1, 0x08(src)
 170         ADDC(sum, t0)
 171         ADDC(sum, t1)
 172 #else
 173         CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
 174 #endif
 175         LONG_SUBU       a1, a1, 0x10
 176         PTR_ADDU        src, src, 0x10
 177         LONG_SRL        t8, a1, 0x7
 178
 179 .Lbegin_movement:
 180         beqz    t8, 1f
 181          andi   t2, a1, 0x40
 182
 183 .Lmove_128bytes:
 184         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 185         CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 186         CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
 187         CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
 188         LONG_SUBU       t8, t8, 0x01
 189         .set    reorder                         /* DADDI_WAR */
 190         PTR_ADDU        src, src, 0x80
 191         bnez    t8, .Lmove_128bytes
 192         .set    noreorder
 193
 194 1:
 195         beqz    t2, 1f
 196          andi   t2, a1, 0x20
 197
 198 .Lmove_64bytes:
 199         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 200         CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 201         PTR_ADDU        src, src, 0x40
 202
 203 1:
 204         beqz    t2, .Ldo_end_words
 205          andi   t8, a1, 0x1c
 206
 207 .Lmove_32bytes:
 208         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 209         andi    t8, a1, 0x1c
 210         PTR_ADDU        src, src, 0x20
 211
 212 .Ldo_end_words:
 213         beqz    t8, .Lsmall_csumcpy
 214          andi   t2, a1, 0x3
 215         LONG_SRL        t8, t8, 0x2
 216
 217 .Lend_words:
 218         LOAD32  t0, (src)
 219         LONG_SUBU       t8, t8, 0x1
 220         ADDC(sum, t0)
 221         .set    reorder                         /* DADDI_WAR */
 222         PTR_ADDU        src, src, 0x4
 223         bnez    t8, .Lend_words
 224         .set    noreorder
 225
 226 /* unknown src alignment and < 8 bytes to go  */
 227 .Lsmall_csumcpy:
 228         move    a1, t2
 229
 230         andi    t0, a1, 4
 231         beqz    t0, 1f
 232          andi   t0, a1, 2
 233
 234         /* Still a full word to go  */
 235         ulw     t1, (src)
 236         PTR_ADDIU       src, 4
 237 #ifdef USE_DOUBLE
 238         dsll    t1, t1, 32                      /* clear lower 32bit */
 239 #endif
 240         ADDC(sum, t1)
 241
 242 1:      move    t1, zero
 243         beqz    t0, 1f
 244          andi   t0, a1, 1
 245
 246         /* Still a halfword to go  */
 247         ulhu    t1, (src)
 248         PTR_ADDIU       src, 2
 249
 250 1:      beqz    t0, 1f
 251          sll    t1, t1, 16
 252
 253         lbu     t2, (src)
 254          nop
 255
 256 #ifdef __MIPSEB__
 257         sll     t2, t2, 8
 258 #endif
 259         or      t1, t2
 260
 261 1:      ADDC(sum, t1)
 262
 263         /* fold checksum */
 264 #ifdef USE_DOUBLE
 265         dsll32  v1, sum, 0
 266         daddu   sum, v1
 267         sltu    v1, sum, v1
 268         dsra32  sum, sum, 0
 269         addu    sum, v1
 270 #endif
 271
 272         /* odd buffer alignment? */
 273 #ifdef CONFIG_CPU_MIPSR2
 274         wsbh    v1, sum
 275         movn    sum, v1, t7
 276 #else
 277         beqz    t7, 1f                  /* odd buffer alignment? */
 278          lui    v1, 0x00ff
 279         addu    v1, 0x00ff
 280         and     t0, sum, v1
 281         sll     t0, t0, 8
 282         srl     sum, sum, 8
 283         and     sum, sum, v1
 284         or      sum, sum, t0
 285 1:
 286 #endif
 287         .set    reorder
 288         /* Add the passed partial csum.  */
 289         ADDC32(sum, a2)
 290         jr      ra
 291         .set    noreorder
 292         END(csum_partial)
 293
 294
 295 /*
 296  * checksum and copy routines based on memcpy.S
 297  *
 298  *      csum_partial_copy_nocheck(src, dst, len, sum)
 299  *      __csum_partial_copy_user(src, dst, len, sum, errp)
 300  *
 301  * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
 302  * function in this file use the standard calling convention.
 303  */
 304
 305 #define src a0
 306 #define dst a1
 307 #define len a2
 308 #define psum a3
 309 #define sum v0
 310 #define odd t8
 311 #define errptr t9
 312
 313 /*
 314  * The exception handler for loads requires that:
 315  *  1- AT contain the address of the byte just past the end of the source
 316  *     of the copy,
 317  *  2- src_entry <= src < AT, and
 318  *  3- (dst - src) == (dst_entry - src_entry),
 319  * The _entry suffix denotes values when __copy_user was called.
 320  *
 321  * (1) is set up up by __csum_partial_copy_from_user and maintained by
 322  *      not writing AT in __csum_partial_copy
 323  * (2) is met by incrementing src by the number of bytes copied
 324  * (3) is met by not doing loads between a pair of increments of dst and src
 325  *
 326  * The exception handlers for stores stores -EFAULT to errptr and return.
 327  * These handlers do not need to overwrite any data.
 328  */
 329
 330 #define EXC(inst_reg,addr,handler)              \
 331 9:      inst_reg, addr;                         \
 332         .section __ex_table,"a";                \
 333         PTR     9b, handler;                    \
 334         .previous
 335
 336 #ifdef USE_DOUBLE
 337
 338 #define LOAD   ld
 339 #define LOADL  ldl
 340 #define LOADR  ldr
 341 #define STOREL sdl
 342 #define STORER sdr
 343 #define STORE  sd
 344 #define ADD    daddu
 345 #define SUB    dsubu
 346 #define SRL    dsrl
 347 #define SLL    dsll
 348 #define SLLV   dsllv
 349 #define SRLV   dsrlv
 350 #define NBYTES 8
 351 #define LOG_NBYTES 3
 352
 353 #else
 354
 355 #define LOAD   lw
 356 #define LOADL  lwl
 357 #define LOADR  lwr
 358 #define STOREL swl
 359 #define STORER swr
 360 #define STORE  sw
 361 #define ADD    addu
 362 #define SUB    subu
 363 #define SRL    srl
 364 #define SLL    sll
 365 #define SLLV   sllv
 366 #define SRLV   srlv
 367 #define NBYTES 4
 368 #define LOG_NBYTES 2
 369
 370 #endif /* USE_DOUBLE */
 371
 372 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 373 #define LDFIRST LOADR
 374 #define LDREST  LOADL
 375 #define STFIRST STORER
 376 #define STREST  STOREL
 377 #define SHIFT_DISCARD SLLV
 378 #define SHIFT_DISCARD_REVERT SRLV
 379 #else
 380 #define LDFIRST LOADL
 381 #define LDREST  LOADR
 382 #define STFIRST STOREL
 383 #define STREST  STORER
 384 #define SHIFT_DISCARD SRLV
 385 #define SHIFT_DISCARD_REVERT SLLV
 386 #endif
 387
 388 #define FIRST(unit) ((unit)*NBYTES)
 389 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 390
 391 #define ADDRMASK (NBYTES-1)
 392
 393 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 394         .set    noat
 395 #else
 396         .set    at=v1
 397 #endif
 398
 399 LEAF(__csum_partial_copy_user)
 400         PTR_ADDU        AT, src, len    /* See (1) above. */
 401 #ifdef CONFIG_64BIT
 402         move    errptr, a4
 403 #else
 404         lw      errptr, 16(sp)
 405 #endif
 406 FEXPORT(csum_partial_copy_nocheck)
 407         move    sum, zero
 408         move    odd, zero
 409         /*
 410          * Note: dst & src may be unaligned, len may be 0
 411          * Temps
 412          */
 413         /*
 414          * The "issue break"s below are very approximate.
 415          * Issue delays for dcache fills will perturb the schedule, as will
 416          * load queue full replay traps, etc.
 417          *
 418          * If len < NBYTES use byte operations.
 419          */
 420         sltu    t2, len, NBYTES
 421         and     t1, dst, ADDRMASK
 422         bnez    t2, .Lcopy_bytes_checklen
 423          and    t0, src, ADDRMASK
 424         andi    odd, dst, 0x1                   /* odd buffer? */
 425         bnez    t1, .Ldst_unaligned
 426          nop
 427         bnez    t0, .Lsrc_unaligned_dst_aligned
 428         /*
 429          * use delay slot for fall-through
 430          * src and dst are aligned; need to compute rem
 431          */
 432 .Lboth_aligned:
 433          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 434         beqz    t0, .Lcleanup_both_aligned # len < 8*NBYTES
 435          nop
 436         SUB     len, 8*NBYTES           # subtract here for bgez loop
 437         .align  4
 438 1:
 439 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 440 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 441 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 442 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 443 EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
 444 EXC(    LOAD    t5, UNIT(5)(src),       .Ll_exc_copy)
 445 EXC(    LOAD    t6, UNIT(6)(src),       .Ll_exc_copy)
 446 EXC(    LOAD    t7, UNIT(7)(src),       .Ll_exc_copy)
 447         SUB     len, len, 8*NBYTES
 448         ADD     src, src, 8*NBYTES
 449 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc)
 450         ADDC(sum, t0)
 451 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc)
 452         ADDC(sum, t1)
 453 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc)
 454         ADDC(sum, t2)
 455 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc)
 456         ADDC(sum, t3)
 457 EXC(    STORE   t4, UNIT(4)(dst),       .Ls_exc)
 458         ADDC(sum, t4)
 459 EXC(    STORE   t5, UNIT(5)(dst),       .Ls_exc)
 460         ADDC(sum, t5)
 461 EXC(    STORE   t6, UNIT(6)(dst),       .Ls_exc)
 462         ADDC(sum, t6)
 463 EXC(    STORE   t7, UNIT(7)(dst),       .Ls_exc)
 464         ADDC(sum, t7)
 465         .set    reorder                         /* DADDI_WAR */
 466         ADD     dst, dst, 8*NBYTES
 467         bgez    len, 1b
 468         .set    noreorder
 469         ADD     len, 8*NBYTES           # revert len (see above)
 470
 471         /*
 472          * len == the number of bytes left to copy < 8*NBYTES
 473          */
 474 .Lcleanup_both_aligned:
 475 #define rem t7
 476         beqz    len, .Ldone
 477          sltu   t0, len, 4*NBYTES
 478         bnez    t0, .Lless_than_4units
 479          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 480         /*
 481          * len >= 4*NBYTES
 482          */
 483 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 484 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 485 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 486 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 487         SUB     len, len, 4*NBYTES
 488         ADD     src, src, 4*NBYTES
 489 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc)
 490         ADDC(sum, t0)
 491 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc)
 492         ADDC(sum, t1)
 493 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc)
 494         ADDC(sum, t2)
 495 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc)
 496         ADDC(sum, t3)
 497         .set    reorder                         /* DADDI_WAR */
 498         ADD     dst, dst, 4*NBYTES
 499         beqz    len, .Ldone
 500         .set    noreorder
 501 .Lless_than_4units:
 502         /*
 503          * rem = len % NBYTES
 504          */
 505         beq     rem, len, .Lcopy_bytes
 506          nop
 507 1:
 508 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 509         ADD     src, src, NBYTES
 510         SUB     len, len, NBYTES
 511 EXC(    STORE   t0, 0(dst),             .Ls_exc)
 512         ADDC(sum, t0)
 513         .set    reorder                         /* DADDI_WAR */
 514         ADD     dst, dst, NBYTES
 515         bne     rem, len, 1b
 516         .set    noreorder
 517
 518         /*
 519          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 520          * A loop would do only a byte at a time with possible branch
 521          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 522          * because can't assume read-access to dst.  Instead, use
 523          * STREST dst, which doesn't require read access to dst.
 524          *
 525          * This code should perform better than a simple loop on modern,
 526          * wide-issue mips processors because the code has fewer branches and
 527          * more instruction-level parallelism.
 528          */
 529 #define bits t2
 530         beqz    len, .Ldone
 531          ADD    t1, dst, len    # t1 is just past last byte of dst
 532         li      bits, 8*NBYTES
 533         SLL     rem, len, 3     # rem = number of bits to keep
 534 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 535         SUB     bits, bits, rem # bits = number of bits to discard
 536         SHIFT_DISCARD t0, t0, bits
 537 EXC(    STREST  t0, -1(t1),             .Ls_exc)
 538         SHIFT_DISCARD_REVERT t0, t0, bits
 539         .set reorder
 540         ADDC(sum, t0)
 541         b       .Ldone
 542         .set noreorder
 543 .Ldst_unaligned:
 544         /*
 545          * dst is unaligned
 546          * t0 = src & ADDRMASK
 547          * t1 = dst & ADDRMASK; T1 > 0
 548          * len >= NBYTES
 549          *
 550          * Copy enough bytes to align dst
 551          * Set match = (src and dst have same alignment)
 552          */
 553 #define match rem
 554 EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
 555         ADD     t2, zero, NBYTES
 556 EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
 557         SUB     t2, t2, t1      # t2 = number of bytes copied
 558         xor     match, t0, t1
 559 EXC(    STFIRST t3, FIRST(0)(dst),      .Ls_exc)
 560         SLL     t4, t1, 3               # t4 = number of bits to discard
 561         SHIFT_DISCARD t3, t3, t4
 562         /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
 563         ADDC(sum, t3)
 564         beq     len, t2, .Ldone
 565          SUB    len, len, t2
 566         ADD     dst, dst, t2
 567         beqz    match, .Lboth_aligned
 568          ADD    src, src, t2
 569
 570 .Lsrc_unaligned_dst_aligned:
 571         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 572         beqz    t0, .Lcleanup_src_unaligned
 573          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 574 1:
 575 /*
 576  * Avoid consecutive LD*'s to the same register since some mips
 577  * implementations can't issue them in the same cycle.
 578  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 579  * are to the same unit (unless src is aligned, but it's not).
 580  */
 581 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 582 EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
 583         SUB     len, len, 4*NBYTES
 584 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 585 EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
 586 EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
 587 EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
 588 EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
 589 EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
 590         ADD     src, src, 4*NBYTES
 591 #ifdef CONFIG_CPU_SB1
 592         nop                             # improves slotting
 593 #endif
 594 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc)
 595         ADDC(sum, t0)
 596 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc)
 597         ADDC(sum, t1)
 598 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc)
 599         ADDC(sum, t2)
 600 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc)
 601         ADDC(sum, t3)
 602         .set    reorder                         /* DADDI_WAR */
 603         ADD     dst, dst, 4*NBYTES
 604         bne     len, rem, 1b
 605         .set    noreorder
 606
 607 .Lcleanup_src_unaligned:
 608         beqz    len, .Ldone
 609          and    rem, len, NBYTES-1  # rem = len % NBYTES
 610         beq     rem, len, .Lcopy_bytes
 611          nop
 612 1:
 613 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 614 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 615         ADD     src, src, NBYTES
 616         SUB     len, len, NBYTES
 617 EXC(    STORE   t0, 0(dst),             .Ls_exc)
 618         ADDC(sum, t0)
 619         .set    reorder                         /* DADDI_WAR */
 620         ADD     dst, dst, NBYTES
 621         bne     len, rem, 1b
 622         .set    noreorder
 623
 624 .Lcopy_bytes_checklen:
 625         beqz    len, .Ldone
 626          nop
 627 .Lcopy_bytes:
 628         /* 0 < len < NBYTES  */
 629 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 630 #define SHIFT_START 0
 631 #define SHIFT_INC 8
 632 #else
 633 #define SHIFT_START 8*(NBYTES-1)
 634 #define SHIFT_INC -8
 635 #endif
 636         move    t2, zero        # partial word
 637         li      t3, SHIFT_START # shift
 638 /* use .Ll_exc_copy here to return correct sum on fault */
 639 #define COPY_BYTE(N)                    \
 640 EXC(    lbu     t0, N(src), .Ll_exc_copy);      \
 641         SUB     len, len, 1;            \
 642 EXC(    sb      t0, N(dst), .Ls_exc);   \
 643         SLLV    t0, t0, t3;             \
 644         addu    t3, SHIFT_INC;          \
 645         beqz    len, .Lcopy_bytes_done; \
 646          or     t2, t0
 647
 648         COPY_BYTE(0)
 649         COPY_BYTE(1)
 650 #ifdef USE_DOUBLE
 651         COPY_BYTE(2)
 652         COPY_BYTE(3)
 653         COPY_BYTE(4)
 654         COPY_BYTE(5)
 655 #endif
 656 EXC(    lbu     t0, NBYTES-2(src), .Ll_exc_copy)
 657         SUB     len, len, 1
 658 EXC(    sb      t0, NBYTES-2(dst), .Ls_exc)
 659         SLLV    t0, t0, t3
 660         or      t2, t0
 661 .Lcopy_bytes_done:
 662         ADDC(sum, t2)
 663 .Ldone:
 664         /* fold checksum */
 665 #ifdef USE_DOUBLE
 666         dsll32  v1, sum, 0
 667         daddu   sum, v1
 668         sltu    v1, sum, v1
 669         dsra32  sum, sum, 0
 670         addu    sum, v1
 671 #endif
 672
 673 #ifdef CONFIG_CPU_MIPSR2
 674         wsbh    v1, sum
 675         movn    sum, v1, odd
 676 #else
 677         beqz    odd, 1f                 /* odd buffer alignment? */
 678          lui    v1, 0x00ff
 679         addu    v1, 0x00ff
 680         and     t0, sum, v1
 681         sll     t0, t0, 8
 682         srl     sum, sum, 8
 683         and     sum, sum, v1
 684         or      sum, sum, t0
 685 1:
 686 #endif
 687         .set reorder
 688         ADDC32(sum, psum)
 689         jr      ra
 690         .set noreorder
 691
 692 .Ll_exc_copy:
 693         /*
 694          * Copy bytes from src until faulting load address (or until a
 695          * lb faults)
 696          *
 697          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 698          * may be more than a byte beyond the last address.
 699          * Hence, the lb below may get an exception.
 700          *
 701          * Assumes src < THREAD_BUADDR($28)
 702          */
 703         LOAD    t0, TI_TASK($28)
 704          li     t2, SHIFT_START
 705         LOAD    t0, THREAD_BUADDR(t0)
 706 1:
 707 EXC(    lbu     t1, 0(src),     .Ll_exc)
 708         ADD     src, src, 1
 709         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 710         SLLV    t1, t1, t2
 711         addu    t2, SHIFT_INC
 712         ADDC(sum, t1)
 713         .set    reorder                         /* DADDI_WAR */
 714         ADD     dst, dst, 1
 715         bne     src, t0, 1b
 716         .set    noreorder
 717 .Ll_exc:
 718         LOAD    t0, TI_TASK($28)
 719          nop
 720         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 721          nop
 722         SUB     len, AT, t0             # len number of uncopied bytes
 723         /*
 724          * Here's where we rely on src and dst being incremented in tandem,
 725          *   See (3) above.
 726          * dst += (fault addr - src) to put dst at first byte to clear
 727          */
 728         ADD     dst, t0                 # compute start address in a1
 729         SUB     dst, src
 730         /*
 731          * Clear len bytes starting at dst.  Can't call __bzero because it
 732          * might modify len.  An inefficient loop for these rare times...
 733          */
 734         .set    reorder                         /* DADDI_WAR */
 735         SUB     src, len, 1
 736         beqz    len, .Ldone
 737         .set    noreorder
 738 1:      sb      zero, 0(dst)
 739         ADD     dst, dst, 1
 740         .set    push
 741         .set    noat
 742 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 743         bnez    src, 1b
 744          SUB    src, src, 1
 745 #else
 746         li      v1, 1
 747         bnez    src, 1b
 748          SUB    src, src, v1
 749 #endif
 750         li      v1, -EFAULT
 751         b       .Ldone
 752          sw     v1, (errptr)
 753
 754 .Ls_exc:
 755         li      v0, -1 /* invalid checksum */
 756         li      v1, -EFAULT
 757         jr      ra
 758          sw     v1, (errptr)
 759         .set    pop
 760         END(__csum_partial_copy_user)