arch/powerpc/lib/checksum_64.S

   1 /*
   2  * This file contains assembly-language implementations
   3  * of IP-style 1's complement checksum routines.
   4  *
   5  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   6  *
   7  *  This program is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU General Public License
   9  *  as published by the Free Software Foundation; either version
  10  *  2 of the License, or (at your option) any later version.
  11  *
  12  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13  */
  14
  15 #include <linux/sys.h>
  16 #include <asm/processor.h>
  17 #include <asm/errno.h>
  18 #include <asm/ppc_asm.h>
  19
  20 /*
  21  * Computes the checksum of a memory block at buff, length len,
  22  * and adds in "sum" (32-bit).
  23  *
  24  * __csum_partial(r3=buff, r4=len, r5=sum)
  25  */
  26 _GLOBAL(__csum_partial)
  27         addic   r0,r5,0                 /* clear carry */
  28
  29         srdi.   r6,r4,3                 /* less than 8 bytes? */
  30         beq     .Lcsum_tail_word
  31
  32         /*
  33          * If only halfword aligned, align to a double word. Since odd
  34          * aligned addresses should be rare and they would require more
  35          * work to calculate the correct checksum, we ignore that case
  36          * and take the potential slowdown of unaligned loads.
  37          */
  38         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
  39         beq     .Lcsum_aligned
  40
  41         li      r7,4
  42         sub     r6,r7,r6
  43         mtctr   r6
  44
  45 1:
  46         lhz     r6,0(r3)                /* align to doubleword */
  47         subi    r4,r4,2
  48         addi    r3,r3,2
  49         adde    r0,r0,r6
  50         bdnz    1b
  51
  52 .Lcsum_aligned:
  53         /*
  54          * We unroll the loop such that each iteration is 64 bytes with an
  55          * entry and exit limb of 64 bytes, meaning a minimum size of
  56          * 128 bytes.
  57          */
  58         srdi.   r6,r4,7
  59         beq     .Lcsum_tail_doublewords         /* len < 128 */
  60
  61         srdi    r6,r4,6
  62         subi    r6,r6,1
  63         mtctr   r6
  64
  65         stdu    r1,-STACKFRAMESIZE(r1)
  66         std     r14,STK_REG(R14)(r1)
  67         std     r15,STK_REG(R15)(r1)
  68         std     r16,STK_REG(R16)(r1)
  69
  70         ld      r6,0(r3)
  71         ld      r9,8(r3)
  72
  73         ld      r10,16(r3)
  74         ld      r11,24(r3)
  75
  76         /*
  77          * On POWER6 and POWER7 back to back addes take 2 cycles because of
  78          * the XER dependency. This means the fastest this loop can go is
  79          * 16 cycles per iteration. The scheduling of the loop below has
  80          * been shown to hit this on both POWER6 and POWER7.
  81          */
  82         .align 5
  83 2:
  84         adde    r0,r0,r6
  85         ld      r12,32(r3)
  86         ld      r14,40(r3)
  87
  88         adde    r0,r0,r9
  89         ld      r15,48(r3)
  90         ld      r16,56(r3)
  91         addi    r3,r3,64
  92
  93         adde    r0,r0,r10
  94
  95         adde    r0,r0,r11
  96
  97         adde    r0,r0,r12
  98
  99         adde    r0,r0,r14
 100
 101         adde    r0,r0,r15
 102         ld      r6,0(r3)
 103         ld      r9,8(r3)
 104
 105         adde    r0,r0,r16
 106         ld      r10,16(r3)
 107         ld      r11,24(r3)
 108         bdnz    2b
 109
 110
 111         adde    r0,r0,r6
 112         ld      r12,32(r3)
 113         ld      r14,40(r3)
 114
 115         adde    r0,r0,r9
 116         ld      r15,48(r3)
 117         ld      r16,56(r3)
 118         addi    r3,r3,64
 119
 120         adde    r0,r0,r10
 121         adde    r0,r0,r11
 122         adde    r0,r0,r12
 123         adde    r0,r0,r14
 124         adde    r0,r0,r15
 125         adde    r0,r0,r16
 126
 127         ld      r14,STK_REG(R14)(r1)
 128         ld      r15,STK_REG(R15)(r1)
 129         ld      r16,STK_REG(R16)(r1)
 130         addi    r1,r1,STACKFRAMESIZE
 131
 132         andi.   r4,r4,63
 133
 134 .Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
 135         srdi.   r6,r4,3
 136         beq     .Lcsum_tail_word
 137
 138         mtctr   r6
 139 3:
 140         ld      r6,0(r3)
 141         addi    r3,r3,8
 142         adde    r0,r0,r6
 143         bdnz    3b
 144
 145         andi.   r4,r4,7
 146
 147 .Lcsum_tail_word:                       /* Up to 7 bytes to go */
 148         srdi.   r6,r4,2
 149         beq     .Lcsum_tail_halfword
 150
 151         lwz     r6,0(r3)
 152         addi    r3,r3,4
 153         adde    r0,r0,r6
 154         subi    r4,r4,4
 155
 156 .Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
 157         srdi.   r6,r4,1
 158         beq     .Lcsum_tail_byte
 159
 160         lhz     r6,0(r3)
 161         addi    r3,r3,2
 162         adde    r0,r0,r6
 163         subi    r4,r4,2
 164
 165 .Lcsum_tail_byte:                       /* Up to 1 byte to go */
 166         andi.   r6,r4,1
 167         beq     .Lcsum_finish
 168
 169         lbz     r6,0(r3)
 170         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 171         adde    r0,r0,r9
 172
 173 .Lcsum_finish:
 174         addze   r0,r0                   /* add in final carry */
 175         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 176         add     r3,r4,r0
 177         srdi    r3,r3,32
 178         blr
 179
 180
 181         .macro srcnr
 182 100:
 183         .section __ex_table,"a"
 184         .align 3
 185         .llong 100b,.Lsrc_error_nr
 186         .previous
 187         .endm
 188
 189         .macro source
 190 150:
 191         .section __ex_table,"a"
 192         .align 3
 193         .llong 150b,.Lsrc_error
 194         .previous
 195         .endm
 196
 197         .macro dstnr
 198 200:
 199         .section __ex_table,"a"
 200         .align 3
 201         .llong 200b,.Ldest_error_nr
 202         .previous
 203         .endm
 204
 205         .macro dest
 206 250:
 207         .section __ex_table,"a"
 208         .align 3
 209         .llong 250b,.Ldest_error
 210         .previous
 211         .endm
 212
 213 /*
 214  * Computes the checksum of a memory block at src, length len,
 215  * and adds in "sum" (32-bit), while copying the block to dst.
 216  * If an access exception occurs on src or dst, it stores -EFAULT
 217  * to *src_err or *dst_err respectively. The caller must take any action
 218  * required in this case (zeroing memory, recalculating partial checksum etc).
 219  *
 220  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 221  */
 222 _GLOBAL(csum_partial_copy_generic)
 223         addic   r0,r6,0                 /* clear carry */
 224
 225         srdi.   r6,r5,3                 /* less than 8 bytes? */
 226         beq     .Lcopy_tail_word
 227
 228         /*
 229          * If only halfword aligned, align to a double word. Since odd
 230          * aligned addresses should be rare and they would require more
 231          * work to calculate the correct checksum, we ignore that case
 232          * and take the potential slowdown of unaligned loads.
 233          *
 234          * If the source and destination are relatively unaligned we only
 235          * align the source. This keeps things simple.
 236          */
 237         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
 238         beq     .Lcopy_aligned
 239
 240         li      r9,4
 241         sub     r6,r9,r6
 242         mtctr   r6
 243
 244 1:
 245 srcnr;  lhz     r6,0(r3)                /* align to doubleword */
 246         subi    r5,r5,2
 247         addi    r3,r3,2
 248         adde    r0,r0,r6
 249 dstnr;  sth     r6,0(r4)
 250         addi    r4,r4,2
 251         bdnz    1b
 252
 253 .Lcopy_aligned:
 254         /*
 255          * We unroll the loop such that each iteration is 64 bytes with an
 256          * entry and exit limb of 64 bytes, meaning a minimum size of
 257          * 128 bytes.
 258          */
 259         srdi.   r6,r5,7
 260         beq     .Lcopy_tail_doublewords         /* len < 128 */
 261
 262         srdi    r6,r5,6
 263         subi    r6,r6,1
 264         mtctr   r6
 265
 266         stdu    r1,-STACKFRAMESIZE(r1)
 267         std     r14,STK_REG(R14)(r1)
 268         std     r15,STK_REG(R15)(r1)
 269         std     r16,STK_REG(R16)(r1)
 270
 271 source; ld      r6,0(r3)
 272 source; ld      r9,8(r3)
 273
 274 source; ld      r10,16(r3)
 275 source; ld      r11,24(r3)
 276
 277         /*
 278          * On POWER6 and POWER7 back to back addes take 2 cycles because of
 279          * the XER dependency. This means the fastest this loop can go is
 280          * 16 cycles per iteration. The scheduling of the loop below has
 281          * been shown to hit this on both POWER6 and POWER7.
 282          */
 283         .align 5
 284 2:
 285         adde    r0,r0,r6
 286 source; ld      r12,32(r3)
 287 source; ld      r14,40(r3)
 288
 289         adde    r0,r0,r9
 290 source; ld      r15,48(r3)
 291 source; ld      r16,56(r3)
 292         addi    r3,r3,64
 293
 294         adde    r0,r0,r10
 295 dest;   std     r6,0(r4)
 296 dest;   std     r9,8(r4)
 297
 298         adde    r0,r0,r11
 299 dest;   std     r10,16(r4)
 300 dest;   std     r11,24(r4)
 301
 302         adde    r0,r0,r12
 303 dest;   std     r12,32(r4)
 304 dest;   std     r14,40(r4)
 305
 306         adde    r0,r0,r14
 307 dest;   std     r15,48(r4)
 308 dest;   std     r16,56(r4)
 309         addi    r4,r4,64
 310
 311         adde    r0,r0,r15
 312 source; ld      r6,0(r3)
 313 source; ld      r9,8(r3)
 314
 315         adde    r0,r0,r16
 316 source; ld      r10,16(r3)
 317 source; ld      r11,24(r3)
 318         bdnz    2b
 319
 320
 321         adde    r0,r0,r6
 322 source; ld      r12,32(r3)
 323 source; ld      r14,40(r3)
 324
 325         adde    r0,r0,r9
 326 source; ld      r15,48(r3)
 327 source; ld      r16,56(r3)
 328         addi    r3,r3,64
 329
 330         adde    r0,r0,r10
 331 dest;   std     r6,0(r4)
 332 dest;   std     r9,8(r4)
 333
 334         adde    r0,r0,r11
 335 dest;   std     r10,16(r4)
 336 dest;   std     r11,24(r4)
 337
 338         adde    r0,r0,r12
 339 dest;   std     r12,32(r4)
 340 dest;   std     r14,40(r4)
 341
 342         adde    r0,r0,r14
 343 dest;   std     r15,48(r4)
 344 dest;   std     r16,56(r4)
 345         addi    r4,r4,64
 346
 347         adde    r0,r0,r15
 348         adde    r0,r0,r16
 349
 350         ld      r14,STK_REG(R14)(r1)
 351         ld      r15,STK_REG(R15)(r1)
 352         ld      r16,STK_REG(R16)(r1)
 353         addi    r1,r1,STACKFRAMESIZE
 354
 355         andi.   r5,r5,63
 356
 357 .Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
 358         srdi.   r6,r5,3
 359         beq     .Lcopy_tail_word
 360
 361         mtctr   r6
 362 3:
 363 srcnr;  ld      r6,0(r3)
 364         addi    r3,r3,8
 365         adde    r0,r0,r6
 366 dstnr;  std     r6,0(r4)
 367         addi    r4,r4,8
 368         bdnz    3b
 369
 370         andi.   r5,r5,7
 371
 372 .Lcopy_tail_word:                       /* Up to 7 bytes to go */
 373         srdi.   r6,r5,2
 374         beq     .Lcopy_tail_halfword
 375
 376 srcnr;  lwz     r6,0(r3)
 377         addi    r3,r3,4
 378         adde    r0,r0,r6
 379 dstnr;  stw     r6,0(r4)
 380         addi    r4,r4,4
 381         subi    r5,r5,4
 382
 383 .Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
 384         srdi.   r6,r5,1
 385         beq     .Lcopy_tail_byte
 386
 387 srcnr;  lhz     r6,0(r3)
 388         addi    r3,r3,2
 389         adde    r0,r0,r6
 390 dstnr;  sth     r6,0(r4)
 391         addi    r4,r4,2
 392         subi    r5,r5,2
 393
 394 .Lcopy_tail_byte:                       /* Up to 1 byte to go */
 395         andi.   r6,r5,1
 396         beq     .Lcopy_finish
 397
 398 srcnr;  lbz     r6,0(r3)
 399         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 400         adde    r0,r0,r9
 401 dstnr;  stb     r6,0(r4)
 402
 403 .Lcopy_finish:
 404         addze   r0,r0                   /* add in final carry */
 405         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 406         add     r3,r4,r0
 407         srdi    r3,r3,32
 408         blr
 409
 410 .Lsrc_error:
 411         ld      r14,STK_REG(R14)(r1)
 412         ld      r15,STK_REG(R15)(r1)
 413         ld      r16,STK_REG(R16)(r1)
 414         addi    r1,r1,STACKFRAMESIZE
 415 .Lsrc_error_nr:
 416         cmpdi   0,r7,0
 417         beqlr
 418         li      r6,-EFAULT
 419         stw     r6,0(r7)
 420         blr
 421
 422 .Ldest_error:
 423         ld      r14,STK_REG(R14)(r1)
 424         ld      r15,STK_REG(R15)(r1)
 425         ld      r16,STK_REG(R16)(r1)
 426         addi    r1,r1,STACKFRAMESIZE
 427 .Ldest_error_nr:
 428         cmpdi   0,r8,0
 429         beqlr
 430         li      r6,-EFAULT
 431         stw     r6,0(r8)
 432         blr