arch/powerpc/lib/checksum_64.S

   1 /*
   2  * This file contains assembly-language implementations
   3  * of IP-style 1's complement checksum routines.
   4  *
   5  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   6  *
   7  *  This program is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU General Public License
   9  *  as published by the Free Software Foundation; either version
  10  *  2 of the License, or (at your option) any later version.
  11  *
  12  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13  */
  14
  15 #include <linux/sys.h>
  16 #include <asm/processor.h>
  17 #include <asm/errno.h>
  18 #include <asm/ppc_asm.h>
  19
  20 /*
  21  * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
  22  * len is in words and is always >= 5.
  23  *
  24  * In practice len == 5, but this is not guaranteed.  So this code does not
  25  * attempt to use doubleword instructions.
  26  */
  27 _GLOBAL(ip_fast_csum)
  28         lwz     r0,0(r3)
  29         lwzu    r5,4(r3)
  30         addic.  r4,r4,-2
  31         addc    r0,r0,r5
  32         mtctr   r4
  33         blelr-
  34 1:      lwzu    r4,4(r3)
  35         adde    r0,r0,r4
  36         bdnz    1b
  37         addze   r0,r0           /* add in final carry */
  38         rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
  39         add     r0,r0,r4
  40         srdi    r0,r0,32
  41         rlwinm  r3,r0,16,0,31   /* fold two halves together */
  42         add     r3,r0,r3
  43         not     r3,r3
  44         srwi    r3,r3,16
  45         blr
  46
  47 /*
  48  * Computes the checksum of a memory block at buff, length len,
  49  * and adds in "sum" (32-bit).
  50  *
  51  * csum_partial(r3=buff, r4=len, r5=sum)
  52  */
  53 _GLOBAL(csum_partial)
  54         addic   r0,r5,0                 /* clear carry */
  55
  56         srdi.   r6,r4,3                 /* less than 8 bytes? */
  57         beq     .Lcsum_tail_word
  58
  59         /*
  60          * If only halfword aligned, align to a double word. Since odd
  61          * aligned addresses should be rare and they would require more
  62          * work to calculate the correct checksum, we ignore that case
  63          * and take the potential slowdown of unaligned loads.
  64          */
  65         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
  66         beq     .Lcsum_aligned
  67
  68         li      r7,4
  69         sub     r6,r7,r6
  70         mtctr   r6
  71
  72 1:
  73         lhz     r6,0(r3)                /* align to doubleword */
  74         subi    r4,r4,2
  75         addi    r3,r3,2
  76         adde    r0,r0,r6
  77         bdnz    1b
  78
  79 .Lcsum_aligned:
  80         /*
  81          * We unroll the loop such that each iteration is 64 bytes with an
  82          * entry and exit limb of 64 bytes, meaning a minimum size of
  83          * 128 bytes.
  84          */
  85         srdi.   r6,r4,7
  86         beq     .Lcsum_tail_doublewords         /* len < 128 */
  87
  88         srdi    r6,r4,6
  89         subi    r6,r6,1
  90         mtctr   r6
  91
  92         stdu    r1,-STACKFRAMESIZE(r1)
  93         std     r14,STK_REG(R14)(r1)
  94         std     r15,STK_REG(R15)(r1)
  95         std     r16,STK_REG(R16)(r1)
  96
  97         ld      r6,0(r3)
  98         ld      r9,8(r3)
  99
 100         ld      r10,16(r3)
 101         ld      r11,24(r3)
 102
 103         /*
 104          * On POWER6 and POWER7 back to back addes take 2 cycles because of
 105          * the XER dependency. This means the fastest this loop can go is
 106          * 16 cycles per iteration. The scheduling of the loop below has
 107          * been shown to hit this on both POWER6 and POWER7.
 108          */
 109         .align 5
 110 2:
 111         adde    r0,r0,r6
 112         ld      r12,32(r3)
 113         ld      r14,40(r3)
 114
 115         adde    r0,r0,r9
 116         ld      r15,48(r3)
 117         ld      r16,56(r3)
 118         addi    r3,r3,64
 119
 120         adde    r0,r0,r10
 121
 122         adde    r0,r0,r11
 123
 124         adde    r0,r0,r12
 125
 126         adde    r0,r0,r14
 127
 128         adde    r0,r0,r15
 129         ld      r6,0(r3)
 130         ld      r9,8(r3)
 131
 132         adde    r0,r0,r16
 133         ld      r10,16(r3)
 134         ld      r11,24(r3)
 135         bdnz    2b
 136
 137
 138         adde    r0,r0,r6
 139         ld      r12,32(r3)
 140         ld      r14,40(r3)
 141
 142         adde    r0,r0,r9
 143         ld      r15,48(r3)
 144         ld      r16,56(r3)
 145         addi    r3,r3,64
 146
 147         adde    r0,r0,r10
 148         adde    r0,r0,r11
 149         adde    r0,r0,r12
 150         adde    r0,r0,r14
 151         adde    r0,r0,r15
 152         adde    r0,r0,r16
 153
 154         ld      r14,STK_REG(R14)(r1)
 155         ld      r15,STK_REG(R15)(r1)
 156         ld      r16,STK_REG(R16)(r1)
 157         addi    r1,r1,STACKFRAMESIZE
 158
 159         andi.   r4,r4,63
 160
 161 .Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
 162         srdi.   r6,r4,3
 163         beq     .Lcsum_tail_word
 164
 165         mtctr   r6
 166 3:
 167         ld      r6,0(r3)
 168         addi    r3,r3,8
 169         adde    r0,r0,r6
 170         bdnz    3b
 171
 172         andi.   r4,r4,7
 173
 174 .Lcsum_tail_word:                       /* Up to 7 bytes to go */
 175         srdi.   r6,r4,2
 176         beq     .Lcsum_tail_halfword
 177
 178         lwz     r6,0(r3)
 179         addi    r3,r3,4
 180         adde    r0,r0,r6
 181         subi    r4,r4,4
 182
 183 .Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
 184         srdi.   r6,r4,1
 185         beq     .Lcsum_tail_byte
 186
 187         lhz     r6,0(r3)
 188         addi    r3,r3,2
 189         adde    r0,r0,r6
 190         subi    r4,r4,2
 191
 192 .Lcsum_tail_byte:                       /* Up to 1 byte to go */
 193         andi.   r6,r4,1
 194         beq     .Lcsum_finish
 195
 196         lbz     r6,0(r3)
 197         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 198         adde    r0,r0,r9
 199
 200 .Lcsum_finish:
 201         addze   r0,r0                   /* add in final carry */
 202         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 203         add     r3,r4,r0
 204         srdi    r3,r3,32
 205         blr
 206
 207
 208         .macro srcnr
 209 100:
 210         .section __ex_table,"a"
 211         .align 3
 212         .llong 100b,.Lsrc_error_nr
 213         .previous
 214         .endm
 215
 216         .macro source
 217 150:
 218         .section __ex_table,"a"
 219         .align 3
 220         .llong 150b,.Lsrc_error
 221         .previous
 222         .endm
 223
 224         .macro dstnr
 225 200:
 226         .section __ex_table,"a"
 227         .align 3
 228         .llong 200b,.Ldest_error_nr
 229         .previous
 230         .endm
 231
 232         .macro dest
 233 250:
 234         .section __ex_table,"a"
 235         .align 3
 236         .llong 250b,.Ldest_error
 237         .previous
 238         .endm
 239
 240 /*
 241  * Computes the checksum of a memory block at src, length len,
 242  * and adds in "sum" (32-bit), while copying the block to dst.
 243  * If an access exception occurs on src or dst, it stores -EFAULT
 244  * to *src_err or *dst_err respectively. The caller must take any action
 245  * required in this case (zeroing memory, recalculating partial checksum etc).
 246  *
 247  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 248  */
 249 _GLOBAL(csum_partial_copy_generic)
 250         addic   r0,r6,0                 /* clear carry */
 251
 252         srdi.   r6,r5,3                 /* less than 8 bytes? */
 253         beq     .Lcopy_tail_word
 254
 255         /*
 256          * If only halfword aligned, align to a double word. Since odd
 257          * aligned addresses should be rare and they would require more
 258          * work to calculate the correct checksum, we ignore that case
 259          * and take the potential slowdown of unaligned loads.
 260          *
 261          * If the source and destination are relatively unaligned we only
 262          * align the source. This keeps things simple.
 263          */
 264         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
 265         beq     .Lcopy_aligned
 266
 267         li      r9,4
 268         sub     r6,r9,r6
 269         mtctr   r6
 270
 271 1:
 272 srcnr;  lhz     r6,0(r3)                /* align to doubleword */
 273         subi    r5,r5,2
 274         addi    r3,r3,2
 275         adde    r0,r0,r6
 276 dstnr;  sth     r6,0(r4)
 277         addi    r4,r4,2
 278         bdnz    1b
 279
 280 .Lcopy_aligned:
 281         /*
 282          * We unroll the loop such that each iteration is 64 bytes with an
 283          * entry and exit limb of 64 bytes, meaning a minimum size of
 284          * 128 bytes.
 285          */
 286         srdi.   r6,r5,7
 287         beq     .Lcopy_tail_doublewords         /* len < 128 */
 288
 289         srdi    r6,r5,6
 290         subi    r6,r6,1
 291         mtctr   r6
 292
 293         stdu    r1,-STACKFRAMESIZE(r1)
 294         std     r14,STK_REG(R14)(r1)
 295         std     r15,STK_REG(R15)(r1)
 296         std     r16,STK_REG(R16)(r1)
 297
 298 source; ld      r6,0(r3)
 299 source; ld      r9,8(r3)
 300
 301 source; ld      r10,16(r3)
 302 source; ld      r11,24(r3)
 303
 304         /*
 305          * On POWER6 and POWER7 back to back addes take 2 cycles because of
 306          * the XER dependency. This means the fastest this loop can go is
 307          * 16 cycles per iteration. The scheduling of the loop below has
 308          * been shown to hit this on both POWER6 and POWER7.
 309          */
 310         .align 5
 311 2:
 312         adde    r0,r0,r6
 313 source; ld      r12,32(r3)
 314 source; ld      r14,40(r3)
 315
 316         adde    r0,r0,r9
 317 source; ld      r15,48(r3)
 318 source; ld      r16,56(r3)
 319         addi    r3,r3,64
 320
 321         adde    r0,r0,r10
 322 dest;   std     r6,0(r4)
 323 dest;   std     r9,8(r4)
 324
 325         adde    r0,r0,r11
 326 dest;   std     r10,16(r4)
 327 dest;   std     r11,24(r4)
 328
 329         adde    r0,r0,r12
 330 dest;   std     r12,32(r4)
 331 dest;   std     r14,40(r4)
 332
 333         adde    r0,r0,r14
 334 dest;   std     r15,48(r4)
 335 dest;   std     r16,56(r4)
 336         addi    r4,r4,64
 337
 338         adde    r0,r0,r15
 339 source; ld      r6,0(r3)
 340 source; ld      r9,8(r3)
 341
 342         adde    r0,r0,r16
 343 source; ld      r10,16(r3)
 344 source; ld      r11,24(r3)
 345         bdnz    2b
 346
 347
 348         adde    r0,r0,r6
 349 source; ld      r12,32(r3)
 350 source; ld      r14,40(r3)
 351
 352         adde    r0,r0,r9
 353 source; ld      r15,48(r3)
 354 source; ld      r16,56(r3)
 355         addi    r3,r3,64
 356
 357         adde    r0,r0,r10
 358 dest;   std     r6,0(r4)
 359 dest;   std     r9,8(r4)
 360
 361         adde    r0,r0,r11
 362 dest;   std     r10,16(r4)
 363 dest;   std     r11,24(r4)
 364
 365         adde    r0,r0,r12
 366 dest;   std     r12,32(r4)
 367 dest;   std     r14,40(r4)
 368
 369         adde    r0,r0,r14
 370 dest;   std     r15,48(r4)
 371 dest;   std     r16,56(r4)
 372         addi    r4,r4,64
 373
 374         adde    r0,r0,r15
 375         adde    r0,r0,r16
 376
 377         ld      r14,STK_REG(R14)(r1)
 378         ld      r15,STK_REG(R15)(r1)
 379         ld      r16,STK_REG(R16)(r1)
 380         addi    r1,r1,STACKFRAMESIZE
 381
 382         andi.   r5,r5,63
 383
 384 .Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
 385         srdi.   r6,r5,3
 386         beq     .Lcopy_tail_word
 387
 388         mtctr   r6
 389 3:
 390 srcnr;  ld      r6,0(r3)
 391         addi    r3,r3,8
 392         adde    r0,r0,r6
 393 dstnr;  std     r6,0(r4)
 394         addi    r4,r4,8
 395         bdnz    3b
 396
 397         andi.   r5,r5,7
 398
 399 .Lcopy_tail_word:                       /* Up to 7 bytes to go */
 400         srdi.   r6,r5,2
 401         beq     .Lcopy_tail_halfword
 402
 403 srcnr;  lwz     r6,0(r3)
 404         addi    r3,r3,4
 405         adde    r0,r0,r6
 406 dstnr;  stw     r6,0(r4)
 407         addi    r4,r4,4
 408         subi    r5,r5,4
 409
 410 .Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
 411         srdi.   r6,r5,1
 412         beq     .Lcopy_tail_byte
 413
 414 srcnr;  lhz     r6,0(r3)
 415         addi    r3,r3,2
 416         adde    r0,r0,r6
 417 dstnr;  sth     r6,0(r4)
 418         addi    r4,r4,2
 419         subi    r5,r5,2
 420
 421 .Lcopy_tail_byte:                       /* Up to 1 byte to go */
 422         andi.   r6,r5,1
 423         beq     .Lcopy_finish
 424
 425 srcnr;  lbz     r6,0(r3)
 426         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 427         adde    r0,r0,r9
 428 dstnr;  stb     r6,0(r4)
 429
 430 .Lcopy_finish:
 431         addze   r0,r0                   /* add in final carry */
 432         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 433         add     r3,r4,r0
 434         srdi    r3,r3,32
 435         blr
 436
 437 .Lsrc_error:
 438         ld      r14,STK_REG(R14)(r1)
 439         ld      r15,STK_REG(R15)(r1)
 440         ld      r16,STK_REG(R16)(r1)
 441         addi    r1,r1,STACKFRAMESIZE
 442 .Lsrc_error_nr:
 443         cmpdi   0,r7,0
 444         beqlr
 445         li      r6,-EFAULT
 446         stw     r6,0(r7)
 447         blr
 448
 449 .Ldest_error:
 450         ld      r14,STK_REG(R14)(r1)
 451         ld      r15,STK_REG(R15)(r1)
 452         ld      r16,STK_REG(R16)(r1)
 453         addi    r1,r1,STACKFRAMESIZE
 454 .Ldest_error_nr:
 455         cmpdi   0,r8,0
 456         beqlr
 457         li      r6,-EFAULT
 458         stw     r6,0(r8)
 459         blr