arch/powerpc/lib/checksum_64.S

   1 /*
   2  * This file contains assembly-language implementations
   3  * of IP-style 1's complement checksum routines.
   4  *
   5  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   6  *
   7  *  This program is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU General Public License
   9  *  as published by the Free Software Foundation; either version
  10  *  2 of the License, or (at your option) any later version.
  11  *
  12  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13  */
  14
  15 #include <linux/sys.h>
  16 #include <asm/processor.h>
  17 #include <asm/errno.h>
  18 #include <asm/ppc_asm.h>
  19
  20 /*
  21  * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
  22  * len is in words and is always >= 5.
  23  *
  24  * In practice len == 5, but this is not guaranteed.  So this code does not
  25  * attempt to use doubleword instructions.
  26  */
  27 _GLOBAL(ip_fast_csum)
  28         lwz     r0,0(r3)
  29         lwzu    r5,4(r3)
  30         addic.  r4,r4,-2
  31         addc    r0,r0,r5
  32         mtctr   r4
  33         blelr-
  34 1:      lwzu    r4,4(r3)
  35         adde    r0,r0,r4
  36         bdnz    1b
  37         addze   r0,r0           /* add in final carry */
  38         rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
  39         add     r0,r0,r4
  40         srdi    r0,r0,32
  41         rlwinm  r3,r0,16,0,31   /* fold two halves together */
  42         add     r3,r0,r3
  43         not     r3,r3
  44         srwi    r3,r3,16
  45         blr
  46
  47 /*
  48  * Compute checksum of TCP or UDP pseudo-header:
  49  *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
  50  * No real gain trying to do this specially for 64 bit, but
  51  * the 32 bit addition may spill into the upper bits of
  52  * the doubleword so we still must fold it down from 64.
  53  */
  54 _GLOBAL(csum_tcpudp_magic)
  55         rlwimi  r5,r6,16,0,15   /* put proto in upper half of len */
  56         addc    r0,r3,r4        /* add 4 32-bit words together */
  57         adde    r0,r0,r5
  58         adde    r0,r0,r7
  59         rldicl  r4,r0,32,0      /* fold 64 bit value */
  60         add     r0,r4,r0
  61         srdi    r0,r0,32
  62         rlwinm  r3,r0,16,0,31   /* fold two halves together */
  63         add     r3,r0,r3
  64         not     r3,r3
  65         srwi    r3,r3,16
  66         blr
  67
  68 /*
  69  * Computes the checksum of a memory block at buff, length len,
  70  * and adds in "sum" (32-bit).
  71  *
  72  * csum_partial(r3=buff, r4=len, r5=sum)
  73  */
  74 _GLOBAL(csum_partial)
  75         addic   r0,r5,0                 /* clear carry */
  76
  77         srdi.   r6,r4,3                 /* less than 8 bytes? */
  78         beq     .Lcsum_tail_word
  79
  80         /*
  81          * If only halfword aligned, align to a double word. Since odd
  82          * aligned addresses should be rare and they would require more
  83          * work to calculate the correct checksum, we ignore that case
  84          * and take the potential slowdown of unaligned loads.
  85          */
  86         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
  87         beq     .Lcsum_aligned
  88
  89         li      r7,4
  90         sub     r6,r7,r6
  91         mtctr   r6
  92
  93 1:
  94         lhz     r6,0(r3)                /* align to doubleword */
  95         subi    r4,r4,2
  96         addi    r3,r3,2
  97         adde    r0,r0,r6
  98         bdnz    1b
  99
 100 .Lcsum_aligned:
 101         /*
 102          * We unroll the loop such that each iteration is 64 bytes with an
 103          * entry and exit limb of 64 bytes, meaning a minimum size of
 104          * 128 bytes.
 105          */
 106         srdi.   r6,r4,7
 107         beq     .Lcsum_tail_doublewords         /* len < 128 */
 108
 109         srdi    r6,r4,6
 110         subi    r6,r6,1
 111         mtctr   r6
 112
 113         stdu    r1,-STACKFRAMESIZE(r1)
 114         std     r14,STK_REG(R14)(r1)
 115         std     r15,STK_REG(R15)(r1)
 116         std     r16,STK_REG(R16)(r1)
 117
 118         ld      r6,0(r3)
 119         ld      r9,8(r3)
 120
 121         ld      r10,16(r3)
 122         ld      r11,24(r3)
 123
 124         /*
 125          * On POWER6 and POWER7 back to back addes take 2 cycles because of
 126          * the XER dependency. This means the fastest this loop can go is
 127          * 16 cycles per iteration. The scheduling of the loop below has
 128          * been shown to hit this on both POWER6 and POWER7.
 129          */
 130         .align 5
 131 2:
 132         adde    r0,r0,r6
 133         ld      r12,32(r3)
 134         ld      r14,40(r3)
 135
 136         adde    r0,r0,r9
 137         ld      r15,48(r3)
 138         ld      r16,56(r3)
 139         addi    r3,r3,64
 140
 141         adde    r0,r0,r10
 142
 143         adde    r0,r0,r11
 144
 145         adde    r0,r0,r12
 146
 147         adde    r0,r0,r14
 148
 149         adde    r0,r0,r15
 150         ld      r6,0(r3)
 151         ld      r9,8(r3)
 152
 153         adde    r0,r0,r16
 154         ld      r10,16(r3)
 155         ld      r11,24(r3)
 156         bdnz    2b
 157
 158
 159         adde    r0,r0,r6
 160         ld      r12,32(r3)
 161         ld      r14,40(r3)
 162
 163         adde    r0,r0,r9
 164         ld      r15,48(r3)
 165         ld      r16,56(r3)
 166         addi    r3,r3,64
 167
 168         adde    r0,r0,r10
 169         adde    r0,r0,r11
 170         adde    r0,r0,r12
 171         adde    r0,r0,r14
 172         adde    r0,r0,r15
 173         adde    r0,r0,r16
 174
 175         ld      r14,STK_REG(R14)(r1)
 176         ld      r15,STK_REG(R15)(r1)
 177         ld      r16,STK_REG(R16)(r1)
 178         addi    r1,r1,STACKFRAMESIZE
 179
 180         andi.   r4,r4,63
 181
 182 .Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
 183         srdi.   r6,r4,3
 184         beq     .Lcsum_tail_word
 185
 186         mtctr   r6
 187 3:
 188         ld      r6,0(r3)
 189         addi    r3,r3,8
 190         adde    r0,r0,r6
 191         bdnz    3b
 192
 193         andi.   r4,r4,7
 194
 195 .Lcsum_tail_word:                       /* Up to 7 bytes to go */
 196         srdi.   r6,r4,2
 197         beq     .Lcsum_tail_halfword
 198
 199         lwz     r6,0(r3)
 200         addi    r3,r3,4
 201         adde    r0,r0,r6
 202         subi    r4,r4,4
 203
 204 .Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
 205         srdi.   r6,r4,1
 206         beq     .Lcsum_tail_byte
 207
 208         lhz     r6,0(r3)
 209         addi    r3,r3,2
 210         adde    r0,r0,r6
 211         subi    r4,r4,2
 212
 213 .Lcsum_tail_byte:                       /* Up to 1 byte to go */
 214         andi.   r6,r4,1
 215         beq     .Lcsum_finish
 216
 217         lbz     r6,0(r3)
 218         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 219         adde    r0,r0,r9
 220
 221 .Lcsum_finish:
 222         addze   r0,r0                   /* add in final carry */
 223         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 224         add     r3,r4,r0
 225         srdi    r3,r3,32
 226         blr
 227
 228
 229         .macro srcnr
 230 100:
 231         .section __ex_table,"a"
 232         .align 3
 233         .llong 100b,.Lsrc_error_nr
 234         .previous
 235         .endm
 236
 237         .macro source
 238 150:
 239         .section __ex_table,"a"
 240         .align 3
 241         .llong 150b,.Lsrc_error
 242         .previous
 243         .endm
 244
 245         .macro dstnr
 246 200:
 247         .section __ex_table,"a"
 248         .align 3
 249         .llong 200b,.Ldest_error_nr
 250         .previous
 251         .endm
 252
 253         .macro dest
 254 250:
 255         .section __ex_table,"a"
 256         .align 3
 257         .llong 250b,.Ldest_error
 258         .previous
 259         .endm
 260
 261 /*
 262  * Computes the checksum of a memory block at src, length len,
 263  * and adds in "sum" (32-bit), while copying the block to dst.
 264  * If an access exception occurs on src or dst, it stores -EFAULT
 265  * to *src_err or *dst_err respectively. The caller must take any action
 266  * required in this case (zeroing memory, recalculating partial checksum etc).
 267  *
 268  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 269  */
 270 _GLOBAL(csum_partial_copy_generic)
 271         addic   r0,r6,0                 /* clear carry */
 272
 273         srdi.   r6,r5,3                 /* less than 8 bytes? */
 274         beq     .Lcopy_tail_word
 275
 276         /*
 277          * If only halfword aligned, align to a double word. Since odd
 278          * aligned addresses should be rare and they would require more
 279          * work to calculate the correct checksum, we ignore that case
 280          * and take the potential slowdown of unaligned loads.
 281          *
 282          * If the source and destination are relatively unaligned we only
 283          * align the source. This keeps things simple.
 284          */
 285         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
 286         beq     .Lcopy_aligned
 287
 288         li      r9,4
 289         sub     r6,r9,r6
 290         mtctr   r6
 291
 292 1:
 293 srcnr;  lhz     r6,0(r3)                /* align to doubleword */
 294         subi    r5,r5,2
 295         addi    r3,r3,2
 296         adde    r0,r0,r6
 297 dstnr;  sth     r6,0(r4)
 298         addi    r4,r4,2
 299         bdnz    1b
 300
 301 .Lcopy_aligned:
 302         /*
 303          * We unroll the loop such that each iteration is 64 bytes with an
 304          * entry and exit limb of 64 bytes, meaning a minimum size of
 305          * 128 bytes.
 306          */
 307         srdi.   r6,r5,7
 308         beq     .Lcopy_tail_doublewords         /* len < 128 */
 309
 310         srdi    r6,r5,6
 311         subi    r6,r6,1
 312         mtctr   r6
 313
 314         stdu    r1,-STACKFRAMESIZE(r1)
 315         std     r14,STK_REG(R14)(r1)
 316         std     r15,STK_REG(R15)(r1)
 317         std     r16,STK_REG(R16)(r1)
 318
 319 source; ld      r6,0(r3)
 320 source; ld      r9,8(r3)
 321
 322 source; ld      r10,16(r3)
 323 source; ld      r11,24(r3)
 324
 325         /*
 326          * On POWER6 and POWER7 back to back addes take 2 cycles because of
 327          * the XER dependency. This means the fastest this loop can go is
 328          * 16 cycles per iteration. The scheduling of the loop below has
 329          * been shown to hit this on both POWER6 and POWER7.
 330          */
 331         .align 5
 332 2:
 333         adde    r0,r0,r6
 334 source; ld      r12,32(r3)
 335 source; ld      r14,40(r3)
 336
 337         adde    r0,r0,r9
 338 source; ld      r15,48(r3)
 339 source; ld      r16,56(r3)
 340         addi    r3,r3,64
 341
 342         adde    r0,r0,r10
 343 dest;   std     r6,0(r4)
 344 dest;   std     r9,8(r4)
 345
 346         adde    r0,r0,r11
 347 dest;   std     r10,16(r4)
 348 dest;   std     r11,24(r4)
 349
 350         adde    r0,r0,r12
 351 dest;   std     r12,32(r4)
 352 dest;   std     r14,40(r4)
 353
 354         adde    r0,r0,r14
 355 dest;   std     r15,48(r4)
 356 dest;   std     r16,56(r4)
 357         addi    r4,r4,64
 358
 359         adde    r0,r0,r15
 360 source; ld      r6,0(r3)
 361 source; ld      r9,8(r3)
 362
 363         adde    r0,r0,r16
 364 source; ld      r10,16(r3)
 365 source; ld      r11,24(r3)
 366         bdnz    2b
 367
 368
 369         adde    r0,r0,r6
 370 source; ld      r12,32(r3)
 371 source; ld      r14,40(r3)
 372
 373         adde    r0,r0,r9
 374 source; ld      r15,48(r3)
 375 source; ld      r16,56(r3)
 376         addi    r3,r3,64
 377
 378         adde    r0,r0,r10
 379 dest;   std     r6,0(r4)
 380 dest;   std     r9,8(r4)
 381
 382         adde    r0,r0,r11
 383 dest;   std     r10,16(r4)
 384 dest;   std     r11,24(r4)
 385
 386         adde    r0,r0,r12
 387 dest;   std     r12,32(r4)
 388 dest;   std     r14,40(r4)
 389
 390         adde    r0,r0,r14
 391 dest;   std     r15,48(r4)
 392 dest;   std     r16,56(r4)
 393         addi    r4,r4,64
 394
 395         adde    r0,r0,r15
 396         adde    r0,r0,r16
 397
 398         ld      r14,STK_REG(R14)(r1)
 399         ld      r15,STK_REG(R15)(r1)
 400         ld      r16,STK_REG(R16)(r1)
 401         addi    r1,r1,STACKFRAMESIZE
 402
 403         andi.   r5,r5,63
 404
 405 .Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
 406         srdi.   r6,r5,3
 407         beq     .Lcopy_tail_word
 408
 409         mtctr   r6
 410 3:
 411 srcnr;  ld      r6,0(r3)
 412         addi    r3,r3,8
 413         adde    r0,r0,r6
 414 dstnr;  std     r6,0(r4)
 415         addi    r4,r4,8
 416         bdnz    3b
 417
 418         andi.   r5,r5,7
 419
 420 .Lcopy_tail_word:                       /* Up to 7 bytes to go */
 421         srdi.   r6,r5,2
 422         beq     .Lcopy_tail_halfword
 423
 424 srcnr;  lwz     r6,0(r3)
 425         addi    r3,r3,4
 426         adde    r0,r0,r6
 427 dstnr;  stw     r6,0(r4)
 428         addi    r4,r4,4
 429         subi    r5,r5,4
 430
 431 .Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
 432         srdi.   r6,r5,1
 433         beq     .Lcopy_tail_byte
 434
 435 srcnr;  lhz     r6,0(r3)
 436         addi    r3,r3,2
 437         adde    r0,r0,r6
 438 dstnr;  sth     r6,0(r4)
 439         addi    r4,r4,2
 440         subi    r5,r5,2
 441
 442 .Lcopy_tail_byte:                       /* Up to 1 byte to go */
 443         andi.   r6,r5,1
 444         beq     .Lcopy_finish
 445
 446 srcnr;  lbz     r6,0(r3)
 447         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 448         adde    r0,r0,r9
 449 dstnr;  stb     r6,0(r4)
 450
 451 .Lcopy_finish:
 452         addze   r0,r0                   /* add in final carry */
 453         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 454         add     r3,r4,r0
 455         srdi    r3,r3,32
 456         blr
 457
 458 .Lsrc_error:
 459         ld      r14,STK_REG(R14)(r1)
 460         ld      r15,STK_REG(R15)(r1)
 461         ld      r16,STK_REG(R16)(r1)
 462         addi    r1,r1,STACKFRAMESIZE
 463 .Lsrc_error_nr:
 464         cmpdi   0,r7,0
 465         beqlr
 466         li      r6,-EFAULT
 467         stw     r6,0(r7)
 468         blr
 469
 470 .Ldest_error:
 471         ld      r14,STK_REG(R14)(r1)
 472         ld      r15,STK_REG(R15)(r1)
 473         ld      r16,STK_REG(R16)(r1)
 474         addi    r1,r1,STACKFRAMESIZE
 475 .Ldest_error_nr:
 476         cmpdi   0,r8,0
 477         beqlr
 478         li      r6,-EFAULT
 479         stw     r6,0(r8)
 480         blr