arch/powerpc/lib/checksum_64.S

   1 /*
   2  * This file contains assembly-language implementations
   3  * of IP-style 1's complement checksum routines.
   4  *
   5  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   6  *
   7  *  This program is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU General Public License
   9  *  as published by the Free Software Foundation; either version
  10  *  2 of the License, or (at your option) any later version.
  11  *
  12  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13  */
  14
  15 #include <linux/sys.h>
  16 #include <asm/processor.h>
  17 #include <asm/errno.h>
  18 #include <asm/ppc_asm.h>
  19 #include <asm/export.h>
  20
  21 /*
  22  * Computes the checksum of a memory block at buff, length len,
  23  * and adds in "sum" (32-bit).
  24  *
  25  * __csum_partial(r3=buff, r4=len, r5=sum)
  26  */
  27 _GLOBAL(__csum_partial)
  28         addic   r0,r5,0                 /* clear carry */
  29
  30         srdi.   r6,r4,3                 /* less than 8 bytes? */
  31         beq     .Lcsum_tail_word
  32
  33         /*
  34          * If only halfword aligned, align to a double word. Since odd
  35          * aligned addresses should be rare and they would require more
  36          * work to calculate the correct checksum, we ignore that case
  37          * and take the potential slowdown of unaligned loads.
  38          */
  39         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
  40         beq     .Lcsum_aligned
  41
  42         li      r7,4
  43         sub     r6,r7,r6
  44         mtctr   r6
  45
  46 1:
  47         lhz     r6,0(r3)                /* align to doubleword */
  48         subi    r4,r4,2
  49         addi    r3,r3,2
  50         adde    r0,r0,r6
  51         bdnz    1b
  52
  53 .Lcsum_aligned:
  54         /*
  55          * We unroll the loop such that each iteration is 64 bytes with an
  56          * entry and exit limb of 64 bytes, meaning a minimum size of
  57          * 128 bytes.
  58          */
  59         srdi.   r6,r4,7
  60         beq     .Lcsum_tail_doublewords         /* len < 128 */
  61
  62         srdi    r6,r4,6
  63         subi    r6,r6,1
  64         mtctr   r6
  65
  66         stdu    r1,-STACKFRAMESIZE(r1)
  67         std     r14,STK_REG(R14)(r1)
  68         std     r15,STK_REG(R15)(r1)
  69         std     r16,STK_REG(R16)(r1)
  70
  71         ld      r6,0(r3)
  72         ld      r9,8(r3)
  73
  74         ld      r10,16(r3)
  75         ld      r11,24(r3)
  76
  77         /*
  78          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  79          * because of the XER dependency. This means the fastest this loop can
  80          * go is 16 cycles per iteration. The scheduling of the loop below has
  81          * been shown to hit this on both POWER6 and POWER7.
  82          */
  83         .align 5
  84 2:
  85         adde    r0,r0,r6
  86         ld      r12,32(r3)
  87         ld      r14,40(r3)
  88
  89         adde    r0,r0,r9
  90         ld      r15,48(r3)
  91         ld      r16,56(r3)
  92         addi    r3,r3,64
  93
  94         adde    r0,r0,r10
  95
  96         adde    r0,r0,r11
  97
  98         adde    r0,r0,r12
  99
 100         adde    r0,r0,r14
 101
 102         adde    r0,r0,r15
 103         ld      r6,0(r3)
 104         ld      r9,8(r3)
 105
 106         adde    r0,r0,r16
 107         ld      r10,16(r3)
 108         ld      r11,24(r3)
 109         bdnz    2b
 110
 111
 112         adde    r0,r0,r6
 113         ld      r12,32(r3)
 114         ld      r14,40(r3)
 115
 116         adde    r0,r0,r9
 117         ld      r15,48(r3)
 118         ld      r16,56(r3)
 119         addi    r3,r3,64
 120
 121         adde    r0,r0,r10
 122         adde    r0,r0,r11
 123         adde    r0,r0,r12
 124         adde    r0,r0,r14
 125         adde    r0,r0,r15
 126         adde    r0,r0,r16
 127
 128         ld      r14,STK_REG(R14)(r1)
 129         ld      r15,STK_REG(R15)(r1)
 130         ld      r16,STK_REG(R16)(r1)
 131         addi    r1,r1,STACKFRAMESIZE
 132
 133         andi.   r4,r4,63
 134
 135 .Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
 136         srdi.   r6,r4,3
 137         beq     .Lcsum_tail_word
 138
 139         mtctr   r6
 140 3:
 141         ld      r6,0(r3)
 142         addi    r3,r3,8
 143         adde    r0,r0,r6
 144         bdnz    3b
 145
 146         andi.   r4,r4,7
 147
 148 .Lcsum_tail_word:                       /* Up to 7 bytes to go */
 149         srdi.   r6,r4,2
 150         beq     .Lcsum_tail_halfword
 151
 152         lwz     r6,0(r3)
 153         addi    r3,r3,4
 154         adde    r0,r0,r6
 155         subi    r4,r4,4
 156
 157 .Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
 158         srdi.   r6,r4,1
 159         beq     .Lcsum_tail_byte
 160
 161         lhz     r6,0(r3)
 162         addi    r3,r3,2
 163         adde    r0,r0,r6
 164         subi    r4,r4,2
 165
 166 .Lcsum_tail_byte:                       /* Up to 1 byte to go */
 167         andi.   r6,r4,1
 168         beq     .Lcsum_finish
 169
 170         lbz     r6,0(r3)
 171         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 172         adde    r0,r0,r9
 173
 174 .Lcsum_finish:
 175         addze   r0,r0                   /* add in final carry */
 176         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 177         add     r3,r4,r0
 178         srdi    r3,r3,32
 179         blr
 180 EXPORT_SYMBOL(__csum_partial)
 181
 182
 183         .macro srcnr
 184 100:
 185         .section __ex_table,"a"
 186         .align 3
 187         .llong 100b,.Lsrc_error_nr
 188         .previous
 189         .endm
 190
 191         .macro source
 192 150:
 193         .section __ex_table,"a"
 194         .align 3
 195         .llong 150b,.Lsrc_error
 196         .previous
 197         .endm
 198
 199         .macro dstnr
 200 200:
 201         .section __ex_table,"a"
 202         .align 3
 203         .llong 200b,.Ldest_error_nr
 204         .previous
 205         .endm
 206
 207         .macro dest
 208 250:
 209         .section __ex_table,"a"
 210         .align 3
 211         .llong 250b,.Ldest_error
 212         .previous
 213         .endm
 214
 215 /*
 216  * Computes the checksum of a memory block at src, length len,
 217  * and adds in "sum" (32-bit), while copying the block to dst.
 218  * If an access exception occurs on src or dst, it stores -EFAULT
 219  * to *src_err or *dst_err respectively. The caller must take any action
 220  * required in this case (zeroing memory, recalculating partial checksum etc).
 221  *
 222  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 223  */
 224 _GLOBAL(csum_partial_copy_generic)
 225         addic   r0,r6,0                 /* clear carry */
 226
 227         srdi.   r6,r5,3                 /* less than 8 bytes? */
 228         beq     .Lcopy_tail_word
 229
 230         /*
 231          * If only halfword aligned, align to a double word. Since odd
 232          * aligned addresses should be rare and they would require more
 233          * work to calculate the correct checksum, we ignore that case
 234          * and take the potential slowdown of unaligned loads.
 235          *
 236          * If the source and destination are relatively unaligned we only
 237          * align the source. This keeps things simple.
 238          */
 239         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
 240         beq     .Lcopy_aligned
 241
 242         li      r9,4
 243         sub     r6,r9,r6
 244         mtctr   r6
 245
 246 1:
 247 srcnr;  lhz     r6,0(r3)                /* align to doubleword */
 248         subi    r5,r5,2
 249         addi    r3,r3,2
 250         adde    r0,r0,r6
 251 dstnr;  sth     r6,0(r4)
 252         addi    r4,r4,2
 253         bdnz    1b
 254
 255 .Lcopy_aligned:
 256         /*
 257          * We unroll the loop such that each iteration is 64 bytes with an
 258          * entry and exit limb of 64 bytes, meaning a minimum size of
 259          * 128 bytes.
 260          */
 261         srdi.   r6,r5,7
 262         beq     .Lcopy_tail_doublewords         /* len < 128 */
 263
 264         srdi    r6,r5,6
 265         subi    r6,r6,1
 266         mtctr   r6
 267
 268         stdu    r1,-STACKFRAMESIZE(r1)
 269         std     r14,STK_REG(R14)(r1)
 270         std     r15,STK_REG(R15)(r1)
 271         std     r16,STK_REG(R16)(r1)
 272
 273 source; ld      r6,0(r3)
 274 source; ld      r9,8(r3)
 275
 276 source; ld      r10,16(r3)
 277 source; ld      r11,24(r3)
 278
 279         /*
 280          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
 281          * because of the XER dependency. This means the fastest this loop can
 282          * go is 16 cycles per iteration. The scheduling of the loop below has
 283          * been shown to hit this on both POWER6 and POWER7.
 284          */
 285         .align 5
 286 2:
 287         adde    r0,r0,r6
 288 source; ld      r12,32(r3)
 289 source; ld      r14,40(r3)
 290
 291         adde    r0,r0,r9
 292 source; ld      r15,48(r3)
 293 source; ld      r16,56(r3)
 294         addi    r3,r3,64
 295
 296         adde    r0,r0,r10
 297 dest;   std     r6,0(r4)
 298 dest;   std     r9,8(r4)
 299
 300         adde    r0,r0,r11
 301 dest;   std     r10,16(r4)
 302 dest;   std     r11,24(r4)
 303
 304         adde    r0,r0,r12
 305 dest;   std     r12,32(r4)
 306 dest;   std     r14,40(r4)
 307
 308         adde    r0,r0,r14
 309 dest;   std     r15,48(r4)
 310 dest;   std     r16,56(r4)
 311         addi    r4,r4,64
 312
 313         adde    r0,r0,r15
 314 source; ld      r6,0(r3)
 315 source; ld      r9,8(r3)
 316
 317         adde    r0,r0,r16
 318 source; ld      r10,16(r3)
 319 source; ld      r11,24(r3)
 320         bdnz    2b
 321
 322
 323         adde    r0,r0,r6
 324 source; ld      r12,32(r3)
 325 source; ld      r14,40(r3)
 326
 327         adde    r0,r0,r9
 328 source; ld      r15,48(r3)
 329 source; ld      r16,56(r3)
 330         addi    r3,r3,64
 331
 332         adde    r0,r0,r10
 333 dest;   std     r6,0(r4)
 334 dest;   std     r9,8(r4)
 335
 336         adde    r0,r0,r11
 337 dest;   std     r10,16(r4)
 338 dest;   std     r11,24(r4)
 339
 340         adde    r0,r0,r12
 341 dest;   std     r12,32(r4)
 342 dest;   std     r14,40(r4)
 343
 344         adde    r0,r0,r14
 345 dest;   std     r15,48(r4)
 346 dest;   std     r16,56(r4)
 347         addi    r4,r4,64
 348
 349         adde    r0,r0,r15
 350         adde    r0,r0,r16
 351
 352         ld      r14,STK_REG(R14)(r1)
 353         ld      r15,STK_REG(R15)(r1)
 354         ld      r16,STK_REG(R16)(r1)
 355         addi    r1,r1,STACKFRAMESIZE
 356
 357         andi.   r5,r5,63
 358
 359 .Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
 360         srdi.   r6,r5,3
 361         beq     .Lcopy_tail_word
 362
 363         mtctr   r6
 364 3:
 365 srcnr;  ld      r6,0(r3)
 366         addi    r3,r3,8
 367         adde    r0,r0,r6
 368 dstnr;  std     r6,0(r4)
 369         addi    r4,r4,8
 370         bdnz    3b
 371
 372         andi.   r5,r5,7
 373
 374 .Lcopy_tail_word:                       /* Up to 7 bytes to go */
 375         srdi.   r6,r5,2
 376         beq     .Lcopy_tail_halfword
 377
 378 srcnr;  lwz     r6,0(r3)
 379         addi    r3,r3,4
 380         adde    r0,r0,r6
 381 dstnr;  stw     r6,0(r4)
 382         addi    r4,r4,4
 383         subi    r5,r5,4
 384
 385 .Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
 386         srdi.   r6,r5,1
 387         beq     .Lcopy_tail_byte
 388
 389 srcnr;  lhz     r6,0(r3)
 390         addi    r3,r3,2
 391         adde    r0,r0,r6
 392 dstnr;  sth     r6,0(r4)
 393         addi    r4,r4,2
 394         subi    r5,r5,2
 395
 396 .Lcopy_tail_byte:                       /* Up to 1 byte to go */
 397         andi.   r6,r5,1
 398         beq     .Lcopy_finish
 399
 400 srcnr;  lbz     r6,0(r3)
 401         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 402         adde    r0,r0,r9
 403 dstnr;  stb     r6,0(r4)
 404
 405 .Lcopy_finish:
 406         addze   r0,r0                   /* add in final carry */
 407         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 408         add     r3,r4,r0
 409         srdi    r3,r3,32
 410         blr
 411
 412 .Lsrc_error:
 413         ld      r14,STK_REG(R14)(r1)
 414         ld      r15,STK_REG(R15)(r1)
 415         ld      r16,STK_REG(R16)(r1)
 416         addi    r1,r1,STACKFRAMESIZE
 417 .Lsrc_error_nr:
 418         cmpdi   0,r7,0
 419         beqlr
 420         li      r6,-EFAULT
 421         stw     r6,0(r7)
 422         blr
 423
 424 .Ldest_error:
 425         ld      r14,STK_REG(R14)(r1)
 426         ld      r15,STK_REG(R15)(r1)
 427         ld      r16,STK_REG(R16)(r1)
 428         addi    r1,r1,STACKFRAMESIZE
 429 .Ldest_error_nr:
 430         cmpdi   0,r8,0
 431         beqlr
 432         li      r6,-EFAULT
 433         stw     r6,0(r8)
 434         blr
 435 EXPORT_SYMBOL(csum_partial_copy_generic)