arch/powerpc/lib/checksum_64.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * This file contains assembly-language implementations
   4  * of IP-style 1's complement checksum routines.
   5  *
   6  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   7  *
   8  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
   9  */
  10
  11 #include <linux/sys.h>
  12 #include <asm/processor.h>
  13 #include <asm/errno.h>
  14 #include <asm/ppc_asm.h>
  15 #include <asm/export.h>
  16
  17 /*
  18  * Computes the checksum of a memory block at buff, length len,
  19  * and adds in "sum" (32-bit).
  20  *
  21  * __csum_partial(r3=buff, r4=len, r5=sum)
  22  */
  23 _GLOBAL(__csum_partial)
  24         addic   r0,r5,0                 /* clear carry */
  25
  26         srdi.   r6,r4,3                 /* less than 8 bytes? */
  27         beq     .Lcsum_tail_word
  28
  29         /*
  30          * If only halfword aligned, align to a double word. Since odd
  31          * aligned addresses should be rare and they would require more
  32          * work to calculate the correct checksum, we ignore that case
  33          * and take the potential slowdown of unaligned loads.
  34          */
  35         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
  36         beq     .Lcsum_aligned
  37
  38         li      r7,4
  39         sub     r6,r7,r6
  40         mtctr   r6
  41
  42 1:
  43         lhz     r6,0(r3)                /* align to doubleword */
  44         subi    r4,r4,2
  45         addi    r3,r3,2
  46         adde    r0,r0,r6
  47         bdnz    1b
  48
  49 .Lcsum_aligned:
  50         /*
  51          * We unroll the loop such that each iteration is 64 bytes with an
  52          * entry and exit limb of 64 bytes, meaning a minimum size of
  53          * 128 bytes.
  54          */
  55         srdi.   r6,r4,7
  56         beq     .Lcsum_tail_doublewords         /* len < 128 */
  57
  58         srdi    r6,r4,6
  59         subi    r6,r6,1
  60         mtctr   r6
  61
  62         stdu    r1,-STACKFRAMESIZE(r1)
  63         std     r14,STK_REG(R14)(r1)
  64         std     r15,STK_REG(R15)(r1)
  65         std     r16,STK_REG(R16)(r1)
  66
  67         ld      r6,0(r3)
  68         ld      r9,8(r3)
  69
  70         ld      r10,16(r3)
  71         ld      r11,24(r3)
  72
  73         /*
  74          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  75          * because of the XER dependency. This means the fastest this loop can
  76          * go is 16 cycles per iteration. The scheduling of the loop below has
  77          * been shown to hit this on both POWER6 and POWER7.
  78          */
  79         .align 5
  80 2:
  81         adde    r0,r0,r6
  82         ld      r12,32(r3)
  83         ld      r14,40(r3)
  84
  85         adde    r0,r0,r9
  86         ld      r15,48(r3)
  87         ld      r16,56(r3)
  88         addi    r3,r3,64
  89
  90         adde    r0,r0,r10
  91
  92         adde    r0,r0,r11
  93
  94         adde    r0,r0,r12
  95
  96         adde    r0,r0,r14
  97
  98         adde    r0,r0,r15
  99         ld      r6,0(r3)
 100         ld      r9,8(r3)
 101
 102         adde    r0,r0,r16
 103         ld      r10,16(r3)
 104         ld      r11,24(r3)
 105         bdnz    2b
 106
 107
 108         adde    r0,r0,r6
 109         ld      r12,32(r3)
 110         ld      r14,40(r3)
 111
 112         adde    r0,r0,r9
 113         ld      r15,48(r3)
 114         ld      r16,56(r3)
 115         addi    r3,r3,64
 116
 117         adde    r0,r0,r10
 118         adde    r0,r0,r11
 119         adde    r0,r0,r12
 120         adde    r0,r0,r14
 121         adde    r0,r0,r15
 122         adde    r0,r0,r16
 123
 124         ld      r14,STK_REG(R14)(r1)
 125         ld      r15,STK_REG(R15)(r1)
 126         ld      r16,STK_REG(R16)(r1)
 127         addi    r1,r1,STACKFRAMESIZE
 128
 129         andi.   r4,r4,63
 130
 131 .Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
 132         srdi.   r6,r4,3
 133         beq     .Lcsum_tail_word
 134
 135         mtctr   r6
 136 3:
 137         ld      r6,0(r3)
 138         addi    r3,r3,8
 139         adde    r0,r0,r6
 140         bdnz    3b
 141
 142         andi.   r4,r4,7
 143
 144 .Lcsum_tail_word:                       /* Up to 7 bytes to go */
 145         srdi.   r6,r4,2
 146         beq     .Lcsum_tail_halfword
 147
 148         lwz     r6,0(r3)
 149         addi    r3,r3,4
 150         adde    r0,r0,r6
 151         subi    r4,r4,4
 152
 153 .Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
 154         srdi.   r6,r4,1
 155         beq     .Lcsum_tail_byte
 156
 157         lhz     r6,0(r3)
 158         addi    r3,r3,2
 159         adde    r0,r0,r6
 160         subi    r4,r4,2
 161
 162 .Lcsum_tail_byte:                       /* Up to 1 byte to go */
 163         andi.   r6,r4,1
 164         beq     .Lcsum_finish
 165
 166         lbz     r6,0(r3)
 167 #ifdef __BIG_ENDIAN__
 168         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 169         adde    r0,r0,r9
 170 #else
 171         adde    r0,r0,r6
 172 #endif
 173
 174 .Lcsum_finish:
 175         addze   r0,r0                   /* add in final carry */
 176         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 177         add     r3,r4,r0
 178         srdi    r3,r3,32
 179         blr
 180 EXPORT_SYMBOL(__csum_partial)
 181
 182
 183         .macro srcnr
 184 100:
 185         EX_TABLE(100b,.Lerror_nr)
 186         .endm
 187
 188         .macro source
 189 150:
 190         EX_TABLE(150b,.Lerror)
 191         .endm
 192
 193         .macro dstnr
 194 200:
 195         EX_TABLE(200b,.Lerror_nr)
 196         .endm
 197
 198         .macro dest
 199 250:
 200         EX_TABLE(250b,.Lerror)
 201         .endm
 202
 203 /*
 204  * Computes the checksum of a memory block at src, length len,
 205  * and adds in 0xffffffff (32-bit), while copying the block to dst.
 206  * If an access exception occurs, it returns 0.
 207  *
 208  * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
 209  */
 210 _GLOBAL(csum_partial_copy_generic)
 211         li      r6,-1
 212         addic   r0,r6,0                 /* clear carry */
 213
 214         srdi.   r6,r5,3                 /* less than 8 bytes? */
 215         beq     .Lcopy_tail_word
 216
 217         /*
 218          * If only halfword aligned, align to a double word. Since odd
 219          * aligned addresses should be rare and they would require more
 220          * work to calculate the correct checksum, we ignore that case
 221          * and take the potential slowdown of unaligned loads.
 222          *
 223          * If the source and destination are relatively unaligned we only
 224          * align the source. This keeps things simple.
 225          */
 226         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
 227         beq     .Lcopy_aligned
 228
 229         li      r9,4
 230         sub     r6,r9,r6
 231         mtctr   r6
 232
 233 1:
 234 srcnr;  lhz     r6,0(r3)                /* align to doubleword */
 235         subi    r5,r5,2
 236         addi    r3,r3,2
 237         adde    r0,r0,r6
 238 dstnr;  sth     r6,0(r4)
 239         addi    r4,r4,2
 240         bdnz    1b
 241
 242 .Lcopy_aligned:
 243         /*
 244          * We unroll the loop such that each iteration is 64 bytes with an
 245          * entry and exit limb of 64 bytes, meaning a minimum size of
 246          * 128 bytes.
 247          */
 248         srdi.   r6,r5,7
 249         beq     .Lcopy_tail_doublewords         /* len < 128 */
 250
 251         srdi    r6,r5,6
 252         subi    r6,r6,1
 253         mtctr   r6
 254
 255         stdu    r1,-STACKFRAMESIZE(r1)
 256         std     r14,STK_REG(R14)(r1)
 257         std     r15,STK_REG(R15)(r1)
 258         std     r16,STK_REG(R16)(r1)
 259
 260 source; ld      r6,0(r3)
 261 source; ld      r9,8(r3)
 262
 263 source; ld      r10,16(r3)
 264 source; ld      r11,24(r3)
 265
 266         /*
 267          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
 268          * because of the XER dependency. This means the fastest this loop can
 269          * go is 16 cycles per iteration. The scheduling of the loop below has
 270          * been shown to hit this on both POWER6 and POWER7.
 271          */
 272         .align 5
 273 2:
 274         adde    r0,r0,r6
 275 source; ld      r12,32(r3)
 276 source; ld      r14,40(r3)
 277
 278         adde    r0,r0,r9
 279 source; ld      r15,48(r3)
 280 source; ld      r16,56(r3)
 281         addi    r3,r3,64
 282
 283         adde    r0,r0,r10
 284 dest;   std     r6,0(r4)
 285 dest;   std     r9,8(r4)
 286
 287         adde    r0,r0,r11
 288 dest;   std     r10,16(r4)
 289 dest;   std     r11,24(r4)
 290
 291         adde    r0,r0,r12
 292 dest;   std     r12,32(r4)
 293 dest;   std     r14,40(r4)
 294
 295         adde    r0,r0,r14
 296 dest;   std     r15,48(r4)
 297 dest;   std     r16,56(r4)
 298         addi    r4,r4,64
 299
 300         adde    r0,r0,r15
 301 source; ld      r6,0(r3)
 302 source; ld      r9,8(r3)
 303
 304         adde    r0,r0,r16
 305 source; ld      r10,16(r3)
 306 source; ld      r11,24(r3)
 307         bdnz    2b
 308
 309
 310         adde    r0,r0,r6
 311 source; ld      r12,32(r3)
 312 source; ld      r14,40(r3)
 313
 314         adde    r0,r0,r9
 315 source; ld      r15,48(r3)
 316 source; ld      r16,56(r3)
 317         addi    r3,r3,64
 318
 319         adde    r0,r0,r10
 320 dest;   std     r6,0(r4)
 321 dest;   std     r9,8(r4)
 322
 323         adde    r0,r0,r11
 324 dest;   std     r10,16(r4)
 325 dest;   std     r11,24(r4)
 326
 327         adde    r0,r0,r12
 328 dest;   std     r12,32(r4)
 329 dest;   std     r14,40(r4)
 330
 331         adde    r0,r0,r14
 332 dest;   std     r15,48(r4)
 333 dest;   std     r16,56(r4)
 334         addi    r4,r4,64
 335
 336         adde    r0,r0,r15
 337         adde    r0,r0,r16
 338
 339         ld      r14,STK_REG(R14)(r1)
 340         ld      r15,STK_REG(R15)(r1)
 341         ld      r16,STK_REG(R16)(r1)
 342         addi    r1,r1,STACKFRAMESIZE
 343
 344         andi.   r5,r5,63
 345
 346 .Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
 347         srdi.   r6,r5,3
 348         beq     .Lcopy_tail_word
 349
 350         mtctr   r6
 351 3:
 352 srcnr;  ld      r6,0(r3)
 353         addi    r3,r3,8
 354         adde    r0,r0,r6
 355 dstnr;  std     r6,0(r4)
 356         addi    r4,r4,8
 357         bdnz    3b
 358
 359         andi.   r5,r5,7
 360
 361 .Lcopy_tail_word:                       /* Up to 7 bytes to go */
 362         srdi.   r6,r5,2
 363         beq     .Lcopy_tail_halfword
 364
 365 srcnr;  lwz     r6,0(r3)
 366         addi    r3,r3,4
 367         adde    r0,r0,r6
 368 dstnr;  stw     r6,0(r4)
 369         addi    r4,r4,4
 370         subi    r5,r5,4
 371
 372 .Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
 373         srdi.   r6,r5,1
 374         beq     .Lcopy_tail_byte
 375
 376 srcnr;  lhz     r6,0(r3)
 377         addi    r3,r3,2
 378         adde    r0,r0,r6
 379 dstnr;  sth     r6,0(r4)
 380         addi    r4,r4,2
 381         subi    r5,r5,2
 382
 383 .Lcopy_tail_byte:                       /* Up to 1 byte to go */
 384         andi.   r6,r5,1
 385         beq     .Lcopy_finish
 386
 387 srcnr;  lbz     r6,0(r3)
 388 #ifdef __BIG_ENDIAN__
 389         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 390         adde    r0,r0,r9
 391 #else
 392         adde    r0,r0,r6
 393 #endif
 394 dstnr;  stb     r6,0(r4)
 395
 396 .Lcopy_finish:
 397         addze   r0,r0                   /* add in final carry */
 398         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 399         add     r3,r4,r0
 400         srdi    r3,r3,32
 401         blr
 402
 403 .Lerror:
 404         ld      r14,STK_REG(R14)(r1)
 405         ld      r15,STK_REG(R15)(r1)
 406         ld      r16,STK_REG(R16)(r1)
 407         addi    r1,r1,STACKFRAMESIZE
 408 .Lerror_nr:
 409         li      r3,0
 410         blr
 411
 412 EXPORT_SYMBOL(csum_partial_copy_generic)
 413
 414 /*
 415  * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 416  *                         const struct in6_addr *daddr,
 417  *                         __u32 len, __u8 proto, __wsum sum)
 418  */
 419
 420 _GLOBAL(csum_ipv6_magic)
 421         ld      r8, 0(r3)
 422         ld      r9, 8(r3)
 423         add     r5, r5, r6
 424         addc    r0, r8, r9
 425         ld      r10, 0(r4)
 426         ld      r11, 8(r4)
 427 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 428         rotldi  r5, r5, 8
 429 #endif
 430         adde    r0, r0, r10
 431         add     r5, r5, r7
 432         adde    r0, r0, r11
 433         adde    r0, r0, r5
 434         addze   r0, r0
 435         rotldi  r3, r0, 32              /* fold two 32 bit halves together */
 436         add     r3, r0, r3
 437         srdi    r0, r3, 32
 438         rotlwi  r3, r0, 16              /* fold two 16 bit halves together */
 439         add     r3, r0, r3
 440         not     r3, r3
 441         rlwinm  r3, r3, 16, 16, 31
 442         blr
 443 EXPORT_SYMBOL(csum_ipv6_magic)