arch/powerpc/crypto/crc32-vpmsum_core.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Core of the accelerated CRC algorithm.
   4  * In your file, define the constants and CRC_FUNCTION_NAME
   5  * Then include this file.
   6  *
   7  * Calculate the checksum of data that is 16 byte aligned and a multiple of
   8  * 16 bytes.
   9  *
  10  * The first step is to reduce it to 1024 bits. We do this in 8 parallel
  11  * chunks in order to mask the latency of the vpmsum instructions. If we
  12  * have more than 32 kB of data to checksum we repeat this step multiple
  13  * times, passing in the previous 1024 bits.
  14  *
  15  * The next step is to reduce the 1024 bits to 64 bits. This step adds
  16  * 32 bits of 0s to the end - this matches what a CRC does. We just
  17  * calculate constants that land the data in this 32 bits.
  18  *
  19  * We then use fixed point Barrett reduction to compute a mod n over GF(2)
  20  * for n = CRC using POWER8 instructions. We use x = 32.
  21  *
  22  * https://en.wikipedia.org/wiki/Barrett_reduction
  23  *
  24  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
  25 */
  26
  27 #include <asm/ppc_asm.h>
  28 #include <asm/ppc-opcode.h>
  29
  30 #define MAX_SIZE        32768
  31
  32         .text
  33
  34 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
  35 #define BYTESWAP_DATA
  36 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
  37 #define BYTESWAP_DATA
  38 #else
  39 #undef BYTESWAP_DATA
  40 #endif
  41
  42 #define off16           r25
  43 #define off32           r26
  44 #define off48           r27
  45 #define off64           r28
  46 #define off80           r29
  47 #define off96           r30
  48 #define off112          r31
  49
  50 #define const1          v24
  51 #define const2          v25
  52
  53 #define byteswap        v26
  54 #define mask_32bit      v27
  55 #define mask_64bit      v28
  56 #define zeroes          v29
  57
  58 #ifdef BYTESWAP_DATA
  59 #define VPERM(A, B, C, D) vperm A, B, C, D
  60 #else
  61 #define VPERM(A, B, C, D)
  62 #endif
  63
  64 /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
  65 FUNC_START(CRC_FUNCTION_NAME)
  66         std     r31,-8(r1)
  67         std     r30,-16(r1)
  68         std     r29,-24(r1)
  69         std     r28,-32(r1)
  70         std     r27,-40(r1)
  71         std     r26,-48(r1)
  72         std     r25,-56(r1)
  73
  74         li      off16,16
  75         li      off32,32
  76         li      off48,48
  77         li      off64,64
  78         li      off80,80
  79         li      off96,96
  80         li      off112,112
  81         li      r0,0
  82
  83         /* Enough room for saving 10 non volatile VMX registers */
  84         subi    r6,r1,56+10*16
  85         subi    r7,r1,56+2*16
  86
  87         stvx    v20,0,r6
  88         stvx    v21,off16,r6
  89         stvx    v22,off32,r6
  90         stvx    v23,off48,r6
  91         stvx    v24,off64,r6
  92         stvx    v25,off80,r6
  93         stvx    v26,off96,r6
  94         stvx    v27,off112,r6
  95         stvx    v28,0,r7
  96         stvx    v29,off16,r7
  97
  98         mr      r10,r3
  99
 100         vxor    zeroes,zeroes,zeroes
 101         vspltisw v0,-1
 102
 103         vsldoi  mask_32bit,zeroes,v0,4
 104         vsldoi  mask_64bit,zeroes,v0,8
 105
 106         /* Get the initial value into v8 */
 107         vxor    v8,v8,v8
 108         MTVRD(v8, R3)
 109 #ifdef REFLECT
 110         vsldoi  v8,zeroes,v8,8  /* shift into bottom 32 bits */
 111 #else
 112         vsldoi  v8,v8,zeroes,4  /* shift into top 32 bits */
 113 #endif
 114
 115 #ifdef BYTESWAP_DATA
 116         LOAD_REG_ADDR(r3, .byteswap_constant)
 117         lvx     byteswap,0,r3
 118         addi    r3,r3,16
 119 #endif
 120
 121         cmpdi   r5,256
 122         blt     .Lshort
 123
 124         rldicr  r6,r5,0,56
 125
 126         /* Checksum in blocks of MAX_SIZE */
 127 1:      lis     r7,MAX_SIZE@h
 128         ori     r7,r7,MAX_SIZE@l
 129         mr      r9,r7
 130         cmpd    r6,r7
 131         bgt     2f
 132         mr      r7,r6
 133 2:      subf    r6,r7,r6
 134
 135         /* our main loop does 128 bytes at a time */
 136         srdi    r7,r7,7
 137
 138         /*
 139          * Work out the offset into the constants table to start at. Each
 140          * constant is 16 bytes, and it is used against 128 bytes of input
 141          * data - 128 / 16 = 8
 142          */
 143         sldi    r8,r7,4
 144         srdi    r9,r9,3
 145         subf    r8,r8,r9
 146
 147         /* We reduce our final 128 bytes in a separate step */
 148         addi    r7,r7,-1
 149         mtctr   r7
 150
 151         LOAD_REG_ADDR(r3, .constants)
 152
 153         /* Find the start of our constants */
 154         add     r3,r3,r8
 155
 156         /* zero v0-v7 which will contain our checksums */
 157         vxor    v0,v0,v0
 158         vxor    v1,v1,v1
 159         vxor    v2,v2,v2
 160         vxor    v3,v3,v3
 161         vxor    v4,v4,v4
 162         vxor    v5,v5,v5
 163         vxor    v6,v6,v6
 164         vxor    v7,v7,v7
 165
 166         lvx     const1,0,r3
 167
 168         /*
 169          * If we are looping back to consume more data we use the values
 170          * already in v16-v23.
 171          */
 172         cmpdi   r0,1
 173         beq     2f
 174
 175         /* First warm up pass */
 176         lvx     v16,0,r4
 177         lvx     v17,off16,r4
 178         VPERM(v16,v16,v16,byteswap)
 179         VPERM(v17,v17,v17,byteswap)
 180         lvx     v18,off32,r4
 181         lvx     v19,off48,r4
 182         VPERM(v18,v18,v18,byteswap)
 183         VPERM(v19,v19,v19,byteswap)
 184         lvx     v20,off64,r4
 185         lvx     v21,off80,r4
 186         VPERM(v20,v20,v20,byteswap)
 187         VPERM(v21,v21,v21,byteswap)
 188         lvx     v22,off96,r4
 189         lvx     v23,off112,r4
 190         VPERM(v22,v22,v22,byteswap)
 191         VPERM(v23,v23,v23,byteswap)
 192         addi    r4,r4,8*16
 193
 194         /* xor in initial value */
 195         vxor    v16,v16,v8
 196
 197 2:      bdz     .Lfirst_warm_up_done
 198
 199         addi    r3,r3,16
 200         lvx     const2,0,r3
 201
 202         /* Second warm up pass */
 203         VPMSUMD(v8,v16,const1)
 204         lvx     v16,0,r4
 205         VPERM(v16,v16,v16,byteswap)
 206         ori     r2,r2,0
 207
 208         VPMSUMD(v9,v17,const1)
 209         lvx     v17,off16,r4
 210         VPERM(v17,v17,v17,byteswap)
 211         ori     r2,r2,0
 212
 213         VPMSUMD(v10,v18,const1)
 214         lvx     v18,off32,r4
 215         VPERM(v18,v18,v18,byteswap)
 216         ori     r2,r2,0
 217
 218         VPMSUMD(v11,v19,const1)
 219         lvx     v19,off48,r4
 220         VPERM(v19,v19,v19,byteswap)
 221         ori     r2,r2,0
 222
 223         VPMSUMD(v12,v20,const1)
 224         lvx     v20,off64,r4
 225         VPERM(v20,v20,v20,byteswap)
 226         ori     r2,r2,0
 227
 228         VPMSUMD(v13,v21,const1)
 229         lvx     v21,off80,r4
 230         VPERM(v21,v21,v21,byteswap)
 231         ori     r2,r2,0
 232
 233         VPMSUMD(v14,v22,const1)
 234         lvx     v22,off96,r4
 235         VPERM(v22,v22,v22,byteswap)
 236         ori     r2,r2,0
 237
 238         VPMSUMD(v15,v23,const1)
 239         lvx     v23,off112,r4
 240         VPERM(v23,v23,v23,byteswap)
 241
 242         addi    r4,r4,8*16
 243
 244         bdz     .Lfirst_cool_down
 245
 246         /*
 247          * main loop. We modulo schedule it such that it takes three iterations
 248          * to complete - first iteration load, second iteration vpmsum, third
 249          * iteration xor.
 250          */
 251         .balign 16
 252 4:      lvx     const1,0,r3
 253         addi    r3,r3,16
 254         ori     r2,r2,0
 255
 256         vxor    v0,v0,v8
 257         VPMSUMD(v8,v16,const2)
 258         lvx     v16,0,r4
 259         VPERM(v16,v16,v16,byteswap)
 260         ori     r2,r2,0
 261
 262         vxor    v1,v1,v9
 263         VPMSUMD(v9,v17,const2)
 264         lvx     v17,off16,r4
 265         VPERM(v17,v17,v17,byteswap)
 266         ori     r2,r2,0
 267
 268         vxor    v2,v2,v10
 269         VPMSUMD(v10,v18,const2)
 270         lvx     v18,off32,r4
 271         VPERM(v18,v18,v18,byteswap)
 272         ori     r2,r2,0
 273
 274         vxor    v3,v3,v11
 275         VPMSUMD(v11,v19,const2)
 276         lvx     v19,off48,r4
 277         VPERM(v19,v19,v19,byteswap)
 278         lvx     const2,0,r3
 279         ori     r2,r2,0
 280
 281         vxor    v4,v4,v12
 282         VPMSUMD(v12,v20,const1)
 283         lvx     v20,off64,r4
 284         VPERM(v20,v20,v20,byteswap)
 285         ori     r2,r2,0
 286
 287         vxor    v5,v5,v13
 288         VPMSUMD(v13,v21,const1)
 289         lvx     v21,off80,r4
 290         VPERM(v21,v21,v21,byteswap)
 291         ori     r2,r2,0
 292
 293         vxor    v6,v6,v14
 294         VPMSUMD(v14,v22,const1)
 295         lvx     v22,off96,r4
 296         VPERM(v22,v22,v22,byteswap)
 297         ori     r2,r2,0
 298
 299         vxor    v7,v7,v15
 300         VPMSUMD(v15,v23,const1)
 301         lvx     v23,off112,r4
 302         VPERM(v23,v23,v23,byteswap)
 303
 304         addi    r4,r4,8*16
 305
 306         bdnz    4b
 307
 308 .Lfirst_cool_down:
 309         /* First cool down pass */
 310         lvx     const1,0,r3
 311         addi    r3,r3,16
 312
 313         vxor    v0,v0,v8
 314         VPMSUMD(v8,v16,const1)
 315         ori     r2,r2,0
 316
 317         vxor    v1,v1,v9
 318         VPMSUMD(v9,v17,const1)
 319         ori     r2,r2,0
 320
 321         vxor    v2,v2,v10
 322         VPMSUMD(v10,v18,const1)
 323         ori     r2,r2,0
 324
 325         vxor    v3,v3,v11
 326         VPMSUMD(v11,v19,const1)
 327         ori     r2,r2,0
 328
 329         vxor    v4,v4,v12
 330         VPMSUMD(v12,v20,const1)
 331         ori     r2,r2,0
 332
 333         vxor    v5,v5,v13
 334         VPMSUMD(v13,v21,const1)
 335         ori     r2,r2,0
 336
 337         vxor    v6,v6,v14
 338         VPMSUMD(v14,v22,const1)
 339         ori     r2,r2,0
 340
 341         vxor    v7,v7,v15
 342         VPMSUMD(v15,v23,const1)
 343         ori     r2,r2,0
 344
 345 .Lsecond_cool_down:
 346         /* Second cool down pass */
 347         vxor    v0,v0,v8
 348         vxor    v1,v1,v9
 349         vxor    v2,v2,v10
 350         vxor    v3,v3,v11
 351         vxor    v4,v4,v12
 352         vxor    v5,v5,v13
 353         vxor    v6,v6,v14
 354         vxor    v7,v7,v15
 355
 356 #ifdef REFLECT
 357         /*
 358          * vpmsumd produces a 96 bit result in the least significant bits
 359          * of the register. Since we are bit reflected we have to shift it
 360          * left 32 bits so it occupies the least significant bits in the
 361          * bit reflected domain.
 362          */
 363         vsldoi  v0,v0,zeroes,4
 364         vsldoi  v1,v1,zeroes,4
 365         vsldoi  v2,v2,zeroes,4
 366         vsldoi  v3,v3,zeroes,4
 367         vsldoi  v4,v4,zeroes,4
 368         vsldoi  v5,v5,zeroes,4
 369         vsldoi  v6,v6,zeroes,4
 370         vsldoi  v7,v7,zeroes,4
 371 #endif
 372
 373         /* xor with last 1024 bits */
 374         lvx     v8,0,r4
 375         lvx     v9,off16,r4
 376         VPERM(v8,v8,v8,byteswap)
 377         VPERM(v9,v9,v9,byteswap)
 378         lvx     v10,off32,r4
 379         lvx     v11,off48,r4
 380         VPERM(v10,v10,v10,byteswap)
 381         VPERM(v11,v11,v11,byteswap)
 382         lvx     v12,off64,r4
 383         lvx     v13,off80,r4
 384         VPERM(v12,v12,v12,byteswap)
 385         VPERM(v13,v13,v13,byteswap)
 386         lvx     v14,off96,r4
 387         lvx     v15,off112,r4
 388         VPERM(v14,v14,v14,byteswap)
 389         VPERM(v15,v15,v15,byteswap)
 390
 391         addi    r4,r4,8*16
 392
 393         vxor    v16,v0,v8
 394         vxor    v17,v1,v9
 395         vxor    v18,v2,v10
 396         vxor    v19,v3,v11
 397         vxor    v20,v4,v12
 398         vxor    v21,v5,v13
 399         vxor    v22,v6,v14
 400         vxor    v23,v7,v15
 401
 402         li      r0,1
 403         cmpdi   r6,0
 404         addi    r6,r6,128
 405         bne     1b
 406
 407         /* Work out how many bytes we have left */
 408         andi.   r5,r5,127
 409
 410         /* Calculate where in the constant table we need to start */
 411         subfic  r6,r5,128
 412         add     r3,r3,r6
 413
 414         /* How many 16 byte chunks are in the tail */
 415         srdi    r7,r5,4
 416         mtctr   r7
 417
 418         /*
 419          * Reduce the previously calculated 1024 bits to 64 bits, shifting
 420          * 32 bits to include the trailing 32 bits of zeros
 421          */
 422         lvx     v0,0,r3
 423         lvx     v1,off16,r3
 424         lvx     v2,off32,r3
 425         lvx     v3,off48,r3
 426         lvx     v4,off64,r3
 427         lvx     v5,off80,r3
 428         lvx     v6,off96,r3
 429         lvx     v7,off112,r3
 430         addi    r3,r3,8*16
 431
 432         VPMSUMW(v0,v16,v0)
 433         VPMSUMW(v1,v17,v1)
 434         VPMSUMW(v2,v18,v2)
 435         VPMSUMW(v3,v19,v3)
 436         VPMSUMW(v4,v20,v4)
 437         VPMSUMW(v5,v21,v5)
 438         VPMSUMW(v6,v22,v6)
 439         VPMSUMW(v7,v23,v7)
 440
 441         /* Now reduce the tail (0 - 112 bytes) */
 442         cmpdi   r7,0
 443         beq     1f
 444
 445         lvx     v16,0,r4
 446         lvx     v17,0,r3
 447         VPERM(v16,v16,v16,byteswap)
 448         VPMSUMW(v16,v16,v17)
 449         vxor    v0,v0,v16
 450         bdz     1f
 451
 452         lvx     v16,off16,r4
 453         lvx     v17,off16,r3
 454         VPERM(v16,v16,v16,byteswap)
 455         VPMSUMW(v16,v16,v17)
 456         vxor    v0,v0,v16
 457         bdz     1f
 458
 459         lvx     v16,off32,r4
 460         lvx     v17,off32,r3
 461         VPERM(v16,v16,v16,byteswap)
 462         VPMSUMW(v16,v16,v17)
 463         vxor    v0,v0,v16
 464         bdz     1f
 465
 466         lvx     v16,off48,r4
 467         lvx     v17,off48,r3
 468         VPERM(v16,v16,v16,byteswap)
 469         VPMSUMW(v16,v16,v17)
 470         vxor    v0,v0,v16
 471         bdz     1f
 472
 473         lvx     v16,off64,r4
 474         lvx     v17,off64,r3
 475         VPERM(v16,v16,v16,byteswap)
 476         VPMSUMW(v16,v16,v17)
 477         vxor    v0,v0,v16
 478         bdz     1f
 479
 480         lvx     v16,off80,r4
 481         lvx     v17,off80,r3
 482         VPERM(v16,v16,v16,byteswap)
 483         VPMSUMW(v16,v16,v17)
 484         vxor    v0,v0,v16
 485         bdz     1f
 486
 487         lvx     v16,off96,r4
 488         lvx     v17,off96,r3
 489         VPERM(v16,v16,v16,byteswap)
 490         VPMSUMW(v16,v16,v17)
 491         vxor    v0,v0,v16
 492
 493         /* Now xor all the parallel chunks together */
 494 1:      vxor    v0,v0,v1
 495         vxor    v2,v2,v3
 496         vxor    v4,v4,v5
 497         vxor    v6,v6,v7
 498
 499         vxor    v0,v0,v2
 500         vxor    v4,v4,v6
 501
 502         vxor    v0,v0,v4
 503
 504 .Lbarrett_reduction:
 505         /* Barrett constants */
 506         LOAD_REG_ADDR(r3, .barrett_constants)
 507
 508         lvx     const1,0,r3
 509         lvx     const2,off16,r3
 510
 511         vsldoi  v1,v0,v0,8
 512         vxor    v0,v0,v1                /* xor two 64 bit results together */
 513
 514 #ifdef REFLECT
 515         /* shift left one bit */
 516         vspltisb v1,1
 517         vsl     v0,v0,v1
 518 #endif
 519
 520         vand    v0,v0,mask_64bit
 521 #ifndef REFLECT
 522         /*
 523          * Now for the Barrett reduction algorithm. The idea is to calculate q,
 524          * the multiple of our polynomial that we need to subtract. By
 525          * doing the computation 2x bits higher (ie 64 bits) and shifting the
 526          * result back down 2x bits, we round down to the nearest multiple.
 527          */
 528         VPMSUMD(v1,v0,const1)   /* ma */
 529         vsldoi  v1,zeroes,v1,8  /* q = floor(ma/(2^64)) */
 530         VPMSUMD(v1,v1,const2)   /* qn */
 531         vxor    v0,v0,v1        /* a - qn, subtraction is xor in GF(2) */
 532
 533         /*
 534          * Get the result into r3. We need to shift it left 8 bytes:
 535          * V0 [ 0 1 2 X ]
 536          * V0 [ 0 X 2 3 ]
 537          */
 538         vsldoi  v0,v0,zeroes,8  /* shift result into top 64 bits */
 539 #else
 540         /*
 541          * The reflected version of Barrett reduction. Instead of bit
 542          * reflecting our data (which is expensive to do), we bit reflect our
 543          * constants and our algorithm, which means the intermediate data in
 544          * our vector registers goes from 0-63 instead of 63-0. We can reflect
 545          * the algorithm because we don't carry in mod 2 arithmetic.
 546          */
 547         vand    v1,v0,mask_32bit        /* bottom 32 bits of a */
 548         VPMSUMD(v1,v1,const1)           /* ma */
 549         vand    v1,v1,mask_32bit        /* bottom 32bits of ma */
 550         VPMSUMD(v1,v1,const2)           /* qn */
 551         vxor    v0,v0,v1                /* a - qn, subtraction is xor in GF(2) */
 552
 553         /*
 554          * Since we are bit reflected, the result (ie the low 32 bits) is in
 555          * the high 32 bits. We just need to shift it left 4 bytes
 556          * V0 [ 0 1 X 3 ]
 557          * V0 [ 0 X 2 3 ]
 558          */
 559         vsldoi  v0,v0,zeroes,4          /* shift result into top 64 bits of */
 560 #endif
 561
 562         /* Get it into r3 */
 563         MFVRD(R3, v0)
 564
 565 .Lout:
 566         subi    r6,r1,56+10*16
 567         subi    r7,r1,56+2*16
 568
 569         lvx     v20,0,r6
 570         lvx     v21,off16,r6
 571         lvx     v22,off32,r6
 572         lvx     v23,off48,r6
 573         lvx     v24,off64,r6
 574         lvx     v25,off80,r6
 575         lvx     v26,off96,r6
 576         lvx     v27,off112,r6
 577         lvx     v28,0,r7
 578         lvx     v29,off16,r7
 579
 580         ld      r31,-8(r1)
 581         ld      r30,-16(r1)
 582         ld      r29,-24(r1)
 583         ld      r28,-32(r1)
 584         ld      r27,-40(r1)
 585         ld      r26,-48(r1)
 586         ld      r25,-56(r1)
 587
 588         blr
 589
 590 .Lfirst_warm_up_done:
 591         lvx     const1,0,r3
 592         addi    r3,r3,16
 593
 594         VPMSUMD(v8,v16,const1)
 595         VPMSUMD(v9,v17,const1)
 596         VPMSUMD(v10,v18,const1)
 597         VPMSUMD(v11,v19,const1)
 598         VPMSUMD(v12,v20,const1)
 599         VPMSUMD(v13,v21,const1)
 600         VPMSUMD(v14,v22,const1)
 601         VPMSUMD(v15,v23,const1)
 602
 603         b       .Lsecond_cool_down
 604
 605 .Lshort:
 606         cmpdi   r5,0
 607         beq     .Lzero
 608
 609         LOAD_REG_ADDR(r3, .short_constants)
 610
 611         /* Calculate where in the constant table we need to start */
 612         subfic  r6,r5,256
 613         add     r3,r3,r6
 614
 615         /* How many 16 byte chunks? */
 616         srdi    r7,r5,4
 617         mtctr   r7
 618
 619         vxor    v19,v19,v19
 620         vxor    v20,v20,v20
 621
 622         lvx     v0,0,r4
 623         lvx     v16,0,r3
 624         VPERM(v0,v0,v16,byteswap)
 625         vxor    v0,v0,v8        /* xor in initial value */
 626         VPMSUMW(v0,v0,v16)
 627         bdz     .Lv0
 628
 629         lvx     v1,off16,r4
 630         lvx     v17,off16,r3
 631         VPERM(v1,v1,v17,byteswap)
 632         VPMSUMW(v1,v1,v17)
 633         bdz     .Lv1
 634
 635         lvx     v2,off32,r4
 636         lvx     v16,off32,r3
 637         VPERM(v2,v2,v16,byteswap)
 638         VPMSUMW(v2,v2,v16)
 639         bdz     .Lv2
 640
 641         lvx     v3,off48,r4
 642         lvx     v17,off48,r3
 643         VPERM(v3,v3,v17,byteswap)
 644         VPMSUMW(v3,v3,v17)
 645         bdz     .Lv3
 646
 647         lvx     v4,off64,r4
 648         lvx     v16,off64,r3
 649         VPERM(v4,v4,v16,byteswap)
 650         VPMSUMW(v4,v4,v16)
 651         bdz     .Lv4
 652
 653         lvx     v5,off80,r4
 654         lvx     v17,off80,r3
 655         VPERM(v5,v5,v17,byteswap)
 656         VPMSUMW(v5,v5,v17)
 657         bdz     .Lv5
 658
 659         lvx     v6,off96,r4
 660         lvx     v16,off96,r3
 661         VPERM(v6,v6,v16,byteswap)
 662         VPMSUMW(v6,v6,v16)
 663         bdz     .Lv6
 664
 665         lvx     v7,off112,r4
 666         lvx     v17,off112,r3
 667         VPERM(v7,v7,v17,byteswap)
 668         VPMSUMW(v7,v7,v17)
 669         bdz     .Lv7
 670
 671         addi    r3,r3,128
 672         addi    r4,r4,128
 673
 674         lvx     v8,0,r4
 675         lvx     v16,0,r3
 676         VPERM(v8,v8,v16,byteswap)
 677         VPMSUMW(v8,v8,v16)
 678         bdz     .Lv8
 679
 680         lvx     v9,off16,r4
 681         lvx     v17,off16,r3
 682         VPERM(v9,v9,v17,byteswap)
 683         VPMSUMW(v9,v9,v17)
 684         bdz     .Lv9
 685
 686         lvx     v10,off32,r4
 687         lvx     v16,off32,r3
 688         VPERM(v10,v10,v16,byteswap)
 689         VPMSUMW(v10,v10,v16)
 690         bdz     .Lv10
 691
 692         lvx     v11,off48,r4
 693         lvx     v17,off48,r3
 694         VPERM(v11,v11,v17,byteswap)
 695         VPMSUMW(v11,v11,v17)
 696         bdz     .Lv11
 697
 698         lvx     v12,off64,r4
 699         lvx     v16,off64,r3
 700         VPERM(v12,v12,v16,byteswap)
 701         VPMSUMW(v12,v12,v16)
 702         bdz     .Lv12
 703
 704         lvx     v13,off80,r4
 705         lvx     v17,off80,r3
 706         VPERM(v13,v13,v17,byteswap)
 707         VPMSUMW(v13,v13,v17)
 708         bdz     .Lv13
 709
 710         lvx     v14,off96,r4
 711         lvx     v16,off96,r3
 712         VPERM(v14,v14,v16,byteswap)
 713         VPMSUMW(v14,v14,v16)
 714         bdz     .Lv14
 715
 716         lvx     v15,off112,r4
 717         lvx     v17,off112,r3
 718         VPERM(v15,v15,v17,byteswap)
 719         VPMSUMW(v15,v15,v17)
 720
 721 .Lv15:  vxor    v19,v19,v15
 722 .Lv14:  vxor    v20,v20,v14
 723 .Lv13:  vxor    v19,v19,v13
 724 .Lv12:  vxor    v20,v20,v12
 725 .Lv11:  vxor    v19,v19,v11
 726 .Lv10:  vxor    v20,v20,v10
 727 .Lv9:   vxor    v19,v19,v9
 728 .Lv8:   vxor    v20,v20,v8
 729 .Lv7:   vxor    v19,v19,v7
 730 .Lv6:   vxor    v20,v20,v6
 731 .Lv5:   vxor    v19,v19,v5
 732 .Lv4:   vxor    v20,v20,v4
 733 .Lv3:   vxor    v19,v19,v3
 734 .Lv2:   vxor    v20,v20,v2
 735 .Lv1:   vxor    v19,v19,v1
 736 .Lv0:   vxor    v20,v20,v0
 737
 738         vxor    v0,v19,v20
 739
 740         b       .Lbarrett_reduction
 741
 742 .Lzero:
 743         mr      r3,r10
 744         b       .Lout
 745
 746 FUNC_END(CRC_FUNCTION_NAME)