1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Core of the accelerated CRC algorithm.
4 * In your file, define the constants and CRC_FUNCTION_NAME
5 * Then include this file.
7 * Calculate the checksum of data that is 16 byte aligned and a multiple of
10 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
11 * chunks in order to mask the latency of the vpmsum instructions. If we
12 * have more than 32 kB of data to checksum we repeat this step multiple
13 * times, passing in the previous 1024 bits.
15 * The next step is to reduce the 1024 bits to 64 bits. This step adds
16 * 32 bits of 0s to the end - this matches what a CRC does. We just
17 * calculate constants that land the data in this 32 bits.
19 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
20 * for n = CRC using POWER8 instructions. We use x = 32.
22 * https://en.wikipedia.org/wiki/Barrett_reduction
24 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
27 #include <asm/ppc_asm.h>
28 #include <asm/ppc-opcode.h>
30 #define MAX_SIZE 32768
34 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
36 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
54 #define mask_32bit v27
55 #define mask_64bit v28
59 #define VPERM(A, B, C, D) vperm A, B, C, D
61 #define VPERM(A, B, C, D)
64 /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
65 FUNC_START(CRC_FUNCTION_NAME)
83 /* Enough room for saving 10 non volatile VMX registers */
100 vxor zeroes,zeroes,zeroes
103 vsldoi mask_32bit,zeroes,v0,4
104 vsldoi mask_64bit,zeroes,v0,8
106 /* Get the initial value into v8 */
110 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
112 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
116 addis r3,r2,.byteswap_constant@toc@ha
117 addi r3,r3,.byteswap_constant@toc@l
128 /* Checksum in blocks of MAX_SIZE */
137 /* our main loop does 128 bytes at a time */
141 * Work out the offset into the constants table to start at. Each
142 * constant is 16 bytes, and it is used against 128 bytes of input
143 * data - 128 / 16 = 8
149 /* We reduce our final 128 bytes in a separate step */
153 addis r3,r2,.constants@toc@ha
154 addi r3,r3,.constants@toc@l
156 /* Find the start of our constants */
159 /* zero v0-v7 which will contain our checksums */
172 * If we are looping back to consume more data we use the values
173 * already in v16-v23.
178 /* First warm up pass */
181 VPERM(v16,v16,v16,byteswap)
182 VPERM(v17,v17,v17,byteswap)
185 VPERM(v18,v18,v18,byteswap)
186 VPERM(v19,v19,v19,byteswap)
189 VPERM(v20,v20,v20,byteswap)
190 VPERM(v21,v21,v21,byteswap)
193 VPERM(v22,v22,v22,byteswap)
194 VPERM(v23,v23,v23,byteswap)
197 /* xor in initial value */
200 2: bdz .Lfirst_warm_up_done
205 /* Second warm up pass */
206 VPMSUMD(v8,v16,const1)
208 VPERM(v16,v16,v16,byteswap)
211 VPMSUMD(v9,v17,const1)
213 VPERM(v17,v17,v17,byteswap)
216 VPMSUMD(v10,v18,const1)
218 VPERM(v18,v18,v18,byteswap)
221 VPMSUMD(v11,v19,const1)
223 VPERM(v19,v19,v19,byteswap)
226 VPMSUMD(v12,v20,const1)
228 VPERM(v20,v20,v20,byteswap)
231 VPMSUMD(v13,v21,const1)
233 VPERM(v21,v21,v21,byteswap)
236 VPMSUMD(v14,v22,const1)
238 VPERM(v22,v22,v22,byteswap)
241 VPMSUMD(v15,v23,const1)
243 VPERM(v23,v23,v23,byteswap)
247 bdz .Lfirst_cool_down
250 * main loop. We modulo schedule it such that it takes three iterations
251 * to complete - first iteration load, second iteration vpmsum, third
260 VPMSUMD(v8,v16,const2)
262 VPERM(v16,v16,v16,byteswap)
266 VPMSUMD(v9,v17,const2)
268 VPERM(v17,v17,v17,byteswap)
272 VPMSUMD(v10,v18,const2)
274 VPERM(v18,v18,v18,byteswap)
278 VPMSUMD(v11,v19,const2)
280 VPERM(v19,v19,v19,byteswap)
285 VPMSUMD(v12,v20,const1)
287 VPERM(v20,v20,v20,byteswap)
291 VPMSUMD(v13,v21,const1)
293 VPERM(v21,v21,v21,byteswap)
297 VPMSUMD(v14,v22,const1)
299 VPERM(v22,v22,v22,byteswap)
303 VPMSUMD(v15,v23,const1)
305 VPERM(v23,v23,v23,byteswap)
312 /* First cool down pass */
317 VPMSUMD(v8,v16,const1)
321 VPMSUMD(v9,v17,const1)
325 VPMSUMD(v10,v18,const1)
329 VPMSUMD(v11,v19,const1)
333 VPMSUMD(v12,v20,const1)
337 VPMSUMD(v13,v21,const1)
341 VPMSUMD(v14,v22,const1)
345 VPMSUMD(v15,v23,const1)
349 /* Second cool down pass */
361 * vpmsumd produces a 96 bit result in the least significant bits
362 * of the register. Since we are bit reflected we have to shift it
363 * left 32 bits so it occupies the least significant bits in the
364 * bit reflected domain.
366 vsldoi v0,v0,zeroes,4
367 vsldoi v1,v1,zeroes,4
368 vsldoi v2,v2,zeroes,4
369 vsldoi v3,v3,zeroes,4
370 vsldoi v4,v4,zeroes,4
371 vsldoi v5,v5,zeroes,4
372 vsldoi v6,v6,zeroes,4
373 vsldoi v7,v7,zeroes,4
376 /* xor with last 1024 bits */
379 VPERM(v8,v8,v8,byteswap)
380 VPERM(v9,v9,v9,byteswap)
383 VPERM(v10,v10,v10,byteswap)
384 VPERM(v11,v11,v11,byteswap)
387 VPERM(v12,v12,v12,byteswap)
388 VPERM(v13,v13,v13,byteswap)
391 VPERM(v14,v14,v14,byteswap)
392 VPERM(v15,v15,v15,byteswap)
410 /* Work out how many bytes we have left */
413 /* Calculate where in the constant table we need to start */
417 /* How many 16 byte chunks are in the tail */
422 * Reduce the previously calculated 1024 bits to 64 bits, shifting
423 * 32 bits to include the trailing 32 bits of zeros
444 /* Now reduce the tail (0 - 112 bytes) */
450 VPERM(v16,v16,v16,byteswap)
457 VPERM(v16,v16,v16,byteswap)
464 VPERM(v16,v16,v16,byteswap)
471 VPERM(v16,v16,v16,byteswap)
478 VPERM(v16,v16,v16,byteswap)
485 VPERM(v16,v16,v16,byteswap)
492 VPERM(v16,v16,v16,byteswap)
496 /* Now xor all the parallel chunks together */
508 /* Barrett constants */
509 addis r3,r2,.barrett_constants@toc@ha
510 addi r3,r3,.barrett_constants@toc@l
516 vxor v0,v0,v1 /* xor two 64 bit results together */
519 /* shift left one bit */
524 vand v0,v0,mask_64bit
527 * Now for the Barrett reduction algorithm. The idea is to calculate q,
528 * the multiple of our polynomial that we need to subtract. By
529 * doing the computation 2x bits higher (ie 64 bits) and shifting the
530 * result back down 2x bits, we round down to the nearest multiple.
532 VPMSUMD(v1,v0,const1) /* ma */
533 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
534 VPMSUMD(v1,v1,const2) /* qn */
535 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
538 * Get the result into r3. We need to shift it left 8 bytes:
542 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
545 * The reflected version of Barrett reduction. Instead of bit
546 * reflecting our data (which is expensive to do), we bit reflect our
547 * constants and our algorithm, which means the intermediate data in
548 * our vector registers goes from 0-63 instead of 63-0. We can reflect
549 * the algorithm because we don't carry in mod 2 arithmetic.
551 vand v1,v0,mask_32bit /* bottom 32 bits of a */
552 VPMSUMD(v1,v1,const1) /* ma */
553 vand v1,v1,mask_32bit /* bottom 32bits of ma */
554 VPMSUMD(v1,v1,const2) /* qn */
555 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
558 * Since we are bit reflected, the result (ie the low 32 bits) is in
559 * the high 32 bits. We just need to shift it left 4 bytes
563 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
594 .Lfirst_warm_up_done:
598 VPMSUMD(v8,v16,const1)
599 VPMSUMD(v9,v17,const1)
600 VPMSUMD(v10,v18,const1)
601 VPMSUMD(v11,v19,const1)
602 VPMSUMD(v12,v20,const1)
603 VPMSUMD(v13,v21,const1)
604 VPMSUMD(v14,v22,const1)
605 VPMSUMD(v15,v23,const1)
613 addis r3,r2,.short_constants@toc@ha
614 addi r3,r3,.short_constants@toc@l
616 /* Calculate where in the constant table we need to start */
620 /* How many 16 byte chunks? */
629 VPERM(v0,v0,v16,byteswap)
630 vxor v0,v0,v8 /* xor in initial value */
636 VPERM(v1,v1,v17,byteswap)
642 VPERM(v2,v2,v16,byteswap)
648 VPERM(v3,v3,v17,byteswap)
654 VPERM(v4,v4,v16,byteswap)
660 VPERM(v5,v5,v17,byteswap)
666 VPERM(v6,v6,v16,byteswap)
672 VPERM(v7,v7,v17,byteswap)
681 VPERM(v8,v8,v16,byteswap)
687 VPERM(v9,v9,v17,byteswap)
693 VPERM(v10,v10,v16,byteswap)
699 VPERM(v11,v11,v17,byteswap)
705 VPERM(v12,v12,v16,byteswap)
711 VPERM(v13,v13,v17,byteswap)
717 VPERM(v14,v14,v16,byteswap)
723 VPERM(v15,v15,v17,byteswap)
726 .Lv15: vxor v19,v19,v15
727 .Lv14: vxor v20,v20,v14
728 .Lv13: vxor v19,v19,v13
729 .Lv12: vxor v20,v20,v12
730 .Lv11: vxor v19,v19,v11
731 .Lv10: vxor v20,v20,v10
732 .Lv9: vxor v19,v19,v9
733 .Lv8: vxor v20,v20,v8
734 .Lv7: vxor v19,v19,v7
735 .Lv6: vxor v20,v20,v6
736 .Lv5: vxor v19,v19,v5
737 .Lv4: vxor v20,v20,v4
738 .Lv3: vxor v19,v19,v3
739 .Lv2: vxor v20,v20,v2
740 .Lv1: vxor v19,v19,v1
741 .Lv0: vxor v20,v20,v0
745 b .Lbarrett_reduction
751 FUNC_END(CRC_FUNCTION_NAME)