1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Core of the accelerated CRC algorithm.
4 * In your file, define the constants and CRC_FUNCTION_NAME
5 * Then include this file.
7 * Calculate the checksum of data that is 16 byte aligned and a multiple of
10 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
11 * chunks in order to mask the latency of the vpmsum instructions. If we
12 * have more than 32 kB of data to checksum we repeat this step multiple
13 * times, passing in the previous 1024 bits.
15 * The next step is to reduce the 1024 bits to 64 bits. This step adds
16 * 32 bits of 0s to the end - this matches what a CRC does. We just
17 * calculate constants that land the data in this 32 bits.
19 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
20 * for n = CRC using POWER8 instructions. We use x = 32.
22 * https://en.wikipedia.org/wiki/Barrett_reduction
24 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
27 #include <asm/ppc_asm.h>
28 #include <asm/ppc-opcode.h>
30 #define MAX_SIZE 32768
34 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
36 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
54 #define mask_32bit v27
55 #define mask_64bit v28
59 #define VPERM(A, B, C, D) vperm A, B, C, D
61 #define VPERM(A, B, C, D)
64 /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
65 FUNC_START(CRC_FUNCTION_NAME)
83 /* Enough room for saving 10 non volatile VMX registers */
100 vxor zeroes,zeroes,zeroes
103 vsldoi mask_32bit,zeroes,v0,4
104 vsldoi mask_64bit,zeroes,v0,8
106 /* Get the initial value into v8 */
110 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
112 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
116 LOAD_REG_ADDR(r3, .byteswap_constant)
126 /* Checksum in blocks of MAX_SIZE */
135 /* our main loop does 128 bytes at a time */
139 * Work out the offset into the constants table to start at. Each
140 * constant is 16 bytes, and it is used against 128 bytes of input
141 * data - 128 / 16 = 8
147 /* We reduce our final 128 bytes in a separate step */
151 LOAD_REG_ADDR(r3, .constants)
153 /* Find the start of our constants */
156 /* zero v0-v7 which will contain our checksums */
169 * If we are looping back to consume more data we use the values
170 * already in v16-v23.
175 /* First warm up pass */
178 VPERM(v16,v16,v16,byteswap)
179 VPERM(v17,v17,v17,byteswap)
182 VPERM(v18,v18,v18,byteswap)
183 VPERM(v19,v19,v19,byteswap)
186 VPERM(v20,v20,v20,byteswap)
187 VPERM(v21,v21,v21,byteswap)
190 VPERM(v22,v22,v22,byteswap)
191 VPERM(v23,v23,v23,byteswap)
194 /* xor in initial value */
197 2: bdz .Lfirst_warm_up_done
202 /* Second warm up pass */
203 VPMSUMD(v8,v16,const1)
205 VPERM(v16,v16,v16,byteswap)
208 VPMSUMD(v9,v17,const1)
210 VPERM(v17,v17,v17,byteswap)
213 VPMSUMD(v10,v18,const1)
215 VPERM(v18,v18,v18,byteswap)
218 VPMSUMD(v11,v19,const1)
220 VPERM(v19,v19,v19,byteswap)
223 VPMSUMD(v12,v20,const1)
225 VPERM(v20,v20,v20,byteswap)
228 VPMSUMD(v13,v21,const1)
230 VPERM(v21,v21,v21,byteswap)
233 VPMSUMD(v14,v22,const1)
235 VPERM(v22,v22,v22,byteswap)
238 VPMSUMD(v15,v23,const1)
240 VPERM(v23,v23,v23,byteswap)
244 bdz .Lfirst_cool_down
247 * main loop. We modulo schedule it such that it takes three iterations
248 * to complete - first iteration load, second iteration vpmsum, third
257 VPMSUMD(v8,v16,const2)
259 VPERM(v16,v16,v16,byteswap)
263 VPMSUMD(v9,v17,const2)
265 VPERM(v17,v17,v17,byteswap)
269 VPMSUMD(v10,v18,const2)
271 VPERM(v18,v18,v18,byteswap)
275 VPMSUMD(v11,v19,const2)
277 VPERM(v19,v19,v19,byteswap)
282 VPMSUMD(v12,v20,const1)
284 VPERM(v20,v20,v20,byteswap)
288 VPMSUMD(v13,v21,const1)
290 VPERM(v21,v21,v21,byteswap)
294 VPMSUMD(v14,v22,const1)
296 VPERM(v22,v22,v22,byteswap)
300 VPMSUMD(v15,v23,const1)
302 VPERM(v23,v23,v23,byteswap)
309 /* First cool down pass */
314 VPMSUMD(v8,v16,const1)
318 VPMSUMD(v9,v17,const1)
322 VPMSUMD(v10,v18,const1)
326 VPMSUMD(v11,v19,const1)
330 VPMSUMD(v12,v20,const1)
334 VPMSUMD(v13,v21,const1)
338 VPMSUMD(v14,v22,const1)
342 VPMSUMD(v15,v23,const1)
346 /* Second cool down pass */
358 * vpmsumd produces a 96 bit result in the least significant bits
359 * of the register. Since we are bit reflected we have to shift it
360 * left 32 bits so it occupies the least significant bits in the
361 * bit reflected domain.
363 vsldoi v0,v0,zeroes,4
364 vsldoi v1,v1,zeroes,4
365 vsldoi v2,v2,zeroes,4
366 vsldoi v3,v3,zeroes,4
367 vsldoi v4,v4,zeroes,4
368 vsldoi v5,v5,zeroes,4
369 vsldoi v6,v6,zeroes,4
370 vsldoi v7,v7,zeroes,4
373 /* xor with last 1024 bits */
376 VPERM(v8,v8,v8,byteswap)
377 VPERM(v9,v9,v9,byteswap)
380 VPERM(v10,v10,v10,byteswap)
381 VPERM(v11,v11,v11,byteswap)
384 VPERM(v12,v12,v12,byteswap)
385 VPERM(v13,v13,v13,byteswap)
388 VPERM(v14,v14,v14,byteswap)
389 VPERM(v15,v15,v15,byteswap)
407 /* Work out how many bytes we have left */
410 /* Calculate where in the constant table we need to start */
414 /* How many 16 byte chunks are in the tail */
419 * Reduce the previously calculated 1024 bits to 64 bits, shifting
420 * 32 bits to include the trailing 32 bits of zeros
441 /* Now reduce the tail (0 - 112 bytes) */
447 VPERM(v16,v16,v16,byteswap)
454 VPERM(v16,v16,v16,byteswap)
461 VPERM(v16,v16,v16,byteswap)
468 VPERM(v16,v16,v16,byteswap)
475 VPERM(v16,v16,v16,byteswap)
482 VPERM(v16,v16,v16,byteswap)
489 VPERM(v16,v16,v16,byteswap)
493 /* Now xor all the parallel chunks together */
505 /* Barrett constants */
506 LOAD_REG_ADDR(r3, .barrett_constants)
512 vxor v0,v0,v1 /* xor two 64 bit results together */
515 /* shift left one bit */
520 vand v0,v0,mask_64bit
523 * Now for the Barrett reduction algorithm. The idea is to calculate q,
524 * the multiple of our polynomial that we need to subtract. By
525 * doing the computation 2x bits higher (ie 64 bits) and shifting the
526 * result back down 2x bits, we round down to the nearest multiple.
528 VPMSUMD(v1,v0,const1) /* ma */
529 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
530 VPMSUMD(v1,v1,const2) /* qn */
531 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
534 * Get the result into r3. We need to shift it left 8 bytes:
538 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
541 * The reflected version of Barrett reduction. Instead of bit
542 * reflecting our data (which is expensive to do), we bit reflect our
543 * constants and our algorithm, which means the intermediate data in
544 * our vector registers goes from 0-63 instead of 63-0. We can reflect
545 * the algorithm because we don't carry in mod 2 arithmetic.
547 vand v1,v0,mask_32bit /* bottom 32 bits of a */
548 VPMSUMD(v1,v1,const1) /* ma */
549 vand v1,v1,mask_32bit /* bottom 32bits of ma */
550 VPMSUMD(v1,v1,const2) /* qn */
551 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
554 * Since we are bit reflected, the result (ie the low 32 bits) is in
555 * the high 32 bits. We just need to shift it left 4 bytes
559 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
590 .Lfirst_warm_up_done:
594 VPMSUMD(v8,v16,const1)
595 VPMSUMD(v9,v17,const1)
596 VPMSUMD(v10,v18,const1)
597 VPMSUMD(v11,v19,const1)
598 VPMSUMD(v12,v20,const1)
599 VPMSUMD(v13,v21,const1)
600 VPMSUMD(v14,v22,const1)
601 VPMSUMD(v15,v23,const1)
609 LOAD_REG_ADDR(r3, .short_constants)
611 /* Calculate where in the constant table we need to start */
615 /* How many 16 byte chunks? */
624 VPERM(v0,v0,v16,byteswap)
625 vxor v0,v0,v8 /* xor in initial value */
631 VPERM(v1,v1,v17,byteswap)
637 VPERM(v2,v2,v16,byteswap)
643 VPERM(v3,v3,v17,byteswap)
649 VPERM(v4,v4,v16,byteswap)
655 VPERM(v5,v5,v17,byteswap)
661 VPERM(v6,v6,v16,byteswap)
667 VPERM(v7,v7,v17,byteswap)
676 VPERM(v8,v8,v16,byteswap)
682 VPERM(v9,v9,v17,byteswap)
688 VPERM(v10,v10,v16,byteswap)
694 VPERM(v11,v11,v17,byteswap)
700 VPERM(v12,v12,v16,byteswap)
706 VPERM(v13,v13,v17,byteswap)
712 VPERM(v14,v14,v16,byteswap)
718 VPERM(v15,v15,v17,byteswap)
721 .Lv15: vxor v19,v19,v15
722 .Lv14: vxor v20,v20,v14
723 .Lv13: vxor v19,v19,v13
724 .Lv12: vxor v20,v20,v12
725 .Lv11: vxor v19,v19,v11
726 .Lv10: vxor v20,v20,v10
727 .Lv9: vxor v19,v19,v9
728 .Lv8: vxor v20,v20,v8
729 .Lv7: vxor v19,v19,v7
730 .Lv6: vxor v20,v20,v6
731 .Lv5: vxor v19,v19,v5
732 .Lv4: vxor v20,v20,v4
733 .Lv3: vxor v19,v19,v3
734 .Lv2: vxor v20,v20,v2
735 .Lv1: vxor v19,v19,v1
736 .Lv0: vxor v20,v20,v0
740 b .Lbarrett_reduction
746 FUNC_END(CRC_FUNCTION_NAME)