2 * Core of the accelerated CRC algorithm.
3 * In your file, define the constants and CRC_FUNCTION_NAME
4 * Then include this file.
6 * Calculate the checksum of data that is 16 byte aligned and a multiple of
9 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
10 * chunks in order to mask the latency of the vpmsum instructions. If we
11 * have more than 32 kB of data to checksum we repeat this step multiple
12 * times, passing in the previous 1024 bits.
14 * The next step is to reduce the 1024 bits to 64 bits. This step adds
15 * 32 bits of 0s to the end - this matches what a CRC does. We just
16 * calculate constants that land the data in this 32 bits.
18 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
19 * for n = CRC using POWER8 instructions. We use x = 32.
21 * http://en.wikipedia.org/wiki/Barrett_reduction
23 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
25 * This program is free software; you can redistribute it and/or
26 * modify it under the terms of the GNU General Public License
27 * as published by the Free Software Foundation; either version
28 * 2 of the License, or (at your option) any later version.
31 #include <asm/ppc_asm.h>
32 #include <asm/ppc-opcode.h>
34 #define MAX_SIZE 32768
38 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
40 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
58 #define mask_32bit v27
59 #define mask_64bit v28
63 #define VPERM(A, B, C, D) vperm A, B, C, D
65 #define VPERM(A, B, C, D)
68 /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
69 FUNC_START(CRC_FUNCTION_NAME)
87 /* Enough room for saving 10 non volatile VMX registers */
104 vxor zeroes,zeroes,zeroes
107 vsldoi mask_32bit,zeroes,v0,4
108 vsldoi mask_64bit,zeroes,v0,8
110 /* Get the initial value into v8 */
114 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
116 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
120 addis r3,r2,.byteswap_constant@toc@ha
121 addi r3,r3,.byteswap_constant@toc@l
132 /* Checksum in blocks of MAX_SIZE */
141 /* our main loop does 128 bytes at a time */
145 * Work out the offset into the constants table to start at. Each
146 * constant is 16 bytes, and it is used against 128 bytes of input
147 * data - 128 / 16 = 8
153 /* We reduce our final 128 bytes in a separate step */
157 addis r3,r2,.constants@toc@ha
158 addi r3,r3,.constants@toc@l
160 /* Find the start of our constants */
163 /* zero v0-v7 which will contain our checksums */
176 * If we are looping back to consume more data we use the values
177 * already in v16-v23.
182 /* First warm up pass */
185 VPERM(v16,v16,v16,byteswap)
186 VPERM(v17,v17,v17,byteswap)
189 VPERM(v18,v18,v18,byteswap)
190 VPERM(v19,v19,v19,byteswap)
193 VPERM(v20,v20,v20,byteswap)
194 VPERM(v21,v21,v21,byteswap)
197 VPERM(v22,v22,v22,byteswap)
198 VPERM(v23,v23,v23,byteswap)
201 /* xor in initial value */
204 2: bdz .Lfirst_warm_up_done
209 /* Second warm up pass */
210 VPMSUMD(v8,v16,const1)
212 VPERM(v16,v16,v16,byteswap)
215 VPMSUMD(v9,v17,const1)
217 VPERM(v17,v17,v17,byteswap)
220 VPMSUMD(v10,v18,const1)
222 VPERM(v18,v18,v18,byteswap)
225 VPMSUMD(v11,v19,const1)
227 VPERM(v19,v19,v19,byteswap)
230 VPMSUMD(v12,v20,const1)
232 VPERM(v20,v20,v20,byteswap)
235 VPMSUMD(v13,v21,const1)
237 VPERM(v21,v21,v21,byteswap)
240 VPMSUMD(v14,v22,const1)
242 VPERM(v22,v22,v22,byteswap)
245 VPMSUMD(v15,v23,const1)
247 VPERM(v23,v23,v23,byteswap)
251 bdz .Lfirst_cool_down
254 * main loop. We modulo schedule it such that it takes three iterations
255 * to complete - first iteration load, second iteration vpmsum, third
264 VPMSUMD(v8,v16,const2)
266 VPERM(v16,v16,v16,byteswap)
270 VPMSUMD(v9,v17,const2)
272 VPERM(v17,v17,v17,byteswap)
276 VPMSUMD(v10,v18,const2)
278 VPERM(v18,v18,v18,byteswap)
282 VPMSUMD(v11,v19,const2)
284 VPERM(v19,v19,v19,byteswap)
289 VPMSUMD(v12,v20,const1)
291 VPERM(v20,v20,v20,byteswap)
295 VPMSUMD(v13,v21,const1)
297 VPERM(v21,v21,v21,byteswap)
301 VPMSUMD(v14,v22,const1)
303 VPERM(v22,v22,v22,byteswap)
307 VPMSUMD(v15,v23,const1)
309 VPERM(v23,v23,v23,byteswap)
316 /* First cool down pass */
321 VPMSUMD(v8,v16,const1)
325 VPMSUMD(v9,v17,const1)
329 VPMSUMD(v10,v18,const1)
333 VPMSUMD(v11,v19,const1)
337 VPMSUMD(v12,v20,const1)
341 VPMSUMD(v13,v21,const1)
345 VPMSUMD(v14,v22,const1)
349 VPMSUMD(v15,v23,const1)
353 /* Second cool down pass */
365 * vpmsumd produces a 96 bit result in the least significant bits
366 * of the register. Since we are bit reflected we have to shift it
367 * left 32 bits so it occupies the least significant bits in the
368 * bit reflected domain.
370 vsldoi v0,v0,zeroes,4
371 vsldoi v1,v1,zeroes,4
372 vsldoi v2,v2,zeroes,4
373 vsldoi v3,v3,zeroes,4
374 vsldoi v4,v4,zeroes,4
375 vsldoi v5,v5,zeroes,4
376 vsldoi v6,v6,zeroes,4
377 vsldoi v7,v7,zeroes,4
380 /* xor with last 1024 bits */
383 VPERM(v8,v8,v8,byteswap)
384 VPERM(v9,v9,v9,byteswap)
387 VPERM(v10,v10,v10,byteswap)
388 VPERM(v11,v11,v11,byteswap)
391 VPERM(v12,v12,v12,byteswap)
392 VPERM(v13,v13,v13,byteswap)
395 VPERM(v14,v14,v14,byteswap)
396 VPERM(v15,v15,v15,byteswap)
414 /* Work out how many bytes we have left */
417 /* Calculate where in the constant table we need to start */
421 /* How many 16 byte chunks are in the tail */
426 * Reduce the previously calculated 1024 bits to 64 bits, shifting
427 * 32 bits to include the trailing 32 bits of zeros
448 /* Now reduce the tail (0 - 112 bytes) */
454 VPERM(v16,v16,v16,byteswap)
461 VPERM(v16,v16,v16,byteswap)
468 VPERM(v16,v16,v16,byteswap)
475 VPERM(v16,v16,v16,byteswap)
482 VPERM(v16,v16,v16,byteswap)
489 VPERM(v16,v16,v16,byteswap)
496 VPERM(v16,v16,v16,byteswap)
500 /* Now xor all the parallel chunks together */
512 /* Barrett constants */
513 addis r3,r2,.barrett_constants@toc@ha
514 addi r3,r3,.barrett_constants@toc@l
520 vxor v0,v0,v1 /* xor two 64 bit results together */
523 /* shift left one bit */
528 vand v0,v0,mask_64bit
531 * Now for the Barrett reduction algorithm. The idea is to calculate q,
532 * the multiple of our polynomial that we need to subtract. By
533 * doing the computation 2x bits higher (ie 64 bits) and shifting the
534 * result back down 2x bits, we round down to the nearest multiple.
536 VPMSUMD(v1,v0,const1) /* ma */
537 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
538 VPMSUMD(v1,v1,const2) /* qn */
539 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
542 * Get the result into r3. We need to shift it left 8 bytes:
546 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
549 * The reflected version of Barrett reduction. Instead of bit
550 * reflecting our data (which is expensive to do), we bit reflect our
551 * constants and our algorithm, which means the intermediate data in
552 * our vector registers goes from 0-63 instead of 63-0. We can reflect
553 * the algorithm because we don't carry in mod 2 arithmetic.
555 vand v1,v0,mask_32bit /* bottom 32 bits of a */
556 VPMSUMD(v1,v1,const1) /* ma */
557 vand v1,v1,mask_32bit /* bottom 32bits of ma */
558 VPMSUMD(v1,v1,const2) /* qn */
559 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
562 * Since we are bit reflected, the result (ie the low 32 bits) is in
563 * the high 32 bits. We just need to shift it left 4 bytes
567 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
598 .Lfirst_warm_up_done:
602 VPMSUMD(v8,v16,const1)
603 VPMSUMD(v9,v17,const1)
604 VPMSUMD(v10,v18,const1)
605 VPMSUMD(v11,v19,const1)
606 VPMSUMD(v12,v20,const1)
607 VPMSUMD(v13,v21,const1)
608 VPMSUMD(v14,v22,const1)
609 VPMSUMD(v15,v23,const1)
617 addis r3,r2,.short_constants@toc@ha
618 addi r3,r3,.short_constants@toc@l
620 /* Calculate where in the constant table we need to start */
624 /* How many 16 byte chunks? */
633 VPERM(v0,v0,v16,byteswap)
634 vxor v0,v0,v8 /* xor in initial value */
640 VPERM(v1,v1,v17,byteswap)
646 VPERM(v2,v2,v16,byteswap)
652 VPERM(v3,v3,v17,byteswap)
658 VPERM(v4,v4,v16,byteswap)
664 VPERM(v5,v5,v17,byteswap)
670 VPERM(v6,v6,v16,byteswap)
676 VPERM(v7,v7,v17,byteswap)
685 VPERM(v8,v8,v16,byteswap)
691 VPERM(v9,v9,v17,byteswap)
697 VPERM(v10,v10,v16,byteswap)
703 VPERM(v11,v11,v17,byteswap)
709 VPERM(v12,v12,v16,byteswap)
715 VPERM(v13,v13,v17,byteswap)
721 VPERM(v14,v14,v16,byteswap)
727 VPERM(v15,v15,v17,byteswap)
730 .Lv15: vxor v19,v19,v15
731 .Lv14: vxor v20,v20,v14
732 .Lv13: vxor v19,v19,v13
733 .Lv12: vxor v20,v20,v12
734 .Lv11: vxor v19,v19,v11
735 .Lv10: vxor v20,v20,v10
736 .Lv9: vxor v19,v19,v9
737 .Lv8: vxor v20,v20,v8
738 .Lv7: vxor v19,v19,v7
739 .Lv6: vxor v20,v20,v6
740 .Lv5: vxor v19,v19,v5
741 .Lv4: vxor v20,v20,v4
742 .Lv3: vxor v19,v19,v3
743 .Lv2: vxor v20,v20,v2
744 .Lv1: vxor v19,v19,v1
745 .Lv0: vxor v20,v20,v0
749 b .Lbarrett_reduction
755 FUNC_END(CRC_FUNCTION_NAME)