2 * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
13 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License version 2 only,
17 * as published by the Free Software Foundation.
19 * This program is distributed in the hope that it will be useful, but
20 * WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * General Public License version 2 for more details (a copy is included
23 * in the LICENSE file that accompanied this code).
25 * You should have received a copy of the GNU General Public License
26 * version 2 along with this program; If not, see http://www.gnu.org/licenses
28 * Please visit http://www.xyratex.com/contact if you need additional
29 * information or have any questions.
35 * Copyright 2012 Xyratex Technology Limited
37 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
39 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
40 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
42 * http://www.intel.com/products/processor/manuals/
43 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
44 * Volume 2B: Instruction Set Reference, N-Z
46 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
47 * Alexander Boyko <Alexander_Boyko@xyratex.com>
50 #include <linux/linkage.h>
51 #include <asm/assembler.h>
53 .section ".rodata", "a"
55 .cpu generic+crypto+crc
59 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
60 * #define CONSTANT_R1 0x154442bd4LL
62 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
63 * #define CONSTANT_R2 0x1c6e41596LL
65 .octa 0x00000001c6e415960000000154442bd4
68 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
69 * #define CONSTANT_R3 0x1751997d0LL
71 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
72 * #define CONSTANT_R4 0x0ccaa009eLL
74 .octa 0x00000000ccaa009e00000001751997d0
77 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
78 * #define CONSTANT_R5 0x163cd6124LL
80 .quad 0x0000000163cd6124
81 .quad 0x00000000FFFFFFFF
84 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
86 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
88 * #define CONSTANT_RU 0x1F7011641LL
90 .octa 0x00000001F701164100000001DB710641
93 .octa 0x000000009e4addf800000000740eef02
94 .octa 0x000000014cd00bd600000000f20c0dfe
95 .quad 0x00000000dd45aab8
96 .quad 0x00000000FFFFFFFF
97 .octa 0x00000000dea713f10000000105ec76f0
113 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
114 * CRC - initial crc32
116 * uint crc32_pmull_le(unsigned char const *buffer,
117 * size_t len, uint crc32)
120 ENTRY(crc32_pmull_le)
121 adr_l x3, .Lcrc32_constants
124 ENTRY(crc32c_pmull_le)
125 adr_l x3, .Lcrc32c_constants
135 ld1 {v1.16b-v4.16b}, [BUF], #0x40
138 eor v1.16b, v1.16b, vCONSTANT.16b
143 ldr qCONSTANT, [CONST]
145 loop_64: /* 64 bytes Full cache line folding */
148 pmull2 v5.1q, v1.2d, vCONSTANT.2d
149 pmull2 v6.1q, v2.2d, vCONSTANT.2d
150 pmull2 v7.1q, v3.2d, vCONSTANT.2d
151 pmull2 v8.1q, v4.2d, vCONSTANT.2d
153 pmull v1.1q, v1.1d, vCONSTANT.1d
154 pmull v2.1q, v2.1d, vCONSTANT.1d
155 pmull v3.1q, v3.1d, vCONSTANT.1d
156 pmull v4.1q, v4.1d, vCONSTANT.1d
158 eor v1.16b, v1.16b, v5.16b
159 ld1 {v5.16b}, [BUF], #0x10
160 eor v2.16b, v2.16b, v6.16b
161 ld1 {v6.16b}, [BUF], #0x10
162 eor v3.16b, v3.16b, v7.16b
163 ld1 {v7.16b}, [BUF], #0x10
164 eor v4.16b, v4.16b, v8.16b
165 ld1 {v8.16b}, [BUF], #0x10
167 eor v1.16b, v1.16b, v5.16b
168 eor v2.16b, v2.16b, v6.16b
169 eor v3.16b, v3.16b, v7.16b
170 eor v4.16b, v4.16b, v8.16b
175 if_will_cond_yield_neon
176 stp q1, q2, [sp, #.Lframe_local_offset]
177 stp q3, q4, [sp, #.Lframe_local_offset + 32]
179 ldp q1, q2, [sp, #.Lframe_local_offset]
180 ldp q3, q4, [sp, #.Lframe_local_offset + 32]
181 ldr qCONSTANT, [CONST]
186 less_64: /* Folding cache line into 128bit */
187 ldr qCONSTANT, [CONST, #16]
189 pmull2 v5.1q, v1.2d, vCONSTANT.2d
190 pmull v1.1q, v1.1d, vCONSTANT.1d
191 eor v1.16b, v1.16b, v5.16b
192 eor v1.16b, v1.16b, v2.16b
194 pmull2 v5.1q, v1.2d, vCONSTANT.2d
195 pmull v1.1q, v1.1d, vCONSTANT.1d
196 eor v1.16b, v1.16b, v5.16b
197 eor v1.16b, v1.16b, v3.16b
199 pmull2 v5.1q, v1.2d, vCONSTANT.2d
200 pmull v1.1q, v1.1d, vCONSTANT.1d
201 eor v1.16b, v1.16b, v5.16b
202 eor v1.16b, v1.16b, v4.16b
206 loop_16: /* Folding rest buffer into 128bit */
209 ld1 {v2.16b}, [BUF], #0x10
210 pmull2 v5.1q, v1.2d, vCONSTANT.2d
211 pmull v1.1q, v1.1d, vCONSTANT.1d
212 eor v1.16b, v1.16b, v5.16b
213 eor v1.16b, v1.16b, v2.16b
218 /* perform the last 64 bit fold, also adds 32 zeroes
219 * to the input stream */
220 ext v2.16b, v1.16b, v1.16b, #8
221 pmull2 v2.1q, v2.2d, vCONSTANT.2d
222 ext v1.16b, v1.16b, vzr.16b, #8
223 eor v1.16b, v1.16b, v2.16b
225 /* final 32-bit fold */
226 ldr dCONSTANT, [CONST, #32]
229 ext v2.16b, v1.16b, vzr.16b, #4
230 and v1.16b, v1.16b, v3.16b
231 pmull v1.1q, v1.1d, vCONSTANT.1d
232 eor v1.16b, v1.16b, v2.16b
234 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
235 ldr qCONSTANT, [CONST, #48]
237 and v2.16b, v1.16b, v3.16b
238 ext v2.16b, vzr.16b, v2.16b, #8
239 pmull2 v2.1q, v2.2d, vCONSTANT.2d
240 and v2.16b, v2.16b, v3.16b
241 pmull v2.1q, v2.1d, vCONSTANT.1d
242 eor v1.16b, v1.16b, v2.16b
247 ENDPROC(crc32_pmull_le)
248 ENDPROC(crc32c_pmull_le)
253 ldp x3, x4, [x1], #16
256 crc32\c\()x w0, w0, x3
257 crc32\c\()x w0, w0, x4
264 crc32\c\()x w0, w0, x3
268 crc32\c\()w w0, w0, w3
271 CPU_BE( rev16 w3, w3 )
272 crc32\c\()h w0, w0, w3
275 crc32\c\()b w0, w0, w3
280 ENTRY(crc32_armv8_le)
282 ENDPROC(crc32_armv8_le)
285 ENTRY(crc32c_armv8_le)
287 ENDPROC(crc32c_armv8_le)