Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[cris-mirror.git] / arch / arm / crypto / crc32-ce-core.S
blob5cbd4a6fedad7cb3c99ed35295b77f554d967434
1 /*
2  * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
3  *
4  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
11 /* GPL HEADER START
12  *
13  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License version 2 only,
17  * as published by the Free Software Foundation.
18  *
19  * This program is distributed in the hope that it will be useful, but
20  * WITHOUT ANY WARRANTY; without even the implied warranty of
21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22  * General Public License version 2 for more details (a copy is included
23  * in the LICENSE file that accompanied this code).
24  *
25  * You should have received a copy of the GNU General Public License
26  * version 2 along with this program; If not, see http://www.gnu.org/licenses
27  *
28  * Please  visit http://www.xyratex.com/contact if you need additional
29  * information or have any questions.
30  *
31  * GPL HEADER END
32  */
35  * Copyright 2012 Xyratex Technology Limited
36  *
37  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
38  * calculation.
39  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
40  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
41  * at:
42  * http://www.intel.com/products/processor/manuals/
43  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
44  * Volume 2B: Instruction Set Reference, N-Z
45  *
46  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
47  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
48  */
50 #include <linux/linkage.h>
51 #include <asm/assembler.h>
53         .text
54         .align          6
55         .arch           armv8-a
56         .arch_extension crc
57         .fpu            crypto-neon-fp-armv8
59 .Lcrc32_constants:
60         /*
61          * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
62          * #define CONSTANT_R1  0x154442bd4LL
63          *
64          * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
65          * #define CONSTANT_R2  0x1c6e41596LL
66          */
67         .quad           0x0000000154442bd4
68         .quad           0x00000001c6e41596
70         /*
71          * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
72          * #define CONSTANT_R3  0x1751997d0LL
73          *
74          * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
75          * #define CONSTANT_R4  0x0ccaa009eLL
76          */
77         .quad           0x00000001751997d0
78         .quad           0x00000000ccaa009e
80         /*
81          * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
82          * #define CONSTANT_R5  0x163cd6124LL
83          */
84         .quad           0x0000000163cd6124
85         .quad           0x00000000FFFFFFFF
87         /*
88          * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
89          *
90          * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
91          *                                                      = 0x1F7011641LL
92          * #define CONSTANT_RU  0x1F7011641LL
93          */
94         .quad           0x00000001DB710641
95         .quad           0x00000001F7011641
97 .Lcrc32c_constants:
98         .quad           0x00000000740eef02
99         .quad           0x000000009e4addf8
100         .quad           0x00000000f20c0dfe
101         .quad           0x000000014cd00bd6
102         .quad           0x00000000dd45aab8
103         .quad           0x00000000FFFFFFFF
104         .quad           0x0000000105ec76f0
105         .quad           0x00000000dea713f1
107         dCONSTANTl      .req    d0
108         dCONSTANTh      .req    d1
109         qCONSTANT       .req    q0
111         BUF             .req    r0
112         LEN             .req    r1
113         CRC             .req    r2
115         qzr             .req    q9
117         /**
118          * Calculate crc32
119          * BUF - buffer
120          * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
121          * CRC - initial crc32
122          * return %eax crc32
123          * uint crc32_pmull_le(unsigned char const *buffer,
124          *                     size_t len, uint crc32)
125          */
126 ENTRY(crc32_pmull_le)
127         adr             r3, .Lcrc32_constants
128         b               0f
130 ENTRY(crc32c_pmull_le)
131         adr             r3, .Lcrc32c_constants
133 0:      bic             LEN, LEN, #15
134         vld1.8          {q1-q2}, [BUF, :128]!
135         vld1.8          {q3-q4}, [BUF, :128]!
136         vmov.i8         qzr, #0
137         vmov.i8         qCONSTANT, #0
138         vmov.32         dCONSTANTl[0], CRC
139         veor.8          d2, d2, dCONSTANTl
140         sub             LEN, LEN, #0x40
141         cmp             LEN, #0x40
142         blt             less_64
144         vld1.64         {qCONSTANT}, [r3]
146 loop_64:                /* 64 bytes Full cache line folding */
147         sub             LEN, LEN, #0x40
149         vmull.p64       q5, d3, dCONSTANTh
150         vmull.p64       q6, d5, dCONSTANTh
151         vmull.p64       q7, d7, dCONSTANTh
152         vmull.p64       q8, d9, dCONSTANTh
154         vmull.p64       q1, d2, dCONSTANTl
155         vmull.p64       q2, d4, dCONSTANTl
156         vmull.p64       q3, d6, dCONSTANTl
157         vmull.p64       q4, d8, dCONSTANTl
159         veor.8          q1, q1, q5
160         vld1.8          {q5}, [BUF, :128]!
161         veor.8          q2, q2, q6
162         vld1.8          {q6}, [BUF, :128]!
163         veor.8          q3, q3, q7
164         vld1.8          {q7}, [BUF, :128]!
165         veor.8          q4, q4, q8
166         vld1.8          {q8}, [BUF, :128]!
168         veor.8          q1, q1, q5
169         veor.8          q2, q2, q6
170         veor.8          q3, q3, q7
171         veor.8          q4, q4, q8
173         cmp             LEN, #0x40
174         bge             loop_64
176 less_64:                /* Folding cache line into 128bit */
177         vldr            dCONSTANTl, [r3, #16]
178         vldr            dCONSTANTh, [r3, #24]
180         vmull.p64       q5, d3, dCONSTANTh
181         vmull.p64       q1, d2, dCONSTANTl
182         veor.8          q1, q1, q5
183         veor.8          q1, q1, q2
185         vmull.p64       q5, d3, dCONSTANTh
186         vmull.p64       q1, d2, dCONSTANTl
187         veor.8          q1, q1, q5
188         veor.8          q1, q1, q3
190         vmull.p64       q5, d3, dCONSTANTh
191         vmull.p64       q1, d2, dCONSTANTl
192         veor.8          q1, q1, q5
193         veor.8          q1, q1, q4
195         teq             LEN, #0
196         beq             fold_64
198 loop_16:                /* Folding rest buffer into 128bit */
199         subs            LEN, LEN, #0x10
201         vld1.8          {q2}, [BUF, :128]!
202         vmull.p64       q5, d3, dCONSTANTh
203         vmull.p64       q1, d2, dCONSTANTl
204         veor.8          q1, q1, q5
205         veor.8          q1, q1, q2
207         bne             loop_16
209 fold_64:
210         /* perform the last 64 bit fold, also adds 32 zeroes
211          * to the input stream */
212         vmull.p64       q2, d2, dCONSTANTh
213         vext.8          q1, q1, qzr, #8
214         veor.8          q1, q1, q2
216         /* final 32-bit fold */
217         vldr            dCONSTANTl, [r3, #32]
218         vldr            d6, [r3, #40]
219         vmov.i8         d7, #0
221         vext.8          q2, q1, qzr, #4
222         vand.8          d2, d2, d6
223         vmull.p64       q1, d2, dCONSTANTl
224         veor.8          q1, q1, q2
226         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
227         vldr            dCONSTANTl, [r3, #48]
228         vldr            dCONSTANTh, [r3, #56]
230         vand.8          q2, q1, q3
231         vext.8          q2, qzr, q2, #8
232         vmull.p64       q2, d5, dCONSTANTh
233         vand.8          q2, q2, q3
234         vmull.p64       q2, d4, dCONSTANTl
235         veor.8          q1, q1, q2
236         vmov            r0, s5
238         bx              lr
239 ENDPROC(crc32_pmull_le)
240 ENDPROC(crc32c_pmull_le)
242         .macro          __crc32, c
243         subs            ip, r2, #8
244         bmi             .Ltail\c
246         tst             r1, #3
247         bne             .Lunaligned\c
249         teq             ip, #0
250 .Laligned8\c:
251         ldrd            r2, r3, [r1], #8
252 ARM_BE8(rev             r2, r2          )
253 ARM_BE8(rev             r3, r3          )
254         crc32\c\()w     r0, r0, r2
255         crc32\c\()w     r0, r0, r3
256         bxeq            lr
257         subs            ip, ip, #8
258         bpl             .Laligned8\c
260 .Ltail\c:
261         tst             ip, #4
262         beq             2f
263         ldr             r3, [r1], #4
264 ARM_BE8(rev             r3, r3          )
265         crc32\c\()w     r0, r0, r3
267 2:      tst             ip, #2
268         beq             1f
269         ldrh            r3, [r1], #2
270 ARM_BE8(rev16           r3, r3          )
271         crc32\c\()h     r0, r0, r3
273 1:      tst             ip, #1
274         bxeq            lr
275         ldrb            r3, [r1]
276         crc32\c\()b     r0, r0, r3
277         bx              lr
279 .Lunaligned\c:
280         tst             r1, #1
281         beq             2f
282         ldrb            r3, [r1], #1
283         subs            r2, r2, #1
284         crc32\c\()b     r0, r0, r3
286         tst             r1, #2
287         beq             0f
288 2:      ldrh            r3, [r1], #2
289         subs            r2, r2, #2
290 ARM_BE8(rev16           r3, r3          )
291         crc32\c\()h     r0, r0, r3
293 0:      subs            ip, r2, #8
294         bpl             .Laligned8\c
295         b               .Ltail\c
296         .endm
298         .align          5
299 ENTRY(crc32_armv8_le)
300         __crc32
301 ENDPROC(crc32_armv8_le)
303         .align          5
304 ENTRY(crc32c_armv8_le)
305         __crc32         c
306 ENDPROC(crc32c_armv8_le)