Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[cris-mirror.git] / arch / arm64 / crypto / crc32-ce-core.S
blob16ed3c7ebd3769d9e9c17bc71fe9d227db558e53
1 /*
2  * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
3  *
4  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
11 /* GPL HEADER START
12  *
13  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License version 2 only,
17  * as published by the Free Software Foundation.
18  *
19  * This program is distributed in the hope that it will be useful, but
20  * WITHOUT ANY WARRANTY; without even the implied warranty of
21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22  * General Public License version 2 for more details (a copy is included
23  * in the LICENSE file that accompanied this code).
24  *
25  * You should have received a copy of the GNU General Public License
26  * version 2 along with this program; If not, see http://www.gnu.org/licenses
27  *
28  * Please  visit http://www.xyratex.com/contact if you need additional
29  * information or have any questions.
30  *
31  * GPL HEADER END
32  */
35  * Copyright 2012 Xyratex Technology Limited
36  *
37  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
38  * calculation.
39  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
40  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
41  * at:
42  * http://www.intel.com/products/processor/manuals/
43  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
44  * Volume 2B: Instruction Set Reference, N-Z
45  *
46  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
47  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
48  */
50 #include <linux/linkage.h>
51 #include <asm/assembler.h>
53         .section        ".rodata", "a"
54         .align          6
55         .cpu            generic+crypto+crc
57 .Lcrc32_constants:
58         /*
59          * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
60          * #define CONSTANT_R1  0x154442bd4LL
61          *
62          * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
63          * #define CONSTANT_R2  0x1c6e41596LL
64          */
65         .octa           0x00000001c6e415960000000154442bd4
67         /*
68          * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
69          * #define CONSTANT_R3  0x1751997d0LL
70          *
71          * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
72          * #define CONSTANT_R4  0x0ccaa009eLL
73          */
74         .octa           0x00000000ccaa009e00000001751997d0
76         /*
77          * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
78          * #define CONSTANT_R5  0x163cd6124LL
79          */
80         .quad           0x0000000163cd6124
81         .quad           0x00000000FFFFFFFF
83         /*
84          * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
85          *
86          * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
87          *                                                      = 0x1F7011641LL
88          * #define CONSTANT_RU  0x1F7011641LL
89          */
90         .octa           0x00000001F701164100000001DB710641
92 .Lcrc32c_constants:
93         .octa           0x000000009e4addf800000000740eef02
94         .octa           0x000000014cd00bd600000000f20c0dfe
95         .quad           0x00000000dd45aab8
96         .quad           0x00000000FFFFFFFF
97         .octa           0x00000000dea713f10000000105ec76f0
99         vCONSTANT       .req    v0
100         dCONSTANT       .req    d0
101         qCONSTANT       .req    q0
103         BUF             .req    x0
104         LEN             .req    x1
105         CRC             .req    x2
107         vzr             .req    v9
109         /**
110          * Calculate crc32
111          * BUF - buffer
112          * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
113          * CRC - initial crc32
114          * return %eax crc32
115          * uint crc32_pmull_le(unsigned char const *buffer,
116          *                     size_t len, uint crc32)
117          */
118         .text
119 ENTRY(crc32_pmull_le)
120         adr_l           x3, .Lcrc32_constants
121         b               0f
123 ENTRY(crc32c_pmull_le)
124         adr_l           x3, .Lcrc32c_constants
126 0:      bic             LEN, LEN, #15
127         ld1             {v1.16b-v4.16b}, [BUF], #0x40
128         movi            vzr.16b, #0
129         fmov            dCONSTANT, CRC
130         eor             v1.16b, v1.16b, vCONSTANT.16b
131         sub             LEN, LEN, #0x40
132         cmp             LEN, #0x40
133         b.lt            less_64
135         ldr             qCONSTANT, [x3]
137 loop_64:                /* 64 bytes Full cache line folding */
138         sub             LEN, LEN, #0x40
140         pmull2          v5.1q, v1.2d, vCONSTANT.2d
141         pmull2          v6.1q, v2.2d, vCONSTANT.2d
142         pmull2          v7.1q, v3.2d, vCONSTANT.2d
143         pmull2          v8.1q, v4.2d, vCONSTANT.2d
145         pmull           v1.1q, v1.1d, vCONSTANT.1d
146         pmull           v2.1q, v2.1d, vCONSTANT.1d
147         pmull           v3.1q, v3.1d, vCONSTANT.1d
148         pmull           v4.1q, v4.1d, vCONSTANT.1d
150         eor             v1.16b, v1.16b, v5.16b
151         ld1             {v5.16b}, [BUF], #0x10
152         eor             v2.16b, v2.16b, v6.16b
153         ld1             {v6.16b}, [BUF], #0x10
154         eor             v3.16b, v3.16b, v7.16b
155         ld1             {v7.16b}, [BUF], #0x10
156         eor             v4.16b, v4.16b, v8.16b
157         ld1             {v8.16b}, [BUF], #0x10
159         eor             v1.16b, v1.16b, v5.16b
160         eor             v2.16b, v2.16b, v6.16b
161         eor             v3.16b, v3.16b, v7.16b
162         eor             v4.16b, v4.16b, v8.16b
164         cmp             LEN, #0x40
165         b.ge            loop_64
167 less_64:                /* Folding cache line into 128bit */
168         ldr             qCONSTANT, [x3, #16]
170         pmull2          v5.1q, v1.2d, vCONSTANT.2d
171         pmull           v1.1q, v1.1d, vCONSTANT.1d
172         eor             v1.16b, v1.16b, v5.16b
173         eor             v1.16b, v1.16b, v2.16b
175         pmull2          v5.1q, v1.2d, vCONSTANT.2d
176         pmull           v1.1q, v1.1d, vCONSTANT.1d
177         eor             v1.16b, v1.16b, v5.16b
178         eor             v1.16b, v1.16b, v3.16b
180         pmull2          v5.1q, v1.2d, vCONSTANT.2d
181         pmull           v1.1q, v1.1d, vCONSTANT.1d
182         eor             v1.16b, v1.16b, v5.16b
183         eor             v1.16b, v1.16b, v4.16b
185         cbz             LEN, fold_64
187 loop_16:                /* Folding rest buffer into 128bit */
188         subs            LEN, LEN, #0x10
190         ld1             {v2.16b}, [BUF], #0x10
191         pmull2          v5.1q, v1.2d, vCONSTANT.2d
192         pmull           v1.1q, v1.1d, vCONSTANT.1d
193         eor             v1.16b, v1.16b, v5.16b
194         eor             v1.16b, v1.16b, v2.16b
196         b.ne            loop_16
198 fold_64:
199         /* perform the last 64 bit fold, also adds 32 zeroes
200          * to the input stream */
201         ext             v2.16b, v1.16b, v1.16b, #8
202         pmull2          v2.1q, v2.2d, vCONSTANT.2d
203         ext             v1.16b, v1.16b, vzr.16b, #8
204         eor             v1.16b, v1.16b, v2.16b
206         /* final 32-bit fold */
207         ldr             dCONSTANT, [x3, #32]
208         ldr             d3, [x3, #40]
210         ext             v2.16b, v1.16b, vzr.16b, #4
211         and             v1.16b, v1.16b, v3.16b
212         pmull           v1.1q, v1.1d, vCONSTANT.1d
213         eor             v1.16b, v1.16b, v2.16b
215         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
216         ldr             qCONSTANT, [x3, #48]
218         and             v2.16b, v1.16b, v3.16b
219         ext             v2.16b, vzr.16b, v2.16b, #8
220         pmull2          v2.1q, v2.2d, vCONSTANT.2d
221         and             v2.16b, v2.16b, v3.16b
222         pmull           v2.1q, v2.1d, vCONSTANT.1d
223         eor             v1.16b, v1.16b, v2.16b
224         mov             w0, v1.s[1]
226         ret
227 ENDPROC(crc32_pmull_le)
228 ENDPROC(crc32c_pmull_le)
230         .macro          __crc32, c
231 0:      subs            x2, x2, #16
232         b.mi            8f
233         ldp             x3, x4, [x1], #16
234 CPU_BE( rev             x3, x3          )
235 CPU_BE( rev             x4, x4          )
236         crc32\c\()x     w0, w0, x3
237         crc32\c\()x     w0, w0, x4
238         b.ne            0b
239         ret
241 8:      tbz             x2, #3, 4f
242         ldr             x3, [x1], #8
243 CPU_BE( rev             x3, x3          )
244         crc32\c\()x     w0, w0, x3
245 4:      tbz             x2, #2, 2f
246         ldr             w3, [x1], #4
247 CPU_BE( rev             w3, w3          )
248         crc32\c\()w     w0, w0, w3
249 2:      tbz             x2, #1, 1f
250         ldrh            w3, [x1], #2
251 CPU_BE( rev16           w3, w3          )
252         crc32\c\()h     w0, w0, w3
253 1:      tbz             x2, #0, 0f
254         ldrb            w3, [x1]
255         crc32\c\()b     w0, w0, w3
256 0:      ret
257         .endm
259         .align          5
260 ENTRY(crc32_armv8_le)
261         __crc32
262 ENDPROC(crc32_armv8_le)
264         .align          5
265 ENTRY(crc32c_armv8_le)
266         __crc32         c
267 ENDPROC(crc32c_armv8_le)