Linux 4.18.10
[linux/fpc-iii.git] / arch / arm64 / crypto / crc32-ce-core.S
blob8061bf0f9c66ab052e9bcd0b118872f49e7bf14c
1 /*
2  * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
3  *
4  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
11 /* GPL HEADER START
12  *
13  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License version 2 only,
17  * as published by the Free Software Foundation.
18  *
19  * This program is distributed in the hope that it will be useful, but
20  * WITHOUT ANY WARRANTY; without even the implied warranty of
21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22  * General Public License version 2 for more details (a copy is included
23  * in the LICENSE file that accompanied this code).
24  *
25  * You should have received a copy of the GNU General Public License
26  * version 2 along with this program; If not, see http://www.gnu.org/licenses
27  *
28  * Please  visit http://www.xyratex.com/contact if you need additional
29  * information or have any questions.
30  *
31  * GPL HEADER END
32  */
35  * Copyright 2012 Xyratex Technology Limited
36  *
37  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
38  * calculation.
39  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
40  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
41  * at:
42  * http://www.intel.com/products/processor/manuals/
43  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
44  * Volume 2B: Instruction Set Reference, N-Z
45  *
46  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
47  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
48  */
50 #include <linux/linkage.h>
51 #include <asm/assembler.h>
53         .section        ".rodata", "a"
54         .align          6
55         .cpu            generic+crypto+crc
57 .Lcrc32_constants:
58         /*
59          * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
60          * #define CONSTANT_R1  0x154442bd4LL
61          *
62          * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
63          * #define CONSTANT_R2  0x1c6e41596LL
64          */
65         .octa           0x00000001c6e415960000000154442bd4
67         /*
68          * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
69          * #define CONSTANT_R3  0x1751997d0LL
70          *
71          * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
72          * #define CONSTANT_R4  0x0ccaa009eLL
73          */
74         .octa           0x00000000ccaa009e00000001751997d0
76         /*
77          * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
78          * #define CONSTANT_R5  0x163cd6124LL
79          */
80         .quad           0x0000000163cd6124
81         .quad           0x00000000FFFFFFFF
83         /*
84          * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
85          *
86          * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
87          *                                                      = 0x1F7011641LL
88          * #define CONSTANT_RU  0x1F7011641LL
89          */
90         .octa           0x00000001F701164100000001DB710641
92 .Lcrc32c_constants:
93         .octa           0x000000009e4addf800000000740eef02
94         .octa           0x000000014cd00bd600000000f20c0dfe
95         .quad           0x00000000dd45aab8
96         .quad           0x00000000FFFFFFFF
97         .octa           0x00000000dea713f10000000105ec76f0
99         vCONSTANT       .req    v0
100         dCONSTANT       .req    d0
101         qCONSTANT       .req    q0
103         BUF             .req    x19
104         LEN             .req    x20
105         CRC             .req    x21
106         CONST           .req    x22
108         vzr             .req    v9
110         /**
111          * Calculate crc32
112          * BUF - buffer
113          * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
114          * CRC - initial crc32
115          * return %eax crc32
116          * uint crc32_pmull_le(unsigned char const *buffer,
117          *                     size_t len, uint crc32)
118          */
119         .text
120 ENTRY(crc32_pmull_le)
121         adr_l           x3, .Lcrc32_constants
122         b               0f
124 ENTRY(crc32c_pmull_le)
125         adr_l           x3, .Lcrc32c_constants
127 0:      frame_push      4, 64
129         mov             BUF, x0
130         mov             LEN, x1
131         mov             CRC, x2
132         mov             CONST, x3
134         bic             LEN, LEN, #15
135         ld1             {v1.16b-v4.16b}, [BUF], #0x40
136         movi            vzr.16b, #0
137         fmov            dCONSTANT, CRC
138         eor             v1.16b, v1.16b, vCONSTANT.16b
139         sub             LEN, LEN, #0x40
140         cmp             LEN, #0x40
141         b.lt            less_64
143         ldr             qCONSTANT, [CONST]
145 loop_64:                /* 64 bytes Full cache line folding */
146         sub             LEN, LEN, #0x40
148         pmull2          v5.1q, v1.2d, vCONSTANT.2d
149         pmull2          v6.1q, v2.2d, vCONSTANT.2d
150         pmull2          v7.1q, v3.2d, vCONSTANT.2d
151         pmull2          v8.1q, v4.2d, vCONSTANT.2d
153         pmull           v1.1q, v1.1d, vCONSTANT.1d
154         pmull           v2.1q, v2.1d, vCONSTANT.1d
155         pmull           v3.1q, v3.1d, vCONSTANT.1d
156         pmull           v4.1q, v4.1d, vCONSTANT.1d
158         eor             v1.16b, v1.16b, v5.16b
159         ld1             {v5.16b}, [BUF], #0x10
160         eor             v2.16b, v2.16b, v6.16b
161         ld1             {v6.16b}, [BUF], #0x10
162         eor             v3.16b, v3.16b, v7.16b
163         ld1             {v7.16b}, [BUF], #0x10
164         eor             v4.16b, v4.16b, v8.16b
165         ld1             {v8.16b}, [BUF], #0x10
167         eor             v1.16b, v1.16b, v5.16b
168         eor             v2.16b, v2.16b, v6.16b
169         eor             v3.16b, v3.16b, v7.16b
170         eor             v4.16b, v4.16b, v8.16b
172         cmp             LEN, #0x40
173         b.lt            less_64
175         if_will_cond_yield_neon
176         stp             q1, q2, [sp, #.Lframe_local_offset]
177         stp             q3, q4, [sp, #.Lframe_local_offset + 32]
178         do_cond_yield_neon
179         ldp             q1, q2, [sp, #.Lframe_local_offset]
180         ldp             q3, q4, [sp, #.Lframe_local_offset + 32]
181         ldr             qCONSTANT, [CONST]
182         movi            vzr.16b, #0
183         endif_yield_neon
184         b               loop_64
186 less_64:                /* Folding cache line into 128bit */
187         ldr             qCONSTANT, [CONST, #16]
189         pmull2          v5.1q, v1.2d, vCONSTANT.2d
190         pmull           v1.1q, v1.1d, vCONSTANT.1d
191         eor             v1.16b, v1.16b, v5.16b
192         eor             v1.16b, v1.16b, v2.16b
194         pmull2          v5.1q, v1.2d, vCONSTANT.2d
195         pmull           v1.1q, v1.1d, vCONSTANT.1d
196         eor             v1.16b, v1.16b, v5.16b
197         eor             v1.16b, v1.16b, v3.16b
199         pmull2          v5.1q, v1.2d, vCONSTANT.2d
200         pmull           v1.1q, v1.1d, vCONSTANT.1d
201         eor             v1.16b, v1.16b, v5.16b
202         eor             v1.16b, v1.16b, v4.16b
204         cbz             LEN, fold_64
206 loop_16:                /* Folding rest buffer into 128bit */
207         subs            LEN, LEN, #0x10
209         ld1             {v2.16b}, [BUF], #0x10
210         pmull2          v5.1q, v1.2d, vCONSTANT.2d
211         pmull           v1.1q, v1.1d, vCONSTANT.1d
212         eor             v1.16b, v1.16b, v5.16b
213         eor             v1.16b, v1.16b, v2.16b
215         b.ne            loop_16
217 fold_64:
218         /* perform the last 64 bit fold, also adds 32 zeroes
219          * to the input stream */
220         ext             v2.16b, v1.16b, v1.16b, #8
221         pmull2          v2.1q, v2.2d, vCONSTANT.2d
222         ext             v1.16b, v1.16b, vzr.16b, #8
223         eor             v1.16b, v1.16b, v2.16b
225         /* final 32-bit fold */
226         ldr             dCONSTANT, [CONST, #32]
227         ldr             d3, [CONST, #40]
229         ext             v2.16b, v1.16b, vzr.16b, #4
230         and             v1.16b, v1.16b, v3.16b
231         pmull           v1.1q, v1.1d, vCONSTANT.1d
232         eor             v1.16b, v1.16b, v2.16b
234         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
235         ldr             qCONSTANT, [CONST, #48]
237         and             v2.16b, v1.16b, v3.16b
238         ext             v2.16b, vzr.16b, v2.16b, #8
239         pmull2          v2.1q, v2.2d, vCONSTANT.2d
240         and             v2.16b, v2.16b, v3.16b
241         pmull           v2.1q, v2.1d, vCONSTANT.1d
242         eor             v1.16b, v1.16b, v2.16b
243         mov             w0, v1.s[1]
245         frame_pop
246         ret
247 ENDPROC(crc32_pmull_le)
248 ENDPROC(crc32c_pmull_le)
250         .macro          __crc32, c
251 0:      subs            x2, x2, #16
252         b.mi            8f
253         ldp             x3, x4, [x1], #16
254 CPU_BE( rev             x3, x3          )
255 CPU_BE( rev             x4, x4          )
256         crc32\c\()x     w0, w0, x3
257         crc32\c\()x     w0, w0, x4
258         b.ne            0b
259         ret
261 8:      tbz             x2, #3, 4f
262         ldr             x3, [x1], #8
263 CPU_BE( rev             x3, x3          )
264         crc32\c\()x     w0, w0, x3
265 4:      tbz             x2, #2, 2f
266         ldr             w3, [x1], #4
267 CPU_BE( rev             w3, w3          )
268         crc32\c\()w     w0, w0, w3
269 2:      tbz             x2, #1, 1f
270         ldrh            w3, [x1], #2
271 CPU_BE( rev16           w3, w3          )
272         crc32\c\()h     w0, w0, w3
273 1:      tbz             x2, #0, 0f
274         ldrb            w3, [x1]
275         crc32\c\()b     w0, w0, w3
276 0:      ret
277         .endm
279         .align          5
280 ENTRY(crc32_armv8_le)
281         __crc32
282 ENDPROC(crc32_armv8_le)
284         .align          5
285 ENTRY(crc32c_armv8_le)
286         __crc32         c
287 ENDPROC(crc32c_armv8_le)