Linux 4.18.10
[linux/fpc-iii.git] / arch / arm64 / crypto / ghash-ce-core.S
blobc723647b37db0387f58d3ea88f899147fdbc2727
1 /*
2  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3  *
4  * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 as published
8  * by the Free Software Foundation.
9  */
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
14         SHASH           .req    v0
15         SHASH2          .req    v1
16         T1              .req    v2
17         T2              .req    v3
18         MASK            .req    v4
19         XL              .req    v5
20         XM              .req    v6
21         XH              .req    v7
22         IN1             .req    v7
24         k00_16          .req    v8
25         k32_48          .req    v9
27         t3              .req    v10
28         t4              .req    v11
29         t5              .req    v12
30         t6              .req    v13
31         t7              .req    v14
32         t8              .req    v15
33         t9              .req    v16
35         perm1           .req    v17
36         perm2           .req    v18
37         perm3           .req    v19
39         sh1             .req    v20
40         sh2             .req    v21
41         sh3             .req    v22
42         sh4             .req    v23
44         ss1             .req    v24
45         ss2             .req    v25
46         ss3             .req    v26
47         ss4             .req    v27
49         .text
50         .arch           armv8-a+crypto
52         .macro          __pmull_p64, rd, rn, rm
53         pmull           \rd\().1q, \rn\().1d, \rm\().1d
54         .endm
56         .macro          __pmull2_p64, rd, rn, rm
57         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
58         .endm
60         .macro          __pmull_p8, rq, ad, bd
61         ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
62         ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
63         ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
65         __pmull_p8_\bd  \rq, \ad
66         .endm
68         .macro          __pmull2_p8, rq, ad, bd
69         tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
70         tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
71         tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
73         __pmull2_p8_\bd \rq, \ad
74         .endm
76         .macro          __pmull_p8_SHASH, rq, ad
77         __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
78         .endm
80         .macro          __pmull_p8_SHASH2, rq, ad
81         __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
82         .endm
84         .macro          __pmull2_p8_SHASH, rq, ad
85         __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
86         .endm
88         .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
89         pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
90         pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
91         pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
92         pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
93         pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
94         pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
95         pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
96         pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
98         eor             t3.16b, t3.16b, t4.16b                  // L = E + F
99         eor             t5.16b, t5.16b, t6.16b                  // M = G + H
100         eor             t7.16b, t7.16b, t8.16b                  // N = I + J
102         uzp1            t4.2d, t3.2d, t5.2d
103         uzp2            t3.2d, t3.2d, t5.2d
104         uzp1            t6.2d, t7.2d, t9.2d
105         uzp2            t7.2d, t7.2d, t9.2d
107         // t3 = (L) (P0 + P1) << 8
108         // t5 = (M) (P2 + P3) << 16
109         eor             t4.16b, t4.16b, t3.16b
110         and             t3.16b, t3.16b, k32_48.16b
112         // t7 = (N) (P4 + P5) << 24
113         // t9 = (K) (P6 + P7) << 32
114         eor             t6.16b, t6.16b, t7.16b
115         and             t7.16b, t7.16b, k00_16.16b
117         eor             t4.16b, t4.16b, t3.16b
118         eor             t6.16b, t6.16b, t7.16b
120         zip2            t5.2d, t4.2d, t3.2d
121         zip1            t3.2d, t4.2d, t3.2d
122         zip2            t9.2d, t6.2d, t7.2d
123         zip1            t7.2d, t6.2d, t7.2d
125         ext             t3.16b, t3.16b, t3.16b, #15
126         ext             t5.16b, t5.16b, t5.16b, #14
127         ext             t7.16b, t7.16b, t7.16b, #13
128         ext             t9.16b, t9.16b, t9.16b, #12
130         eor             t3.16b, t3.16b, t5.16b
131         eor             t7.16b, t7.16b, t9.16b
132         eor             \rq\().16b, \rq\().16b, t3.16b
133         eor             \rq\().16b, \rq\().16b, t7.16b
134         .endm
136         .macro          __pmull_pre_p64
137         movi            MASK.16b, #0xe1
138         shl             MASK.2d, MASK.2d, #57
139         .endm
141         .macro          __pmull_pre_p8
142         // k00_16 := 0x0000000000000000_000000000000ffff
143         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
144         movi            k32_48.2d, #0xffffffff
145         mov             k32_48.h[2], k32_48.h[0]
146         ushr            k00_16.2d, k32_48.2d, #32
148         // prepare the permutation vectors
149         mov_q           x5, 0x080f0e0d0c0b0a09
150         movi            T1.8b, #8
151         dup             perm1.2d, x5
152         eor             perm1.16b, perm1.16b, T1.16b
153         ushr            perm2.2d, perm1.2d, #8
154         ushr            perm3.2d, perm1.2d, #16
155         ushr            T1.2d, perm1.2d, #24
156         sli             perm2.2d, perm1.2d, #56
157         sli             perm3.2d, perm1.2d, #48
158         sli             T1.2d, perm1.2d, #40
160         // precompute loop invariants
161         tbl             sh1.16b, {SHASH.16b}, perm1.16b
162         tbl             sh2.16b, {SHASH.16b}, perm2.16b
163         tbl             sh3.16b, {SHASH.16b}, perm3.16b
164         tbl             sh4.16b, {SHASH.16b}, T1.16b
165         ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
166         ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
167         ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
168         ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
169         .endm
171         //
172         // PMULL (64x64->128) based reduction for CPUs that can do
173         // it in a single instruction.
174         //
175         .macro          __pmull_reduce_p64
176         pmull           T2.1q, XL.1d, MASK.1d
177         eor             XM.16b, XM.16b, T1.16b
179         mov             XH.d[0], XM.d[1]
180         mov             XM.d[1], XL.d[0]
182         eor             XL.16b, XM.16b, T2.16b
183         ext             T2.16b, XL.16b, XL.16b, #8
184         pmull           XL.1q, XL.1d, MASK.1d
185         .endm
187         //
188         // Alternative reduction for CPUs that lack support for the
189         // 64x64->128 PMULL instruction
190         //
191         .macro          __pmull_reduce_p8
192         eor             XM.16b, XM.16b, T1.16b
194         mov             XL.d[1], XM.d[0]
195         mov             XH.d[0], XM.d[1]
197         shl             T1.2d, XL.2d, #57
198         shl             T2.2d, XL.2d, #62
199         eor             T2.16b, T2.16b, T1.16b
200         shl             T1.2d, XL.2d, #63
201         eor             T2.16b, T2.16b, T1.16b
202         ext             T1.16b, XL.16b, XH.16b, #8
203         eor             T2.16b, T2.16b, T1.16b
205         mov             XL.d[1], T2.d[0]
206         mov             XH.d[0], T2.d[1]
208         ushr            T2.2d, XL.2d, #1
209         eor             XH.16b, XH.16b, XL.16b
210         eor             XL.16b, XL.16b, T2.16b
211         ushr            T2.2d, T2.2d, #6
212         ushr            XL.2d, XL.2d, #1
213         .endm
215         .macro          __pmull_ghash, pn
216         frame_push      5
218         mov             x19, x0
219         mov             x20, x1
220         mov             x21, x2
221         mov             x22, x3
222         mov             x23, x4
224 0:      ld1             {SHASH.2d}, [x22]
225         ld1             {XL.2d}, [x20]
226         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
227         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
229         __pmull_pre_\pn
231         /* do the head block first, if supplied */
232         cbz             x23, 1f
233         ld1             {T1.2d}, [x23]
234         mov             x23, xzr
235         b               2f
237 1:      ld1             {T1.2d}, [x21], #16
238         sub             w19, w19, #1
240 2:      /* multiply XL by SHASH in GF(2^128) */
241 CPU_LE( rev64           T1.16b, T1.16b  )
243         ext             T2.16b, XL.16b, XL.16b, #8
244         ext             IN1.16b, T1.16b, T1.16b, #8
245         eor             T1.16b, T1.16b, T2.16b
246         eor             XL.16b, XL.16b, IN1.16b
248         __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
249         eor             T1.16b, T1.16b, XL.16b
250         __pmull_\pn     XL, XL, SHASH                   // a0 * b0
251         __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
253         eor             T2.16b, XL.16b, XH.16b
254         ext             T1.16b, XL.16b, XH.16b, #8
255         eor             XM.16b, XM.16b, T2.16b
257         __pmull_reduce_\pn
259         eor             T2.16b, T2.16b, XH.16b
260         eor             XL.16b, XL.16b, T2.16b
262         cbz             w19, 3f
264         if_will_cond_yield_neon
265         st1             {XL.2d}, [x20]
266         do_cond_yield_neon
267         b               0b
268         endif_yield_neon
270         b               1b
272 3:      st1             {XL.2d}, [x20]
273         frame_pop
274         ret
275         .endm
277         /*
278          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
279          *                         struct ghash_key const *k, const char *head)
280          */
281 ENTRY(pmull_ghash_update_p64)
282         __pmull_ghash   p64
283 ENDPROC(pmull_ghash_update_p64)
285 ENTRY(pmull_ghash_update_p8)
286         __pmull_ghash   p8
287 ENDPROC(pmull_ghash_update_p8)
289         KS              .req    v8
290         CTR             .req    v9
291         INP             .req    v10
293         .macro          load_round_keys, rounds, rk
294         cmp             \rounds, #12
295         blo             2222f           /* 128 bits */
296         beq             1111f           /* 192 bits */
297         ld1             {v17.4s-v18.4s}, [\rk], #32
298 1111:   ld1             {v19.4s-v20.4s}, [\rk], #32
299 2222:   ld1             {v21.4s-v24.4s}, [\rk], #64
300         ld1             {v25.4s-v28.4s}, [\rk], #64
301         ld1             {v29.4s-v31.4s}, [\rk]
302         .endm
304         .macro          enc_round, state, key
305         aese            \state\().16b, \key\().16b
306         aesmc           \state\().16b, \state\().16b
307         .endm
309         .macro          enc_block, state, rounds
310         cmp             \rounds, #12
311         b.lo            2222f           /* 128 bits */
312         b.eq            1111f           /* 192 bits */
313         enc_round       \state, v17
314         enc_round       \state, v18
315 1111:   enc_round       \state, v19
316         enc_round       \state, v20
317 2222:   .irp            key, v21, v22, v23, v24, v25, v26, v27, v28, v29
318         enc_round       \state, \key
319         .endr
320         aese            \state\().16b, v30.16b
321         eor             \state\().16b, \state\().16b, v31.16b
322         .endm
324         .macro          pmull_gcm_do_crypt, enc
325         ld1             {SHASH.2d}, [x4]
326         ld1             {XL.2d}, [x1]
327         ldr             x8, [x5, #8]                    // load lower counter
329         load_round_keys w7, x6
331         movi            MASK.16b, #0xe1
332         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
333 CPU_LE( rev             x8, x8          )
334         shl             MASK.2d, MASK.2d, #57
335         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
337         .if             \enc == 1
338         ldr             x10, [sp]
339         ld1             {KS.16b}, [x10]
340         .endif
342 0:      ld1             {CTR.8b}, [x5]                  // load upper counter
343         ld1             {INP.16b}, [x3], #16
344         rev             x9, x8
345         add             x8, x8, #1
346         sub             w0, w0, #1
347         ins             CTR.d[1], x9                    // set lower counter
349         .if             \enc == 1
350         eor             INP.16b, INP.16b, KS.16b        // encrypt input
351         st1             {INP.16b}, [x2], #16
352         .endif
354         rev64           T1.16b, INP.16b
356         cmp             w7, #12
357         b.ge            2f                              // AES-192/256?
359 1:      enc_round       CTR, v21
361         ext             T2.16b, XL.16b, XL.16b, #8
362         ext             IN1.16b, T1.16b, T1.16b, #8
364         enc_round       CTR, v22
366         eor             T1.16b, T1.16b, T2.16b
367         eor             XL.16b, XL.16b, IN1.16b
369         enc_round       CTR, v23
371         pmull2          XH.1q, SHASH.2d, XL.2d          // a1 * b1
372         eor             T1.16b, T1.16b, XL.16b
374         enc_round       CTR, v24
376         pmull           XL.1q, SHASH.1d, XL.1d          // a0 * b0
377         pmull           XM.1q, SHASH2.1d, T1.1d         // (a1 + a0)(b1 + b0)
379         enc_round       CTR, v25
381         ext             T1.16b, XL.16b, XH.16b, #8
382         eor             T2.16b, XL.16b, XH.16b
383         eor             XM.16b, XM.16b, T1.16b
385         enc_round       CTR, v26
387         eor             XM.16b, XM.16b, T2.16b
388         pmull           T2.1q, XL.1d, MASK.1d
390         enc_round       CTR, v27
392         mov             XH.d[0], XM.d[1]
393         mov             XM.d[1], XL.d[0]
395         enc_round       CTR, v28
397         eor             XL.16b, XM.16b, T2.16b
399         enc_round       CTR, v29
401         ext             T2.16b, XL.16b, XL.16b, #8
403         aese            CTR.16b, v30.16b
405         pmull           XL.1q, XL.1d, MASK.1d
406         eor             T2.16b, T2.16b, XH.16b
408         eor             KS.16b, CTR.16b, v31.16b
410         eor             XL.16b, XL.16b, T2.16b
412         .if             \enc == 0
413         eor             INP.16b, INP.16b, KS.16b
414         st1             {INP.16b}, [x2], #16
415         .endif
417         cbnz            w0, 0b
419 CPU_LE( rev             x8, x8          )
420         st1             {XL.2d}, [x1]
421         str             x8, [x5, #8]                    // store lower counter
423         .if             \enc == 1
424         st1             {KS.16b}, [x10]
425         .endif
427         ret
429 2:      b.eq            3f                              // AES-192?
430         enc_round       CTR, v17
431         enc_round       CTR, v18
432 3:      enc_round       CTR, v19
433         enc_round       CTR, v20
434         b               1b
435         .endm
437         /*
438          * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
439          *                        struct ghash_key const *k, u8 ctr[],
440          *                        int rounds, u8 ks[])
441          */
442 ENTRY(pmull_gcm_encrypt)
443         pmull_gcm_do_crypt      1
444 ENDPROC(pmull_gcm_encrypt)
446         /*
447          * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
448          *                        struct ghash_key const *k, u8 ctr[],
449          *                        int rounds)
450          */
451 ENTRY(pmull_gcm_decrypt)
452         pmull_gcm_do_crypt      0
453 ENDPROC(pmull_gcm_decrypt)
455         /*
456          * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
457          */
458 ENTRY(pmull_gcm_encrypt_block)
459         cbz             x2, 0f
460         load_round_keys w3, x2
461 0:      ld1             {v0.16b}, [x1]
462         enc_block       v0, w3
463         st1             {v0.16b}, [x0]
464         ret
465 ENDPROC(pmull_gcm_encrypt_block)