Merge tag 'trace-printf-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace...
[drm/drm-misc.git] / arch / arm64 / crypto / ghash-ce-core.S
blob23ee9a5eaf27c23c5b30ead46d7761e1909cd4a5
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4  *
5  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6  */
8 #include <linux/linkage.h>
9 #include <linux/cfi_types.h>
10 #include <asm/assembler.h>
12         SHASH           .req    v0
13         SHASH2          .req    v1
14         T1              .req    v2
15         T2              .req    v3
16         MASK            .req    v4
17         XM              .req    v5
18         XL              .req    v6
19         XH              .req    v7
20         IN1             .req    v7
22         k00_16          .req    v8
23         k32_48          .req    v9
25         t3              .req    v10
26         t4              .req    v11
27         t5              .req    v12
28         t6              .req    v13
29         t7              .req    v14
30         t8              .req    v15
31         t9              .req    v16
33         perm1           .req    v17
34         perm2           .req    v18
35         perm3           .req    v19
37         sh1             .req    v20
38         sh2             .req    v21
39         sh3             .req    v22
40         sh4             .req    v23
42         ss1             .req    v24
43         ss2             .req    v25
44         ss3             .req    v26
45         ss4             .req    v27
47         XL2             .req    v8
48         XM2             .req    v9
49         XH2             .req    v10
50         XL3             .req    v11
51         XM3             .req    v12
52         XH3             .req    v13
53         TT3             .req    v14
54         TT4             .req    v15
55         HH              .req    v16
56         HH3             .req    v17
57         HH4             .req    v18
58         HH34            .req    v19
60         .text
61         .arch           armv8-a+crypto
63         .macro          __pmull_p64, rd, rn, rm
64         pmull           \rd\().1q, \rn\().1d, \rm\().1d
65         .endm
67         .macro          __pmull2_p64, rd, rn, rm
68         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
69         .endm
71         .macro          __pmull_p8, rq, ad, bd
72         ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
73         ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
74         ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
76         __pmull_p8_\bd  \rq, \ad
77         .endm
79         .macro          __pmull2_p8, rq, ad, bd
80         tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
81         tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
82         tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
84         __pmull2_p8_\bd \rq, \ad
85         .endm
87         .macro          __pmull_p8_SHASH, rq, ad
88         __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
89         .endm
91         .macro          __pmull_p8_SHASH2, rq, ad
92         __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
93         .endm
95         .macro          __pmull2_p8_SHASH, rq, ad
96         __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
97         .endm
99         .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100         pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
101         pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
102         pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
103         pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
104         pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
105         pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
106         pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
107         pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
109         eor             t3.16b, t3.16b, t4.16b                  // L = E + F
110         eor             t5.16b, t5.16b, t6.16b                  // M = G + H
111         eor             t7.16b, t7.16b, t8.16b                  // N = I + J
113         uzp1            t4.2d, t3.2d, t5.2d
114         uzp2            t3.2d, t3.2d, t5.2d
115         uzp1            t6.2d, t7.2d, t9.2d
116         uzp2            t7.2d, t7.2d, t9.2d
118         // t3 = (L) (P0 + P1) << 8
119         // t5 = (M) (P2 + P3) << 16
120         eor             t4.16b, t4.16b, t3.16b
121         and             t3.16b, t3.16b, k32_48.16b
123         // t7 = (N) (P4 + P5) << 24
124         // t9 = (K) (P6 + P7) << 32
125         eor             t6.16b, t6.16b, t7.16b
126         and             t7.16b, t7.16b, k00_16.16b
128         eor             t4.16b, t4.16b, t3.16b
129         eor             t6.16b, t6.16b, t7.16b
131         zip2            t5.2d, t4.2d, t3.2d
132         zip1            t3.2d, t4.2d, t3.2d
133         zip2            t9.2d, t6.2d, t7.2d
134         zip1            t7.2d, t6.2d, t7.2d
136         ext             t3.16b, t3.16b, t3.16b, #15
137         ext             t5.16b, t5.16b, t5.16b, #14
138         ext             t7.16b, t7.16b, t7.16b, #13
139         ext             t9.16b, t9.16b, t9.16b, #12
141         eor             t3.16b, t3.16b, t5.16b
142         eor             t7.16b, t7.16b, t9.16b
143         eor             \rq\().16b, \rq\().16b, t3.16b
144         eor             \rq\().16b, \rq\().16b, t7.16b
145         .endm
147         .macro          __pmull_pre_p64
148         add             x8, x3, #16
149         ld1             {HH.2d-HH4.2d}, [x8]
151         trn1            SHASH2.2d, SHASH.2d, HH.2d
152         trn2            T1.2d, SHASH.2d, HH.2d
153         eor             SHASH2.16b, SHASH2.16b, T1.16b
155         trn1            HH34.2d, HH3.2d, HH4.2d
156         trn2            T1.2d, HH3.2d, HH4.2d
157         eor             HH34.16b, HH34.16b, T1.16b
159         movi            MASK.16b, #0xe1
160         shl             MASK.2d, MASK.2d, #57
161         .endm
163         .macro          __pmull_pre_p8
164         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
165         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
167         // k00_16 := 0x0000000000000000_000000000000ffff
168         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
169         movi            k32_48.2d, #0xffffffff
170         mov             k32_48.h[2], k32_48.h[0]
171         ushr            k00_16.2d, k32_48.2d, #32
173         // prepare the permutation vectors
174         mov_q           x5, 0x080f0e0d0c0b0a09
175         movi            T1.8b, #8
176         dup             perm1.2d, x5
177         eor             perm1.16b, perm1.16b, T1.16b
178         ushr            perm2.2d, perm1.2d, #8
179         ushr            perm3.2d, perm1.2d, #16
180         ushr            T1.2d, perm1.2d, #24
181         sli             perm2.2d, perm1.2d, #56
182         sli             perm3.2d, perm1.2d, #48
183         sli             T1.2d, perm1.2d, #40
185         // precompute loop invariants
186         tbl             sh1.16b, {SHASH.16b}, perm1.16b
187         tbl             sh2.16b, {SHASH.16b}, perm2.16b
188         tbl             sh3.16b, {SHASH.16b}, perm3.16b
189         tbl             sh4.16b, {SHASH.16b}, T1.16b
190         ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
191         ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
192         ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
193         ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
194         .endm
196         //
197         // PMULL (64x64->128) based reduction for CPUs that can do
198         // it in a single instruction.
199         //
200         .macro          __pmull_reduce_p64
201         pmull           T2.1q, XL.1d, MASK.1d
202         eor             XM.16b, XM.16b, T1.16b
204         mov             XH.d[0], XM.d[1]
205         mov             XM.d[1], XL.d[0]
207         eor             XL.16b, XM.16b, T2.16b
208         ext             T2.16b, XL.16b, XL.16b, #8
209         pmull           XL.1q, XL.1d, MASK.1d
210         .endm
212         //
213         // Alternative reduction for CPUs that lack support for the
214         // 64x64->128 PMULL instruction
215         //
216         .macro          __pmull_reduce_p8
217         eor             XM.16b, XM.16b, T1.16b
219         mov             XL.d[1], XM.d[0]
220         mov             XH.d[0], XM.d[1]
222         shl             T1.2d, XL.2d, #57
223         shl             T2.2d, XL.2d, #62
224         eor             T2.16b, T2.16b, T1.16b
225         shl             T1.2d, XL.2d, #63
226         eor             T2.16b, T2.16b, T1.16b
227         ext             T1.16b, XL.16b, XH.16b, #8
228         eor             T2.16b, T2.16b, T1.16b
230         mov             XL.d[1], T2.d[0]
231         mov             XH.d[0], T2.d[1]
233         ushr            T2.2d, XL.2d, #1
234         eor             XH.16b, XH.16b, XL.16b
235         eor             XL.16b, XL.16b, T2.16b
236         ushr            T2.2d, T2.2d, #6
237         ushr            XL.2d, XL.2d, #1
238         .endm
240         .macro          __pmull_ghash, pn
241         ld1             {SHASH.2d}, [x3]
242         ld1             {XL.2d}, [x1]
244         __pmull_pre_\pn
246         /* do the head block first, if supplied */
247         cbz             x4, 0f
248         ld1             {T1.2d}, [x4]
249         mov             x4, xzr
250         b               3f
252 0:      .ifc            \pn, p64
253         tbnz            w0, #0, 2f              // skip until #blocks is a
254         tbnz            w0, #1, 2f              // round multiple of 4
256 1:      ld1             {XM3.16b-TT4.16b}, [x2], #64
258         sub             w0, w0, #4
260         rev64           T1.16b, XM3.16b
261         rev64           T2.16b, XH3.16b
262         rev64           TT4.16b, TT4.16b
263         rev64           TT3.16b, TT3.16b
265         ext             IN1.16b, TT4.16b, TT4.16b, #8
266         ext             XL3.16b, TT3.16b, TT3.16b, #8
268         eor             TT4.16b, TT4.16b, IN1.16b
269         pmull2          XH2.1q, SHASH.2d, IN1.2d        // a1 * b1
270         pmull           XL2.1q, SHASH.1d, IN1.1d        // a0 * b0
271         pmull           XM2.1q, SHASH2.1d, TT4.1d       // (a1 + a0)(b1 + b0)
273         eor             TT3.16b, TT3.16b, XL3.16b
274         pmull2          XH3.1q, HH.2d, XL3.2d           // a1 * b1
275         pmull           XL3.1q, HH.1d, XL3.1d           // a0 * b0
276         pmull2          XM3.1q, SHASH2.2d, TT3.2d       // (a1 + a0)(b1 + b0)
278         ext             IN1.16b, T2.16b, T2.16b, #8
279         eor             XL2.16b, XL2.16b, XL3.16b
280         eor             XH2.16b, XH2.16b, XH3.16b
281         eor             XM2.16b, XM2.16b, XM3.16b
283         eor             T2.16b, T2.16b, IN1.16b
284         pmull2          XH3.1q, HH3.2d, IN1.2d          // a1 * b1
285         pmull           XL3.1q, HH3.1d, IN1.1d          // a0 * b0
286         pmull           XM3.1q, HH34.1d, T2.1d          // (a1 + a0)(b1 + b0)
288         eor             XL2.16b, XL2.16b, XL3.16b
289         eor             XH2.16b, XH2.16b, XH3.16b
290         eor             XM2.16b, XM2.16b, XM3.16b
292         ext             IN1.16b, T1.16b, T1.16b, #8
293         ext             TT3.16b, XL.16b, XL.16b, #8
294         eor             XL.16b, XL.16b, IN1.16b
295         eor             T1.16b, T1.16b, TT3.16b
297         pmull2          XH.1q, HH4.2d, XL.2d            // a1 * b1
298         eor             T1.16b, T1.16b, XL.16b
299         pmull           XL.1q, HH4.1d, XL.1d            // a0 * b0
300         pmull2          XM.1q, HH34.2d, T1.2d           // (a1 + a0)(b1 + b0)
302         eor             XL.16b, XL.16b, XL2.16b
303         eor             XH.16b, XH.16b, XH2.16b
304         eor             XM.16b, XM.16b, XM2.16b
306         eor             T2.16b, XL.16b, XH.16b
307         ext             T1.16b, XL.16b, XH.16b, #8
308         eor             XM.16b, XM.16b, T2.16b
310         __pmull_reduce_p64
312         eor             T2.16b, T2.16b, XH.16b
313         eor             XL.16b, XL.16b, T2.16b
315         cbz             w0, 5f
316         b               1b
317         .endif
319 2:      ld1             {T1.2d}, [x2], #16
320         sub             w0, w0, #1
322 3:      /* multiply XL by SHASH in GF(2^128) */
323 CPU_LE( rev64           T1.16b, T1.16b  )
325         ext             T2.16b, XL.16b, XL.16b, #8
326         ext             IN1.16b, T1.16b, T1.16b, #8
327         eor             T1.16b, T1.16b, T2.16b
328         eor             XL.16b, XL.16b, IN1.16b
330         __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
331         eor             T1.16b, T1.16b, XL.16b
332         __pmull_\pn     XL, XL, SHASH                   // a0 * b0
333         __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
335 4:      eor             T2.16b, XL.16b, XH.16b
336         ext             T1.16b, XL.16b, XH.16b, #8
337         eor             XM.16b, XM.16b, T2.16b
339         __pmull_reduce_\pn
341         eor             T2.16b, T2.16b, XH.16b
342         eor             XL.16b, XL.16b, T2.16b
344         cbnz            w0, 0b
346 5:      st1             {XL.2d}, [x1]
347         ret
348         .endm
350         /*
351          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352          *                         struct ghash_key const *k, const char *head)
353          */
354 SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355         __pmull_ghash   p64
356 SYM_FUNC_END(pmull_ghash_update_p64)
358 SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359         __pmull_ghash   p8
360 SYM_FUNC_END(pmull_ghash_update_p8)
362         KS0             .req    v8
363         KS1             .req    v9
364         KS2             .req    v10
365         KS3             .req    v11
367         INP0            .req    v21
368         INP1            .req    v22
369         INP2            .req    v23
370         INP3            .req    v24
372         K0              .req    v25
373         K1              .req    v26
374         K2              .req    v27
375         K3              .req    v28
376         K4              .req    v12
377         K5              .req    v13
378         K6              .req    v4
379         K7              .req    v5
380         K8              .req    v14
381         K9              .req    v15
382         KK              .req    v29
383         KL              .req    v30
384         KM              .req    v31
386         .macro          load_round_keys, rounds, rk, tmp
387         add             \tmp, \rk, #64
388         ld1             {K0.4s-K3.4s}, [\rk]
389         ld1             {K4.4s-K5.4s}, [\tmp]
390         add             \tmp, \rk, \rounds, lsl #4
391         sub             \tmp, \tmp, #32
392         ld1             {KK.4s-KM.4s}, [\tmp]
393         .endm
395         .macro          enc_round, state, key
396         aese            \state\().16b, \key\().16b
397         aesmc           \state\().16b, \state\().16b
398         .endm
400         .macro          enc_qround, s0, s1, s2, s3, key
401         enc_round       \s0, \key
402         enc_round       \s1, \key
403         enc_round       \s2, \key
404         enc_round       \s3, \key
405         .endm
407         .macro          enc_block, state, rounds, rk, tmp
408         add             \tmp, \rk, #96
409         ld1             {K6.4s-K7.4s}, [\tmp], #32
410         .irp            key, K0, K1, K2, K3, K4 K5
411         enc_round       \state, \key
412         .endr
414         tbnz            \rounds, #2, .Lnot128_\@
415 .Lout256_\@:
416         enc_round       \state, K6
417         enc_round       \state, K7
419 .Lout192_\@:
420         enc_round       \state, KK
421         aese            \state\().16b, KL.16b
422         eor             \state\().16b, \state\().16b, KM.16b
424         .subsection     1
425 .Lnot128_\@:
426         ld1             {K8.4s-K9.4s}, [\tmp], #32
427         enc_round       \state, K6
428         enc_round       \state, K7
429         ld1             {K6.4s-K7.4s}, [\tmp]
430         enc_round       \state, K8
431         enc_round       \state, K9
432         tbz             \rounds, #1, .Lout192_\@
433         b               .Lout256_\@
434         .previous
435         .endm
437         .align          6
438         .macro          pmull_gcm_do_crypt, enc
439         frame_push      1
441         load_round_keys x7, x6, x8
443         ld1             {SHASH.2d}, [x3], #16
444         ld1             {HH.2d-HH4.2d}, [x3]
446         trn1            SHASH2.2d, SHASH.2d, HH.2d
447         trn2            T1.2d, SHASH.2d, HH.2d
448         eor             SHASH2.16b, SHASH2.16b, T1.16b
450         trn1            HH34.2d, HH3.2d, HH4.2d
451         trn2            T1.2d, HH3.2d, HH4.2d
452         eor             HH34.16b, HH34.16b, T1.16b
454         ld1             {XL.2d}, [x4]
456         cbz             x0, 3f                          // tag only?
458         ldr             w8, [x5, #12]                   // load lower counter
459 CPU_LE( rev             w8, w8          )
461 0:      mov             w9, #4                          // max blocks per round
462         add             x10, x0, #0xf
463         lsr             x10, x10, #4                    // remaining blocks
465         subs            x0, x0, #64
466         csel            w9, w10, w9, mi
467         add             w8, w8, w9
469         bmi             1f
470         ld1             {INP0.16b-INP3.16b}, [x2], #64
471         .subsection     1
472         /*
473          * Populate the four input registers right to left with up to 63 bytes
474          * of data, using overlapping loads to avoid branches.
475          *
476          *                INP0     INP1     INP2     INP3
477          *  1 byte     |        |        |        |x       |
478          * 16 bytes    |        |        |        |xxxxxxxx|
479          * 17 bytes    |        |        |xxxxxxxx|x       |
480          * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
481          * etc etc
482          *
483          * Note that this code may read up to 15 bytes before the start of
484          * the input. It is up to the calling code to ensure this is safe if
485          * this happens in the first iteration of the loop (i.e., when the
486          * input size is < 16 bytes)
487          */
488 1:      mov             x15, #16
489         ands            x19, x0, #0xf
490         csel            x19, x19, x15, ne
491         adr_l           x17, .Lpermute_table + 16
493         sub             x11, x15, x19
494         add             x12, x17, x11
495         sub             x17, x17, x11
496         ld1             {T1.16b}, [x12]
497         sub             x10, x1, x11
498         sub             x11, x2, x11
500         cmp             x0, #-16
501         csel            x14, x15, xzr, gt
502         cmp             x0, #-32
503         csel            x15, x15, xzr, gt
504         cmp             x0, #-48
505         csel            x16, x19, xzr, gt
506         csel            x1, x1, x10, gt
507         csel            x2, x2, x11, gt
509         ld1             {INP0.16b}, [x2], x14
510         ld1             {INP1.16b}, [x2], x15
511         ld1             {INP2.16b}, [x2], x16
512         ld1             {INP3.16b}, [x2]
513         tbl             INP3.16b, {INP3.16b}, T1.16b
514         b               2f
515         .previous
517 2:      .if             \enc == 0
518         bl              pmull_gcm_ghash_4x
519         .endif
521         bl              pmull_gcm_enc_4x
523         tbnz            x0, #63, 6f
524         st1             {INP0.16b-INP3.16b}, [x1], #64
525         .if             \enc == 1
526         bl              pmull_gcm_ghash_4x
527         .endif
528         bne             0b
530 3:      ldr             x10, [sp, #.Lframe_local_offset]
531         cbz             x10, 5f                         // output tag?
533         ld1             {INP3.16b}, [x10]               // load lengths[]
534         mov             w9, #1
535         bl              pmull_gcm_ghash_4x
537         mov             w11, #(0x1 << 24)               // BE '1U'
538         ld1             {KS0.16b}, [x5]
539         mov             KS0.s[3], w11
541         enc_block       KS0, x7, x6, x12
543         ext             XL.16b, XL.16b, XL.16b, #8
544         rev64           XL.16b, XL.16b
545         eor             XL.16b, XL.16b, KS0.16b
547         .if             \enc == 1
548         st1             {XL.16b}, [x10]                 // store tag
549         .else
550         ldp             x11, x12, [sp, #40]             // load tag pointer and authsize
551         adr_l           x17, .Lpermute_table
552         ld1             {KS0.16b}, [x11]                // load supplied tag
553         add             x17, x17, x12
554         ld1             {KS1.16b}, [x17]                // load permute vector
556         cmeq            XL.16b, XL.16b, KS0.16b         // compare tags
557         mvn             XL.16b, XL.16b                  // -1 for fail, 0 for pass
558         tbl             XL.16b, {XL.16b}, KS1.16b       // keep authsize bytes only
559         sminv           b0, XL.16b                      // signed minimum across XL
560         smov            w0, v0.b[0]                     // return b0
561         .endif
563 4:      frame_pop
564         ret
567 CPU_LE( rev             w8, w8          )
568         str             w8, [x5, #12]                   // store lower counter
569         st1             {XL.2d}, [x4]
570         b               4b
572 6:      ld1             {T1.16b-T2.16b}, [x17], #32     // permute vectors
573         sub             x17, x17, x19, lsl #1
575         cmp             w9, #1
576         beq             7f
577         .subsection     1
578 7:      ld1             {INP2.16b}, [x1]
579         tbx             INP2.16b, {INP3.16b}, T1.16b
580         mov             INP3.16b, INP2.16b
581         b               8f
582         .previous
584         st1             {INP0.16b}, [x1], x14
585         st1             {INP1.16b}, [x1], x15
586         st1             {INP2.16b}, [x1], x16
587         tbl             INP3.16b, {INP3.16b}, T1.16b
588         tbx             INP3.16b, {INP2.16b}, T2.16b
589 8:      st1             {INP3.16b}, [x1]
591         .if             \enc == 1
592         ld1             {T1.16b}, [x17]
593         tbl             INP3.16b, {INP3.16b}, T1.16b    // clear non-data bits
594         bl              pmull_gcm_ghash_4x
595         .endif
596         b               3b
597         .endm
599         /*
600          * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
602          *                        int rounds, u8 tag)
603          */
604 SYM_FUNC_START(pmull_gcm_encrypt)
605         pmull_gcm_do_crypt      1
606 SYM_FUNC_END(pmull_gcm_encrypt)
608         /*
609          * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
611          *                        int rounds, u8 tag)
612          */
613 SYM_FUNC_START(pmull_gcm_decrypt)
614         pmull_gcm_do_crypt      0
615 SYM_FUNC_END(pmull_gcm_decrypt)
617 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
618         movi            MASK.16b, #0xe1
619         shl             MASK.2d, MASK.2d, #57
621         rev64           T1.16b, INP0.16b
622         rev64           T2.16b, INP1.16b
623         rev64           TT3.16b, INP2.16b
624         rev64           TT4.16b, INP3.16b
626         ext             XL.16b, XL.16b, XL.16b, #8
628         tbz             w9, #2, 0f                      // <4 blocks?
629         .subsection     1
630 0:      movi            XH2.16b, #0
631         movi            XM2.16b, #0
632         movi            XL2.16b, #0
634         tbz             w9, #0, 1f                      // 2 blocks?
635         tbz             w9, #1, 2f                      // 1 block?
637         eor             T2.16b, T2.16b, XL.16b
638         ext             T1.16b, T2.16b, T2.16b, #8
639         b               .Lgh3
641 1:      eor             TT3.16b, TT3.16b, XL.16b
642         ext             T2.16b, TT3.16b, TT3.16b, #8
643         b               .Lgh2
645 2:      eor             TT4.16b, TT4.16b, XL.16b
646         ext             IN1.16b, TT4.16b, TT4.16b, #8
647         b               .Lgh1
648         .previous
650         eor             T1.16b, T1.16b, XL.16b
651         ext             IN1.16b, T1.16b, T1.16b, #8
653         pmull2          XH2.1q, HH4.2d, IN1.2d          // a1 * b1
654         eor             T1.16b, T1.16b, IN1.16b
655         pmull           XL2.1q, HH4.1d, IN1.1d          // a0 * b0
656         pmull2          XM2.1q, HH34.2d, T1.2d          // (a1 + a0)(b1 + b0)
658         ext             T1.16b, T2.16b, T2.16b, #8
659 .Lgh3:  eor             T2.16b, T2.16b, T1.16b
660         pmull2          XH.1q, HH3.2d, T1.2d            // a1 * b1
661         pmull           XL.1q, HH3.1d, T1.1d            // a0 * b0
662         pmull           XM.1q, HH34.1d, T2.1d           // (a1 + a0)(b1 + b0)
664         eor             XH2.16b, XH2.16b, XH.16b
665         eor             XL2.16b, XL2.16b, XL.16b
666         eor             XM2.16b, XM2.16b, XM.16b
668         ext             T2.16b, TT3.16b, TT3.16b, #8
669 .Lgh2:  eor             TT3.16b, TT3.16b, T2.16b
670         pmull2          XH.1q, HH.2d, T2.2d             // a1 * b1
671         pmull           XL.1q, HH.1d, T2.1d             // a0 * b0
672         pmull2          XM.1q, SHASH2.2d, TT3.2d        // (a1 + a0)(b1 + b0)
674         eor             XH2.16b, XH2.16b, XH.16b
675         eor             XL2.16b, XL2.16b, XL.16b
676         eor             XM2.16b, XM2.16b, XM.16b
678         ext             IN1.16b, TT4.16b, TT4.16b, #8
679 .Lgh1:  eor             TT4.16b, TT4.16b, IN1.16b
680         pmull           XL.1q, SHASH.1d, IN1.1d         // a0 * b0
681         pmull2          XH.1q, SHASH.2d, IN1.2d         // a1 * b1
682         pmull           XM.1q, SHASH2.1d, TT4.1d        // (a1 + a0)(b1 + b0)
684         eor             XH.16b, XH.16b, XH2.16b
685         eor             XL.16b, XL.16b, XL2.16b
686         eor             XM.16b, XM.16b, XM2.16b
688         eor             T2.16b, XL.16b, XH.16b
689         ext             T1.16b, XL.16b, XH.16b, #8
690         eor             XM.16b, XM.16b, T2.16b
692         __pmull_reduce_p64
694         eor             T2.16b, T2.16b, XH.16b
695         eor             XL.16b, XL.16b, T2.16b
697         ret
698 SYM_FUNC_END(pmull_gcm_ghash_4x)
700 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701         ld1             {KS0.16b}, [x5]                 // load upper counter
702         sub             w10, w8, #4
703         sub             w11, w8, #3
704         sub             w12, w8, #2
705         sub             w13, w8, #1
706         rev             w10, w10
707         rev             w11, w11
708         rev             w12, w12
709         rev             w13, w13
710         mov             KS1.16b, KS0.16b
711         mov             KS2.16b, KS0.16b
712         mov             KS3.16b, KS0.16b
713         ins             KS0.s[3], w10                   // set lower counter
714         ins             KS1.s[3], w11
715         ins             KS2.s[3], w12
716         ins             KS3.s[3], w13
718         add             x10, x6, #96                    // round key pointer
719         ld1             {K6.4s-K7.4s}, [x10], #32
720         .irp            key, K0, K1, K2, K3, K4, K5
721         enc_qround      KS0, KS1, KS2, KS3, \key
722         .endr
724         tbnz            x7, #2, .Lnot128
725         .subsection     1
726 .Lnot128:
727         ld1             {K8.4s-K9.4s}, [x10], #32
728         .irp            key, K6, K7
729         enc_qround      KS0, KS1, KS2, KS3, \key
730         .endr
731         ld1             {K6.4s-K7.4s}, [x10]
732         .irp            key, K8, K9
733         enc_qround      KS0, KS1, KS2, KS3, \key
734         .endr
735         tbz             x7, #1, .Lout192
736         b               .Lout256
737         .previous
739 .Lout256:
740         .irp            key, K6, K7
741         enc_qround      KS0, KS1, KS2, KS3, \key
742         .endr
744 .Lout192:
745         enc_qround      KS0, KS1, KS2, KS3, KK
747         aese            KS0.16b, KL.16b
748         aese            KS1.16b, KL.16b
749         aese            KS2.16b, KL.16b
750         aese            KS3.16b, KL.16b
752         eor             KS0.16b, KS0.16b, KM.16b
753         eor             KS1.16b, KS1.16b, KM.16b
754         eor             KS2.16b, KS2.16b, KM.16b
755         eor             KS3.16b, KS3.16b, KM.16b
757         eor             INP0.16b, INP0.16b, KS0.16b
758         eor             INP1.16b, INP1.16b, KS1.16b
759         eor             INP2.16b, INP2.16b, KS2.16b
760         eor             INP3.16b, INP3.16b, KS3.16b
762         ret
763 SYM_FUNC_END(pmull_gcm_enc_4x)
765         .section        ".rodata", "a"
766         .align          6
767 .Lpermute_table:
768         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
771         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
772         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
773         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
775         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
776         .previous