1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
8 #include <linux/linkage.h>
9 #include <linux/cfi_types.h>
10 #include <asm/assembler.h>
63 .macro __pmull_p64, rd, rn, rm
64 pmull \rd\().1q, \rn\().1d, \rm\().1d
67 .macro __pmull2_p64, rd, rn, rm
68 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
71 .macro __pmull_p8, rq, ad, bd
72 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
73 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
74 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
76 __pmull_p8_\bd \rq, \ad
79 .macro __pmull2_p8, rq, ad, bd
80 tbl t3.16b, {\ad\().16b}, perm1.16b // A1
81 tbl t5.16b, {\ad\().16b}, perm2.16b // A2
82 tbl t7.16b, {\ad\().16b}, perm3.16b // A3
84 __pmull2_p8_\bd \rq, \ad
87 .macro __pmull_p8_SHASH, rq, ad
88 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
91 .macro __pmull_p8_SHASH2, rq, ad
92 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
95 .macro __pmull2_p8_SHASH, rq, ad
96 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
99 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100 pmull\t t3.8h, t3.\nb, \bd // F = A1*B
101 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
102 pmull\t t5.8h, t5.\nb, \bd // H = A2*B
103 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
104 pmull\t t7.8h, t7.\nb, \bd // J = A3*B
105 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
106 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
107 pmull\t \rq\().8h, \ad, \bd // D = A*B
109 eor t3.16b, t3.16b, t4.16b // L = E + F
110 eor t5.16b, t5.16b, t6.16b // M = G + H
111 eor t7.16b, t7.16b, t8.16b // N = I + J
113 uzp1 t4.2d, t3.2d, t5.2d
114 uzp2 t3.2d, t3.2d, t5.2d
115 uzp1 t6.2d, t7.2d, t9.2d
116 uzp2 t7.2d, t7.2d, t9.2d
118 // t3 = (L) (P0 + P1) << 8
119 // t5 = (M) (P2 + P3) << 16
120 eor t4.16b, t4.16b, t3.16b
121 and t3.16b, t3.16b, k32_48.16b
123 // t7 = (N) (P4 + P5) << 24
124 // t9 = (K) (P6 + P7) << 32
125 eor t6.16b, t6.16b, t7.16b
126 and t7.16b, t7.16b, k00_16.16b
128 eor t4.16b, t4.16b, t3.16b
129 eor t6.16b, t6.16b, t7.16b
131 zip2 t5.2d, t4.2d, t3.2d
132 zip1 t3.2d, t4.2d, t3.2d
133 zip2 t9.2d, t6.2d, t7.2d
134 zip1 t7.2d, t6.2d, t7.2d
136 ext t3.16b, t3.16b, t3.16b, #15
137 ext t5.16b, t5.16b, t5.16b, #14
138 ext t7.16b, t7.16b, t7.16b, #13
139 ext t9.16b, t9.16b, t9.16b, #12
141 eor t3.16b, t3.16b, t5.16b
142 eor t7.16b, t7.16b, t9.16b
143 eor \rq\().16b, \rq\().16b, t3.16b
144 eor \rq\().16b, \rq\().16b, t7.16b
147 .macro __pmull_pre_p64
149 ld1 {HH.2d-HH4.2d}, [x8]
151 trn1 SHASH2.2d, SHASH.2d, HH.2d
152 trn2 T1.2d, SHASH.2d, HH.2d
153 eor SHASH2.16b, SHASH2.16b, T1.16b
155 trn1 HH34.2d, HH3.2d, HH4.2d
156 trn2 T1.2d, HH3.2d, HH4.2d
157 eor HH34.16b, HH34.16b, T1.16b
160 shl MASK.2d, MASK.2d, #57
163 .macro __pmull_pre_p8
164 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
165 eor SHASH2.16b, SHASH2.16b, SHASH.16b
167 // k00_16 := 0x0000000000000000_000000000000ffff
168 // k32_48 := 0x00000000ffffffff_0000ffffffffffff
169 movi k32_48.2d, #0xffffffff
170 mov k32_48.h[2], k32_48.h[0]
171 ushr k00_16.2d, k32_48.2d, #32
173 // prepare the permutation vectors
174 mov_q x5, 0x080f0e0d0c0b0a09
177 eor perm1.16b, perm1.16b, T1.16b
178 ushr perm2.2d, perm1.2d, #8
179 ushr perm3.2d, perm1.2d, #16
180 ushr T1.2d, perm1.2d, #24
181 sli perm2.2d, perm1.2d, #56
182 sli perm3.2d, perm1.2d, #48
183 sli T1.2d, perm1.2d, #40
185 // precompute loop invariants
186 tbl sh1.16b, {SHASH.16b}, perm1.16b
187 tbl sh2.16b, {SHASH.16b}, perm2.16b
188 tbl sh3.16b, {SHASH.16b}, perm3.16b
189 tbl sh4.16b, {SHASH.16b}, T1.16b
190 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
191 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
192 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
193 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
197 // PMULL (64x64->128) based reduction for CPUs that can do
198 // it in a single instruction.
200 .macro __pmull_reduce_p64
201 pmull T2.1q, XL.1d, MASK.1d
202 eor XM.16b, XM.16b, T1.16b
207 eor XL.16b, XM.16b, T2.16b
208 ext T2.16b, XL.16b, XL.16b, #8
209 pmull XL.1q, XL.1d, MASK.1d
213 // Alternative reduction for CPUs that lack support for the
214 // 64x64->128 PMULL instruction
216 .macro __pmull_reduce_p8
217 eor XM.16b, XM.16b, T1.16b
222 shl T1.2d, XL.2d, #57
223 shl T2.2d, XL.2d, #62
224 eor T2.16b, T2.16b, T1.16b
225 shl T1.2d, XL.2d, #63
226 eor T2.16b, T2.16b, T1.16b
227 ext T1.16b, XL.16b, XH.16b, #8
228 eor T2.16b, T2.16b, T1.16b
233 ushr T2.2d, XL.2d, #1
234 eor XH.16b, XH.16b, XL.16b
235 eor XL.16b, XL.16b, T2.16b
236 ushr T2.2d, T2.2d, #6
237 ushr XL.2d, XL.2d, #1
240 .macro __pmull_ghash, pn
246 /* do the head block first, if supplied */
253 tbnz w0, #0, 2f // skip until #blocks is a
254 tbnz w0, #1, 2f // round multiple of 4
256 1: ld1 {XM3.16b-TT4.16b}, [x2], #64
260 rev64 T1.16b, XM3.16b
261 rev64 T2.16b, XH3.16b
262 rev64 TT4.16b, TT4.16b
263 rev64 TT3.16b, TT3.16b
265 ext IN1.16b, TT4.16b, TT4.16b, #8
266 ext XL3.16b, TT3.16b, TT3.16b, #8
268 eor TT4.16b, TT4.16b, IN1.16b
269 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
270 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
271 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
273 eor TT3.16b, TT3.16b, XL3.16b
274 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
275 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
276 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
278 ext IN1.16b, T2.16b, T2.16b, #8
279 eor XL2.16b, XL2.16b, XL3.16b
280 eor XH2.16b, XH2.16b, XH3.16b
281 eor XM2.16b, XM2.16b, XM3.16b
283 eor T2.16b, T2.16b, IN1.16b
284 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
285 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
286 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
288 eor XL2.16b, XL2.16b, XL3.16b
289 eor XH2.16b, XH2.16b, XH3.16b
290 eor XM2.16b, XM2.16b, XM3.16b
292 ext IN1.16b, T1.16b, T1.16b, #8
293 ext TT3.16b, XL.16b, XL.16b, #8
294 eor XL.16b, XL.16b, IN1.16b
295 eor T1.16b, T1.16b, TT3.16b
297 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
298 eor T1.16b, T1.16b, XL.16b
299 pmull XL.1q, HH4.1d, XL.1d // a0 * b0
300 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
302 eor XL.16b, XL.16b, XL2.16b
303 eor XH.16b, XH.16b, XH2.16b
304 eor XM.16b, XM.16b, XM2.16b
306 eor T2.16b, XL.16b, XH.16b
307 ext T1.16b, XL.16b, XH.16b, #8
308 eor XM.16b, XM.16b, T2.16b
312 eor T2.16b, T2.16b, XH.16b
313 eor XL.16b, XL.16b, T2.16b
319 2: ld1 {T1.2d}, [x2], #16
322 3: /* multiply XL by SHASH in GF(2^128) */
323 CPU_LE( rev64 T1.16b, T1.16b )
325 ext T2.16b, XL.16b, XL.16b, #8
326 ext IN1.16b, T1.16b, T1.16b, #8
327 eor T1.16b, T1.16b, T2.16b
328 eor XL.16b, XL.16b, IN1.16b
330 __pmull2_\pn XH, XL, SHASH // a1 * b1
331 eor T1.16b, T1.16b, XL.16b
332 __pmull_\pn XL, XL, SHASH // a0 * b0
333 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
335 4: eor T2.16b, XL.16b, XH.16b
336 ext T1.16b, XL.16b, XH.16b, #8
337 eor XM.16b, XM.16b, T2.16b
341 eor T2.16b, T2.16b, XH.16b
342 eor XL.16b, XL.16b, T2.16b
351 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352 * struct ghash_key const *k, const char *head)
354 SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
356 SYM_FUNC_END(pmull_ghash_update_p64)
358 SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
360 SYM_FUNC_END(pmull_ghash_update_p8)
386 .macro load_round_keys, rounds, rk, tmp
388 ld1 {K0.4s-K3.4s}, [\rk]
389 ld1 {K4.4s-K5.4s}, [\tmp]
390 add \tmp, \rk, \rounds, lsl #4
392 ld1 {KK.4s-KM.4s}, [\tmp]
395 .macro enc_round, state, key
396 aese \state\().16b, \key\().16b
397 aesmc \state\().16b, \state\().16b
400 .macro enc_qround, s0, s1, s2, s3, key
407 .macro enc_block, state, rounds, rk, tmp
409 ld1 {K6.4s-K7.4s}, [\tmp], #32
410 .irp key, K0, K1, K2, K3, K4 K5
411 enc_round \state, \key
414 tbnz \rounds, #2, .Lnot128_\@
421 aese \state\().16b, KL.16b
422 eor \state\().16b, \state\().16b, KM.16b
426 ld1 {K8.4s-K9.4s}, [\tmp], #32
429 ld1 {K6.4s-K7.4s}, [\tmp]
432 tbz \rounds, #1, .Lout192_\@
438 .macro pmull_gcm_do_crypt, enc
441 load_round_keys x7, x6, x8
443 ld1 {SHASH.2d}, [x3], #16
444 ld1 {HH.2d-HH4.2d}, [x3]
446 trn1 SHASH2.2d, SHASH.2d, HH.2d
447 trn2 T1.2d, SHASH.2d, HH.2d
448 eor SHASH2.16b, SHASH2.16b, T1.16b
450 trn1 HH34.2d, HH3.2d, HH4.2d
451 trn2 T1.2d, HH3.2d, HH4.2d
452 eor HH34.16b, HH34.16b, T1.16b
456 cbz x0, 3f // tag only?
458 ldr w8, [x5, #12] // load lower counter
461 0: mov w9, #4 // max blocks per round
463 lsr x10, x10, #4 // remaining blocks
470 ld1 {INP0.16b-INP3.16b}, [x2], #64
473 * Populate the four input registers right to left with up to 63 bytes
474 * of data, using overlapping loads to avoid branches.
476 * INP0 INP1 INP2 INP3
478 * 16 bytes | | | |xxxxxxxx|
479 * 17 bytes | | |xxxxxxxx|x |
480 * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
483 * Note that this code may read up to 15 bytes before the start of
484 * the input. It is up to the calling code to ensure this is safe if
485 * this happens in the first iteration of the loop (i.e., when the
486 * input size is < 16 bytes)
490 csel x19, x19, x15, ne
491 adr_l x17, .Lpermute_table + 16
501 csel x14, x15, xzr, gt
503 csel x15, x15, xzr, gt
505 csel x16, x19, xzr, gt
509 ld1 {INP0.16b}, [x2], x14
510 ld1 {INP1.16b}, [x2], x15
511 ld1 {INP2.16b}, [x2], x16
513 tbl INP3.16b, {INP3.16b}, T1.16b
518 bl pmull_gcm_ghash_4x
524 st1 {INP0.16b-INP3.16b}, [x1], #64
526 bl pmull_gcm_ghash_4x
530 3: ldr x10, [sp, #.Lframe_local_offset]
531 cbz x10, 5f // output tag?
533 ld1 {INP3.16b}, [x10] // load lengths[]
535 bl pmull_gcm_ghash_4x
537 mov w11, #(0x1 << 24) // BE '1U'
541 enc_block KS0, x7, x6, x12
543 ext XL.16b, XL.16b, XL.16b, #8
545 eor XL.16b, XL.16b, KS0.16b
548 st1 {XL.16b}, [x10] // store tag
550 ldp x11, x12, [sp, #40] // load tag pointer and authsize
551 adr_l x17, .Lpermute_table
552 ld1 {KS0.16b}, [x11] // load supplied tag
554 ld1 {KS1.16b}, [x17] // load permute vector
556 cmeq XL.16b, XL.16b, KS0.16b // compare tags
557 mvn XL.16b, XL.16b // -1 for fail, 0 for pass
558 tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
559 sminv b0, XL.16b // signed minimum across XL
560 smov w0, v0.b[0] // return b0
568 str w8, [x5, #12] // store lower counter
572 6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
573 sub x17, x17, x19, lsl #1
578 7: ld1 {INP2.16b}, [x1]
579 tbx INP2.16b, {INP3.16b}, T1.16b
580 mov INP3.16b, INP2.16b
584 st1 {INP0.16b}, [x1], x14
585 st1 {INP1.16b}, [x1], x15
586 st1 {INP2.16b}, [x1], x16
587 tbl INP3.16b, {INP3.16b}, T1.16b
588 tbx INP3.16b, {INP2.16b}, T2.16b
589 8: st1 {INP3.16b}, [x1]
593 tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
594 bl pmull_gcm_ghash_4x
600 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601 * struct ghash_key const *k, u64 dg[], u8 ctr[],
602 * int rounds, u8 tag)
604 SYM_FUNC_START(pmull_gcm_encrypt)
606 SYM_FUNC_END(pmull_gcm_encrypt)
609 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610 * struct ghash_key const *k, u64 dg[], u8 ctr[],
611 * int rounds, u8 tag)
613 SYM_FUNC_START(pmull_gcm_decrypt)
615 SYM_FUNC_END(pmull_gcm_decrypt)
617 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
619 shl MASK.2d, MASK.2d, #57
621 rev64 T1.16b, INP0.16b
622 rev64 T2.16b, INP1.16b
623 rev64 TT3.16b, INP2.16b
624 rev64 TT4.16b, INP3.16b
626 ext XL.16b, XL.16b, XL.16b, #8
628 tbz w9, #2, 0f // <4 blocks?
634 tbz w9, #0, 1f // 2 blocks?
635 tbz w9, #1, 2f // 1 block?
637 eor T2.16b, T2.16b, XL.16b
638 ext T1.16b, T2.16b, T2.16b, #8
641 1: eor TT3.16b, TT3.16b, XL.16b
642 ext T2.16b, TT3.16b, TT3.16b, #8
645 2: eor TT4.16b, TT4.16b, XL.16b
646 ext IN1.16b, TT4.16b, TT4.16b, #8
650 eor T1.16b, T1.16b, XL.16b
651 ext IN1.16b, T1.16b, T1.16b, #8
653 pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
654 eor T1.16b, T1.16b, IN1.16b
655 pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
656 pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
658 ext T1.16b, T2.16b, T2.16b, #8
659 .Lgh3: eor T2.16b, T2.16b, T1.16b
660 pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
661 pmull XL.1q, HH3.1d, T1.1d // a0 * b0
662 pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
664 eor XH2.16b, XH2.16b, XH.16b
665 eor XL2.16b, XL2.16b, XL.16b
666 eor XM2.16b, XM2.16b, XM.16b
668 ext T2.16b, TT3.16b, TT3.16b, #8
669 .Lgh2: eor TT3.16b, TT3.16b, T2.16b
670 pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
671 pmull XL.1q, HH.1d, T2.1d // a0 * b0
672 pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
674 eor XH2.16b, XH2.16b, XH.16b
675 eor XL2.16b, XL2.16b, XL.16b
676 eor XM2.16b, XM2.16b, XM.16b
678 ext IN1.16b, TT4.16b, TT4.16b, #8
679 .Lgh1: eor TT4.16b, TT4.16b, IN1.16b
680 pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
681 pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
682 pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
684 eor XH.16b, XH.16b, XH2.16b
685 eor XL.16b, XL.16b, XL2.16b
686 eor XM.16b, XM.16b, XM2.16b
688 eor T2.16b, XL.16b, XH.16b
689 ext T1.16b, XL.16b, XH.16b, #8
690 eor XM.16b, XM.16b, T2.16b
694 eor T2.16b, T2.16b, XH.16b
695 eor XL.16b, XL.16b, T2.16b
698 SYM_FUNC_END(pmull_gcm_ghash_4x)
700 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701 ld1 {KS0.16b}, [x5] // load upper counter
713 ins KS0.s[3], w10 // set lower counter
718 add x10, x6, #96 // round key pointer
719 ld1 {K6.4s-K7.4s}, [x10], #32
720 .irp key, K0, K1, K2, K3, K4, K5
721 enc_qround KS0, KS1, KS2, KS3, \key
724 tbnz x7, #2, .Lnot128
727 ld1 {K8.4s-K9.4s}, [x10], #32
729 enc_qround KS0, KS1, KS2, KS3, \key
731 ld1 {K6.4s-K7.4s}, [x10]
733 enc_qround KS0, KS1, KS2, KS3, \key
741 enc_qround KS0, KS1, KS2, KS3, \key
745 enc_qround KS0, KS1, KS2, KS3, KK
752 eor KS0.16b, KS0.16b, KM.16b
753 eor KS1.16b, KS1.16b, KM.16b
754 eor KS2.16b, KS2.16b, KM.16b
755 eor KS3.16b, KS3.16b, KM.16b
757 eor INP0.16b, INP0.16b, KS0.16b
758 eor INP1.16b, INP1.16b, KS1.16b
759 eor INP2.16b, INP2.16b, KS2.16b
760 eor INP3.16b, INP3.16b, KS3.16b
763 SYM_FUNC_END(pmull_gcm_enc_4x)
765 .section ".rodata", "a"
768 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
771 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
772 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
773 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
775 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf