2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
52 .macro __pmull_p64, rd, rn, rm
53 pmull \rd\().1q, \rn\().1d, \rm\().1d
56 .macro __pmull2_p64, rd, rn, rm
57 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
60 .macro __pmull_p8, rq, ad, bd
61 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
62 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
63 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
65 __pmull_p8_\bd \rq, \ad
68 .macro __pmull2_p8, rq, ad, bd
69 tbl t3.16b, {\ad\().16b}, perm1.16b // A1
70 tbl t5.16b, {\ad\().16b}, perm2.16b // A2
71 tbl t7.16b, {\ad\().16b}, perm3.16b // A3
73 __pmull2_p8_\bd \rq, \ad
76 .macro __pmull_p8_SHASH, rq, ad
77 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
80 .macro __pmull_p8_SHASH2, rq, ad
81 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
84 .macro __pmull2_p8_SHASH, rq, ad
85 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
88 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
89 pmull\t t3.8h, t3.\nb, \bd // F = A1*B
90 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
91 pmull\t t5.8h, t5.\nb, \bd // H = A2*B
92 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
93 pmull\t t7.8h, t7.\nb, \bd // J = A3*B
94 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
95 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
96 pmull\t \rq\().8h, \ad, \bd // D = A*B
98 eor t3.16b, t3.16b, t4.16b // L = E + F
99 eor t5.16b, t5.16b, t6.16b // M = G + H
100 eor t7.16b, t7.16b, t8.16b // N = I + J
102 uzp1 t4.2d, t3.2d, t5.2d
103 uzp2 t3.2d, t3.2d, t5.2d
104 uzp1 t6.2d, t7.2d, t9.2d
105 uzp2 t7.2d, t7.2d, t9.2d
107 // t3 = (L) (P0 + P1) << 8
108 // t5 = (M) (P2 + P3) << 16
109 eor t4.16b, t4.16b, t3.16b
110 and t3.16b, t3.16b, k32_48.16b
112 // t7 = (N) (P4 + P5) << 24
113 // t9 = (K) (P6 + P7) << 32
114 eor t6.16b, t6.16b, t7.16b
115 and t7.16b, t7.16b, k00_16.16b
117 eor t4.16b, t4.16b, t3.16b
118 eor t6.16b, t6.16b, t7.16b
120 zip2 t5.2d, t4.2d, t3.2d
121 zip1 t3.2d, t4.2d, t3.2d
122 zip2 t9.2d, t6.2d, t7.2d
123 zip1 t7.2d, t6.2d, t7.2d
125 ext t3.16b, t3.16b, t3.16b, #15
126 ext t5.16b, t5.16b, t5.16b, #14
127 ext t7.16b, t7.16b, t7.16b, #13
128 ext t9.16b, t9.16b, t9.16b, #12
130 eor t3.16b, t3.16b, t5.16b
131 eor t7.16b, t7.16b, t9.16b
132 eor \rq\().16b, \rq\().16b, t3.16b
133 eor \rq\().16b, \rq\().16b, t7.16b
136 .macro __pmull_pre_p64
138 shl MASK.2d, MASK.2d, #57
141 .macro __pmull_pre_p8
142 // k00_16 := 0x0000000000000000_000000000000ffff
143 // k32_48 := 0x00000000ffffffff_0000ffffffffffff
144 movi k32_48.2d, #0xffffffff
145 mov k32_48.h[2], k32_48.h[0]
146 ushr k00_16.2d, k32_48.2d, #32
148 // prepare the permutation vectors
149 mov_q x5, 0x080f0e0d0c0b0a09
152 eor perm1.16b, perm1.16b, T1.16b
153 ushr perm2.2d, perm1.2d, #8
154 ushr perm3.2d, perm1.2d, #16
155 ushr T1.2d, perm1.2d, #24
156 sli perm2.2d, perm1.2d, #56
157 sli perm3.2d, perm1.2d, #48
158 sli T1.2d, perm1.2d, #40
160 // precompute loop invariants
161 tbl sh1.16b, {SHASH.16b}, perm1.16b
162 tbl sh2.16b, {SHASH.16b}, perm2.16b
163 tbl sh3.16b, {SHASH.16b}, perm3.16b
164 tbl sh4.16b, {SHASH.16b}, T1.16b
165 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
166 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
167 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
168 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
172 // PMULL (64x64->128) based reduction for CPUs that can do
173 // it in a single instruction.
175 .macro __pmull_reduce_p64
176 pmull T2.1q, XL.1d, MASK.1d
177 eor XM.16b, XM.16b, T1.16b
182 eor XL.16b, XM.16b, T2.16b
183 ext T2.16b, XL.16b, XL.16b, #8
184 pmull XL.1q, XL.1d, MASK.1d
188 // Alternative reduction for CPUs that lack support for the
189 // 64x64->128 PMULL instruction
191 .macro __pmull_reduce_p8
192 eor XM.16b, XM.16b, T1.16b
197 shl T1.2d, XL.2d, #57
198 shl T2.2d, XL.2d, #62
199 eor T2.16b, T2.16b, T1.16b
200 shl T1.2d, XL.2d, #63
201 eor T2.16b, T2.16b, T1.16b
202 ext T1.16b, XL.16b, XH.16b, #8
203 eor T2.16b, T2.16b, T1.16b
208 ushr T2.2d, XL.2d, #1
209 eor XH.16b, XH.16b, XL.16b
210 eor XL.16b, XL.16b, T2.16b
211 ushr T2.2d, T2.2d, #6
212 ushr XL.2d, XL.2d, #1
215 .macro __pmull_ghash, pn
224 0: ld1 {SHASH.2d}, [x22]
226 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
227 eor SHASH2.16b, SHASH2.16b, SHASH.16b
231 /* do the head block first, if supplied */
237 1: ld1 {T1.2d}, [x21], #16
240 2: /* multiply XL by SHASH in GF(2^128) */
241 CPU_LE( rev64 T1.16b, T1.16b )
243 ext T2.16b, XL.16b, XL.16b, #8
244 ext IN1.16b, T1.16b, T1.16b, #8
245 eor T1.16b, T1.16b, T2.16b
246 eor XL.16b, XL.16b, IN1.16b
248 __pmull2_\pn XH, XL, SHASH // a1 * b1
249 eor T1.16b, T1.16b, XL.16b
250 __pmull_\pn XL, XL, SHASH // a0 * b0
251 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
253 eor T2.16b, XL.16b, XH.16b
254 ext T1.16b, XL.16b, XH.16b, #8
255 eor XM.16b, XM.16b, T2.16b
259 eor T2.16b, T2.16b, XH.16b
260 eor XL.16b, XL.16b, T2.16b
264 if_will_cond_yield_neon
272 3: st1 {XL.2d}, [x20]
278 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
279 * struct ghash_key const *k, const char *head)
281 ENTRY(pmull_ghash_update_p64)
283 ENDPROC(pmull_ghash_update_p64)
285 ENTRY(pmull_ghash_update_p8)
287 ENDPROC(pmull_ghash_update_p8)
293 .macro load_round_keys, rounds, rk
295 blo 2222f /* 128 bits */
296 beq 1111f /* 192 bits */
297 ld1 {v17.4s-v18.4s}, [\rk], #32
298 1111: ld1 {v19.4s-v20.4s}, [\rk], #32
299 2222: ld1 {v21.4s-v24.4s}, [\rk], #64
300 ld1 {v25.4s-v28.4s}, [\rk], #64
301 ld1 {v29.4s-v31.4s}, [\rk]
304 .macro enc_round, state, key
305 aese \state\().16b, \key\().16b
306 aesmc \state\().16b, \state\().16b
309 .macro enc_block, state, rounds
311 b.lo 2222f /* 128 bits */
312 b.eq 1111f /* 192 bits */
313 enc_round \state, v17
314 enc_round \state, v18
315 1111: enc_round \state, v19
316 enc_round \state, v20
317 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
318 enc_round \state, \key
320 aese \state\().16b, v30.16b
321 eor \state\().16b, \state\().16b, v31.16b
324 .macro pmull_gcm_do_crypt, enc
327 ldr x8, [x5, #8] // load lower counter
329 load_round_keys w7, x6
332 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
334 shl MASK.2d, MASK.2d, #57
335 eor SHASH2.16b, SHASH2.16b, SHASH.16b
342 0: ld1 {CTR.8b}, [x5] // load upper counter
343 ld1 {INP.16b}, [x3], #16
347 ins CTR.d[1], x9 // set lower counter
350 eor INP.16b, INP.16b, KS.16b // encrypt input
351 st1 {INP.16b}, [x2], #16
354 rev64 T1.16b, INP.16b
357 b.ge 2f // AES-192/256?
359 1: enc_round CTR, v21
361 ext T2.16b, XL.16b, XL.16b, #8
362 ext IN1.16b, T1.16b, T1.16b, #8
366 eor T1.16b, T1.16b, T2.16b
367 eor XL.16b, XL.16b, IN1.16b
371 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
372 eor T1.16b, T1.16b, XL.16b
376 pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
377 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
381 ext T1.16b, XL.16b, XH.16b, #8
382 eor T2.16b, XL.16b, XH.16b
383 eor XM.16b, XM.16b, T1.16b
387 eor XM.16b, XM.16b, T2.16b
388 pmull T2.1q, XL.1d, MASK.1d
397 eor XL.16b, XM.16b, T2.16b
401 ext T2.16b, XL.16b, XL.16b, #8
403 aese CTR.16b, v30.16b
405 pmull XL.1q, XL.1d, MASK.1d
406 eor T2.16b, T2.16b, XH.16b
408 eor KS.16b, CTR.16b, v31.16b
410 eor XL.16b, XL.16b, T2.16b
413 eor INP.16b, INP.16b, KS.16b
414 st1 {INP.16b}, [x2], #16
421 str x8, [x5, #8] // store lower counter
429 2: b.eq 3f // AES-192?
432 3: enc_round CTR, v19
438 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
439 * struct ghash_key const *k, u8 ctr[],
440 * int rounds, u8 ks[])
442 ENTRY(pmull_gcm_encrypt)
444 ENDPROC(pmull_gcm_encrypt)
447 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
448 * struct ghash_key const *k, u8 ctr[],
451 ENTRY(pmull_gcm_decrypt)
453 ENDPROC(pmull_gcm_decrypt)
456 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
458 ENTRY(pmull_gcm_encrypt_block)
460 load_round_keys w3, x2
461 0: ld1 {v0.16b}, [x1]
465 ENDPROC(pmull_gcm_encrypt_block)