2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
52 .macro __pmull_p64, rd, rn, rm
53 pmull \rd\().1q, \rn\().1d, \rm\().1d
56 .macro __pmull2_p64, rd, rn, rm
57 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
60 .macro __pmull_p8, rq, ad, bd
61 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
62 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
63 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
65 __pmull_p8_\bd \rq, \ad
68 .macro __pmull2_p8, rq, ad, bd
69 tbl t3.16b, {\ad\().16b}, perm1.16b // A1
70 tbl t5.16b, {\ad\().16b}, perm2.16b // A2
71 tbl t7.16b, {\ad\().16b}, perm3.16b // A3
73 __pmull2_p8_\bd \rq, \ad
76 .macro __pmull_p8_SHASH, rq, ad
77 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
80 .macro __pmull_p8_SHASH2, rq, ad
81 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
84 .macro __pmull2_p8_SHASH, rq, ad
85 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
88 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
89 pmull\t t3.8h, t3.\nb, \bd // F = A1*B
90 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
91 pmull\t t5.8h, t5.\nb, \bd // H = A2*B
92 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
93 pmull\t t7.8h, t7.\nb, \bd // J = A3*B
94 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
95 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
96 pmull\t \rq\().8h, \ad, \bd // D = A*B
98 eor t3.16b, t3.16b, t4.16b // L = E + F
99 eor t5.16b, t5.16b, t6.16b // M = G + H
100 eor t7.16b, t7.16b, t8.16b // N = I + J
102 uzp1 t4.2d, t3.2d, t5.2d
103 uzp2 t3.2d, t3.2d, t5.2d
104 uzp1 t6.2d, t7.2d, t9.2d
105 uzp2 t7.2d, t7.2d, t9.2d
107 // t3 = (L) (P0 + P1) << 8
108 // t5 = (M) (P2 + P3) << 16
109 eor t4.16b, t4.16b, t3.16b
110 and t3.16b, t3.16b, k32_48.16b
112 // t7 = (N) (P4 + P5) << 24
113 // t9 = (K) (P6 + P7) << 32
114 eor t6.16b, t6.16b, t7.16b
115 and t7.16b, t7.16b, k00_16.16b
117 eor t4.16b, t4.16b, t3.16b
118 eor t6.16b, t6.16b, t7.16b
120 zip2 t5.2d, t4.2d, t3.2d
121 zip1 t3.2d, t4.2d, t3.2d
122 zip2 t9.2d, t6.2d, t7.2d
123 zip1 t7.2d, t6.2d, t7.2d
125 ext t3.16b, t3.16b, t3.16b, #15
126 ext t5.16b, t5.16b, t5.16b, #14
127 ext t7.16b, t7.16b, t7.16b, #13
128 ext t9.16b, t9.16b, t9.16b, #12
130 eor t3.16b, t3.16b, t5.16b
131 eor t7.16b, t7.16b, t9.16b
132 eor \rq\().16b, \rq\().16b, t3.16b
133 eor \rq\().16b, \rq\().16b, t7.16b
136 .macro __pmull_pre_p64
138 shl MASK.2d, MASK.2d, #57
141 .macro __pmull_pre_p8
142 // k00_16 := 0x0000000000000000_000000000000ffff
143 // k32_48 := 0x00000000ffffffff_0000ffffffffffff
144 movi k32_48.2d, #0xffffffff
145 mov k32_48.h[2], k32_48.h[0]
146 ushr k00_16.2d, k32_48.2d, #32
148 // prepare the permutation vectors
149 mov_q x5, 0x080f0e0d0c0b0a09
152 eor perm1.16b, perm1.16b, T1.16b
153 ushr perm2.2d, perm1.2d, #8
154 ushr perm3.2d, perm1.2d, #16
155 ushr T1.2d, perm1.2d, #24
156 sli perm2.2d, perm1.2d, #56
157 sli perm3.2d, perm1.2d, #48
158 sli T1.2d, perm1.2d, #40
160 // precompute loop invariants
161 tbl sh1.16b, {SHASH.16b}, perm1.16b
162 tbl sh2.16b, {SHASH.16b}, perm2.16b
163 tbl sh3.16b, {SHASH.16b}, perm3.16b
164 tbl sh4.16b, {SHASH.16b}, T1.16b
165 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
166 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
167 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
168 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
172 // PMULL (64x64->128) based reduction for CPUs that can do
173 // it in a single instruction.
175 .macro __pmull_reduce_p64
176 pmull T2.1q, XL.1d, MASK.1d
177 eor XM.16b, XM.16b, T1.16b
182 eor XL.16b, XM.16b, T2.16b
183 ext T2.16b, XL.16b, XL.16b, #8
184 pmull XL.1q, XL.1d, MASK.1d
188 // Alternative reduction for CPUs that lack support for the
189 // 64x64->128 PMULL instruction
191 .macro __pmull_reduce_p8
192 eor XM.16b, XM.16b, T1.16b
197 shl T1.2d, XL.2d, #57
198 shl T2.2d, XL.2d, #62
199 eor T2.16b, T2.16b, T1.16b
200 shl T1.2d, XL.2d, #63
201 eor T2.16b, T2.16b, T1.16b
202 ext T1.16b, XL.16b, XH.16b, #8
203 eor T2.16b, T2.16b, T1.16b
208 ushr T2.2d, XL.2d, #1
209 eor XH.16b, XH.16b, XL.16b
210 eor XL.16b, XL.16b, T2.16b
211 ushr T2.2d, T2.2d, #6
212 ushr XL.2d, XL.2d, #1
215 .macro __pmull_ghash, pn
218 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
219 eor SHASH2.16b, SHASH2.16b, SHASH.16b
223 /* do the head block first, if supplied */
228 0: ld1 {T1.2d}, [x2], #16
231 1: /* multiply XL by SHASH in GF(2^128) */
232 CPU_LE( rev64 T1.16b, T1.16b )
234 ext T2.16b, XL.16b, XL.16b, #8
235 ext IN1.16b, T1.16b, T1.16b, #8
236 eor T1.16b, T1.16b, T2.16b
237 eor XL.16b, XL.16b, IN1.16b
239 __pmull2_\pn XH, XL, SHASH // a1 * b1
240 eor T1.16b, T1.16b, XL.16b
241 __pmull_\pn XL, XL, SHASH // a0 * b0
242 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
244 eor T2.16b, XL.16b, XH.16b
245 ext T1.16b, XL.16b, XH.16b, #8
246 eor XM.16b, XM.16b, T2.16b
250 eor T2.16b, T2.16b, XH.16b
251 eor XL.16b, XL.16b, T2.16b
260 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
261 * struct ghash_key const *k, const char *head)
263 ENTRY(pmull_ghash_update_p64)
265 ENDPROC(pmull_ghash_update_p64)
267 ENTRY(pmull_ghash_update_p8)
269 ENDPROC(pmull_ghash_update_p8)
275 .macro load_round_keys, rounds, rk
277 blo 2222f /* 128 bits */
278 beq 1111f /* 192 bits */
279 ld1 {v17.4s-v18.4s}, [\rk], #32
280 1111: ld1 {v19.4s-v20.4s}, [\rk], #32
281 2222: ld1 {v21.4s-v24.4s}, [\rk], #64
282 ld1 {v25.4s-v28.4s}, [\rk], #64
283 ld1 {v29.4s-v31.4s}, [\rk]
286 .macro enc_round, state, key
287 aese \state\().16b, \key\().16b
288 aesmc \state\().16b, \state\().16b
291 .macro enc_block, state, rounds
293 b.lo 2222f /* 128 bits */
294 b.eq 1111f /* 192 bits */
295 enc_round \state, v17
296 enc_round \state, v18
297 1111: enc_round \state, v19
298 enc_round \state, v20
299 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
300 enc_round \state, \key
302 aese \state\().16b, v30.16b
303 eor \state\().16b, \state\().16b, v31.16b
306 .macro pmull_gcm_do_crypt, enc
309 ldr x8, [x5, #8] // load lower counter
312 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
314 shl MASK.2d, MASK.2d, #57
315 eor SHASH2.16b, SHASH2.16b, SHASH.16b
321 0: ld1 {CTR.8b}, [x5] // load upper counter
322 ld1 {INP.16b}, [x3], #16
326 ins CTR.d[1], x9 // set lower counter
329 eor INP.16b, INP.16b, KS.16b // encrypt input
330 st1 {INP.16b}, [x2], #16
333 rev64 T1.16b, INP.16b
336 b.ge 2f // AES-192/256?
338 1: enc_round CTR, v21
340 ext T2.16b, XL.16b, XL.16b, #8
341 ext IN1.16b, T1.16b, T1.16b, #8
345 eor T1.16b, T1.16b, T2.16b
346 eor XL.16b, XL.16b, IN1.16b
350 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
351 eor T1.16b, T1.16b, XL.16b
355 pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
356 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
360 ext T1.16b, XL.16b, XH.16b, #8
361 eor T2.16b, XL.16b, XH.16b
362 eor XM.16b, XM.16b, T1.16b
366 eor XM.16b, XM.16b, T2.16b
367 pmull T2.1q, XL.1d, MASK.1d
376 eor XL.16b, XM.16b, T2.16b
380 ext T2.16b, XL.16b, XL.16b, #8
382 aese CTR.16b, v30.16b
384 pmull XL.1q, XL.1d, MASK.1d
385 eor T2.16b, T2.16b, XH.16b
387 eor KS.16b, CTR.16b, v31.16b
389 eor XL.16b, XL.16b, T2.16b
392 eor INP.16b, INP.16b, KS.16b
393 st1 {INP.16b}, [x2], #16
400 str x8, [x5, #8] // store lower counter
408 2: b.eq 3f // AES-192?
411 3: enc_round CTR, v19
417 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
418 * struct ghash_key const *k, u8 ctr[],
419 * int rounds, u8 ks[])
421 ENTRY(pmull_gcm_encrypt)
423 ENDPROC(pmull_gcm_encrypt)
426 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
427 * struct ghash_key const *k, u8 ctr[],
430 ENTRY(pmull_gcm_decrypt)
432 ENDPROC(pmull_gcm_decrypt)
435 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
437 ENTRY(pmull_gcm_encrypt_block)
439 load_round_keys w3, x2
440 0: ld1 {v0.16b}, [x1]
444 ENDPROC(pmull_gcm_encrypt_block)