1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
5 * Copyright (C) 2015 - 2017 Linaro Ltd.
6 * Copyright (C) 2023 Google LLC. <ardb@google.com>
9 #include <linux/linkage.h>
10 #include <asm/assembler.h>
13 .fpu crypto-neon-fp-armv8
96 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
97 vmull.p64 \rd, \rn, \rm
101 * This implementation of 64x64 -> 128 bit polynomial multiplication
102 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
103 * "Fast Software Polynomial Multiplication on ARM Processors Using
104 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
105 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
107 * It has been slightly tweaked for in-order performance, and to allow
108 * 'rq' to overlap with 'ad' or 'bd'.
110 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
111 vext.8 t0l, \ad, \ad, #1 @ A1
113 vext.8 t4l, \bd, \bd, #1 @ B1
115 vmull.p8 t0q, t0l, \bd @ F = A1*B
116 vext.8 t1l, \ad, \ad, #2 @ A2
117 vmull.p8 t4q, \ad, \b1 @ E = A*B1
119 vext.8 t3l, \bd, \bd, #2 @ B2
121 vmull.p8 t1q, t1l, \bd @ H = A2*B
122 vext.8 t2l, \ad, \ad, #3 @ A3
123 vmull.p8 t3q, \ad, \b2 @ G = A*B2
124 veor t0q, t0q, t4q @ L = E + F
126 vext.8 t4l, \bd, \bd, #3 @ B3
128 vmull.p8 t2q, t2l, \bd @ J = A3*B
129 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
130 veor t1q, t1q, t3q @ M = G + H
132 vext.8 t3l, \bd, \bd, #4 @ B4
134 vmull.p8 t4q, \ad, \b3 @ I = A*B3
135 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
136 vmull.p8 t3q, \ad, \b4 @ K = A*B4
139 veor t2q, t2q, t4q @ N = I + J
142 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
144 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
146 vext.8 t0q, t0q, t0q, #15
148 vext.8 t1q, t1q, t1q, #14
149 vmull.p8 \rq, \ad, \bd @ D = A*B
150 vext.8 t2q, t2q, t2q, #13
151 vext.8 t3q, t3q, t3q, #12
159 // PMULL (64x64->128) based reduction for CPUs that can do
160 // it in a single instruction.
162 .macro __pmull_reduce_p64
163 vmull.p64 T1, XL_L, MASK
165 veor XH_L, XH_L, XM_H
166 vext.8 T1, T1, T1, #8
167 veor XL_H, XL_H, XM_L
170 vmull.p64 XL, T1_H, MASK
174 // Alternative reduction for CPUs that lack support for the
175 // 64x64->128 PMULL instruction
177 .macro __pmull_reduce_p8
178 veor XL_H, XL_H, XM_L
179 veor XH_L, XH_L, XM_H
186 veor XL_H, XL_H, T1_L
187 veor XH_L, XH_L, T1_H
196 .macro ghash_update, pn, enc, aggregate=1, head=1
200 /* do the head block first, if supplied */
211 tst r0, #3 // skip until #blocks is a
212 bne 2f // round multiple of 4
214 vld1.8 {XL2-XM2}, [r2]!
215 1: vld1.8 {T2-T3}, [r2]!
218 \enc\()_4x XL2, XM2, T2, T3
221 vld1.64 {HH}, [ip, :128]!
222 vld1.64 {HH3-HH4}, [ip, :128]
224 veor SHASH2_p64, SHASH_L, SHASH_H
225 veor SHASH2_H, HH_L, HH_H
226 veor HH34_L, HH3_L, HH3_H
227 veor HH34_H, HH4_L, HH4_H
230 vshl.u64 MASK, MASK, #57
238 vext.8 T1, XL2, XL2, #8
239 veor XL2_H, XL2_H, XL_L
245 vmull.p64 XH, HH4_H, XL_H // a1 * b1
246 veor XL2_H, XL2_H, XL_H
247 vmull.p64 XL, HH4_L, XL_L // a0 * b0
248 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
250 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
251 veor XM2_L, XM2_L, XM2_H
252 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
253 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
259 vmull.p64 XH2, HH_H, T3_L // a1 * b1
260 veor T3_L, T3_L, T3_H
261 vmull.p64 XL2, HH_L, T3_H // a0 * b0
262 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
268 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
269 veor T1_L, T1_L, T1_H
270 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
271 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
279 vld1.8 {XL2-XM2}, [r2]!
293 2: vld1.8 {T1}, [r2]!
297 veor SHASH2_p64, SHASH_L, SHASH_H
299 vshl.u64 MASK, MASK, #57
304 3: /* multiply XL by SHASH in GF(2^128) */
307 vext.8 IN1, T1, T1, #8
308 veor T1_L, T1_L, XL_H
311 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
313 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
314 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
328 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
329 * struct ghash_key const *k, const char *head)
331 ENTRY(pmull_ghash_update_p64)
332 vld1.64 {SHASH}, [r3]!
334 vld1.64 {HH3-HH4}, [r3]
336 veor SHASH2_p64, SHASH_L, SHASH_H
337 veor SHASH2_H, HH_L, HH_H
338 veor HH34_L, HH3_L, HH3_H
339 veor HH34_H, HH4_L, HH4_H
342 vshl.u64 MASK, MASK, #57
348 ENDPROC(pmull_ghash_update_p64)
350 ENTRY(pmull_ghash_update_p8)
351 vld1.64 {SHASH}, [r3]
352 veor SHASH2_p8, SHASH_L, SHASH_H
354 vext.8 s1l, SHASH_L, SHASH_L, #1
355 vext.8 s2l, SHASH_L, SHASH_L, #2
356 vext.8 s3l, SHASH_L, SHASH_L, #3
357 vext.8 s4l, SHASH_L, SHASH_L, #4
358 vext.8 s1h, SHASH_H, SHASH_H, #1
359 vext.8 s2h, SHASH_H, SHASH_H, #2
360 vext.8 s3h, SHASH_H, SHASH_H, #3
361 vext.8 s4h, SHASH_H, SHASH_H, #4
363 vmov.i64 k16, #0xffff
364 vmov.i64 k32, #0xffffffff
365 vmov.i64 k48, #0xffffffffffff
371 ENDPROC(pmull_ghash_update_p8)
390 .macro round, rk:req, regs:vararg
397 .macro aes_encrypt, rkp, rounds, regs:vararg
398 vld1.8 {ek0-ek1}, [\rkp, :128]!
403 vld1.8 {ek0}, [\rkp, :128]!
405 vld1.8 {ek1}, [\rkp, :128]!
410 vld1.8 {ek0}, [\rkp, :128]!
412 vld1.8 {ek1}, [\rkp, :128]!
416 vld1.8 {ek0}, [\rkp, :128]!
418 vld1.8 {ek1}, [\rkp, :128]!
422 vld1.8 {ek0}, [\rkp, :128]
434 vld1.8 {ctr0}, [r5] // load 12 byte IV
437 vext.8 ctr1, ctr1, ctr1, #4
443 aes_encrypt ip, r6, e0
445 ENDPROC(pmull_aes_encrypt)
447 pmull_aes_encrypt_4x:
452 vext.8 ctr1, ctr1, ctr1, #4
470 aes_encrypt ip, r6, e0, e1, e2, e3
472 ENDPROC(pmull_aes_encrypt_4x)
474 pmull_aes_encrypt_final:
479 vext.8 ctr1, ctr1, ctr1, #4
480 mov r7, #1 << 24 // BE #1 for the tag
487 aes_encrypt ip, r6, e0, e1
489 ENDPROC(pmull_aes_encrypt_final)
503 .macro enc_4x, in0, in1, in2, in3
504 bl pmull_aes_encrypt_4x
511 vst1.8 {\in0-\in1}, [r4]!
512 vst1.8 {\in2-\in3}, [r4]!
515 .macro dec_4x, in0, in1, in2, in3
516 bl pmull_aes_encrypt_4x
523 vst1.8 {e0-e1}, [r4]!
524 vst1.8 {e2-e3}, [r4]!
528 * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
529 * struct gcm_key const *k, char *dst,
530 * char *iv, int rounds, u32 counter)
532 ENTRY(pmull_gcm_encrypt)
534 ldrd r4, r5, [sp, #24]
535 ldrd r6, r7, [sp, #32]
537 vld1.64 {SHASH}, [r3]
539 ghash_update p64, enc, head=0
543 ENDPROC(pmull_gcm_encrypt)
546 * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
547 * struct gcm_key const *k, char *dst,
548 * char *iv, int rounds, u32 counter)
550 ENTRY(pmull_gcm_decrypt)
552 ldrd r4, r5, [sp, #24]
553 ldrd r6, r7, [sp, #32]
555 vld1.64 {SHASH}, [r3]
557 ghash_update p64, dec, head=0
561 ENDPROC(pmull_gcm_decrypt)
564 * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
565 * struct gcm_key const *k, char *head,
566 * char *iv, int rounds, u32 counter)
568 ENTRY(pmull_gcm_enc_final)
570 ldrd r4, r5, [sp, #24]
571 ldrd r6, r7, [sp, #32]
573 bl pmull_aes_encrypt_final
585 vld1.8 {e3}, [r8] // permute vector for key stream
586 vld1.8 {e2}, [ip] // permute vector for ghash input
588 vtbl.8 e3l, {e0}, e3l
589 vtbl.8 e3h, {e0}, e3h
591 vld1.8 {e0}, [r4] // encrypt tail block
595 vtbl.8 T1_L, {e0}, e2l
596 vtbl.8 T1_H, {e0}, e2h
600 vld1.64 {SHASH}, [r3, :128]
602 veor SHASH2_p64, SHASH_L, SHASH_H
603 vshl.u64 MASK, MASK, #57
605 bne 3f // process head block first
606 ghash_update p64, aggregate=0, head=0
609 vext.8 XL, XL, XL, #8
612 sub r2, r2, #16 // rewind src pointer
613 vst1.8 {XL}, [r2] // store tag
616 ENDPROC(pmull_gcm_enc_final)
619 * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
620 * struct gcm_key const *k, char *head,
621 * char *iv, int rounds, u32 counter,
622 * const char *otag, int authsize)
624 ENTRY(pmull_gcm_dec_final)
626 ldrd r4, r5, [sp, #24]
627 ldrd r6, r7, [sp, #32]
629 bl pmull_aes_encrypt_final
641 vld1.8 {e3}, [r8] // permute vector for key stream
642 vld1.8 {e2}, [ip] // permute vector for ghash input
644 vtbl.8 e3l, {e0}, e3l
645 vtbl.8 e3h, {e0}, e3h
649 vtbl.8 T1_L, {e0}, e2l
650 vtbl.8 T1_H, {e0}, e2h
657 vld1.64 {SHASH}, [r3]
659 veor SHASH2_p64, SHASH_L, SHASH_H
660 vshl.u64 MASK, MASK, #57
662 bne 3f // process head block first
663 ghash_update p64, aggregate=0, head=0
666 vext.8 XL, XL, XL, #8
670 ldrd r2, r3, [sp, #40] // otag and authsize
673 vceq.i8 T1, T1, XL // compare tags
674 vmvn T1, T1 // 0 for eq, -1 for ne
677 vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only
678 vtbl.8 XL_H, {T1}, e0h
680 vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector
681 vpmin.s8 XL_L, XL_L, XL_L
682 vmov.32 r0, XL_L[0] // fail if != 0x0
685 ENDPROC(pmull_gcm_dec_final)
687 .section ".rodata", "a", %progbits
690 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
691 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
692 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
693 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
694 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
695 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff