1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Bit sliced AES using NEON instructions
5 * Copyright (C) 2017 Linaro Ltd.
6 * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
10 * The algorithm implemented here is described in detail by the paper
11 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
12 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
14 * This implementation is based primarily on the OpenSSL implementation
15 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
18 #include <linux/linkage.h>
19 #include <asm/assembler.h>
60 .macro __tbl, out, tbl, in, tmp
63 .error __tbl needs temp register if out == tbl
67 vtbl.8 \out\()l, {\tbl}, \in\()l
69 vtbl.8 \out\()h, {\tmp}, \in\()h
71 vtbl.8 \out\()h, {\tbl}, \in\()h
75 .macro __ldr, out, sym
77 vldr \out\()h, \sym + 8
80 .macro __adr, reg, lbl
82 THUMB( orr \reg, \reg, #1 )
85 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
101 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
115 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
130 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
146 .macro mul_gf4, x0, x1, y0, y1, t0, t1
156 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
173 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
174 y0, y1, y2, y3, t0, t1, t2, t3
177 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
180 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
187 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
190 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
197 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
198 t0, t1, t2, t3, s0, s1, s2, s3
245 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
246 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
249 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
250 t0, t1, t2, t3, s0, s1, s2, s3
251 in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
252 inv_gf256 \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
253 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
254 out_bs_ch \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
257 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
258 t0, t1, t2, t3, s0, s1, s2, s3
259 inv_in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
260 inv_gf256 \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
261 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
262 inv_out_bs_ch \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
265 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
267 vld1.8 {\t0-\t1}, [bskey, :256]!
269 vld1.8 {\t2-\t3}, [bskey, :256]!
271 __tbl \x0, \t0, \mask
273 __tbl \x1, \t1, \mask
274 vld1.8 {\t0-\t1}, [bskey, :256]!
276 __tbl \x2, \t2, \mask
277 __tbl \x3, \t3, \mask
278 vld1.8 {\t2-\t3}, [bskey, :256]!
281 __tbl \x4, \t0, \mask
283 __tbl \x5, \t1, \mask
285 __tbl \x6, \t2, \mask
286 __tbl \x7, \t3, \mask
289 .macro inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
291 __tbl \x0, \x0, \mask, \t0
292 __tbl \x1, \x1, \mask, \t1
293 __tbl \x2, \x2, \mask, \t2
294 __tbl \x3, \x3, \mask, \t3
295 __tbl \x4, \x4, \mask, \t0
296 __tbl \x5, \x5, \mask, \t1
297 __tbl \x6, \x6, \mask, \t2
298 __tbl \x7, \x7, \mask, \t3
301 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
302 t0, t1, t2, t3, t4, t5, t6, t7, inv
303 vext.8 \t0, \x0, \x0, #12
304 vext.8 \t1, \x1, \x1, #12
306 vext.8 \t2, \x2, \x2, #12
308 vext.8 \t3, \x3, \x3, #12
310 vext.8 \t4, \x4, \x4, #12
312 vext.8 \t5, \x5, \x5, #12
314 vext.8 \t6, \x6, \x6, #12
316 vext.8 \t7, \x7, \x7, #12
320 vext.8 \x0, \x0, \x0, #8
324 vext.8 \x1, \x1, \x1, #8
329 vext.8 \t0, \x4, \x4, #8
331 vext.8 \t1, \x5, \x5, #8
333 vext.8 \x4, \x3, \x3, #8
335 vext.8 \x5, \x7, \x7, #8
337 vext.8 \x3, \x6, \x6, #8
339 vext.8 \x6, \x2, \x2, #8
357 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
358 t0, t1, t2, t3, t4, t5, t6, t7
359 vld1.8 {\t0-\t1}, [bskey, :256]!
361 vld1.8 {\t2-\t3}, [bskey, :256]!
363 vld1.8 {\t4-\t5}, [bskey, :256]!
365 vld1.8 {\t6-\t7}, [bskey, :256]
366 sub bskey, bskey, #224
372 vext.8 \t0, \x0, \x0, #8
373 vext.8 \t6, \x6, \x6, #8
374 vext.8 \t7, \x7, \x7, #8
376 vext.8 \t1, \x1, \x1, #8
378 vext.8 \t2, \x2, \x2, #8
380 vext.8 \t3, \x3, \x3, #8
382 vext.8 \t4, \x4, \x4, #8
384 vext.8 \t5, \x5, \x5, #8
402 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
403 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
406 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
407 vshr.u64 \t0, \b0, #\n
408 vshr.u64 \t1, \b1, #\n
414 vshl.s64 \t0, \t0, #\n
416 vshl.s64 \t1, \t1, #\n
421 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
424 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
425 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
427 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
428 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
429 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
430 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
434 M0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d
437 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
439 ENTRY(aesbs_convert_key)
440 vld1.32 {q7}, [r1]! // load round 0 key
441 vld1.32 {q15}, [r1]! // load round 1 key
443 vmov.i8 q8, #0x01 // bit masks
452 vst1.8 {q7}, [r0, :128]! // save round 0 key
467 vld1.32 {q15}, [r1]! // load next round key
474 vst1.8 {q0-q1}, [r0, :256]!
475 vst1.8 {q2-q3}, [r0, :256]!
476 vst1.8 {q4-q5}, [r0, :256]!
477 vst1.8 {q6-q7}, [r0, :256]!
480 vmov.i8 q7, #0x63 // compose .L63
482 vst1.8 {q15}, [r0, :128]
484 ENDPROC(aesbs_convert_key)
487 M0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01
490 vld1.8 {q9}, [bskey, :128]! // round 0 key
493 veor q10, q0, q9 // xor with round0 key
510 bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
512 sub rounds, rounds, #1
516 SR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
517 SRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d
522 shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
524 sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
526 subs rounds, rounds, #1
529 mix_cols q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
537 vld1.8 {q12}, [bskey, :128] // last round key
539 bitslice q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
550 ENDPROC(aesbs_encrypt8)
553 M0ISR: .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
556 add bskey, bskey, rounds, lsl #7
557 sub bskey, bskey, #112
558 vld1.8 {q9}, [bskey, :128] // round 0 key
559 sub bskey, bskey, #128
562 veor q10, q0, q9 // xor with round0 key
579 bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
581 sub rounds, rounds, #1
585 ISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
586 ISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d
591 inv_shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
593 inv_sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
595 subs rounds, rounds, #1
598 inv_mix_cols q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
606 add bskey, bskey, #112
607 vld1.8 {q12}, [bskey, :128] // last round key
609 bitslice q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
620 ENDPROC(aesbs_decrypt8)
623 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
625 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
628 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
630 ldr r5, [sp, #16] // number of blocks
635 sub ip, ip, lr, lsl #2
636 bxlt ip // computed goto if blocks < 8
654 sub ip, ip, lr, lsl #2
655 bxlt ip // computed goto if blocks < 8
673 ENTRY(aesbs_ecb_encrypt)
674 __ecb_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
675 ENDPROC(aesbs_ecb_encrypt)
678 ENTRY(aesbs_ecb_decrypt)
679 __ecb_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
680 ENDPROC(aesbs_ecb_decrypt)
683 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
684 * int rounds, int blocks, u8 iv[])
687 ENTRY(aesbs_cbc_decrypt)
690 ldm ip, {r5-r6} // load args 4-5
695 sub ip, ip, lr, lsl #2
697 bxlt ip // computed goto if blocks < 8
724 sub ip, ip, lr, lsl #2
725 bxlt ip // computed goto if blocks < 8
737 sub ip, ip, lr, lsl #3
738 bxlt ip // computed goto if blocks < 8
755 vld1.8 {q8}, [r1]! // load next round's iv
756 2: vst1.8 {q5}, [r0]!
759 vst1.8 {q8}, [r6] // store next round's iv
763 ENDPROC(aesbs_cbc_decrypt)
766 vmov.32 \q\()h[1], r10
768 vmov.32 \q\()h[0], r9
770 vmov.32 \q\()l[1], r8
772 vmov.32 \q\()l[0], r7
778 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
779 * int rounds, int blocks, u8 ctr[], u8 final[])
781 ENTRY(aesbs_ctr_encrypt)
785 ldm ip, {r5-r7} // load args 4-6
787 addne r5, r5, #1 // one extra block if final != 0
789 vld1.8 {q0}, [r6] // load counter
811 sub ip, ip, lr, lsl #5
812 sub ip, ip, lr, lsl #2
813 bxlt ip // computed goto if blocks < 8
831 ldrle r4, [sp, #40] // load final in the last round
832 sub ip, ip, lr, lsl #2
833 bxlt ip // computed goto if blocks < 8
842 teq r4, #0 // skip last block if 'final'
848 sub ip, ip, lr, lsl #3
849 bxlt ip // computed goto if blocks < 8
865 teq r4, #0 // skip last block if 'final'
880 ENDPROC(aesbs_ctr_encrypt)
882 .macro next_tweak, out, in, const, tmp
883 vshr.s64 \tmp, \in, #63
884 vand \tmp, \tmp, \const
885 vadd.u64 \out, \in, \in
886 vext.8 \tmp, \tmp, \tmp, #8
887 veor \out, \out, \tmp
891 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
892 * int blocks, u8 iv[], int reorder_last_tweak)
893 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
894 * int blocks, u8 iv[], int reorder_last_tweak)
897 vld1.8 {q14}, [r7] // load iv
898 vmov.i32 d30, #0x87 // compose tweak mask vector
900 vshr.u64 d30, d31, #7
906 sub ip, ip, r4, lsl #5
908 bxlt ip // computed goto if blocks < 8
911 next_tweak q12, q14, q15, q13
913 vst1.8 {q14}, [r4, :128]!
916 next_tweak q14, q12, q15, q13
918 vst1.8 {q12}, [r4, :128]!
921 next_tweak q12, q14, q15, q13
923 vst1.8 {q14}, [r4, :128]!
926 next_tweak q14, q12, q15, q13
928 vst1.8 {q12}, [r4, :128]!
931 next_tweak q12, q14, q15, q13
933 vst1.8 {q14}, [r4, :128]!
936 next_tweak q14, q12, q15, q13
938 vst1.8 {q12}, [r4, :128]!
941 next_tweak q12, q14, q15, q13
943 vst1.8 {q14}, [r4, :128]!
946 next_tweak q14, q12, q15, q13
951 vst1.8 {q12}, [r4, :128]
953 vst1.8 {q14}, [r7] // store next iv
958 ENDPROC(__xts_prepare8)
960 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
962 mov r5, sp // preserve sp
963 ldrd r6, r7, [sp, #24] // get blocks and iv args
964 ldr r8, [sp, #32] // reorder final tweak?
966 sub ip, sp, #128 // make room for 8x tweak
967 bic ip, ip, #0xf // align sp to 16 bytes
970 99: bl __xts_prepare8
979 sub ip, ip, lr, lsl #2
981 bxlt ip // computed goto if blocks < 8
983 vld1.8 {q8}, [r4, :128]!
984 vld1.8 {q9}, [r4, :128]!
985 vld1.8 {q10}, [r4, :128]!
986 vld1.8 {q11}, [r4, :128]!
987 vld1.8 {q12}, [r4, :128]!
988 vld1.8 {q13}, [r4, :128]!
989 vld1.8 {q14}, [r4, :128]!
990 vld1.8 {q15}, [r4, :128]
993 sub ip, ip, lr, lsl #3
994 bxlt ip // computed goto if blocks < 8
1020 ENTRY(aesbs_xts_encrypt)
1021 __xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1022 ENDPROC(aesbs_xts_encrypt)
1024 ENTRY(aesbs_xts_decrypt)
1025 __xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1026 ENDPROC(aesbs_xts_decrypt)