2 * Bit sliced AES using NEON instructions
4 * Copyright (C) 2017 Linaro Ltd.
5 * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
13 * The algorithm implemented here is described in detail by the paper
14 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
15 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
17 * This implementation is based primarily on the OpenSSL implementation
18 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
21 #include <linux/linkage.h>
22 #include <asm/assembler.h>
63 .macro __tbl, out, tbl, in, tmp
66 .error __tbl needs temp register if out == tbl
70 vtbl.8 \out\()l, {\tbl}, \in\()l
72 vtbl.8 \out\()h, {\tmp}, \in\()h
74 vtbl.8 \out\()h, {\tbl}, \in\()h
78 .macro __ldr, out, sym
80 vldr \out\()h, \sym + 8
83 .macro __adr, reg, lbl
85 THUMB( orr \reg, \reg, #1 )
88 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
104 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
118 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
133 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
149 .macro mul_gf4, x0, x1, y0, y1, t0, t1
159 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
176 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
177 y0, y1, y2, y3, t0, t1, t2, t3
180 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
183 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
190 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
193 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
200 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
201 t0, t1, t2, t3, s0, s1, s2, s3
248 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
249 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
252 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
253 t0, t1, t2, t3, s0, s1, s2, s3
254 in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
255 inv_gf256 \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
256 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
257 out_bs_ch \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
260 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
261 t0, t1, t2, t3, s0, s1, s2, s3
262 inv_in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
263 inv_gf256 \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
264 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
265 inv_out_bs_ch \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
268 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
270 vld1.8 {\t0-\t1}, [bskey, :256]!
272 vld1.8 {\t2-\t3}, [bskey, :256]!
274 __tbl \x0, \t0, \mask
276 __tbl \x1, \t1, \mask
277 vld1.8 {\t0-\t1}, [bskey, :256]!
279 __tbl \x2, \t2, \mask
280 __tbl \x3, \t3, \mask
281 vld1.8 {\t2-\t3}, [bskey, :256]!
284 __tbl \x4, \t0, \mask
286 __tbl \x5, \t1, \mask
288 __tbl \x6, \t2, \mask
289 __tbl \x7, \t3, \mask
292 .macro inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
294 __tbl \x0, \x0, \mask, \t0
295 __tbl \x1, \x1, \mask, \t1
296 __tbl \x2, \x2, \mask, \t2
297 __tbl \x3, \x3, \mask, \t3
298 __tbl \x4, \x4, \mask, \t0
299 __tbl \x5, \x5, \mask, \t1
300 __tbl \x6, \x6, \mask, \t2
301 __tbl \x7, \x7, \mask, \t3
304 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
305 t0, t1, t2, t3, t4, t5, t6, t7, inv
306 vext.8 \t0, \x0, \x0, #12
307 vext.8 \t1, \x1, \x1, #12
309 vext.8 \t2, \x2, \x2, #12
311 vext.8 \t3, \x3, \x3, #12
313 vext.8 \t4, \x4, \x4, #12
315 vext.8 \t5, \x5, \x5, #12
317 vext.8 \t6, \x6, \x6, #12
319 vext.8 \t7, \x7, \x7, #12
323 vext.8 \x0, \x0, \x0, #8
327 vext.8 \x1, \x1, \x1, #8
332 vext.8 \t0, \x4, \x4, #8
334 vext.8 \t1, \x5, \x5, #8
336 vext.8 \x4, \x3, \x3, #8
338 vext.8 \x5, \x7, \x7, #8
340 vext.8 \x3, \x6, \x6, #8
342 vext.8 \x6, \x2, \x2, #8
360 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
361 t0, t1, t2, t3, t4, t5, t6, t7
362 vld1.8 {\t0-\t1}, [bskey, :256]!
364 vld1.8 {\t2-\t3}, [bskey, :256]!
366 vld1.8 {\t4-\t5}, [bskey, :256]!
368 vld1.8 {\t6-\t7}, [bskey, :256]
369 sub bskey, bskey, #224
375 vext.8 \t0, \x0, \x0, #8
376 vext.8 \t6, \x6, \x6, #8
377 vext.8 \t7, \x7, \x7, #8
379 vext.8 \t1, \x1, \x1, #8
381 vext.8 \t2, \x2, \x2, #8
383 vext.8 \t3, \x3, \x3, #8
385 vext.8 \t4, \x4, \x4, #8
387 vext.8 \t5, \x5, \x5, #8
405 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
406 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
409 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
410 vshr.u64 \t0, \b0, #\n
411 vshr.u64 \t1, \b1, #\n
417 vshl.s64 \t0, \t0, #\n
419 vshl.s64 \t1, \t1, #\n
424 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
427 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
428 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
430 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
431 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
432 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
433 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
437 M0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d
440 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
442 ENTRY(aesbs_convert_key)
443 vld1.32 {q7}, [r1]! // load round 0 key
444 vld1.32 {q15}, [r1]! // load round 1 key
446 vmov.i8 q8, #0x01 // bit masks
455 vst1.8 {q7}, [r0, :128]! // save round 0 key
470 vld1.32 {q15}, [r1]! // load next round key
477 vst1.8 {q0-q1}, [r0, :256]!
478 vst1.8 {q2-q3}, [r0, :256]!
479 vst1.8 {q4-q5}, [r0, :256]!
480 vst1.8 {q6-q7}, [r0, :256]!
483 vmov.i8 q7, #0x63 // compose .L63
485 vst1.8 {q15}, [r0, :128]
487 ENDPROC(aesbs_convert_key)
490 M0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01
493 vld1.8 {q9}, [bskey, :128]! // round 0 key
496 veor q10, q0, q9 // xor with round0 key
513 bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
515 sub rounds, rounds, #1
519 SR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
520 SRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d
525 shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
527 sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
529 subs rounds, rounds, #1
532 mix_cols q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
540 vld1.8 {q12}, [bskey, :128] // last round key
542 bitslice q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
553 ENDPROC(aesbs_encrypt8)
556 M0ISR: .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
559 add bskey, bskey, rounds, lsl #7
560 sub bskey, bskey, #112
561 vld1.8 {q9}, [bskey, :128] // round 0 key
562 sub bskey, bskey, #128
565 veor q10, q0, q9 // xor with round0 key
582 bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
584 sub rounds, rounds, #1
588 ISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
589 ISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d
594 inv_shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
596 inv_sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
598 subs rounds, rounds, #1
601 inv_mix_cols q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
609 add bskey, bskey, #112
610 vld1.8 {q12}, [bskey, :128] // last round key
612 bitslice q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
623 ENDPROC(aesbs_decrypt8)
626 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
628 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
631 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
633 ldr r5, [sp, #16] // number of blocks
638 sub ip, ip, lr, lsl #2
639 bxlt ip // computed goto if blocks < 8
657 sub ip, ip, lr, lsl #2
658 bxlt ip // computed goto if blocks < 8
676 ENTRY(aesbs_ecb_encrypt)
677 __ecb_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
678 ENDPROC(aesbs_ecb_encrypt)
681 ENTRY(aesbs_ecb_decrypt)
682 __ecb_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
683 ENDPROC(aesbs_ecb_decrypt)
686 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
687 * int rounds, int blocks, u8 iv[])
690 ENTRY(aesbs_cbc_decrypt)
693 ldm ip, {r5-r6} // load args 4-5
698 sub ip, ip, lr, lsl #2
700 bxlt ip // computed goto if blocks < 8
727 sub ip, ip, lr, lsl #2
728 bxlt ip // computed goto if blocks < 8
740 sub ip, ip, lr, lsl #3
741 bxlt ip // computed goto if blocks < 8
758 vld1.8 {q8}, [r1]! // load next round's iv
759 2: vst1.8 {q5}, [r0]!
762 vst1.8 {q8}, [r6] // store next round's iv
766 ENDPROC(aesbs_cbc_decrypt)
769 vmov.32 \q\()h[1], r10
771 vmov.32 \q\()h[0], r9
773 vmov.32 \q\()l[1], r8
775 vmov.32 \q\()l[0], r7
781 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
782 * int rounds, int blocks, u8 ctr[], u8 final[])
784 ENTRY(aesbs_ctr_encrypt)
788 ldm ip, {r5-r7} // load args 4-6
790 addne r5, r5, #1 // one extra block if final != 0
792 vld1.8 {q0}, [r6] // load counter
814 sub ip, ip, lr, lsl #5
815 sub ip, ip, lr, lsl #2
816 bxlt ip // computed goto if blocks < 8
834 ldrle r4, [sp, #40] // load final in the last round
835 sub ip, ip, lr, lsl #2
836 bxlt ip // computed goto if blocks < 8
845 teq r4, #0 // skip last block if 'final'
851 sub ip, ip, lr, lsl #3
852 bxlt ip // computed goto if blocks < 8
868 teq r4, #0 // skip last block if 'final'
883 ENDPROC(aesbs_ctr_encrypt)
885 .macro next_tweak, out, in, const, tmp
886 vshr.s64 \tmp, \in, #63
887 vand \tmp, \tmp, \const
888 vadd.u64 \out, \in, \in
889 vext.8 \tmp, \tmp, \tmp, #8
890 veor \out, \out, \tmp
898 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
899 * int blocks, u8 iv[])
900 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
901 * int blocks, u8 iv[])
904 vld1.8 {q14}, [r7] // load iv
905 __ldr q15, .Lxts_mul_x // load tweak mask
911 sub ip, ip, r4, lsl #5
913 bxlt ip // computed goto if blocks < 8
916 next_tweak q12, q14, q15, q13
918 vst1.8 {q14}, [r4, :128]!
921 next_tweak q14, q12, q15, q13
923 vst1.8 {q12}, [r4, :128]!
926 next_tweak q12, q14, q15, q13
928 vst1.8 {q14}, [r4, :128]!
931 next_tweak q14, q12, q15, q13
933 vst1.8 {q12}, [r4, :128]!
936 next_tweak q12, q14, q15, q13
938 vst1.8 {q14}, [r4, :128]!
941 next_tweak q14, q12, q15, q13
943 vst1.8 {q12}, [r4, :128]!
946 next_tweak q12, q14, q15, q13
948 vst1.8 {q14}, [r4, :128]!
951 next_tweak q14, q12, q15, q13
953 vst1.8 {q12}, [r4, :128]
955 0: vst1.8 {q14}, [r7] // store next iv
957 ENDPROC(__xts_prepare8)
959 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
961 mov r5, sp // preserve sp
962 ldrd r6, r7, [sp, #24] // get blocks and iv args
963 sub ip, sp, #128 // make room for 8x tweak
964 bic ip, ip, #0xf // align sp to 16 bytes
967 99: bl __xts_prepare8
976 sub ip, ip, lr, lsl #2
978 bxlt ip // computed goto if blocks < 8
980 vld1.8 {q8}, [r4, :128]!
981 vld1.8 {q9}, [r4, :128]!
982 vld1.8 {q10}, [r4, :128]!
983 vld1.8 {q11}, [r4, :128]!
984 vld1.8 {q12}, [r4, :128]!
985 vld1.8 {q13}, [r4, :128]!
986 vld1.8 {q14}, [r4, :128]!
987 vld1.8 {q15}, [r4, :128]
990 sub ip, ip, lr, lsl #3
991 bxlt ip // computed goto if blocks < 8
1017 ENTRY(aesbs_xts_encrypt)
1018 __xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1019 ENDPROC(aesbs_xts_encrypt)
1021 ENTRY(aesbs_xts_decrypt)
1022 __xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1023 ENDPROC(aesbs_xts_decrypt)