1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Bit sliced AES using NEON instructions
5 * Copyright (C) 2017 Linaro Ltd.
6 * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
10 * The algorithm implemented here is described in detail by the paper
11 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
12 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
14 * This implementation is based primarily on the OpenSSL implementation
15 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
18 #include <linux/linkage.h>
19 #include <asm/assembler.h>
60 .macro __tbl, out, tbl, in, tmp
63 .error __tbl needs temp register if out == tbl
67 vtbl.8 \out\()l, {\tbl}, \in\()l
69 vtbl.8 \out\()h, {\tmp}, \in\()h
71 vtbl.8 \out\()h, {\tbl}, \in\()h
75 .macro __ldr, out, sym
77 vldr \out\()h, \sym + 8
80 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
96 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
110 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
125 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
141 .macro mul_gf4, x0, x1, y0, y1, t0, t1
151 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
168 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
169 y0, y1, y2, y3, t0, t1, t2, t3
172 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
175 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
182 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
185 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
192 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
193 t0, t1, t2, t3, s0, s1, s2, s3
240 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
241 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
244 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
245 t0, t1, t2, t3, s0, s1, s2, s3
246 in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
247 inv_gf256 \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
248 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
249 out_bs_ch \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
252 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
253 t0, t1, t2, t3, s0, s1, s2, s3
254 inv_in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
255 inv_gf256 \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
256 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
257 inv_out_bs_ch \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
260 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
262 vld1.8 {\t0-\t1}, [bskey, :256]!
264 vld1.8 {\t2-\t3}, [bskey, :256]!
266 __tbl \x0, \t0, \mask
268 __tbl \x1, \t1, \mask
269 vld1.8 {\t0-\t1}, [bskey, :256]!
271 __tbl \x2, \t2, \mask
272 __tbl \x3, \t3, \mask
273 vld1.8 {\t2-\t3}, [bskey, :256]!
276 __tbl \x4, \t0, \mask
278 __tbl \x5, \t1, \mask
280 __tbl \x6, \t2, \mask
281 __tbl \x7, \t3, \mask
284 .macro inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
286 __tbl \x0, \x0, \mask, \t0
287 __tbl \x1, \x1, \mask, \t1
288 __tbl \x2, \x2, \mask, \t2
289 __tbl \x3, \x3, \mask, \t3
290 __tbl \x4, \x4, \mask, \t0
291 __tbl \x5, \x5, \mask, \t1
292 __tbl \x6, \x6, \mask, \t2
293 __tbl \x7, \x7, \mask, \t3
296 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
297 t0, t1, t2, t3, t4, t5, t6, t7, inv
298 vext.8 \t0, \x0, \x0, #12
299 vext.8 \t1, \x1, \x1, #12
301 vext.8 \t2, \x2, \x2, #12
303 vext.8 \t3, \x3, \x3, #12
305 vext.8 \t4, \x4, \x4, #12
307 vext.8 \t5, \x5, \x5, #12
309 vext.8 \t6, \x6, \x6, #12
311 vext.8 \t7, \x7, \x7, #12
315 vext.8 \x0, \x0, \x0, #8
319 vext.8 \x1, \x1, \x1, #8
324 vext.8 \t0, \x4, \x4, #8
326 vext.8 \t1, \x5, \x5, #8
328 vext.8 \x4, \x3, \x3, #8
330 vext.8 \x5, \x7, \x7, #8
332 vext.8 \x3, \x6, \x6, #8
334 vext.8 \x6, \x2, \x2, #8
352 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
353 t0, t1, t2, t3, t4, t5, t6, t7
354 vld1.8 {\t0-\t1}, [bskey, :256]!
356 vld1.8 {\t2-\t3}, [bskey, :256]!
358 vld1.8 {\t4-\t5}, [bskey, :256]!
360 vld1.8 {\t6-\t7}, [bskey, :256]
361 sub bskey, bskey, #224
367 vext.8 \t0, \x0, \x0, #8
368 vext.8 \t6, \x6, \x6, #8
369 vext.8 \t7, \x7, \x7, #8
371 vext.8 \t1, \x1, \x1, #8
373 vext.8 \t2, \x2, \x2, #8
375 vext.8 \t3, \x3, \x3, #8
377 vext.8 \t4, \x4, \x4, #8
379 vext.8 \t5, \x5, \x5, #8
397 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
398 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
401 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
402 vshr.u64 \t0, \b0, #\n
403 vshr.u64 \t1, \b1, #\n
409 vshl.s64 \t0, \t0, #\n
411 vshl.s64 \t1, \t1, #\n
416 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
419 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
420 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
422 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
423 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
424 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
425 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
429 M0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d
432 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
434 ENTRY(aesbs_convert_key)
435 vld1.32 {q7}, [r1]! // load round 0 key
436 vld1.32 {q15}, [r1]! // load round 1 key
438 vmov.i8 q8, #0x01 // bit masks
447 vst1.8 {q7}, [r0, :128]! // save round 0 key
462 vld1.32 {q15}, [r1]! // load next round key
469 vst1.8 {q0-q1}, [r0, :256]!
470 vst1.8 {q2-q3}, [r0, :256]!
471 vst1.8 {q4-q5}, [r0, :256]!
472 vst1.8 {q6-q7}, [r0, :256]!
475 vmov.i8 q7, #0x63 // compose .L63
477 vst1.8 {q15}, [r0, :128]
479 ENDPROC(aesbs_convert_key)
482 M0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01
485 vld1.8 {q9}, [bskey, :128]! // round 0 key
488 veor q10, q0, q9 // xor with round0 key
505 bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
507 sub rounds, rounds, #1
511 SR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
512 SRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d
517 shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
519 sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
521 subs rounds, rounds, #1
524 mix_cols q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
532 vld1.8 {q12}, [bskey, :128] // last round key
534 bitslice q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
545 ENDPROC(aesbs_encrypt8)
548 M0ISR: .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
551 add bskey, bskey, rounds, lsl #7
552 sub bskey, bskey, #112
553 vld1.8 {q9}, [bskey, :128] // round 0 key
554 sub bskey, bskey, #128
557 veor q10, q0, q9 // xor with round0 key
574 bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
576 sub rounds, rounds, #1
580 ISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
581 ISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d
586 inv_shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
588 inv_sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
590 subs rounds, rounds, #1
593 inv_mix_cols q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
601 add bskey, bskey, #112
602 vld1.8 {q12}, [bskey, :128] // last round key
604 bitslice q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
615 ENDPROC(aesbs_decrypt8)
618 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
620 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
623 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
625 ldr r5, [sp, #16] // number of blocks
630 sub ip, ip, lr, lsl #2
631 movlt pc, ip // computed goto if blocks < 8
649 sub ip, ip, lr, lsl #2
650 movlt pc, ip // computed goto if blocks < 8
668 ENTRY(aesbs_ecb_encrypt)
669 __ecb_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
670 ENDPROC(aesbs_ecb_encrypt)
673 ENTRY(aesbs_ecb_decrypt)
674 __ecb_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
675 ENDPROC(aesbs_ecb_decrypt)
678 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
679 * int rounds, int blocks, u8 iv[])
682 ENTRY(aesbs_cbc_decrypt)
685 ldm ip, {r5-r6} // load args 4-5
690 sub ip, ip, lr, lsl #2
692 movlt pc, ip // computed goto if blocks < 8
719 sub ip, ip, lr, lsl #2
720 movlt pc, ip // computed goto if blocks < 8
732 sub ip, ip, lr, lsl #3
733 movlt pc, ip // computed goto if blocks < 8
750 vld1.8 {q8}, [r1]! // load next round's iv
751 2: vst1.8 {q5}, [r0]!
754 vst1.8 {q8}, [r6] // store next round's iv
758 ENDPROC(aesbs_cbc_decrypt)
771 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
772 * int rounds, int bytes, u8 ctr[])
774 ENTRY(aesbs_ctr_encrypt)
778 ldm ip, {r5, r6} // load args 4-5
779 vld1.8 {q0}, [r6] // load counter
798 sub ip, ip, lr, lsl #1
800 add ip, ip, lr, lsr #2
802 movle pc, ip // computed goto if bytes < 112
820 ands r4, r5, #15 // preserves C flag
821 teqcs r5, r5 // set Z flag if not last iteration
822 sub ip, ip, lr, lsr #2
824 movcc pc, ip // computed goto if bytes < 128
848 movcc pc, ip // computed goto if bytes < 128
868 3: adr lr, .Lpermute_table + 16
869 cmp r5, #16 // Z flag remains cleared
872 vtbl.8 d16, {q5}, d16
873 vtbl.8 d17, {q5}, d17
875 bcc 4f // have to reload prev if R5 < 16
876 vtbx.8 d10, {q2}, d18
877 vtbx.8 d11, {q2}, d19
878 mov pc, ip // branch back to VST sequence
881 vshr.s8 q9, q9, #7 // create mask for VBIF
882 vld1.8 {q8}, [r0] // reload
886 ENDPROC(aesbs_ctr_encrypt)
890 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
891 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
892 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
893 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
894 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
895 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
897 .macro next_tweak, out, in, const, tmp
898 vshr.s64 \tmp, \in, #63
899 vand \tmp, \tmp, \const
900 vadd.u64 \out, \in, \in
901 vext.8 \tmp, \tmp, \tmp, #8
902 veor \out, \out, \tmp
906 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
907 * int blocks, u8 iv[], int reorder_last_tweak)
908 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
909 * int blocks, u8 iv[], int reorder_last_tweak)
913 vld1.8 {q14}, [r7] // load iv
914 vmov.i32 d30, #0x87 // compose tweak mask vector
916 vshr.u64 d30, d31, #7
922 sub ip, ip, r4, lsl #5
924 movlt pc, ip // computed goto if blocks < 8
927 next_tweak q12, q14, q15, q13
929 vst1.8 {q14}, [r4, :128]!
932 next_tweak q14, q12, q15, q13
934 vst1.8 {q12}, [r4, :128]!
937 next_tweak q12, q14, q15, q13
939 vst1.8 {q14}, [r4, :128]!
942 next_tweak q14, q12, q15, q13
944 vst1.8 {q12}, [r4, :128]!
947 next_tweak q12, q14, q15, q13
949 vst1.8 {q14}, [r4, :128]!
952 next_tweak q14, q12, q15, q13
954 vst1.8 {q12}, [r4, :128]!
957 next_tweak q12, q14, q15, q13
959 vst1.8 {q14}, [r4, :128]!
962 next_tweak q14, q12, q15, q13
967 vst1.8 {q12}, [r4, :128]
969 vst1.8 {q14}, [r7] // store next iv
974 ENDPROC(__xts_prepare8)
976 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
978 mov r5, sp // preserve sp
979 ldrd r6, r7, [sp, #24] // get blocks and iv args
981 sub ip, sp, #128 // make room for 8x tweak
982 bic ip, ip, #0xf // align sp to 16 bytes
985 99: bl __xts_prepare8
994 sub ip, ip, lr, lsl #2
996 movlt pc, ip // computed goto if blocks < 8
998 vld1.8 {q8}, [r4, :128]!
999 vld1.8 {q9}, [r4, :128]!
1000 vld1.8 {q10}, [r4, :128]!
1001 vld1.8 {q11}, [r4, :128]!
1002 vld1.8 {q12}, [r4, :128]!
1003 vld1.8 {q13}, [r4, :128]!
1004 vld1.8 {q14}, [r4, :128]!
1005 vld1.8 {q15}, [r4, :128]
1008 sub ip, ip, lr, lsl #3
1009 movlt pc, ip // computed goto if blocks < 8
1035 ENTRY(aesbs_xts_encrypt)
1036 mov ip, #0 // never reorder final tweak
1037 __xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1038 ENDPROC(aesbs_xts_encrypt)
1040 ENTRY(aesbs_xts_decrypt)
1041 ldr ip, [sp, #8] // reorder final tweak?
1042 __xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1043 ENDPROC(aesbs_xts_decrypt)