1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Bit sliced AES using NEON instructions
5 * Copyright (C) 2017 Linaro Ltd.
6 * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
10 * The algorithm implemented here is described in detail by the paper
11 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
12 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
14 * This implementation is based primarily on the OpenSSL implementation
15 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
18 #include <linux/linkage.h>
19 #include <asm/assembler.h>
60 .macro __tbl, out, tbl, in, tmp
63 .error __tbl needs temp register if out == tbl
67 vtbl.8 \out\()l, {\tbl}, \in\()l
69 vtbl.8 \out\()h, {\tmp}, \in\()h
71 vtbl.8 \out\()h, {\tbl}, \in\()h
75 .macro __ldr, out, sym
77 vldr \out\()h, \sym + 8
80 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
96 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
110 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
125 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
141 .macro mul_gf4, x0, x1, y0, y1, t0, t1
151 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
168 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
169 y0, y1, y2, y3, t0, t1, t2, t3
172 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
175 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
182 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
185 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
192 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
193 t0, t1, t2, t3, s0, s1, s2, s3
240 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
241 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
244 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
245 t0, t1, t2, t3, s0, s1, s2, s3
246 in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
247 inv_gf256 \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
248 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
249 out_bs_ch \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
252 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
253 t0, t1, t2, t3, s0, s1, s2, s3
254 inv_in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
255 inv_gf256 \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
256 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
257 inv_out_bs_ch \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
260 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
262 vld1.8 {\t0-\t1}, [bskey, :256]!
264 vld1.8 {\t2-\t3}, [bskey, :256]!
266 __tbl \x0, \t0, \mask
268 __tbl \x1, \t1, \mask
269 vld1.8 {\t0-\t1}, [bskey, :256]!
271 __tbl \x2, \t2, \mask
272 __tbl \x3, \t3, \mask
273 vld1.8 {\t2-\t3}, [bskey, :256]!
276 __tbl \x4, \t0, \mask
278 __tbl \x5, \t1, \mask
280 __tbl \x6, \t2, \mask
281 __tbl \x7, \t3, \mask
284 .macro inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
286 __tbl \x0, \x0, \mask, \t0
287 __tbl \x1, \x1, \mask, \t1
288 __tbl \x2, \x2, \mask, \t2
289 __tbl \x3, \x3, \mask, \t3
290 __tbl \x4, \x4, \mask, \t0
291 __tbl \x5, \x5, \mask, \t1
292 __tbl \x6, \x6, \mask, \t2
293 __tbl \x7, \x7, \mask, \t3
296 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
297 t0, t1, t2, t3, t4, t5, t6, t7, inv
298 vext.8 \t0, \x0, \x0, #12
299 vext.8 \t1, \x1, \x1, #12
301 vext.8 \t2, \x2, \x2, #12
303 vext.8 \t3, \x3, \x3, #12
305 vext.8 \t4, \x4, \x4, #12
307 vext.8 \t5, \x5, \x5, #12
309 vext.8 \t6, \x6, \x6, #12
311 vext.8 \t7, \x7, \x7, #12
315 vext.8 \x0, \x0, \x0, #8
319 vext.8 \x1, \x1, \x1, #8
324 vext.8 \t0, \x4, \x4, #8
326 vext.8 \t1, \x5, \x5, #8
328 vext.8 \x4, \x3, \x3, #8
330 vext.8 \x5, \x7, \x7, #8
332 vext.8 \x3, \x6, \x6, #8
334 vext.8 \x6, \x2, \x2, #8
352 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
353 t0, t1, t2, t3, t4, t5, t6, t7
354 vld1.8 {\t0-\t1}, [bskey, :256]!
356 vld1.8 {\t2-\t3}, [bskey, :256]!
358 vld1.8 {\t4-\t5}, [bskey, :256]!
360 vld1.8 {\t6-\t7}, [bskey, :256]
361 sub bskey, bskey, #224
367 vext.8 \t0, \x0, \x0, #8
368 vext.8 \t6, \x6, \x6, #8
369 vext.8 \t7, \x7, \x7, #8
371 vext.8 \t1, \x1, \x1, #8
373 vext.8 \t2, \x2, \x2, #8
375 vext.8 \t3, \x3, \x3, #8
377 vext.8 \t4, \x4, \x4, #8
379 vext.8 \t5, \x5, \x5, #8
397 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
398 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
401 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
402 vshr.u64 \t0, \b0, #\n
403 vshr.u64 \t1, \b1, #\n
409 vshl.s64 \t0, \t0, #\n
411 vshl.s64 \t1, \t1, #\n
416 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
419 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
420 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
422 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
423 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
424 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
425 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
429 M0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d
432 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
434 ENTRY(aesbs_convert_key)
435 vld1.32 {q7}, [r1]! // load round 0 key
436 vld1.32 {q15}, [r1]! // load round 1 key
438 vmov.i8 q8, #0x01 // bit masks
447 vst1.8 {q7}, [r0, :128]! // save round 0 key
462 vld1.32 {q15}, [r1]! // load next round key
469 vst1.8 {q0-q1}, [r0, :256]!
470 vst1.8 {q2-q3}, [r0, :256]!
471 vst1.8 {q4-q5}, [r0, :256]!
472 vst1.8 {q6-q7}, [r0, :256]!
475 vmov.i8 q7, #0x63 // compose .L63
477 vst1.8 {q15}, [r0, :128]
479 ENDPROC(aesbs_convert_key)
482 M0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01
485 vld1.8 {q9}, [bskey, :128]! // round 0 key
488 veor q10, q0, q9 // xor with round0 key
505 bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
507 sub rounds, rounds, #1
511 SR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
512 SRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d
517 shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
519 sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
521 subs rounds, rounds, #1
524 mix_cols q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
532 vld1.8 {q12}, [bskey, :128] // last round key
534 bitslice q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
545 ENDPROC(aesbs_encrypt8)
548 M0ISR: .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
551 add bskey, bskey, rounds, lsl #7
552 sub bskey, bskey, #112
553 vld1.8 {q9}, [bskey, :128] // round 0 key
554 sub bskey, bskey, #128
557 veor q10, q0, q9 // xor with round0 key
574 bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
576 sub rounds, rounds, #1
580 ISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
581 ISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d
586 inv_shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
588 inv_sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
590 subs rounds, rounds, #1
593 inv_mix_cols q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
601 add bskey, bskey, #112
602 vld1.8 {q12}, [bskey, :128] // last round key
604 bitslice q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
615 ENDPROC(aesbs_decrypt8)
618 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
620 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
623 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
625 ldr r5, [sp, #16] // number of blocks
630 sub ip, ip, lr, lsl #2
631 movlt pc, ip // computed goto if blocks < 8
649 sub ip, ip, lr, lsl #2
650 movlt pc, ip // computed goto if blocks < 8
668 ENTRY(aesbs_ecb_encrypt)
669 __ecb_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
670 ENDPROC(aesbs_ecb_encrypt)
673 ENTRY(aesbs_ecb_decrypt)
674 __ecb_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
675 ENDPROC(aesbs_ecb_decrypt)
678 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
679 * int rounds, int blocks, u8 iv[])
682 ENTRY(aesbs_cbc_decrypt)
685 ldm ip, {r5-r6} // load args 4-5
690 sub ip, ip, lr, lsl #2
692 movlt pc, ip // computed goto if blocks < 8
719 sub ip, ip, lr, lsl #2
720 movlt pc, ip // computed goto if blocks < 8
732 sub ip, ip, lr, lsl #3
733 movlt pc, ip // computed goto if blocks < 8
750 vld1.8 {q8}, [r1]! // load next round's iv
751 2: vst1.8 {q5}, [r0]!
754 vst1.8 {q8}, [r6] // store next round's iv
758 ENDPROC(aesbs_cbc_decrypt)
761 vmov.32 \q\()h[1], r10
763 vmov.32 \q\()h[0], r9
765 vmov.32 \q\()l[1], r8
767 vmov.32 \q\()l[0], r7
773 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
774 * int rounds, int blocks, u8 ctr[], u8 final[])
776 ENTRY(aesbs_ctr_encrypt)
780 ldm ip, {r5-r7} // load args 4-6
782 addne r5, r5, #1 // one extra block if final != 0
784 vld1.8 {q0}, [r6] // load counter
806 sub ip, ip, lr, lsl #5
807 sub ip, ip, lr, lsl #2
808 movlt pc, ip // computed goto if blocks < 8
826 ldrle r4, [sp, #40] // load final in the last round
827 sub ip, ip, lr, lsl #2
828 movlt pc, ip // computed goto if blocks < 8
837 teq r4, #0 // skip last block if 'final'
843 sub ip, ip, lr, lsl #3
844 movlt pc, ip // computed goto if blocks < 8
860 teq r4, #0 // skip last block if 'final'
875 ENDPROC(aesbs_ctr_encrypt)
877 .macro next_tweak, out, in, const, tmp
878 vshr.s64 \tmp, \in, #63
879 vand \tmp, \tmp, \const
880 vadd.u64 \out, \in, \in
881 vext.8 \tmp, \tmp, \tmp, #8
882 veor \out, \out, \tmp
886 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
887 * int blocks, u8 iv[], int reorder_last_tweak)
888 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
889 * int blocks, u8 iv[], int reorder_last_tweak)
892 vld1.8 {q14}, [r7] // load iv
893 vmov.i32 d30, #0x87 // compose tweak mask vector
895 vshr.u64 d30, d31, #7
901 sub ip, ip, r4, lsl #5
903 movlt pc, ip // computed goto if blocks < 8
906 next_tweak q12, q14, q15, q13
908 vst1.8 {q14}, [r4, :128]!
911 next_tweak q14, q12, q15, q13
913 vst1.8 {q12}, [r4, :128]!
916 next_tweak q12, q14, q15, q13
918 vst1.8 {q14}, [r4, :128]!
921 next_tweak q14, q12, q15, q13
923 vst1.8 {q12}, [r4, :128]!
926 next_tweak q12, q14, q15, q13
928 vst1.8 {q14}, [r4, :128]!
931 next_tweak q14, q12, q15, q13
933 vst1.8 {q12}, [r4, :128]!
936 next_tweak q12, q14, q15, q13
938 vst1.8 {q14}, [r4, :128]!
941 next_tweak q14, q12, q15, q13
946 vst1.8 {q12}, [r4, :128]
948 vst1.8 {q14}, [r7] // store next iv
953 ENDPROC(__xts_prepare8)
955 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
957 mov r5, sp // preserve sp
958 ldrd r6, r7, [sp, #24] // get blocks and iv args
960 sub ip, sp, #128 // make room for 8x tweak
961 bic ip, ip, #0xf // align sp to 16 bytes
964 99: bl __xts_prepare8
973 sub ip, ip, lr, lsl #2
975 movlt pc, ip // computed goto if blocks < 8
977 vld1.8 {q8}, [r4, :128]!
978 vld1.8 {q9}, [r4, :128]!
979 vld1.8 {q10}, [r4, :128]!
980 vld1.8 {q11}, [r4, :128]!
981 vld1.8 {q12}, [r4, :128]!
982 vld1.8 {q13}, [r4, :128]!
983 vld1.8 {q14}, [r4, :128]!
984 vld1.8 {q15}, [r4, :128]
987 sub ip, ip, lr, lsl #3
988 movlt pc, ip // computed goto if blocks < 8
1014 ENTRY(aesbs_xts_encrypt)
1015 mov ip, #0 // never reorder final tweak
1016 __xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1017 ENDPROC(aesbs_xts_encrypt)
1019 ENTRY(aesbs_xts_decrypt)
1020 ldr ip, [sp, #8] // reorder final tweak?
1021 __xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1022 ENDPROC(aesbs_xts_decrypt)