1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
5 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
13 .fpu crypto-neon-fp-armv8
16 .macro enc_round, state, key
18 aesmc.8 \state, \state
21 .macro dec_round, state, key
23 aesimc.8 \state, \state
26 .macro enc_dround, key1, key2
31 .macro dec_dround, key1, key2
36 .macro enc_fround, key1, key2, key3
42 .macro dec_fround, key1, key2, key3
48 .macro enc_dround_4x, key1, key2
59 .macro dec_dround_4x, key1, key2
70 .macro enc_fround_4x, key1, key2, key3
85 .macro dec_fround_4x, key1, key2, key3
100 .macro do_block, dround, fround
101 cmp r3, #12 @ which key size?
102 vld1.32 {q10-q11}, [ip]!
104 vld1.32 {q12-q13}, [ip]!
106 vld1.32 {q10-q11}, [ip]!
108 vld1.32 {q12-q13}, [ip]!
110 blo 0f @ AES-128: 10 rounds
111 vld1.32 {q10-q11}, [ip]!
113 beq 1f @ AES-192: 12 rounds
114 vld1.32 {q12-q13}, [ip]
116 0: \fround q12, q13, q14
119 1: \fround q10, q11, q14
124 * Internal, non-AAPCS compliant functions that implement the core AES
125 * transforms. These should preserve all registers except q0 - q2 and ip
127 * q0 : first in/output block
128 * q1 : second in/output block (_4x version only)
129 * q2 : third in/output block (_4x version only)
130 * q3 : fourth in/output block (_4x version only)
131 * q8 : first round key
132 * q9 : secound round key
133 * q14 : final round key
134 * r2 : address of round key array
135 * r3 : number of rounds
139 add ip, r2, #32 @ 3rd round key
141 do_block enc_dround, enc_fround
146 add ip, r2, #32 @ 3rd round key
147 do_block dec_dround, dec_fround
152 add ip, r2, #32 @ 3rd round key
153 do_block enc_dround_4x, enc_fround_4x
154 ENDPROC(aes_encrypt_4x)
158 add ip, r2, #32 @ 3rd round key
159 do_block dec_dround_4x, dec_fround_4x
160 ENDPROC(aes_decrypt_4x)
162 .macro prepare_key, rk, rounds
163 add ip, \rk, \rounds, lsl #4
164 vld1.32 {q8-q9}, [\rk] @ load first 2 round keys
165 vld1.32 {q14}, [ip] @ load last round key
169 * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
171 * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
174 ENTRY(ce_aes_ecb_encrypt)
181 vld1.8 {q0-q1}, [r1]!
182 vld1.8 {q2-q3}, [r1]!
184 vst1.8 {q0-q1}, [r0]!
185 vst1.8 {q2-q3}, [r0]!
198 ENDPROC(ce_aes_ecb_encrypt)
200 ENTRY(ce_aes_ecb_decrypt)
207 vld1.8 {q0-q1}, [r1]!
208 vld1.8 {q2-q3}, [r1]!
210 vst1.8 {q0-q1}, [r0]!
211 vst1.8 {q2-q3}, [r0]!
224 ENDPROC(ce_aes_ecb_decrypt)
227 * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
228 * int blocks, u8 iv[])
229 * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
230 * int blocks, u8 iv[])
232 ENTRY(ce_aes_cbc_encrypt)
234 ldrd r4, r5, [sp, #16]
238 vld1.8 {q1}, [r1]! @ get next pt block
239 veor q0, q0, q1 @ ..and xor with iv
246 ENDPROC(ce_aes_cbc_encrypt)
248 ENTRY(ce_aes_cbc_decrypt)
250 ldrd r4, r5, [sp, #16]
251 vld1.8 {q15}, [r5] @ keep iv in q15
256 vld1.8 {q0-q1}, [r1]!
257 vld1.8 {q2-q3}, [r1]!
268 vst1.8 {q0-q1}, [r0]!
269 vst1.8 {q2-q3}, [r0]!
274 vmov q6, q14 @ preserve last round key
276 vld1.8 {q0}, [r1]! @ get next ct block
277 veor q14, q15, q6 @ combine prev ct with last key
284 vst1.8 {q15}, [r5] @ keep iv in q15
286 ENDPROC(ce_aes_cbc_decrypt)
290 * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291 * int rounds, int bytes, u8 const iv[])
292 * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293 * int rounds, int bytes, u8 const iv[])
296 ENTRY(ce_aes_cbc_cts_encrypt)
298 ldrd r4, r5, [sp, #16]
300 movw ip, :lower16:.Lcts_permute_table
301 movt ip, :upper16:.Lcts_permute_table
310 vld1.8 {q0}, [r1] @ overlapping loads
313 vld1.8 {q1}, [r5] @ get iv
316 veor q0, q0, q1 @ xor with iv
319 vtbl.8 d4, {d0-d1}, d10
320 vtbl.8 d5, {d0-d1}, d11
321 vtbl.8 d2, {d6-d7}, d12
322 vtbl.8 d3, {d6-d7}, d13
328 vst1.8 {q2}, [r4] @ overlapping stores
332 ENDPROC(ce_aes_cbc_cts_encrypt)
334 ENTRY(ce_aes_cbc_cts_decrypt)
336 ldrd r4, r5, [sp, #16]
338 movw ip, :lower16:.Lcts_permute_table
339 movt ip, :upper16:.Lcts_permute_table
348 vld1.8 {q0}, [r1] @ overlapping loads
351 vld1.8 {q3}, [r5] @ get iv
356 vtbl.8 d4, {d0-d1}, d10
357 vtbl.8 d5, {d0-d1}, d11
358 vtbx.8 d0, {d2-d3}, d12
359 vtbx.8 d1, {d2-d3}, d13
363 veor q0, q0, q3 @ xor with iv
366 vst1.8 {q1}, [r4] @ overlapping stores
370 ENDPROC(ce_aes_cbc_cts_decrypt)
374 * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
375 * int blocks, u8 ctr[])
377 ENTRY(ce_aes_ctr_encrypt)
379 ldrd r4, r5, [sp, #16]
380 vld1.8 {q7}, [r5] @ load ctr
382 vmov r6, s31 @ keep swabbed ctr in r6
384 cmn r6, r4 @ 32 bit overflow?
391 * NOTE: the sequence below has been carefully tweaked to avoid
392 * a silicon erratum that exists in Cortex-A57 (#1742098) and
393 * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
394 * may produce an incorrect result if they take their input from a
395 * register of which a single 32-bit lane has been updated the last
396 * time it was modified. To work around this, the lanes of registers
397 * q0-q3 below are not manipulated individually, and the different
398 * counter values are prepared by successive manipulations of q7.
404 vmov s31, ip @ set lane 3 of q1 via q7
408 vmov s31, lr @ set lane 3 of q2 via q7
411 vmov s31, ip @ set lane 3 of q3 via q7
415 vld1.8 {q4-q5}, [r1]!
424 vst1.8 {q0-q1}, [r0]!
425 vst1.8 {q2-q3}, [r0]!
435 adds r6, r6, #1 @ increment BE ctr
442 bmi .Lctrtailblock @ blocks < 0 means tail block
449 vst1.8 {q7}, [r5] @ return next CTR value
453 vst1.8 {q0}, [r0, :64] @ return the key stream
457 .irp sreg, s30, s29, s28
458 vmov ip, \sreg @ load next word of ctr
459 rev ip, ip @ ... to handle the carry
466 ENDPROC(ce_aes_ctr_encrypt)
469 * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
470 * int bytes, u8 iv[], u32 const rk2[], int first)
471 * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
472 * int bytes, u8 iv[], u32 const rk2[], int first)
475 .macro next_tweak, out, in, const, tmp
476 vshr.s64 \tmp, \in, #63
477 vand \tmp, \tmp, \const
478 vadd.u64 \out, \in, \in
479 vext.8 \tmp, \tmp, \tmp, #8
480 veor \out, \out, \tmp
484 vmov.i32 d30, #0x87 @ compose tweak mask vector
486 vshr.u64 d30, d31, #7
488 ldrd r4, r5, [sp, #16] @ load args
490 vld1.8 {q0}, [r5] @ load iv
491 teq r6, #1 @ start of a block?
494 @ Encrypt the IV in q0 with the second AES key. This should only
495 @ be done at the start of a block.
496 ldr r6, [sp, #24] @ load AES key 2
498 add ip, r6, #32 @ 3rd round key of key 2
499 b .Laes_encrypt_tweak @ tail call
500 ENDPROC(ce_aes_xts_init)
502 ENTRY(ce_aes_xts_encrypt)
505 bl ce_aes_xts_init @ run shared prologue
509 teq r6, #0 @ start of a block?
513 next_tweak q4, q4, q15, q10
517 vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks
518 vld1.8 {q2-q3}, [r1]!
519 next_tweak q5, q4, q15, q10
521 next_tweak q6, q5, q15, q10
523 next_tweak q7, q6, q15, q10
531 vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks
532 vst1.8 {q2-q3}, [r0]!
551 next_tweak q4, q4, q15, q6
565 movw ip, :lower16:.Lcts_permute_table
566 movt ip, :upper16:.Lcts_permute_table
568 add r1, r1, r4 @ rewind input pointer
569 add r4, r4, #16 @ # bytes in final block
573 add r4, r0, r4 @ output address of final block
575 vld1.8 {q1}, [r1] @ load final partial block
579 vtbl.8 d4, {d0-d1}, d4
580 vtbl.8 d5, {d0-d1}, d5
581 vtbx.8 d0, {d2-d3}, d6
582 vtbx.8 d1, {d2-d3}, d7
584 vst1.8 {q2}, [r4] @ overlapping stores
587 ENDPROC(ce_aes_xts_encrypt)
590 ENTRY(ce_aes_xts_decrypt)
593 bl ce_aes_xts_init @ run shared prologue
597 /* subtract 16 bytes if we are doing CTS */
601 teq r6, #0 @ start of a block?
605 next_tweak q4, q4, q15, q10
609 vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks
610 vld1.8 {q2-q3}, [r1]!
611 next_tweak q5, q4, q15, q10
613 next_tweak q6, q5, q15, q10
615 next_tweak q7, q6, q15, q10
623 vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks
624 vst1.8 {q2-q3}, [r0]!
644 next_tweak q4, q4, q15, q6
651 movw ip, :lower16:.Lcts_permute_table
652 movt ip, :upper16:.Lcts_permute_table
654 add r1, r1, r4 @ rewind input pointer
655 add r4, r4, #16 @ # bytes in final block
659 add r4, r0, r4 @ output address of final block
661 next_tweak q5, q4, q15, q6
663 vld1.8 {q1}, [r1] @ load final partial block
671 vtbl.8 d4, {d0-d1}, d4
672 vtbl.8 d5, {d0-d1}, d5
673 vtbx.8 d0, {d2-d3}, d6
674 vtbx.8 d1, {d2-d3}, d7
676 vst1.8 {q2}, [r4] @ overlapping stores
679 ENDPROC(ce_aes_xts_decrypt)
682 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
683 * AES sbox substitution on each byte in
695 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
696 * operation on round key *src
703 ENDPROC(ce_aes_invert)
705 .section ".rodata", "a"
708 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
709 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
710 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
711 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
712 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
713 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff