1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
5 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
13 .fpu crypto-neon-fp-armv8
16 .macro enc_round, state, key
18 aesmc.8 \state, \state
21 .macro dec_round, state, key
23 aesimc.8 \state, \state
26 .macro enc_dround, key1, key2
31 .macro dec_dround, key1, key2
36 .macro enc_fround, key1, key2, key3
42 .macro dec_fround, key1, key2, key3
48 .macro enc_dround_4x, key1, key2
59 .macro dec_dround_4x, key1, key2
70 .macro enc_fround_4x, key1, key2, key3
85 .macro dec_fround_4x, key1, key2, key3
100 .macro do_block, dround, fround
101 cmp r3, #12 @ which key size?
102 vld1.32 {q10-q11}, [ip]!
104 vld1.32 {q12-q13}, [ip]!
106 vld1.32 {q10-q11}, [ip]!
108 vld1.32 {q12-q13}, [ip]!
110 blo 0f @ AES-128: 10 rounds
111 vld1.32 {q10-q11}, [ip]!
113 beq 1f @ AES-192: 12 rounds
114 vld1.32 {q12-q13}, [ip]
116 0: \fround q12, q13, q14
119 1: \fround q10, q11, q14
124 * Internal, non-AAPCS compliant functions that implement the core AES
125 * transforms. These should preserve all registers except q0 - q2 and ip
127 * q0 : first in/output block
128 * q1 : second in/output block (_4x version only)
129 * q2 : third in/output block (_4x version only)
130 * q3 : fourth in/output block (_4x version only)
131 * q8 : first round key
132 * q9 : secound round key
133 * q14 : final round key
134 * r2 : address of round key array
135 * r3 : number of rounds
139 add ip, r2, #32 @ 3rd round key
141 do_block enc_dround, enc_fround
146 add ip, r2, #32 @ 3rd round key
147 do_block dec_dround, dec_fround
152 add ip, r2, #32 @ 3rd round key
153 do_block enc_dround_4x, enc_fround_4x
154 ENDPROC(aes_encrypt_4x)
158 add ip, r2, #32 @ 3rd round key
159 do_block dec_dround_4x, dec_fround_4x
160 ENDPROC(aes_decrypt_4x)
162 .macro prepare_key, rk, rounds
163 add ip, \rk, \rounds, lsl #4
164 vld1.32 {q8-q9}, [\rk] @ load first 2 round keys
165 vld1.32 {q14}, [ip] @ load last round key
169 * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
171 * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
174 ENTRY(ce_aes_ecb_encrypt)
181 vld1.8 {q0-q1}, [r1]!
182 vld1.8 {q2-q3}, [r1]!
184 vst1.8 {q0-q1}, [r0]!
185 vst1.8 {q2-q3}, [r0]!
198 ENDPROC(ce_aes_ecb_encrypt)
200 ENTRY(ce_aes_ecb_decrypt)
207 vld1.8 {q0-q1}, [r1]!
208 vld1.8 {q2-q3}, [r1]!
210 vst1.8 {q0-q1}, [r0]!
211 vst1.8 {q2-q3}, [r0]!
224 ENDPROC(ce_aes_ecb_decrypt)
227 * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
228 * int blocks, u8 iv[])
229 * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
230 * int blocks, u8 iv[])
232 ENTRY(ce_aes_cbc_encrypt)
234 ldrd r4, r5, [sp, #16]
238 vld1.8 {q1}, [r1]! @ get next pt block
239 veor q0, q0, q1 @ ..and xor with iv
246 ENDPROC(ce_aes_cbc_encrypt)
248 ENTRY(ce_aes_cbc_decrypt)
250 ldrd r4, r5, [sp, #16]
251 vld1.8 {q15}, [r5] @ keep iv in q15
256 vld1.8 {q0-q1}, [r1]!
257 vld1.8 {q2-q3}, [r1]!
268 vst1.8 {q0-q1}, [r0]!
269 vst1.8 {q2-q3}, [r0]!
274 vmov q6, q14 @ preserve last round key
276 vld1.8 {q0}, [r1]! @ get next ct block
277 veor q14, q15, q6 @ combine prev ct with last key
284 vst1.8 {q15}, [r5] @ keep iv in q15
286 ENDPROC(ce_aes_cbc_decrypt)
290 * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291 * int rounds, int bytes, u8 const iv[])
292 * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293 * int rounds, int bytes, u8 const iv[])
296 ENTRY(ce_aes_cbc_cts_encrypt)
298 ldrd r4, r5, [sp, #16]
300 movw ip, :lower16:.Lcts_permute_table
301 movt ip, :upper16:.Lcts_permute_table
310 vld1.8 {q0}, [r1] @ overlapping loads
313 vld1.8 {q1}, [r5] @ get iv
316 veor q0, q0, q1 @ xor with iv
319 vtbl.8 d4, {d0-d1}, d10
320 vtbl.8 d5, {d0-d1}, d11
321 vtbl.8 d2, {d6-d7}, d12
322 vtbl.8 d3, {d6-d7}, d13
328 vst1.8 {q2}, [r4] @ overlapping stores
332 ENDPROC(ce_aes_cbc_cts_encrypt)
334 ENTRY(ce_aes_cbc_cts_decrypt)
336 ldrd r4, r5, [sp, #16]
338 movw ip, :lower16:.Lcts_permute_table
339 movt ip, :upper16:.Lcts_permute_table
348 vld1.8 {q0}, [r1] @ overlapping loads
351 vld1.8 {q3}, [r5] @ get iv
356 vtbl.8 d4, {d0-d1}, d10
357 vtbl.8 d5, {d0-d1}, d11
358 vtbx.8 d0, {d2-d3}, d12
359 vtbx.8 d1, {d2-d3}, d13
363 veor q0, q0, q3 @ xor with iv
366 vst1.8 {q1}, [r4] @ overlapping stores
370 ENDPROC(ce_aes_cbc_cts_decrypt)
374 * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
375 * int blocks, u8 ctr[])
377 ENTRY(ce_aes_ctr_encrypt)
379 ldrd r4, r5, [sp, #16]
380 vld1.8 {q7}, [r5] @ load ctr
382 vmov r6, s31 @ keep swabbed ctr in r6
384 cmn r6, r4 @ 32 bit overflow?
403 vld1.8 {q4-q5}, [r1]!
412 vst1.8 {q0-q1}, [r0]!
413 vst1.8 {q2-q3}, [r0]!
423 adds r6, r6, #1 @ increment BE ctr
430 bmi .Lctrtailblock @ blocks < 0 means tail block
437 vst1.8 {q7}, [r5] @ return next CTR value
441 vst1.8 {q0}, [r0, :64] @ return the key stream
445 .irp sreg, s30, s29, s28
446 vmov ip, \sreg @ load next word of ctr
447 rev ip, ip @ ... to handle the carry
454 ENDPROC(ce_aes_ctr_encrypt)
457 * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
458 * int bytes, u8 iv[], u32 const rk2[], int first)
459 * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
460 * int bytes, u8 iv[], u32 const rk2[], int first)
463 .macro next_tweak, out, in, const, tmp
464 vshr.s64 \tmp, \in, #63
465 vand \tmp, \tmp, \const
466 vadd.u64 \out, \in, \in
467 vext.8 \tmp, \tmp, \tmp, #8
468 veor \out, \out, \tmp
472 vmov.i32 d30, #0x87 @ compose tweak mask vector
474 vshr.u64 d30, d31, #7
476 ldrd r4, r5, [sp, #16] @ load args
478 vld1.8 {q0}, [r5] @ load iv
479 teq r6, #1 @ start of a block?
482 @ Encrypt the IV in q0 with the second AES key. This should only
483 @ be done at the start of a block.
484 ldr r6, [sp, #24] @ load AES key 2
486 add ip, r6, #32 @ 3rd round key of key 2
487 b .Laes_encrypt_tweak @ tail call
488 ENDPROC(ce_aes_xts_init)
490 ENTRY(ce_aes_xts_encrypt)
493 bl ce_aes_xts_init @ run shared prologue
497 teq r6, #0 @ start of a block?
501 next_tweak q4, q4, q15, q10
505 vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks
506 vld1.8 {q2-q3}, [r1]!
507 next_tweak q5, q4, q15, q10
509 next_tweak q6, q5, q15, q10
511 next_tweak q7, q6, q15, q10
519 vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks
520 vst1.8 {q2-q3}, [r0]!
539 next_tweak q4, q4, q15, q6
553 movw ip, :lower16:.Lcts_permute_table
554 movt ip, :upper16:.Lcts_permute_table
556 add r1, r1, r4 @ rewind input pointer
557 add r4, r4, #16 @ # bytes in final block
561 add r4, r0, r4 @ output address of final block
563 vld1.8 {q1}, [r1] @ load final partial block
567 vtbl.8 d4, {d0-d1}, d4
568 vtbl.8 d5, {d0-d1}, d5
569 vtbx.8 d0, {d2-d3}, d6
570 vtbx.8 d1, {d2-d3}, d7
572 vst1.8 {q2}, [r4] @ overlapping stores
575 ENDPROC(ce_aes_xts_encrypt)
578 ENTRY(ce_aes_xts_decrypt)
581 bl ce_aes_xts_init @ run shared prologue
585 /* subtract 16 bytes if we are doing CTS */
589 teq r6, #0 @ start of a block?
593 next_tweak q4, q4, q15, q10
597 vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks
598 vld1.8 {q2-q3}, [r1]!
599 next_tweak q5, q4, q15, q10
601 next_tweak q6, q5, q15, q10
603 next_tweak q7, q6, q15, q10
611 vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks
612 vst1.8 {q2-q3}, [r0]!
632 next_tweak q4, q4, q15, q6
639 movw ip, :lower16:.Lcts_permute_table
640 movt ip, :upper16:.Lcts_permute_table
642 add r1, r1, r4 @ rewind input pointer
643 add r4, r4, #16 @ # bytes in final block
647 add r4, r0, r4 @ output address of final block
649 next_tweak q5, q4, q15, q6
651 vld1.8 {q1}, [r1] @ load final partial block
659 vtbl.8 d4, {d0-d1}, d4
660 vtbl.8 d5, {d0-d1}, d5
661 vtbx.8 d0, {d2-d3}, d6
662 vtbx.8 d1, {d2-d3}, d7
664 vst1.8 {q2}, [r4] @ overlapping stores
667 ENDPROC(ce_aes_xts_decrypt)
670 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
671 * AES sbox substitution on each byte in
683 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
684 * operation on round key *src
691 ENDPROC(ce_aes_invert)
693 .section ".rodata", "a"
696 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
697 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
698 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
699 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
700 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
701 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff