2 @ ====================================================================
3 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
4 @ project. The module is, however, dual licensed under OpenSSL and
5 @ CRYPTOGAMS licenses depending on where you obtain it. For further
6 @ details see http://www.openssl.org/~appro/cryptogams/.
8 @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
9 @ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
11 @ ====================================================================
13 @ Bit-sliced AES for ARM NEON
17 @ This implementation is direct adaptation of bsaes-x86_64 module for
18 @ ARM NEON. Except that this module is endian-neutral [in sense that
19 @ it can be compiled for either endianness] by courtesy of vld1.8's
20 @ neutrality. Initial version doesn't implement interface to OpenSSL,
21 @ only low-level primitives and unsupported entry points, just enough
22 @ to collect performance results, which for Cortex-A8 core are:
24 @ encrypt 19.5 cycles per byte processed with 128-bit key
25 @ decrypt 22.1 cycles per byte processed with 128-bit key
26 @ key conv. 440 cycles per 128-bit key/0.18 of 8x block
28 @ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
29 @ which is [much] worse than anticipated (for further details see
30 @ http://www.openssl.org/~appro/Snapdragon-S4.html).
32 @ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
33 @ manages in 20.0 cycles].
35 @ When comparing to x86_64 results keep in mind that NEON unit is
36 @ [mostly] single-issue and thus can't [fully] benefit from
37 @ instruction-level parallelism. And when comparing to aes-armv4
38 @ results keep in mind key schedule conversion overhead (see
39 @ bsaes-x86_64.pl for further details)...
45 @ Add CBC, CTR and XTS subroutines, adapt for kernel use.
47 @ <ard.biesheuvel@linaro.org>
51 # include "arm_arch.h"
53 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
54 # define VFP_ABI_POP vldmia sp!,{d8-d15}
55 # define VFP_ABI_FRAME 0x40
59 # define VFP_ABI_FRAME 0
60 # define BSAES_ASM_EXTENDED_KEY
61 # define XTS_CHAIN_TWEAK
62 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
71 .syntax unified @ ARMv7-capable assembler is expected to handle this
80 .type _bsaes_decrypt8,%function
83 adr r6,_bsaes_decrypt8
84 vldmia r4!, {q9} @ round 0 key
85 add r6,r6,#.LM0ISR-_bsaes_decrypt8
87 vldmia r6!, {q8} @ .LM0ISR
88 veor q10, q0, q9 @ xor with round0 key
100 vtbl.8 d7, {q13}, d17
102 vtbl.8 d8, {q14}, d16
103 vtbl.8 d9, {q14}, d17
105 vtbl.8 d10, {q15}, d16
106 vtbl.8 d11, {q15}, d17
108 vtbl.8 d12, {q10}, d16
109 vtbl.8 d13, {q10}, d17
110 vtbl.8 d14, {q11}, d16
111 vtbl.8 d15, {q11}, d17
112 vmov.i8 q8,#0x55 @ compose .LBS0
113 vmov.i8 q9,#0x33 @ compose .LBS1
121 vshl.u64 q10, q10, #1
123 vshl.u64 q11, q11, #1
133 vshl.u64 q10, q10, #1
135 vshl.u64 q11, q11, #1
138 vmov.i8 q8,#0x0f @ compose .LBS2
146 vshl.u64 q10, q10, #2
148 vshl.u64 q11, q11, #2
158 vshl.u64 q10, q10, #2
160 vshl.u64 q11, q11, #2
170 vshl.u64 q10, q10, #4
172 vshl.u64 q11, q11, #4
182 vshl.u64 q10, q10, #4
184 vshl.u64 q11, q11, #4
202 vtbl.8 d4, {q10}, d24
203 vtbl.8 d5, {q10}, d25
205 vtbl.8 d6, {q11}, d24
206 vtbl.8 d7, {q11}, d25
213 vtbl.8 d10, {q9}, d24
214 vtbl.8 d11, {q9}, d25
216 vtbl.8 d12, {q10}, d24
217 vtbl.8 d13, {q10}, d25
218 vtbl.8 d14, {q11}, d24
219 vtbl.8 d15, {q11}, d25
272 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
274 @ new smaller inversion
281 veor q14, q8, q14 @ q14=q15
370 @ multiplication by 0x05-0x00-0x04-0x00
371 vext.8 q8, q0, q0, #8
372 vext.8 q14, q3, q3, #8
373 vext.8 q15, q5, q5, #8
375 vext.8 q9, q1, q1, #8
377 vext.8 q10, q6, q6, #8
379 vext.8 q11, q4, q4, #8
381 vext.8 q12, q2, q2, #8
383 vext.8 q13, q7, q7, #8
402 vext.8 q8, q0, q0, #12 @ x0 <<< 32
403 vext.8 q9, q1, q1, #12
404 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
405 vext.8 q10, q6, q6, #12
407 vext.8 q11, q4, q4, #12
409 vext.8 q12, q2, q2, #12
411 vext.8 q13, q7, q7, #12
413 vext.8 q14, q3, q3, #12
415 vext.8 q15, q5, q5, #12
420 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
424 vext.8 q1, q1, q1, #8
429 vext.8 q8, q2, q2, #8
431 vext.8 q9, q7, q7, #8
433 vext.8 q2, q4, q4, #8
435 vext.8 q7, q5, q5, #8
437 vext.8 q4, q3, q3, #8
439 vext.8 q3, q6, q6, #8
448 vldmia r6, {q12} @ .LISR
449 ite eq @ Thumb2 thing, sanity check in ARM
452 vldmia r6, {q12} @ .LISRM0
456 vmov.i8 q8,#0x55 @ compose .LBS0
457 vmov.i8 q9,#0x33 @ compose .LBS1
465 vshl.u64 q10, q10, #1
467 vshl.u64 q11, q11, #1
477 vshl.u64 q10, q10, #1
479 vshl.u64 q11, q11, #1
482 vmov.i8 q8,#0x0f @ compose .LBS2
490 vshl.u64 q10, q10, #2
492 vshl.u64 q11, q11, #2
502 vshl.u64 q10, q10, #2
504 vshl.u64 q11, q11, #2
514 vshl.u64 q10, q10, #4
516 vshl.u64 q11, q11, #4
526 vshl.u64 q10, q10, #4
528 vshl.u64 q11, q11, #4
531 vldmia r4, {q8} @ last round key
541 .size _bsaes_decrypt8,.-_bsaes_decrypt8
543 .type _bsaes_const,%object
546 .LM0ISR: @ InvShiftRows constants
547 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
549 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
551 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
552 .LM0SR: @ ShiftRows constants
553 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
555 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
557 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
559 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
561 .quad 0x090d01050c000408, 0x03070b0f060a0e02
562 .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
564 .size _bsaes_const,.-_bsaes_const
566 .type _bsaes_encrypt8,%function
569 adr r6,_bsaes_encrypt8
570 vldmia r4!, {q9} @ round 0 key
571 sub r6,r6,#_bsaes_encrypt8-.LM0SR
573 vldmia r6!, {q8} @ .LM0SR
575 veor q10, q0, q9 @ xor with round0 key
577 vtbl.8 d0, {q10}, d16
578 vtbl.8 d1, {q10}, d17
580 vtbl.8 d2, {q11}, d16
581 vtbl.8 d3, {q11}, d17
583 vtbl.8 d4, {q12}, d16
584 vtbl.8 d5, {q12}, d17
586 vtbl.8 d6, {q13}, d16
587 vtbl.8 d7, {q13}, d17
589 vtbl.8 d8, {q14}, d16
590 vtbl.8 d9, {q14}, d17
592 vtbl.8 d10, {q15}, d16
593 vtbl.8 d11, {q15}, d17
595 vtbl.8 d12, {q10}, d16
596 vtbl.8 d13, {q10}, d17
597 vtbl.8 d14, {q11}, d16
598 vtbl.8 d15, {q11}, d17
599 _bsaes_encrypt8_bitslice:
600 vmov.i8 q8,#0x55 @ compose .LBS0
601 vmov.i8 q9,#0x33 @ compose .LBS1
609 vshl.u64 q10, q10, #1
611 vshl.u64 q11, q11, #1
621 vshl.u64 q10, q10, #1
623 vshl.u64 q11, q11, #1
626 vmov.i8 q8,#0x0f @ compose .LBS2
634 vshl.u64 q10, q10, #2
636 vshl.u64 q11, q11, #2
646 vshl.u64 q10, q10, #2
648 vshl.u64 q11, q11, #2
658 vshl.u64 q10, q10, #4
660 vshl.u64 q11, q11, #4
670 vshl.u64 q10, q10, #4
672 vshl.u64 q11, q11, #4
690 vtbl.8 d4, {q10}, d24
691 vtbl.8 d5, {q10}, d25
693 vtbl.8 d6, {q11}, d24
694 vtbl.8 d7, {q11}, d25
701 vtbl.8 d10, {q9}, d24
702 vtbl.8 d11, {q9}, d25
704 vtbl.8 d12, {q10}, d24
705 vtbl.8 d13, {q10}, d25
706 vtbl.8 d14, {q11}, d24
707 vtbl.8 d15, {q11}, d25
761 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
763 @ new smaller inversion
770 veor q14, q8, q14 @ q14=q15
857 vext.8 q8, q0, q0, #12 @ x0 <<< 32
858 vext.8 q9, q1, q1, #12
859 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
860 vext.8 q10, q4, q4, #12
862 vext.8 q11, q6, q6, #12
864 vext.8 q12, q3, q3, #12
866 vext.8 q13, q7, q7, #12
868 vext.8 q14, q2, q2, #12
870 vext.8 q15, q5, q5, #12
875 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
879 vext.8 q1, q1, q1, #8
884 vext.8 q8, q3, q3, #8
886 vext.8 q9, q7, q7, #8
888 vext.8 q3, q6, q6, #8
890 vext.8 q7, q5, q5, #8
892 vext.8 q6, q2, q2, #8
894 vext.8 q2, q4, q4, #8
903 vldmia r6, {q12} @ .LSR
904 ite eq @ Thumb2 thing, samity check in ARM
907 vldmia r6, {q12} @ .LSRM0
911 vmov.i8 q8,#0x55 @ compose .LBS0
912 vmov.i8 q9,#0x33 @ compose .LBS1
920 vshl.u64 q10, q10, #1
922 vshl.u64 q11, q11, #1
932 vshl.u64 q10, q10, #1
934 vshl.u64 q11, q11, #1
937 vmov.i8 q8,#0x0f @ compose .LBS2
945 vshl.u64 q10, q10, #2
947 vshl.u64 q11, q11, #2
957 vshl.u64 q10, q10, #2
959 vshl.u64 q11, q11, #2
969 vshl.u64 q10, q10, #4
971 vshl.u64 q11, q11, #4
981 vshl.u64 q10, q10, #4
983 vshl.u64 q11, q11, #4
986 vldmia r4, {q8} @ last round key
996 .size _bsaes_encrypt8,.-_bsaes_encrypt8
997 .type _bsaes_key_convert,%function
1000 adr r6,_bsaes_key_convert
1001 vld1.8 {q7}, [r4]! @ load round 0 key
1002 sub r6,r6,#_bsaes_key_convert-.LM0
1003 vld1.8 {q15}, [r4]! @ load round 1 key
1005 vmov.i8 q8, #0x01 @ bit masks
1011 vldmia r6, {q14} @ .LM0
1018 vstmia r12!, {q7} @ save round 0 key
1023 vtbl.8 d14,{q15},d28
1024 vtbl.8 d15,{q15},d29
1036 vld1.8 {q15}, [r4]! @ load next round key
1037 vmvn q0, q0 @ "pnot"
1045 vstmia r12!,{q0-q7} @ write bit-sliced round key
1048 vmov.i8 q7,#0x63 @ compose .L63
1049 @ don't save last round key
1051 .size _bsaes_key_convert,.-_bsaes_key_convert
1052 .extern AES_cbc_encrypt
1055 .global bsaes_cbc_encrypt
1056 .hidden bsaes_cbc_encrypt
1057 .type bsaes_cbc_encrypt,%function
1071 @ it is up to the caller to make sure we are called with enc == 0
1074 stmdb sp!, {r4-r10, lr}
1076 ldr r8, [ip] @ IV is 1st arg on the stack
1077 mov r2, r2, lsr#4 @ len in 16 byte blocks
1078 sub sp, #0x10 @ scratch space to carry over the IV
1079 mov r9, sp @ save sp
1081 ldr r10, [r3, #240] @ get # of rounds
1082 #ifndef BSAES_ASM_EXTENDED_KEY
1083 @ allocate the key schedule on the stack
1084 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1085 add r12, #96 @ sifze of bit-slices key schedule
1087 @ populate the key schedule
1088 mov r4, r3 @ pass key
1089 mov r5, r10 @ pass # of rounds
1090 mov sp, r12 @ sp is sp
1091 bl _bsaes_key_convert
1093 vstmia r12, {q15} @ save last round key
1094 veor q7, q7, q6 @ fix up round 0 key
1101 @ populate the key schedule
1103 mov r4, r3 @ pass key
1104 mov r5, r10 @ pass # of rounds
1105 add r12, r3, #248 @ pass key schedule
1106 bl _bsaes_key_convert
1109 vstmia r12, {q15} @ save last round key
1110 veor q7, q7, q6 @ fix up round 0 key
1117 vld1.8 {q15}, [r8] @ load IV
1123 bmi .Lcbc_dec_loop_finish
1125 vld1.8 {q0-q1}, [r0]! @ load input
1126 vld1.8 {q2-q3}, [r0]!
1127 #ifndef BSAES_ASM_EXTENDED_KEY
1128 mov r4, sp @ pass the key
1132 vld1.8 {q4-q5}, [r0]!
1134 vld1.8 {q6-q7}, [r0]
1136 vstmia r9, {q15} @ put aside IV
1140 vldmia r9, {q14} @ reload IV
1141 vld1.8 {q8-q9}, [r0]! @ reload input
1142 veor q0, q0, q14 @ ^= IV
1143 vld1.8 {q10-q11}, [r0]!
1146 vld1.8 {q12-q13}, [r0]!
1149 vld1.8 {q14-q15}, [r0]!
1151 vst1.8 {q0-q1}, [r1]! @ write output
1163 .Lcbc_dec_loop_finish:
1167 vld1.8 {q0}, [r0]! @ load input
1171 #ifndef BSAES_ASM_EXTENDED_KEY
1172 mov r4, sp @ pass the key
1177 vstmia r9, {q15} @ put aside IV
1194 vldmia r9, {q14} @ reload IV
1195 vld1.8 {q8-q9}, [r0]! @ reload input
1196 veor q0, q0, q14 @ ^= IV
1197 vld1.8 {q10-q11}, [r0]!
1200 vld1.8 {q12-q13}, [r0]!
1205 vst1.8 {q0-q1}, [r1]! @ write output
1217 vldmia r9,{q14} @ reload IV
1218 vld1.8 {q8-q9}, [r0]! @ reload input
1219 veor q0, q0, q14 @ ^= IV
1220 vld1.8 {q10-q11}, [r0]!
1228 vst1.8 {q0-q1}, [r1]! @ write output
1238 vldmia r9, {q14} @ reload IV
1239 vld1.8 {q8-q9}, [r0]! @ reload input
1240 veor q0, q0, q14 @ ^= IV
1241 vld1.8 {q10-q11}, [r0]!
1246 vst1.8 {q0-q1}, [r1]! @ write output
1256 vldmia r9, {q14} @ reload IV
1257 vld1.8 {q8-q9}, [r0]! @ reload input
1258 veor q0, q0, q14 @ ^= IV
1264 vst1.8 {q0-q1}, [r1]! @ write output
1272 vldmia r9, {q14} @ reload IV
1273 vld1.8 {q8-q9}, [r0]! @ reload input
1274 veor q0, q0, q14 @ ^= IV
1278 vst1.8 {q0-q1}, [r1]! @ write output
1285 vldmia r9, {q14} @ reload IV
1286 vld1.8 {q8}, [r0]! @ reload input
1287 veor q0, q0, q14 @ ^= IV
1288 vld1.8 {q15}, [r0]! @ reload input
1290 vst1.8 {q0-q1}, [r1]! @ write output
1295 mov r10, r1 @ save original out pointer
1296 mov r1, r9 @ use the iv scratch space as out buffer
1298 vmov q4,q15 @ just in case ensure that IV
1299 vmov q5,q0 @ and input are preserved
1301 vld1.8 {q0}, [r9,:64] @ load result
1302 veor q0, q0, q4 @ ^= IV
1303 vmov q15, q5 @ q5 holds input
1304 vst1.8 {q0}, [r10] @ write output
1307 #ifndef BSAES_ASM_EXTENDED_KEY
1310 .Lcbc_dec_bzero: @ wipe key schedule [if any]
1317 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1318 vst1.8 {q15}, [r8] @ return IV
1320 ldmia sp!, {r4-r10, pc}
1321 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1323 .global bsaes_ctr32_encrypt_blocks
1324 .hidden bsaes_ctr32_encrypt_blocks
1325 .type bsaes_ctr32_encrypt_blocks,%function
1327 bsaes_ctr32_encrypt_blocks:
1328 cmp r2, #8 @ use plain AES for
1329 blo .Lctr_enc_short @ small sizes
1332 stmdb sp!, {r4-r10, lr}
1334 ldr r8, [ip] @ ctr is 1st arg on the stack
1335 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1336 mov r9, sp @ save sp
1338 ldr r10, [r3, #240] @ get # of rounds
1339 #ifndef BSAES_ASM_EXTENDED_KEY
1340 @ allocate the key schedule on the stack
1341 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1342 add r12, #96 @ size of bit-sliced key schedule
1344 @ populate the key schedule
1345 mov r4, r3 @ pass key
1346 mov r5, r10 @ pass # of rounds
1347 mov sp, r12 @ sp is sp
1348 bl _bsaes_key_convert
1349 veor q7,q7,q15 @ fix up last round key
1350 vstmia r12, {q7} @ save last round key
1352 vld1.8 {q0}, [r8] @ load counter
1353 add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
1354 vldmia sp, {q4} @ load round0 key
1360 @ populate the key schedule
1362 mov r4, r3 @ pass key
1363 mov r5, r10 @ pass # of rounds
1364 add r12, r3, #248 @ pass key schedule
1365 bl _bsaes_key_convert
1366 veor q7,q7,q15 @ fix up last round key
1367 vstmia r12, {q7} @ save last round key
1370 0: add r12, r3, #248
1371 vld1.8 {q0}, [r8] @ load counter
1372 adrl r8, .LREVM0SR @ borrow r8
1373 vldmia r12, {q4} @ load round0 key
1374 sub sp, #0x10 @ place for adjusted round0 key
1377 vmov.i32 q8,#1 @ compose 1<<96
1382 vadd.u32 q9,q8,q8 @ compose 2<<96
1383 vstmia sp, {q4} @ save adjusted round0 key
1388 vadd.u32 q10, q8, q9 @ compose 3<<96
1389 vadd.u32 q1, q0, q8 @ +1
1390 vadd.u32 q2, q0, q9 @ +2
1391 vadd.u32 q3, q0, q10 @ +3
1392 vadd.u32 q4, q1, q10
1393 vadd.u32 q5, q2, q10
1394 vadd.u32 q6, q3, q10
1395 vadd.u32 q7, q4, q10
1396 vadd.u32 q10, q5, q10 @ next counter
1398 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1399 @ to flip byte order in 32-bit counter
1401 vldmia sp, {q9} @ load round0 key
1402 #ifndef BSAES_ASM_EXTENDED_KEY
1403 add r4, sp, #0x10 @ pass next round key
1407 vldmia r8, {q8} @ .LREVM0SR
1408 mov r5, r10 @ pass rounds
1409 vstmia r9, {q10} @ save next counter
1410 sub r6, r8, #.LREVM0SR-.LSR @ pass constants
1412 bl _bsaes_encrypt8_alt
1415 blo .Lctr_enc_loop_done
1417 vld1.8 {q8-q9}, [r0]! @ load input
1418 vld1.8 {q10-q11}, [r0]!
1421 vld1.8 {q12-q13}, [r0]!
1424 vld1.8 {q14-q15}, [r0]!
1426 vst1.8 {q0-q1}, [r1]! @ write output
1432 vmov.i32 q8, #1 @ compose 1<<96
1436 vext.8 q8, q9, q8, #4
1438 vadd.u32 q9,q8,q8 @ compose 2<<96
1440 vldmia r9, {q0} @ load counter
1446 .Lctr_enc_loop_done:
1448 vld1.8 {q8}, [r0]! @ load input
1450 vst1.8 {q0}, [r1]! @ write output
1482 #ifndef BSAES_ASM_EXTENDED_KEY
1483 .Lctr_enc_bzero: @ wipe key schedule [if any]
1492 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1494 ldmia sp!, {r4-r10, pc} @ return
1498 ldr ip, [sp] @ ctr pointer is passed on stack
1499 stmdb sp!, {r4-r8, lr}
1501 mov r4, r0 @ copy arguments
1505 ldr r8, [ip, #12] @ load counter LSW
1506 vld1.8 {q1}, [ip] @ load whole counter value
1511 vst1.8 {q1}, [sp,:64] @ copy counter value
1514 .Lctr_enc_short_loop:
1515 add r0, sp, #0x10 @ input counter value
1516 mov r1, sp @ output on the stack
1521 vld1.8 {q0}, [r4]! @ load input
1522 vld1.8 {q1}, [sp,:64] @ load encrypted counter
1526 str r0, [sp, #0x1c] @ next counter value
1528 str r8, [sp, #0x1c] @ next counter value
1531 vst1.8 {q0}, [r5]! @ store output
1533 bne .Lctr_enc_short_loop
1539 ldmia sp!, {r4-r8, pc}
1540 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1541 .globl bsaes_xts_encrypt
1542 .hidden bsaes_xts_encrypt
1543 .type bsaes_xts_encrypt,%function
1547 stmdb sp!, {r4-r10, lr} @ 0x20
1549 mov r6, sp @ future r3
1556 sub r0, sp, #0x10 @ 0x10
1557 bic r0, #0xf @ align at 16 bytes
1560 #ifdef XTS_CHAIN_TWEAK
1561 ldr r0, [ip] @ pointer to input tweak
1563 @ generate initial tweak
1564 ldr r0, [ip, #4] @ iv[]
1566 ldr r2, [ip, #0] @ key2
1568 mov r0,sp @ pointer to initial tweak
1571 ldr r1, [r10, #240] @ get # of rounds
1573 #ifndef BSAES_ASM_EXTENDED_KEY
1574 @ allocate the key schedule on the stack
1575 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
1576 @ add r12, #96 @ size of bit-sliced key schedule
1577 sub r12, #48 @ place for tweak[9]
1579 @ populate the key schedule
1580 mov r4, r10 @ pass key
1581 mov r5, r1 @ pass # of rounds
1583 add r12, #0x90 @ pass key schedule
1584 bl _bsaes_key_convert
1585 veor q7, q7, q15 @ fix up last round key
1586 vstmia r12, {q7} @ save last round key
1588 ldr r12, [r10, #244]
1592 str r12, [r10, #244]
1593 mov r4, r10 @ pass key
1594 mov r5, r1 @ pass # of rounds
1595 add r12, r10, #248 @ pass key schedule
1596 bl _bsaes_key_convert
1597 veor q7, q7, q15 @ fix up last round key
1601 0: sub sp, #0x90 @ place for tweak[9]
1604 vld1.8 {q8}, [r0] @ initial tweak
1613 vldmia r2, {q5} @ load XTS magic
1614 vshr.s64 q6, q8, #63
1618 vst1.64 {q8}, [r0,:128]!
1620 vshr.s64 q7, q9, #63
1623 vadd.u64 q10, q9, q9
1624 vst1.64 {q9}, [r0,:128]!
1626 vshr.s64 q6, q10, #63
1630 vadd.u64 q11, q10, q10
1631 vst1.64 {q10}, [r0,:128]!
1633 vshr.s64 q7, q11, #63
1638 vadd.u64 q12, q11, q11
1639 vst1.64 {q11}, [r0,:128]!
1641 vshr.s64 q6, q12, #63
1646 vadd.u64 q13, q12, q12
1647 vst1.64 {q12}, [r0,:128]!
1649 vshr.s64 q7, q13, #63
1654 vadd.u64 q14, q13, q13
1655 vst1.64 {q13}, [r0,:128]!
1657 vshr.s64 q6, q14, #63
1662 vadd.u64 q15, q14, q14
1663 vst1.64 {q14}, [r0,:128]!
1665 vshr.s64 q7, q15, #63
1670 vadd.u64 q8, q15, q15
1671 vst1.64 {q15}, [r0,:128]!
1674 vst1.64 {q8}, [r0,:128] @ next round tweak
1676 vld1.8 {q6-q7}, [r7]!
1678 #ifndef BSAES_ASM_EXTENDED_KEY
1679 add r4, sp, #0x90 @ pass key schedule
1681 add r4, r10, #248 @ pass key schedule
1684 mov r5, r1 @ pass rounds
1690 vld1.64 {q8-q9}, [r0,:128]!
1691 vld1.64 {q10-q11}, [r0,:128]!
1693 vld1.64 {q12-q13}, [r0,:128]!
1696 vst1.8 {q0-q1}, [r8]!
1698 vld1.64 {q14-q15}, [r0,:128]!
1700 vst1.8 {q8-q9}, [r8]!
1703 vst1.8 {q10-q11}, [r8]!
1705 vst1.8 {q12-q13}, [r8]!
1707 vld1.64 {q8}, [r0,:128] @ next round tweak
1716 vldmia r2, {q5} @ load XTS magic
1717 vshr.s64 q7, q8, #63
1721 vst1.64 {q8}, [r0,:128]!
1723 vshr.s64 q6, q9, #63
1726 vadd.u64 q10, q9, q9
1727 vst1.64 {q9}, [r0,:128]!
1729 vshr.s64 q7, q10, #63
1735 vadd.u64 q11, q10, q10
1736 vst1.64 {q10}, [r0,:128]!
1738 vshr.s64 q6, q11, #63
1745 vadd.u64 q12, q11, q11
1746 vst1.64 {q11}, [r0,:128]!
1748 vshr.s64 q7, q12, #63
1755 vadd.u64 q13, q12, q12
1756 vst1.64 {q12}, [r0,:128]!
1758 vshr.s64 q6, q13, #63
1765 vadd.u64 q14, q13, q13
1766 vst1.64 {q13}, [r0,:128]!
1768 vshr.s64 q7, q14, #63
1775 vadd.u64 q15, q14, q14
1776 vst1.64 {q14}, [r0,:128]!
1778 vshr.s64 q6, q15, #63
1786 vst1.64 {q15}, [r0,:128] @ next round tweak
1790 #ifndef BSAES_ASM_EXTENDED_KEY
1791 add r4, sp, #0x90 @ pass key schedule
1793 add r4, r10, #248 @ pass key schedule
1796 mov r5, r1 @ pass rounds
1801 vld1.64 {q8-q9}, [r0,:128]!
1802 vld1.64 {q10-q11}, [r0,:128]!
1804 vld1.64 {q12-q13}, [r0,:128]!
1807 vst1.8 {q0-q1}, [r8]!
1809 vld1.64 {q14}, [r0,:128]!
1811 vst1.8 {q8-q9}, [r8]!
1814 vst1.8 {q10-q11}, [r8]!
1817 vld1.64 {q8}, [r0,:128] @ next round tweak
1821 vst1.64 {q14}, [r0,:128] @ next round tweak
1824 #ifndef BSAES_ASM_EXTENDED_KEY
1825 add r4, sp, #0x90 @ pass key schedule
1827 add r4, r10, #248 @ pass key schedule
1830 mov r5, r1 @ pass rounds
1835 vld1.64 {q8-q9}, [r0,:128]!
1836 vld1.64 {q10-q11}, [r0,:128]!
1838 vld1.64 {q12-q13}, [r0,:128]!
1841 vst1.8 {q0-q1}, [r8]!
1844 vst1.8 {q8-q9}, [r8]!
1846 vst1.8 {q10-q11}, [r8]!
1848 vld1.64 {q8}, [r0,:128] @ next round tweak
1851 @ put this in range for both ARM and Thumb mode adr instructions
1858 vst1.64 {q13}, [r0,:128] @ next round tweak
1861 #ifndef BSAES_ASM_EXTENDED_KEY
1862 add r4, sp, #0x90 @ pass key schedule
1864 add r4, r10, #248 @ pass key schedule
1867 mov r5, r1 @ pass rounds
1872 vld1.64 {q8-q9}, [r0,:128]!
1873 vld1.64 {q10-q11}, [r0,:128]!
1875 vld1.64 {q12}, [r0,:128]!
1878 vst1.8 {q0-q1}, [r8]!
1881 vst1.8 {q8-q9}, [r8]!
1884 vld1.64 {q8}, [r0,:128] @ next round tweak
1888 vst1.64 {q12}, [r0,:128] @ next round tweak
1891 #ifndef BSAES_ASM_EXTENDED_KEY
1892 add r4, sp, #0x90 @ pass key schedule
1894 add r4, r10, #248 @ pass key schedule
1897 mov r5, r1 @ pass rounds
1902 vld1.64 {q8-q9}, [r0,:128]!
1903 vld1.64 {q10-q11}, [r0,:128]!
1907 vst1.8 {q0-q1}, [r8]!
1909 vst1.8 {q8-q9}, [r8]!
1911 vld1.64 {q8}, [r0,:128] @ next round tweak
1915 vst1.64 {q11}, [r0,:128] @ next round tweak
1918 #ifndef BSAES_ASM_EXTENDED_KEY
1919 add r4, sp, #0x90 @ pass key schedule
1921 add r4, r10, #248 @ pass key schedule
1924 mov r5, r1 @ pass rounds
1929 vld1.64 {q8-q9}, [r0,:128]!
1930 vld1.64 {q10}, [r0,:128]!
1934 vst1.8 {q0-q1}, [r8]!
1937 vld1.64 {q8}, [r0,:128] @ next round tweak
1941 vst1.64 {q10}, [r0,:128] @ next round tweak
1944 #ifndef BSAES_ASM_EXTENDED_KEY
1945 add r4, sp, #0x90 @ pass key schedule
1947 add r4, r10, #248 @ pass key schedule
1950 mov r5, r1 @ pass rounds
1955 vld1.64 {q8-q9}, [r0,:128]!
1958 vst1.8 {q0-q1}, [r8]!
1960 vld1.64 {q8}, [r0,:128] @ next round tweak
1967 vst1.8 {q0}, [sp,:128]
1969 mov r4, r3 @ preserve fp
1973 vld1.8 {q0}, [sp,:128]
1978 vmov q8, q9 @ next round tweak
1981 #ifndef XTS_CHAIN_TWEAK
1988 ldrb r1, [r8, #-0x10]
1989 strb r0, [r8, #-0x10]
1999 vst1.8 {q0}, [sp,:128]
2001 mov r4, r3 @ preserve fp
2005 vld1.8 {q0}, [sp,:128]
2015 #ifdef XTS_CHAIN_TWEAK
2016 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2018 .Lxts_enc_bzero: @ wipe key schedule [if any]
2024 #ifdef XTS_CHAIN_TWEAK
2028 ldmia sp!, {r4-r10, pc} @ return
2030 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2032 .globl bsaes_xts_decrypt
2033 .hidden bsaes_xts_decrypt
2034 .type bsaes_xts_decrypt,%function
2038 stmdb sp!, {r4-r10, lr} @ 0x20
2040 mov r6, sp @ future r3
2047 sub r0, sp, #0x10 @ 0x10
2048 bic r0, #0xf @ align at 16 bytes
2051 #ifdef XTS_CHAIN_TWEAK
2052 ldr r0, [ip] @ pointer to input tweak
2054 @ generate initial tweak
2055 ldr r0, [ip, #4] @ iv[]
2057 ldr r2, [ip, #0] @ key2
2059 mov r0, sp @ pointer to initial tweak
2062 ldr r1, [r10, #240] @ get # of rounds
2064 #ifndef BSAES_ASM_EXTENDED_KEY
2065 @ allocate the key schedule on the stack
2066 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
2067 @ add r12, #96 @ size of bit-sliced key schedule
2068 sub r12, #48 @ place for tweak[9]
2070 @ populate the key schedule
2071 mov r4, r10 @ pass key
2072 mov r5, r1 @ pass # of rounds
2074 add r12, #0x90 @ pass key schedule
2075 bl _bsaes_key_convert
2078 vstmia r12, {q15} @ save last round key
2079 veor q7, q7, q6 @ fix up round 0 key
2082 ldr r12, [r10, #244]
2086 str r12, [r10, #244]
2087 mov r4, r10 @ pass key
2088 mov r5, r1 @ pass # of rounds
2089 add r12, r10, #248 @ pass key schedule
2090 bl _bsaes_key_convert
2093 vstmia r12, {q15} @ save last round key
2094 veor q7, q7, q6 @ fix up round 0 key
2098 0: sub sp, #0x90 @ place for tweak[9]
2100 vld1.8 {q8}, [r0] @ initial tweak
2103 tst r9, #0xf @ if not multiple of 16
2104 it ne @ Thumb2 thing, sanity check in ARM
2105 subne r9, #0x10 @ subtract another 16 bytes
2113 vldmia r2, {q5} @ load XTS magic
2114 vshr.s64 q6, q8, #63
2118 vst1.64 {q8}, [r0,:128]!
2120 vshr.s64 q7, q9, #63
2123 vadd.u64 q10, q9, q9
2124 vst1.64 {q9}, [r0,:128]!
2126 vshr.s64 q6, q10, #63
2130 vadd.u64 q11, q10, q10
2131 vst1.64 {q10}, [r0,:128]!
2133 vshr.s64 q7, q11, #63
2138 vadd.u64 q12, q11, q11
2139 vst1.64 {q11}, [r0,:128]!
2141 vshr.s64 q6, q12, #63
2146 vadd.u64 q13, q12, q12
2147 vst1.64 {q12}, [r0,:128]!
2149 vshr.s64 q7, q13, #63
2154 vadd.u64 q14, q13, q13
2155 vst1.64 {q13}, [r0,:128]!
2157 vshr.s64 q6, q14, #63
2162 vadd.u64 q15, q14, q14
2163 vst1.64 {q14}, [r0,:128]!
2165 vshr.s64 q7, q15, #63
2170 vadd.u64 q8, q15, q15
2171 vst1.64 {q15}, [r0,:128]!
2174 vst1.64 {q8}, [r0,:128] @ next round tweak
2176 vld1.8 {q6-q7}, [r7]!
2178 #ifndef BSAES_ASM_EXTENDED_KEY
2179 add r4, sp, #0x90 @ pass key schedule
2181 add r4, r10, #248 @ pass key schedule
2184 mov r5, r1 @ pass rounds
2190 vld1.64 {q8-q9}, [r0,:128]!
2191 vld1.64 {q10-q11}, [r0,:128]!
2193 vld1.64 {q12-q13}, [r0,:128]!
2196 vst1.8 {q0-q1}, [r8]!
2198 vld1.64 {q14-q15}, [r0,:128]!
2200 vst1.8 {q8-q9}, [r8]!
2203 vst1.8 {q10-q11}, [r8]!
2205 vst1.8 {q12-q13}, [r8]!
2207 vld1.64 {q8}, [r0,:128] @ next round tweak
2216 vldmia r2, {q5} @ load XTS magic
2217 vshr.s64 q7, q8, #63
2221 vst1.64 {q8}, [r0,:128]!
2223 vshr.s64 q6, q9, #63
2226 vadd.u64 q10, q9, q9
2227 vst1.64 {q9}, [r0,:128]!
2229 vshr.s64 q7, q10, #63
2235 vadd.u64 q11, q10, q10
2236 vst1.64 {q10}, [r0,:128]!
2238 vshr.s64 q6, q11, #63
2245 vadd.u64 q12, q11, q11
2246 vst1.64 {q11}, [r0,:128]!
2248 vshr.s64 q7, q12, #63
2255 vadd.u64 q13, q12, q12
2256 vst1.64 {q12}, [r0,:128]!
2258 vshr.s64 q6, q13, #63
2265 vadd.u64 q14, q13, q13
2266 vst1.64 {q13}, [r0,:128]!
2268 vshr.s64 q7, q14, #63
2275 vadd.u64 q15, q14, q14
2276 vst1.64 {q14}, [r0,:128]!
2278 vshr.s64 q6, q15, #63
2286 vst1.64 {q15}, [r0,:128] @ next round tweak
2290 #ifndef BSAES_ASM_EXTENDED_KEY
2291 add r4, sp, #0x90 @ pass key schedule
2293 add r4, r10, #248 @ pass key schedule
2296 mov r5, r1 @ pass rounds
2301 vld1.64 {q8-q9}, [r0,:128]!
2302 vld1.64 {q10-q11}, [r0,:128]!
2304 vld1.64 {q12-q13}, [r0,:128]!
2307 vst1.8 {q0-q1}, [r8]!
2309 vld1.64 {q14}, [r0,:128]!
2311 vst1.8 {q8-q9}, [r8]!
2314 vst1.8 {q10-q11}, [r8]!
2317 vld1.64 {q8}, [r0,:128] @ next round tweak
2321 vst1.64 {q14}, [r0,:128] @ next round tweak
2324 #ifndef BSAES_ASM_EXTENDED_KEY
2325 add r4, sp, #0x90 @ pass key schedule
2327 add r4, r10, #248 @ pass key schedule
2330 mov r5, r1 @ pass rounds
2335 vld1.64 {q8-q9}, [r0,:128]!
2336 vld1.64 {q10-q11}, [r0,:128]!
2338 vld1.64 {q12-q13}, [r0,:128]!
2341 vst1.8 {q0-q1}, [r8]!
2344 vst1.8 {q8-q9}, [r8]!
2346 vst1.8 {q10-q11}, [r8]!
2348 vld1.64 {q8}, [r0,:128] @ next round tweak
2352 vst1.64 {q13}, [r0,:128] @ next round tweak
2355 #ifndef BSAES_ASM_EXTENDED_KEY
2356 add r4, sp, #0x90 @ pass key schedule
2358 add r4, r10, #248 @ pass key schedule
2361 mov r5, r1 @ pass rounds
2366 vld1.64 {q8-q9}, [r0,:128]!
2367 vld1.64 {q10-q11}, [r0,:128]!
2369 vld1.64 {q12}, [r0,:128]!
2372 vst1.8 {q0-q1}, [r8]!
2375 vst1.8 {q8-q9}, [r8]!
2378 vld1.64 {q8}, [r0,:128] @ next round tweak
2382 vst1.64 {q12}, [r0,:128] @ next round tweak
2385 #ifndef BSAES_ASM_EXTENDED_KEY
2386 add r4, sp, #0x90 @ pass key schedule
2388 add r4, r10, #248 @ pass key schedule
2391 mov r5, r1 @ pass rounds
2396 vld1.64 {q8-q9}, [r0,:128]!
2397 vld1.64 {q10-q11}, [r0,:128]!
2401 vst1.8 {q0-q1}, [r8]!
2403 vst1.8 {q8-q9}, [r8]!
2405 vld1.64 {q8}, [r0,:128] @ next round tweak
2409 vst1.64 {q11}, [r0,:128] @ next round tweak
2412 #ifndef BSAES_ASM_EXTENDED_KEY
2413 add r4, sp, #0x90 @ pass key schedule
2415 add r4, r10, #248 @ pass key schedule
2418 mov r5, r1 @ pass rounds
2423 vld1.64 {q8-q9}, [r0,:128]!
2424 vld1.64 {q10}, [r0,:128]!
2428 vst1.8 {q0-q1}, [r8]!
2431 vld1.64 {q8}, [r0,:128] @ next round tweak
2435 vst1.64 {q10}, [r0,:128] @ next round tweak
2438 #ifndef BSAES_ASM_EXTENDED_KEY
2439 add r4, sp, #0x90 @ pass key schedule
2441 add r4, r10, #248 @ pass key schedule
2444 mov r5, r1 @ pass rounds
2449 vld1.64 {q8-q9}, [r0,:128]!
2452 vst1.8 {q0-q1}, [r8]!
2454 vld1.64 {q8}, [r0,:128] @ next round tweak
2461 vst1.8 {q0}, [sp,:128]
2463 mov r4, r3 @ preserve fp
2464 mov r5, r2 @ preserve magic
2468 vld1.8 {q0}, [sp,:128]
2474 vmov q8, q9 @ next round tweak
2477 #ifndef XTS_CHAIN_TWEAK
2481 @ calculate one round of extra tweak for the stolen ciphertext
2483 vshr.s64 q6, q8, #63
2489 @ perform the final decryption with the last tweak value
2494 vst1.8 {q0}, [sp,:128]
2496 mov r4, r3 @ preserve fp
2500 vld1.8 {q0}, [sp,:128]
2508 strb r1, [r8, #0x10]
2518 vst1.8 {q0}, [sp,:128]
2523 vld1.8 {q0}, [sp,:128]
2533 #ifdef XTS_CHAIN_TWEAK
2534 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2536 .Lxts_dec_bzero: @ wipe key schedule [if any]
2542 #ifdef XTS_CHAIN_TWEAK
2546 ldmia sp!, {r4-r10, pc} @ return
2548 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt