2 @ ====================================================================
3 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
4 @ project. The module is, however, dual licensed under OpenSSL and
5 @ CRYPTOGAMS licenses depending on where you obtain it. For further
6 @ details see http://www.openssl.org/~appro/cryptogams/.
8 @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
9 @ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
11 @ ====================================================================
13 @ Bit-sliced AES for ARM NEON
17 @ This implementation is direct adaptation of bsaes-x86_64 module for
18 @ ARM NEON. Except that this module is endian-neutral [in sense that
19 @ it can be compiled for either endianness] by courtesy of vld1.8's
20 @ neutrality. Initial version doesn't implement interface to OpenSSL,
21 @ only low-level primitives and unsupported entry points, just enough
22 @ to collect performance results, which for Cortex-A8 core are:
24 @ encrypt 19.5 cycles per byte processed with 128-bit key
25 @ decrypt 22.1 cycles per byte processed with 128-bit key
26 @ key conv. 440 cycles per 128-bit key/0.18 of 8x block
28 @ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
29 @ which is [much] worse than anticipated (for further details see
30 @ http://www.openssl.org/~appro/Snapdragon-S4.html).
32 @ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
33 @ manages in 20.0 cycles].
35 @ When comparing to x86_64 results keep in mind that NEON unit is
36 @ [mostly] single-issue and thus can't [fully] benefit from
37 @ instruction-level parallelism. And when comparing to aes-armv4
38 @ results keep in mind key schedule conversion overhead (see
39 @ bsaes-x86_64.pl for further details)...
45 @ Add CBC, CTR and XTS subroutines, adapt for kernel use.
47 @ <ard.biesheuvel@linaro.org>
50 # include "arm_arch.h"
53 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
54 # define VFP_ABI_POP vldmia sp!,{d8-d15}
55 # define VFP_ABI_FRAME 0x40
59 # define VFP_ABI_FRAME 0
60 # define BSAES_ASM_EXTENDED_KEY
61 # define XTS_CHAIN_TWEAK
62 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
63 # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
70 #if __ARM_MAX_ARCH__>=7
75 .syntax unified @ ARMv7-capable assembler is expected to handle this
82 .type _bsaes_decrypt8,%function
85 adr r6,_bsaes_decrypt8
86 vldmia r4!, {q9} @ round 0 key
87 add r6,r6,#.LM0ISR-_bsaes_decrypt8
89 vldmia r6!, {q8} @ .LM0ISR
90 veor q10, q0, q9 @ xor with round0 key
101 vtbl.8 d6, {q13}, d16
102 vtbl.8 d7, {q13}, d17
104 vtbl.8 d8, {q14}, d16
105 vtbl.8 d9, {q14}, d17
107 vtbl.8 d10, {q15}, d16
108 vtbl.8 d11, {q15}, d17
110 vtbl.8 d12, {q10}, d16
111 vtbl.8 d13, {q10}, d17
112 vtbl.8 d14, {q11}, d16
113 vtbl.8 d15, {q11}, d17
114 vmov.i8 q8,#0x55 @ compose .LBS0
115 vmov.i8 q9,#0x33 @ compose .LBS1
123 vshl.u64 q10, q10, #1
125 vshl.u64 q11, q11, #1
135 vshl.u64 q10, q10, #1
137 vshl.u64 q11, q11, #1
140 vmov.i8 q8,#0x0f @ compose .LBS2
148 vshl.u64 q10, q10, #2
150 vshl.u64 q11, q11, #2
160 vshl.u64 q10, q10, #2
162 vshl.u64 q11, q11, #2
172 vshl.u64 q10, q10, #4
174 vshl.u64 q11, q11, #4
184 vshl.u64 q10, q10, #4
186 vshl.u64 q11, q11, #4
204 vtbl.8 d4, {q10}, d24
205 vtbl.8 d5, {q10}, d25
207 vtbl.8 d6, {q11}, d24
208 vtbl.8 d7, {q11}, d25
215 vtbl.8 d10, {q9}, d24
216 vtbl.8 d11, {q9}, d25
218 vtbl.8 d12, {q10}, d24
219 vtbl.8 d13, {q10}, d25
220 vtbl.8 d14, {q11}, d24
221 vtbl.8 d15, {q11}, d25
274 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
276 @ new smaller inversion
283 veor q14, q8, q14 @ q14=q15
372 @ multiplication by 0x05-0x00-0x04-0x00
373 vext.8 q8, q0, q0, #8
374 vext.8 q14, q3, q3, #8
375 vext.8 q15, q5, q5, #8
377 vext.8 q9, q1, q1, #8
379 vext.8 q10, q6, q6, #8
381 vext.8 q11, q4, q4, #8
383 vext.8 q12, q2, q2, #8
385 vext.8 q13, q7, q7, #8
404 vext.8 q8, q0, q0, #12 @ x0 <<< 32
405 vext.8 q9, q1, q1, #12
406 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
407 vext.8 q10, q6, q6, #12
409 vext.8 q11, q4, q4, #12
411 vext.8 q12, q2, q2, #12
413 vext.8 q13, q7, q7, #12
415 vext.8 q14, q3, q3, #12
417 vext.8 q15, q5, q5, #12
422 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
426 vext.8 q1, q1, q1, #8
431 vext.8 q8, q2, q2, #8
433 vext.8 q9, q7, q7, #8
435 vext.8 q2, q4, q4, #8
437 vext.8 q7, q5, q5, #8
439 vext.8 q4, q3, q3, #8
441 vext.8 q3, q6, q6, #8
450 vldmia r6, {q12} @ .LISR
451 ite eq @ Thumb2 thing, sanity check in ARM
454 vldmia r6, {q12} @ .LISRM0
458 vmov.i8 q8,#0x55 @ compose .LBS0
459 vmov.i8 q9,#0x33 @ compose .LBS1
467 vshl.u64 q10, q10, #1
469 vshl.u64 q11, q11, #1
479 vshl.u64 q10, q10, #1
481 vshl.u64 q11, q11, #1
484 vmov.i8 q8,#0x0f @ compose .LBS2
492 vshl.u64 q10, q10, #2
494 vshl.u64 q11, q11, #2
504 vshl.u64 q10, q10, #2
506 vshl.u64 q11, q11, #2
516 vshl.u64 q10, q10, #4
518 vshl.u64 q11, q11, #4
528 vshl.u64 q10, q10, #4
530 vshl.u64 q11, q11, #4
533 vldmia r4, {q8} @ last round key
543 .size _bsaes_decrypt8,.-_bsaes_decrypt8
545 .type _bsaes_const,%object
548 .LM0ISR: @ InvShiftRows constants
549 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
551 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
553 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
554 .LM0SR: @ ShiftRows constants
555 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
557 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
559 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
561 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
563 .quad 0x090d01050c000408, 0x03070b0f060a0e02
564 .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
566 .size _bsaes_const,.-_bsaes_const
568 .type _bsaes_encrypt8,%function
571 adr r6,_bsaes_encrypt8
572 vldmia r4!, {q9} @ round 0 key
573 sub r6,r6,#_bsaes_encrypt8-.LM0SR
575 vldmia r6!, {q8} @ .LM0SR
577 veor q10, q0, q9 @ xor with round0 key
579 vtbl.8 d0, {q10}, d16
580 vtbl.8 d1, {q10}, d17
582 vtbl.8 d2, {q11}, d16
583 vtbl.8 d3, {q11}, d17
585 vtbl.8 d4, {q12}, d16
586 vtbl.8 d5, {q12}, d17
588 vtbl.8 d6, {q13}, d16
589 vtbl.8 d7, {q13}, d17
591 vtbl.8 d8, {q14}, d16
592 vtbl.8 d9, {q14}, d17
594 vtbl.8 d10, {q15}, d16
595 vtbl.8 d11, {q15}, d17
597 vtbl.8 d12, {q10}, d16
598 vtbl.8 d13, {q10}, d17
599 vtbl.8 d14, {q11}, d16
600 vtbl.8 d15, {q11}, d17
601 _bsaes_encrypt8_bitslice:
602 vmov.i8 q8,#0x55 @ compose .LBS0
603 vmov.i8 q9,#0x33 @ compose .LBS1
611 vshl.u64 q10, q10, #1
613 vshl.u64 q11, q11, #1
623 vshl.u64 q10, q10, #1
625 vshl.u64 q11, q11, #1
628 vmov.i8 q8,#0x0f @ compose .LBS2
636 vshl.u64 q10, q10, #2
638 vshl.u64 q11, q11, #2
648 vshl.u64 q10, q10, #2
650 vshl.u64 q11, q11, #2
660 vshl.u64 q10, q10, #4
662 vshl.u64 q11, q11, #4
672 vshl.u64 q10, q10, #4
674 vshl.u64 q11, q11, #4
692 vtbl.8 d4, {q10}, d24
693 vtbl.8 d5, {q10}, d25
695 vtbl.8 d6, {q11}, d24
696 vtbl.8 d7, {q11}, d25
703 vtbl.8 d10, {q9}, d24
704 vtbl.8 d11, {q9}, d25
706 vtbl.8 d12, {q10}, d24
707 vtbl.8 d13, {q10}, d25
708 vtbl.8 d14, {q11}, d24
709 vtbl.8 d15, {q11}, d25
763 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
765 @ new smaller inversion
772 veor q14, q8, q14 @ q14=q15
859 vext.8 q8, q0, q0, #12 @ x0 <<< 32
860 vext.8 q9, q1, q1, #12
861 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
862 vext.8 q10, q4, q4, #12
864 vext.8 q11, q6, q6, #12
866 vext.8 q12, q3, q3, #12
868 vext.8 q13, q7, q7, #12
870 vext.8 q14, q2, q2, #12
872 vext.8 q15, q5, q5, #12
877 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
881 vext.8 q1, q1, q1, #8
886 vext.8 q8, q3, q3, #8
888 vext.8 q9, q7, q7, #8
890 vext.8 q3, q6, q6, #8
892 vext.8 q7, q5, q5, #8
894 vext.8 q6, q2, q2, #8
896 vext.8 q2, q4, q4, #8
905 vldmia r6, {q12} @ .LSR
906 ite eq @ Thumb2 thing, samity check in ARM
909 vldmia r6, {q12} @ .LSRM0
913 vmov.i8 q8,#0x55 @ compose .LBS0
914 vmov.i8 q9,#0x33 @ compose .LBS1
922 vshl.u64 q10, q10, #1
924 vshl.u64 q11, q11, #1
934 vshl.u64 q10, q10, #1
936 vshl.u64 q11, q11, #1
939 vmov.i8 q8,#0x0f @ compose .LBS2
947 vshl.u64 q10, q10, #2
949 vshl.u64 q11, q11, #2
959 vshl.u64 q10, q10, #2
961 vshl.u64 q11, q11, #2
971 vshl.u64 q10, q10, #4
973 vshl.u64 q11, q11, #4
983 vshl.u64 q10, q10, #4
985 vshl.u64 q11, q11, #4
988 vldmia r4, {q8} @ last round key
998 .size _bsaes_encrypt8,.-_bsaes_encrypt8
999 .type _bsaes_key_convert,%function
1002 adr r6,_bsaes_key_convert
1003 vld1.8 {q7}, [r4]! @ load round 0 key
1004 sub r6,r6,#_bsaes_key_convert-.LM0
1005 vld1.8 {q15}, [r4]! @ load round 1 key
1007 vmov.i8 q8, #0x01 @ bit masks
1013 vldmia r6, {q14} @ .LM0
1020 vstmia r12!, {q7} @ save round 0 key
1025 vtbl.8 d14,{q15},d28
1026 vtbl.8 d15,{q15},d29
1038 vld1.8 {q15}, [r4]! @ load next round key
1039 vmvn q0, q0 @ "pnot"
1047 vstmia r12!,{q0-q7} @ write bit-sliced round key
1050 vmov.i8 q7,#0x63 @ compose .L63
1051 @ don't save last round key
1053 .size _bsaes_key_convert,.-_bsaes_key_convert
1054 .extern AES_cbc_encrypt
1057 .global bsaes_cbc_encrypt
1058 .type bsaes_cbc_encrypt,%function
1072 @ it is up to the caller to make sure we are called with enc == 0
1075 stmdb sp!, {r4-r10, lr}
1077 ldr r8, [ip] @ IV is 1st arg on the stack
1078 mov r2, r2, lsr#4 @ len in 16 byte blocks
1079 sub sp, #0x10 @ scratch space to carry over the IV
1080 mov r9, sp @ save sp
1082 ldr r10, [r3, #240] @ get # of rounds
1083 #ifndef BSAES_ASM_EXTENDED_KEY
1084 @ allocate the key schedule on the stack
1085 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1086 add r12, #96 @ sifze of bit-slices key schedule
1088 @ populate the key schedule
1089 mov r4, r3 @ pass key
1090 mov r5, r10 @ pass # of rounds
1091 mov sp, r12 @ sp is sp
1092 bl _bsaes_key_convert
1094 vstmia r12, {q15} @ save last round key
1095 veor q7, q7, q6 @ fix up round 0 key
1102 @ populate the key schedule
1104 mov r4, r3 @ pass key
1105 mov r5, r10 @ pass # of rounds
1106 add r12, r3, #248 @ pass key schedule
1107 bl _bsaes_key_convert
1110 vstmia r12, {q15} @ save last round key
1111 veor q7, q7, q6 @ fix up round 0 key
1118 vld1.8 {q15}, [r8] @ load IV
1124 bmi .Lcbc_dec_loop_finish
1126 vld1.8 {q0-q1}, [r0]! @ load input
1127 vld1.8 {q2-q3}, [r0]!
1128 #ifndef BSAES_ASM_EXTENDED_KEY
1129 mov r4, sp @ pass the key
1133 vld1.8 {q4-q5}, [r0]!
1135 vld1.8 {q6-q7}, [r0]
1137 vstmia r9, {q15} @ put aside IV
1141 vldmia r9, {q14} @ reload IV
1142 vld1.8 {q8-q9}, [r0]! @ reload input
1143 veor q0, q0, q14 @ ^= IV
1144 vld1.8 {q10-q11}, [r0]!
1147 vld1.8 {q12-q13}, [r0]!
1150 vld1.8 {q14-q15}, [r0]!
1152 vst1.8 {q0-q1}, [r1]! @ write output
1164 .Lcbc_dec_loop_finish:
1168 vld1.8 {q0}, [r0]! @ load input
1172 #ifndef BSAES_ASM_EXTENDED_KEY
1173 mov r4, sp @ pass the key
1178 vstmia r9, {q15} @ put aside IV
1195 vldmia r9, {q14} @ reload IV
1196 vld1.8 {q8-q9}, [r0]! @ reload input
1197 veor q0, q0, q14 @ ^= IV
1198 vld1.8 {q10-q11}, [r0]!
1201 vld1.8 {q12-q13}, [r0]!
1206 vst1.8 {q0-q1}, [r1]! @ write output
1218 vldmia r9,{q14} @ reload IV
1219 vld1.8 {q8-q9}, [r0]! @ reload input
1220 veor q0, q0, q14 @ ^= IV
1221 vld1.8 {q10-q11}, [r0]!
1229 vst1.8 {q0-q1}, [r1]! @ write output
1239 vldmia r9, {q14} @ reload IV
1240 vld1.8 {q8-q9}, [r0]! @ reload input
1241 veor q0, q0, q14 @ ^= IV
1242 vld1.8 {q10-q11}, [r0]!
1247 vst1.8 {q0-q1}, [r1]! @ write output
1257 vldmia r9, {q14} @ reload IV
1258 vld1.8 {q8-q9}, [r0]! @ reload input
1259 veor q0, q0, q14 @ ^= IV
1265 vst1.8 {q0-q1}, [r1]! @ write output
1273 vldmia r9, {q14} @ reload IV
1274 vld1.8 {q8-q9}, [r0]! @ reload input
1275 veor q0, q0, q14 @ ^= IV
1279 vst1.8 {q0-q1}, [r1]! @ write output
1286 vldmia r9, {q14} @ reload IV
1287 vld1.8 {q8}, [r0]! @ reload input
1288 veor q0, q0, q14 @ ^= IV
1289 vld1.8 {q15}, [r0]! @ reload input
1291 vst1.8 {q0-q1}, [r1]! @ write output
1296 mov r10, r1 @ save original out pointer
1297 mov r1, r9 @ use the iv scratch space as out buffer
1299 vmov q4,q15 @ just in case ensure that IV
1300 vmov q5,q0 @ and input are preserved
1302 vld1.8 {q0}, [r9,:64] @ load result
1303 veor q0, q0, q4 @ ^= IV
1304 vmov q15, q5 @ q5 holds input
1305 vst1.8 {q0}, [r10] @ write output
1308 #ifndef BSAES_ASM_EXTENDED_KEY
1311 .Lcbc_dec_bzero: @ wipe key schedule [if any]
1318 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1319 vst1.8 {q15}, [r8] @ return IV
1321 ldmia sp!, {r4-r10, pc}
1322 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1324 .global bsaes_ctr32_encrypt_blocks
1325 .type bsaes_ctr32_encrypt_blocks,%function
1327 bsaes_ctr32_encrypt_blocks:
1328 cmp r2, #8 @ use plain AES for
1329 blo .Lctr_enc_short @ small sizes
1332 stmdb sp!, {r4-r10, lr}
1334 ldr r8, [ip] @ ctr is 1st arg on the stack
1335 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1336 mov r9, sp @ save sp
1338 ldr r10, [r3, #240] @ get # of rounds
1339 #ifndef BSAES_ASM_EXTENDED_KEY
1340 @ allocate the key schedule on the stack
1341 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1342 add r12, #96 @ size of bit-sliced key schedule
1344 @ populate the key schedule
1345 mov r4, r3 @ pass key
1346 mov r5, r10 @ pass # of rounds
1347 mov sp, r12 @ sp is sp
1348 bl _bsaes_key_convert
1349 veor q7,q7,q15 @ fix up last round key
1350 vstmia r12, {q7} @ save last round key
1352 vld1.8 {q0}, [r8] @ load counter
1353 add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
1354 vldmia sp, {q4} @ load round0 key
1360 @ populate the key schedule
1362 mov r4, r3 @ pass key
1363 mov r5, r10 @ pass # of rounds
1364 add r12, r3, #248 @ pass key schedule
1365 bl _bsaes_key_convert
1366 veor q7,q7,q15 @ fix up last round key
1367 vstmia r12, {q7} @ save last round key
1370 0: add r12, r3, #248
1371 vld1.8 {q0}, [r8] @ load counter
1372 adrl r8, .LREVM0SR @ borrow r8
1373 vldmia r12, {q4} @ load round0 key
1374 sub sp, #0x10 @ place for adjusted round0 key
1377 vmov.i32 q8,#1 @ compose 1<<96
1382 vadd.u32 q9,q8,q8 @ compose 2<<96
1383 vstmia sp, {q4} @ save adjusted round0 key
1388 vadd.u32 q10, q8, q9 @ compose 3<<96
1389 vadd.u32 q1, q0, q8 @ +1
1390 vadd.u32 q2, q0, q9 @ +2
1391 vadd.u32 q3, q0, q10 @ +3
1392 vadd.u32 q4, q1, q10
1393 vadd.u32 q5, q2, q10
1394 vadd.u32 q6, q3, q10
1395 vadd.u32 q7, q4, q10
1396 vadd.u32 q10, q5, q10 @ next counter
1398 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1399 @ to flip byte order in 32-bit counter
1401 vldmia sp, {q9} @ load round0 key
1402 #ifndef BSAES_ASM_EXTENDED_KEY
1403 add r4, sp, #0x10 @ pass next round key
1407 vldmia r8, {q8} @ .LREVM0SR
1408 mov r5, r10 @ pass rounds
1409 vstmia r9, {q10} @ save next counter
1410 sub r6, r8, #.LREVM0SR-.LSR @ pass constants
1412 bl _bsaes_encrypt8_alt
1415 blo .Lctr_enc_loop_done
1417 vld1.8 {q8-q9}, [r0]! @ load input
1418 vld1.8 {q10-q11}, [r0]!
1421 vld1.8 {q12-q13}, [r0]!
1424 vld1.8 {q14-q15}, [r0]!
1426 vst1.8 {q0-q1}, [r1]! @ write output
1432 vmov.i32 q8, #1 @ compose 1<<96
1436 vext.8 q8, q9, q8, #4
1438 vadd.u32 q9,q8,q8 @ compose 2<<96
1440 vldmia r9, {q0} @ load counter
1446 .Lctr_enc_loop_done:
1448 vld1.8 {q8}, [r0]! @ load input
1450 vst1.8 {q0}, [r1]! @ write output
1482 #ifndef BSAES_ASM_EXTENDED_KEY
1483 .Lctr_enc_bzero: @ wipe key schedule [if any]
1492 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1494 ldmia sp!, {r4-r10, pc} @ return
1498 ldr ip, [sp] @ ctr pointer is passed on stack
1499 stmdb sp!, {r4-r8, lr}
1501 mov r4, r0 @ copy arguments
1505 ldr r8, [ip, #12] @ load counter LSW
1506 vld1.8 {q1}, [ip] @ load whole counter value
1511 vst1.8 {q1}, [sp,:64] @ copy counter value
1514 .Lctr_enc_short_loop:
1515 add r0, sp, #0x10 @ input counter value
1516 mov r1, sp @ output on the stack
1521 vld1.8 {q0}, [r4]! @ load input
1522 vld1.8 {q1}, [sp,:64] @ load encrypted counter
1526 str r0, [sp, #0x1c] @ next counter value
1528 str r8, [sp, #0x1c] @ next counter value
1531 vst1.8 {q0}, [r5]! @ store output
1533 bne .Lctr_enc_short_loop
1539 ldmia sp!, {r4-r8, pc}
1540 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1541 .globl bsaes_xts_encrypt
1542 .type bsaes_xts_encrypt,%function
1546 stmdb sp!, {r4-r10, lr} @ 0x20
1548 mov r6, sp @ future r3
1555 sub r0, sp, #0x10 @ 0x10
1556 bic r0, #0xf @ align at 16 bytes
1559 #ifdef XTS_CHAIN_TWEAK
1560 ldr r0, [ip] @ pointer to input tweak
1562 @ generate initial tweak
1563 ldr r0, [ip, #4] @ iv[]
1565 ldr r2, [ip, #0] @ key2
1567 mov r0,sp @ pointer to initial tweak
1570 ldr r1, [r10, #240] @ get # of rounds
1572 #ifndef BSAES_ASM_EXTENDED_KEY
1573 @ allocate the key schedule on the stack
1574 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
1575 @ add r12, #96 @ size of bit-sliced key schedule
1576 sub r12, #48 @ place for tweak[9]
1578 @ populate the key schedule
1579 mov r4, r10 @ pass key
1580 mov r5, r1 @ pass # of rounds
1582 add r12, #0x90 @ pass key schedule
1583 bl _bsaes_key_convert
1584 veor q7, q7, q15 @ fix up last round key
1585 vstmia r12, {q7} @ save last round key
1587 ldr r12, [r10, #244]
1591 str r12, [r10, #244]
1592 mov r4, r10 @ pass key
1593 mov r5, r1 @ pass # of rounds
1594 add r12, r10, #248 @ pass key schedule
1595 bl _bsaes_key_convert
1596 veor q7, q7, q15 @ fix up last round key
1600 0: sub sp, #0x90 @ place for tweak[9]
1603 vld1.8 {q8}, [r0] @ initial tweak
1612 vldmia r2, {q5} @ load XTS magic
1613 vshr.s64 q6, q8, #63
1617 vst1.64 {q8}, [r0,:128]!
1619 vshr.s64 q7, q9, #63
1622 vadd.u64 q10, q9, q9
1623 vst1.64 {q9}, [r0,:128]!
1625 vshr.s64 q6, q10, #63
1629 vadd.u64 q11, q10, q10
1630 vst1.64 {q10}, [r0,:128]!
1632 vshr.s64 q7, q11, #63
1637 vadd.u64 q12, q11, q11
1638 vst1.64 {q11}, [r0,:128]!
1640 vshr.s64 q6, q12, #63
1645 vadd.u64 q13, q12, q12
1646 vst1.64 {q12}, [r0,:128]!
1648 vshr.s64 q7, q13, #63
1653 vadd.u64 q14, q13, q13
1654 vst1.64 {q13}, [r0,:128]!
1656 vshr.s64 q6, q14, #63
1661 vadd.u64 q15, q14, q14
1662 vst1.64 {q14}, [r0,:128]!
1664 vshr.s64 q7, q15, #63
1669 vadd.u64 q8, q15, q15
1670 vst1.64 {q15}, [r0,:128]!
1673 vst1.64 {q8}, [r0,:128] @ next round tweak
1675 vld1.8 {q6-q7}, [r7]!
1677 #ifndef BSAES_ASM_EXTENDED_KEY
1678 add r4, sp, #0x90 @ pass key schedule
1680 add r4, r10, #248 @ pass key schedule
1683 mov r5, r1 @ pass rounds
1689 vld1.64 {q8-q9}, [r0,:128]!
1690 vld1.64 {q10-q11}, [r0,:128]!
1692 vld1.64 {q12-q13}, [r0,:128]!
1695 vst1.8 {q0-q1}, [r8]!
1697 vld1.64 {q14-q15}, [r0,:128]!
1699 vst1.8 {q8-q9}, [r8]!
1702 vst1.8 {q10-q11}, [r8]!
1704 vst1.8 {q12-q13}, [r8]!
1706 vld1.64 {q8}, [r0,:128] @ next round tweak
1715 vldmia r2, {q5} @ load XTS magic
1716 vshr.s64 q7, q8, #63
1720 vst1.64 {q8}, [r0,:128]!
1722 vshr.s64 q6, q9, #63
1725 vadd.u64 q10, q9, q9
1726 vst1.64 {q9}, [r0,:128]!
1728 vshr.s64 q7, q10, #63
1734 vadd.u64 q11, q10, q10
1735 vst1.64 {q10}, [r0,:128]!
1737 vshr.s64 q6, q11, #63
1744 vadd.u64 q12, q11, q11
1745 vst1.64 {q11}, [r0,:128]!
1747 vshr.s64 q7, q12, #63
1754 vadd.u64 q13, q12, q12
1755 vst1.64 {q12}, [r0,:128]!
1757 vshr.s64 q6, q13, #63
1764 vadd.u64 q14, q13, q13
1765 vst1.64 {q13}, [r0,:128]!
1767 vshr.s64 q7, q14, #63
1774 vadd.u64 q15, q14, q14
1775 vst1.64 {q14}, [r0,:128]!
1777 vshr.s64 q6, q15, #63
1785 vst1.64 {q15}, [r0,:128] @ next round tweak
1789 #ifndef BSAES_ASM_EXTENDED_KEY
1790 add r4, sp, #0x90 @ pass key schedule
1792 add r4, r10, #248 @ pass key schedule
1795 mov r5, r1 @ pass rounds
1800 vld1.64 {q8-q9}, [r0,:128]!
1801 vld1.64 {q10-q11}, [r0,:128]!
1803 vld1.64 {q12-q13}, [r0,:128]!
1806 vst1.8 {q0-q1}, [r8]!
1808 vld1.64 {q14}, [r0,:128]!
1810 vst1.8 {q8-q9}, [r8]!
1813 vst1.8 {q10-q11}, [r8]!
1816 vld1.64 {q8}, [r0,:128] @ next round tweak
1820 vst1.64 {q14}, [r0,:128] @ next round tweak
1823 #ifndef BSAES_ASM_EXTENDED_KEY
1824 add r4, sp, #0x90 @ pass key schedule
1826 add r4, r10, #248 @ pass key schedule
1829 mov r5, r1 @ pass rounds
1834 vld1.64 {q8-q9}, [r0,:128]!
1835 vld1.64 {q10-q11}, [r0,:128]!
1837 vld1.64 {q12-q13}, [r0,:128]!
1840 vst1.8 {q0-q1}, [r8]!
1843 vst1.8 {q8-q9}, [r8]!
1845 vst1.8 {q10-q11}, [r8]!
1847 vld1.64 {q8}, [r0,:128] @ next round tweak
1850 @ put this in range for both ARM and Thumb mode adr instructions
1857 vst1.64 {q13}, [r0,:128] @ next round tweak
1860 #ifndef BSAES_ASM_EXTENDED_KEY
1861 add r4, sp, #0x90 @ pass key schedule
1863 add r4, r10, #248 @ pass key schedule
1866 mov r5, r1 @ pass rounds
1871 vld1.64 {q8-q9}, [r0,:128]!
1872 vld1.64 {q10-q11}, [r0,:128]!
1874 vld1.64 {q12}, [r0,:128]!
1877 vst1.8 {q0-q1}, [r8]!
1880 vst1.8 {q8-q9}, [r8]!
1883 vld1.64 {q8}, [r0,:128] @ next round tweak
1887 vst1.64 {q12}, [r0,:128] @ next round tweak
1890 #ifndef BSAES_ASM_EXTENDED_KEY
1891 add r4, sp, #0x90 @ pass key schedule
1893 add r4, r10, #248 @ pass key schedule
1896 mov r5, r1 @ pass rounds
1901 vld1.64 {q8-q9}, [r0,:128]!
1902 vld1.64 {q10-q11}, [r0,:128]!
1906 vst1.8 {q0-q1}, [r8]!
1908 vst1.8 {q8-q9}, [r8]!
1910 vld1.64 {q8}, [r0,:128] @ next round tweak
1914 vst1.64 {q11}, [r0,:128] @ next round tweak
1917 #ifndef BSAES_ASM_EXTENDED_KEY
1918 add r4, sp, #0x90 @ pass key schedule
1920 add r4, r10, #248 @ pass key schedule
1923 mov r5, r1 @ pass rounds
1928 vld1.64 {q8-q9}, [r0,:128]!
1929 vld1.64 {q10}, [r0,:128]!
1933 vst1.8 {q0-q1}, [r8]!
1936 vld1.64 {q8}, [r0,:128] @ next round tweak
1940 vst1.64 {q10}, [r0,:128] @ next round tweak
1943 #ifndef BSAES_ASM_EXTENDED_KEY
1944 add r4, sp, #0x90 @ pass key schedule
1946 add r4, r10, #248 @ pass key schedule
1949 mov r5, r1 @ pass rounds
1954 vld1.64 {q8-q9}, [r0,:128]!
1957 vst1.8 {q0-q1}, [r8]!
1959 vld1.64 {q8}, [r0,:128] @ next round tweak
1966 vst1.8 {q0}, [sp,:128]
1968 mov r4, r3 @ preserve fp
1972 vld1.8 {q0}, [sp,:128]
1977 vmov q8, q9 @ next round tweak
1980 #ifndef XTS_CHAIN_TWEAK
1987 ldrb r1, [r8, #-0x10]
1988 strb r0, [r8, #-0x10]
1998 vst1.8 {q0}, [sp,:128]
2000 mov r4, r3 @ preserve fp
2004 vld1.8 {q0}, [sp,:128]
2014 #ifdef XTS_CHAIN_TWEAK
2015 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2017 .Lxts_enc_bzero: @ wipe key schedule [if any]
2023 #ifdef XTS_CHAIN_TWEAK
2027 ldmia sp!, {r4-r10, pc} @ return
2029 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2031 .globl bsaes_xts_decrypt
2032 .type bsaes_xts_decrypt,%function
2036 stmdb sp!, {r4-r10, lr} @ 0x20
2038 mov r6, sp @ future r3
2045 sub r0, sp, #0x10 @ 0x10
2046 bic r0, #0xf @ align at 16 bytes
2049 #ifdef XTS_CHAIN_TWEAK
2050 ldr r0, [ip] @ pointer to input tweak
2052 @ generate initial tweak
2053 ldr r0, [ip, #4] @ iv[]
2055 ldr r2, [ip, #0] @ key2
2057 mov r0, sp @ pointer to initial tweak
2060 ldr r1, [r10, #240] @ get # of rounds
2062 #ifndef BSAES_ASM_EXTENDED_KEY
2063 @ allocate the key schedule on the stack
2064 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
2065 @ add r12, #96 @ size of bit-sliced key schedule
2066 sub r12, #48 @ place for tweak[9]
2068 @ populate the key schedule
2069 mov r4, r10 @ pass key
2070 mov r5, r1 @ pass # of rounds
2072 add r12, #0x90 @ pass key schedule
2073 bl _bsaes_key_convert
2076 vstmia r12, {q15} @ save last round key
2077 veor q7, q7, q6 @ fix up round 0 key
2080 ldr r12, [r10, #244]
2084 str r12, [r10, #244]
2085 mov r4, r10 @ pass key
2086 mov r5, r1 @ pass # of rounds
2087 add r12, r10, #248 @ pass key schedule
2088 bl _bsaes_key_convert
2091 vstmia r12, {q15} @ save last round key
2092 veor q7, q7, q6 @ fix up round 0 key
2096 0: sub sp, #0x90 @ place for tweak[9]
2098 vld1.8 {q8}, [r0] @ initial tweak
2101 tst r9, #0xf @ if not multiple of 16
2102 it ne @ Thumb2 thing, sanity check in ARM
2103 subne r9, #0x10 @ subtract another 16 bytes
2111 vldmia r2, {q5} @ load XTS magic
2112 vshr.s64 q6, q8, #63
2116 vst1.64 {q8}, [r0,:128]!
2118 vshr.s64 q7, q9, #63
2121 vadd.u64 q10, q9, q9
2122 vst1.64 {q9}, [r0,:128]!
2124 vshr.s64 q6, q10, #63
2128 vadd.u64 q11, q10, q10
2129 vst1.64 {q10}, [r0,:128]!
2131 vshr.s64 q7, q11, #63
2136 vadd.u64 q12, q11, q11
2137 vst1.64 {q11}, [r0,:128]!
2139 vshr.s64 q6, q12, #63
2144 vadd.u64 q13, q12, q12
2145 vst1.64 {q12}, [r0,:128]!
2147 vshr.s64 q7, q13, #63
2152 vadd.u64 q14, q13, q13
2153 vst1.64 {q13}, [r0,:128]!
2155 vshr.s64 q6, q14, #63
2160 vadd.u64 q15, q14, q14
2161 vst1.64 {q14}, [r0,:128]!
2163 vshr.s64 q7, q15, #63
2168 vadd.u64 q8, q15, q15
2169 vst1.64 {q15}, [r0,:128]!
2172 vst1.64 {q8}, [r0,:128] @ next round tweak
2174 vld1.8 {q6-q7}, [r7]!
2176 #ifndef BSAES_ASM_EXTENDED_KEY
2177 add r4, sp, #0x90 @ pass key schedule
2179 add r4, r10, #248 @ pass key schedule
2182 mov r5, r1 @ pass rounds
2188 vld1.64 {q8-q9}, [r0,:128]!
2189 vld1.64 {q10-q11}, [r0,:128]!
2191 vld1.64 {q12-q13}, [r0,:128]!
2194 vst1.8 {q0-q1}, [r8]!
2196 vld1.64 {q14-q15}, [r0,:128]!
2198 vst1.8 {q8-q9}, [r8]!
2201 vst1.8 {q10-q11}, [r8]!
2203 vst1.8 {q12-q13}, [r8]!
2205 vld1.64 {q8}, [r0,:128] @ next round tweak
2214 vldmia r2, {q5} @ load XTS magic
2215 vshr.s64 q7, q8, #63
2219 vst1.64 {q8}, [r0,:128]!
2221 vshr.s64 q6, q9, #63
2224 vadd.u64 q10, q9, q9
2225 vst1.64 {q9}, [r0,:128]!
2227 vshr.s64 q7, q10, #63
2233 vadd.u64 q11, q10, q10
2234 vst1.64 {q10}, [r0,:128]!
2236 vshr.s64 q6, q11, #63
2243 vadd.u64 q12, q11, q11
2244 vst1.64 {q11}, [r0,:128]!
2246 vshr.s64 q7, q12, #63
2253 vadd.u64 q13, q12, q12
2254 vst1.64 {q12}, [r0,:128]!
2256 vshr.s64 q6, q13, #63
2263 vadd.u64 q14, q13, q13
2264 vst1.64 {q13}, [r0,:128]!
2266 vshr.s64 q7, q14, #63
2273 vadd.u64 q15, q14, q14
2274 vst1.64 {q14}, [r0,:128]!
2276 vshr.s64 q6, q15, #63
2284 vst1.64 {q15}, [r0,:128] @ next round tweak
2288 #ifndef BSAES_ASM_EXTENDED_KEY
2289 add r4, sp, #0x90 @ pass key schedule
2291 add r4, r10, #248 @ pass key schedule
2294 mov r5, r1 @ pass rounds
2299 vld1.64 {q8-q9}, [r0,:128]!
2300 vld1.64 {q10-q11}, [r0,:128]!
2302 vld1.64 {q12-q13}, [r0,:128]!
2305 vst1.8 {q0-q1}, [r8]!
2307 vld1.64 {q14}, [r0,:128]!
2309 vst1.8 {q8-q9}, [r8]!
2312 vst1.8 {q10-q11}, [r8]!
2315 vld1.64 {q8}, [r0,:128] @ next round tweak
2319 vst1.64 {q14}, [r0,:128] @ next round tweak
2322 #ifndef BSAES_ASM_EXTENDED_KEY
2323 add r4, sp, #0x90 @ pass key schedule
2325 add r4, r10, #248 @ pass key schedule
2328 mov r5, r1 @ pass rounds
2333 vld1.64 {q8-q9}, [r0,:128]!
2334 vld1.64 {q10-q11}, [r0,:128]!
2336 vld1.64 {q12-q13}, [r0,:128]!
2339 vst1.8 {q0-q1}, [r8]!
2342 vst1.8 {q8-q9}, [r8]!
2344 vst1.8 {q10-q11}, [r8]!
2346 vld1.64 {q8}, [r0,:128] @ next round tweak
2350 vst1.64 {q13}, [r0,:128] @ next round tweak
2353 #ifndef BSAES_ASM_EXTENDED_KEY
2354 add r4, sp, #0x90 @ pass key schedule
2356 add r4, r10, #248 @ pass key schedule
2359 mov r5, r1 @ pass rounds
2364 vld1.64 {q8-q9}, [r0,:128]!
2365 vld1.64 {q10-q11}, [r0,:128]!
2367 vld1.64 {q12}, [r0,:128]!
2370 vst1.8 {q0-q1}, [r8]!
2373 vst1.8 {q8-q9}, [r8]!
2376 vld1.64 {q8}, [r0,:128] @ next round tweak
2380 vst1.64 {q12}, [r0,:128] @ next round tweak
2383 #ifndef BSAES_ASM_EXTENDED_KEY
2384 add r4, sp, #0x90 @ pass key schedule
2386 add r4, r10, #248 @ pass key schedule
2389 mov r5, r1 @ pass rounds
2394 vld1.64 {q8-q9}, [r0,:128]!
2395 vld1.64 {q10-q11}, [r0,:128]!
2399 vst1.8 {q0-q1}, [r8]!
2401 vst1.8 {q8-q9}, [r8]!
2403 vld1.64 {q8}, [r0,:128] @ next round tweak
2407 vst1.64 {q11}, [r0,:128] @ next round tweak
2410 #ifndef BSAES_ASM_EXTENDED_KEY
2411 add r4, sp, #0x90 @ pass key schedule
2413 add r4, r10, #248 @ pass key schedule
2416 mov r5, r1 @ pass rounds
2421 vld1.64 {q8-q9}, [r0,:128]!
2422 vld1.64 {q10}, [r0,:128]!
2426 vst1.8 {q0-q1}, [r8]!
2429 vld1.64 {q8}, [r0,:128] @ next round tweak
2433 vst1.64 {q10}, [r0,:128] @ next round tweak
2436 #ifndef BSAES_ASM_EXTENDED_KEY
2437 add r4, sp, #0x90 @ pass key schedule
2439 add r4, r10, #248 @ pass key schedule
2442 mov r5, r1 @ pass rounds
2447 vld1.64 {q8-q9}, [r0,:128]!
2450 vst1.8 {q0-q1}, [r8]!
2452 vld1.64 {q8}, [r0,:128] @ next round tweak
2459 vst1.8 {q0}, [sp,:128]
2461 mov r4, r3 @ preserve fp
2462 mov r5, r2 @ preserve magic
2466 vld1.8 {q0}, [sp,:128]
2472 vmov q8, q9 @ next round tweak
2475 #ifndef XTS_CHAIN_TWEAK
2479 @ calculate one round of extra tweak for the stolen ciphertext
2481 vshr.s64 q6, q8, #63
2487 @ perform the final decryption with the last tweak value
2492 vst1.8 {q0}, [sp,:128]
2494 mov r4, r3 @ preserve fp
2498 vld1.8 {q0}, [sp,:128]
2506 strb r1, [r8, #0x10]
2516 vst1.8 {q0}, [sp,:128]
2521 vld1.8 {q0}, [sp,:128]
2531 #ifdef XTS_CHAIN_TWEAK
2532 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2534 .Lxts_dec_bzero: @ wipe key schedule [if any]
2540 #ifdef XTS_CHAIN_TWEAK
2544 ldmia sp!, {r4-r10, pc} @ return
2546 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt