2 @ ====================================================================
3 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
4 @ project. The module is, however, dual licensed under OpenSSL and
5 @ CRYPTOGAMS licenses depending on where you obtain it. For further
6 @ details see http://www.openssl.org/~appro/cryptogams/.
8 @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
9 @ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
11 @ ====================================================================
13 @ Bit-sliced AES for ARM NEON
17 @ This implementation is direct adaptation of bsaes-x86_64 module for
18 @ ARM NEON. Except that this module is endian-neutral [in sense that
19 @ it can be compiled for either endianness] by courtesy of vld1.8's
20 @ neutrality. Initial version doesn't implement interface to OpenSSL,
21 @ only low-level primitives and unsupported entry points, just enough
22 @ to collect performance results, which for Cortex-A8 core are:
24 @ encrypt 19.5 cycles per byte processed with 128-bit key
25 @ decrypt 22.1 cycles per byte processed with 128-bit key
26 @ key conv. 440 cycles per 128-bit key/0.18 of 8x block
28 @ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
29 @ which is [much] worse than anticipated (for further details see
30 @ http://www.openssl.org/~appro/Snapdragon-S4.html).
32 @ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
33 @ manages in 20.0 cycles].
35 @ When comparing to x86_64 results keep in mind that NEON unit is
36 @ [mostly] single-issue and thus can't [fully] benefit from
37 @ instruction-level parallelism. And when comparing to aes-armv4
38 @ results keep in mind key schedule conversion overhead (see
39 @ bsaes-x86_64.pl for further details)...
45 @ Add CBC, CTR and XTS subroutines, adapt for kernel use.
47 @ <ard.biesheuvel@linaro.org>
50 # include "arm_arch.h"
52 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
53 # define VFP_ABI_POP vldmia sp!,{d8-d15}
54 # define VFP_ABI_FRAME 0x40
58 # define VFP_ABI_FRAME 0
59 # define BSAES_ASM_EXTENDED_KEY
60 # define XTS_CHAIN_TWEAK
61 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
62 # define __ARM_MAX_ARCH__ 7
69 #if __ARM_MAX_ARCH__>=7
74 .syntax unified @ ARMv7-capable assembler is expected to handle this
81 .type _bsaes_decrypt8,%function
84 adr r6,_bsaes_decrypt8
85 vldmia r4!, {q9} @ round 0 key
86 add r6,r6,#.LM0ISR-_bsaes_decrypt8
88 vldmia r6!, {q8} @ .LM0ISR
89 veor q10, q0, q9 @ xor with round0 key
100 vtbl.8 d6, {q13}, d16
101 vtbl.8 d7, {q13}, d17
103 vtbl.8 d8, {q14}, d16
104 vtbl.8 d9, {q14}, d17
106 vtbl.8 d10, {q15}, d16
107 vtbl.8 d11, {q15}, d17
109 vtbl.8 d12, {q10}, d16
110 vtbl.8 d13, {q10}, d17
111 vtbl.8 d14, {q11}, d16
112 vtbl.8 d15, {q11}, d17
113 vmov.i8 q8,#0x55 @ compose .LBS0
114 vmov.i8 q9,#0x33 @ compose .LBS1
122 vshl.u64 q10, q10, #1
124 vshl.u64 q11, q11, #1
134 vshl.u64 q10, q10, #1
136 vshl.u64 q11, q11, #1
139 vmov.i8 q8,#0x0f @ compose .LBS2
147 vshl.u64 q10, q10, #2
149 vshl.u64 q11, q11, #2
159 vshl.u64 q10, q10, #2
161 vshl.u64 q11, q11, #2
171 vshl.u64 q10, q10, #4
173 vshl.u64 q11, q11, #4
183 vshl.u64 q10, q10, #4
185 vshl.u64 q11, q11, #4
203 vtbl.8 d4, {q10}, d24
204 vtbl.8 d5, {q10}, d25
206 vtbl.8 d6, {q11}, d24
207 vtbl.8 d7, {q11}, d25
214 vtbl.8 d10, {q9}, d24
215 vtbl.8 d11, {q9}, d25
217 vtbl.8 d12, {q10}, d24
218 vtbl.8 d13, {q10}, d25
219 vtbl.8 d14, {q11}, d24
220 vtbl.8 d15, {q11}, d25
273 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
275 @ new smaller inversion
282 veor q14, q8, q14 @ q14=q15
371 @ multiplication by 0x05-0x00-0x04-0x00
372 vext.8 q8, q0, q0, #8
373 vext.8 q14, q3, q3, #8
374 vext.8 q15, q5, q5, #8
376 vext.8 q9, q1, q1, #8
378 vext.8 q10, q6, q6, #8
380 vext.8 q11, q4, q4, #8
382 vext.8 q12, q2, q2, #8
384 vext.8 q13, q7, q7, #8
403 vext.8 q8, q0, q0, #12 @ x0 <<< 32
404 vext.8 q9, q1, q1, #12
405 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
406 vext.8 q10, q6, q6, #12
408 vext.8 q11, q4, q4, #12
410 vext.8 q12, q2, q2, #12
412 vext.8 q13, q7, q7, #12
414 vext.8 q14, q3, q3, #12
416 vext.8 q15, q5, q5, #12
421 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
425 vext.8 q1, q1, q1, #8
430 vext.8 q8, q2, q2, #8
432 vext.8 q9, q7, q7, #8
434 vext.8 q2, q4, q4, #8
436 vext.8 q7, q5, q5, #8
438 vext.8 q4, q3, q3, #8
440 vext.8 q3, q6, q6, #8
449 vldmia r6, {q12} @ .LISR
450 ite eq @ Thumb2 thing, sanity check in ARM
453 vldmia r6, {q12} @ .LISRM0
457 vmov.i8 q8,#0x55 @ compose .LBS0
458 vmov.i8 q9,#0x33 @ compose .LBS1
466 vshl.u64 q10, q10, #1
468 vshl.u64 q11, q11, #1
478 vshl.u64 q10, q10, #1
480 vshl.u64 q11, q11, #1
483 vmov.i8 q8,#0x0f @ compose .LBS2
491 vshl.u64 q10, q10, #2
493 vshl.u64 q11, q11, #2
503 vshl.u64 q10, q10, #2
505 vshl.u64 q11, q11, #2
515 vshl.u64 q10, q10, #4
517 vshl.u64 q11, q11, #4
527 vshl.u64 q10, q10, #4
529 vshl.u64 q11, q11, #4
532 vldmia r4, {q8} @ last round key
542 .size _bsaes_decrypt8,.-_bsaes_decrypt8
544 .type _bsaes_const,%object
547 .LM0ISR: @ InvShiftRows constants
548 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
550 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
552 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
553 .LM0SR: @ ShiftRows constants
554 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
556 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
558 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
560 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
562 .quad 0x090d01050c000408, 0x03070b0f060a0e02
563 .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
565 .size _bsaes_const,.-_bsaes_const
567 .type _bsaes_encrypt8,%function
570 adr r6,_bsaes_encrypt8
571 vldmia r4!, {q9} @ round 0 key
572 sub r6,r6,#_bsaes_encrypt8-.LM0SR
574 vldmia r6!, {q8} @ .LM0SR
576 veor q10, q0, q9 @ xor with round0 key
578 vtbl.8 d0, {q10}, d16
579 vtbl.8 d1, {q10}, d17
581 vtbl.8 d2, {q11}, d16
582 vtbl.8 d3, {q11}, d17
584 vtbl.8 d4, {q12}, d16
585 vtbl.8 d5, {q12}, d17
587 vtbl.8 d6, {q13}, d16
588 vtbl.8 d7, {q13}, d17
590 vtbl.8 d8, {q14}, d16
591 vtbl.8 d9, {q14}, d17
593 vtbl.8 d10, {q15}, d16
594 vtbl.8 d11, {q15}, d17
596 vtbl.8 d12, {q10}, d16
597 vtbl.8 d13, {q10}, d17
598 vtbl.8 d14, {q11}, d16
599 vtbl.8 d15, {q11}, d17
600 _bsaes_encrypt8_bitslice:
601 vmov.i8 q8,#0x55 @ compose .LBS0
602 vmov.i8 q9,#0x33 @ compose .LBS1
610 vshl.u64 q10, q10, #1
612 vshl.u64 q11, q11, #1
622 vshl.u64 q10, q10, #1
624 vshl.u64 q11, q11, #1
627 vmov.i8 q8,#0x0f @ compose .LBS2
635 vshl.u64 q10, q10, #2
637 vshl.u64 q11, q11, #2
647 vshl.u64 q10, q10, #2
649 vshl.u64 q11, q11, #2
659 vshl.u64 q10, q10, #4
661 vshl.u64 q11, q11, #4
671 vshl.u64 q10, q10, #4
673 vshl.u64 q11, q11, #4
691 vtbl.8 d4, {q10}, d24
692 vtbl.8 d5, {q10}, d25
694 vtbl.8 d6, {q11}, d24
695 vtbl.8 d7, {q11}, d25
702 vtbl.8 d10, {q9}, d24
703 vtbl.8 d11, {q9}, d25
705 vtbl.8 d12, {q10}, d24
706 vtbl.8 d13, {q10}, d25
707 vtbl.8 d14, {q11}, d24
708 vtbl.8 d15, {q11}, d25
762 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
764 @ new smaller inversion
771 veor q14, q8, q14 @ q14=q15
858 vext.8 q8, q0, q0, #12 @ x0 <<< 32
859 vext.8 q9, q1, q1, #12
860 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
861 vext.8 q10, q4, q4, #12
863 vext.8 q11, q6, q6, #12
865 vext.8 q12, q3, q3, #12
867 vext.8 q13, q7, q7, #12
869 vext.8 q14, q2, q2, #12
871 vext.8 q15, q5, q5, #12
876 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
880 vext.8 q1, q1, q1, #8
885 vext.8 q8, q3, q3, #8
887 vext.8 q9, q7, q7, #8
889 vext.8 q3, q6, q6, #8
891 vext.8 q7, q5, q5, #8
893 vext.8 q6, q2, q2, #8
895 vext.8 q2, q4, q4, #8
904 vldmia r6, {q12} @ .LSR
905 ite eq @ Thumb2 thing, samity check in ARM
908 vldmia r6, {q12} @ .LSRM0
912 vmov.i8 q8,#0x55 @ compose .LBS0
913 vmov.i8 q9,#0x33 @ compose .LBS1
921 vshl.u64 q10, q10, #1
923 vshl.u64 q11, q11, #1
933 vshl.u64 q10, q10, #1
935 vshl.u64 q11, q11, #1
938 vmov.i8 q8,#0x0f @ compose .LBS2
946 vshl.u64 q10, q10, #2
948 vshl.u64 q11, q11, #2
958 vshl.u64 q10, q10, #2
960 vshl.u64 q11, q11, #2
970 vshl.u64 q10, q10, #4
972 vshl.u64 q11, q11, #4
982 vshl.u64 q10, q10, #4
984 vshl.u64 q11, q11, #4
987 vldmia r4, {q8} @ last round key
997 .size _bsaes_encrypt8,.-_bsaes_encrypt8
998 .type _bsaes_key_convert,%function
1001 adr r6,_bsaes_key_convert
1002 vld1.8 {q7}, [r4]! @ load round 0 key
1003 sub r6,r6,#_bsaes_key_convert-.LM0
1004 vld1.8 {q15}, [r4]! @ load round 1 key
1006 vmov.i8 q8, #0x01 @ bit masks
1012 vldmia r6, {q14} @ .LM0
1019 vstmia r12!, {q7} @ save round 0 key
1024 vtbl.8 d14,{q15},d28
1025 vtbl.8 d15,{q15},d29
1037 vld1.8 {q15}, [r4]! @ load next round key
1038 vmvn q0, q0 @ "pnot"
1046 vstmia r12!,{q0-q7} @ write bit-sliced round key
1049 vmov.i8 q7,#0x63 @ compose .L63
1050 @ don't save last round key
1052 .size _bsaes_key_convert,.-_bsaes_key_convert
1053 .extern AES_cbc_encrypt
1056 .global bsaes_cbc_encrypt
1057 .type bsaes_cbc_encrypt,%function
1071 @ it is up to the caller to make sure we are called with enc == 0
1074 stmdb sp!, {r4-r10, lr}
1076 ldr r8, [ip] @ IV is 1st arg on the stack
1077 mov r2, r2, lsr#4 @ len in 16 byte blocks
1078 sub sp, #0x10 @ scratch space to carry over the IV
1079 mov r9, sp @ save sp
1081 ldr r10, [r3, #240] @ get # of rounds
1082 #ifndef BSAES_ASM_EXTENDED_KEY
1083 @ allocate the key schedule on the stack
1084 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1085 add r12, #96 @ sifze of bit-slices key schedule
1087 @ populate the key schedule
1088 mov r4, r3 @ pass key
1089 mov r5, r10 @ pass # of rounds
1090 mov sp, r12 @ sp is sp
1091 bl _bsaes_key_convert
1093 vstmia r12, {q15} @ save last round key
1094 veor q7, q7, q6 @ fix up round 0 key
1101 @ populate the key schedule
1103 mov r4, r3 @ pass key
1104 mov r5, r10 @ pass # of rounds
1105 add r12, r3, #248 @ pass key schedule
1106 bl _bsaes_key_convert
1109 vstmia r12, {q15} @ save last round key
1110 veor q7, q7, q6 @ fix up round 0 key
1117 vld1.8 {q15}, [r8] @ load IV
1123 bmi .Lcbc_dec_loop_finish
1125 vld1.8 {q0-q1}, [r0]! @ load input
1126 vld1.8 {q2-q3}, [r0]!
1127 #ifndef BSAES_ASM_EXTENDED_KEY
1128 mov r4, sp @ pass the key
1132 vld1.8 {q4-q5}, [r0]!
1134 vld1.8 {q6-q7}, [r0]
1136 vstmia r9, {q15} @ put aside IV
1140 vldmia r9, {q14} @ reload IV
1141 vld1.8 {q8-q9}, [r0]! @ reload input
1142 veor q0, q0, q14 @ ^= IV
1143 vld1.8 {q10-q11}, [r0]!
1146 vld1.8 {q12-q13}, [r0]!
1149 vld1.8 {q14-q15}, [r0]!
1151 vst1.8 {q0-q1}, [r1]! @ write output
1163 .Lcbc_dec_loop_finish:
1167 vld1.8 {q0}, [r0]! @ load input
1171 #ifndef BSAES_ASM_EXTENDED_KEY
1172 mov r4, sp @ pass the key
1177 vstmia r9, {q15} @ put aside IV
1194 vldmia r9, {q14} @ reload IV
1195 vld1.8 {q8-q9}, [r0]! @ reload input
1196 veor q0, q0, q14 @ ^= IV
1197 vld1.8 {q10-q11}, [r0]!
1200 vld1.8 {q12-q13}, [r0]!
1205 vst1.8 {q0-q1}, [r1]! @ write output
1217 vldmia r9,{q14} @ reload IV
1218 vld1.8 {q8-q9}, [r0]! @ reload input
1219 veor q0, q0, q14 @ ^= IV
1220 vld1.8 {q10-q11}, [r0]!
1228 vst1.8 {q0-q1}, [r1]! @ write output
1238 vldmia r9, {q14} @ reload IV
1239 vld1.8 {q8-q9}, [r0]! @ reload input
1240 veor q0, q0, q14 @ ^= IV
1241 vld1.8 {q10-q11}, [r0]!
1246 vst1.8 {q0-q1}, [r1]! @ write output
1256 vldmia r9, {q14} @ reload IV
1257 vld1.8 {q8-q9}, [r0]! @ reload input
1258 veor q0, q0, q14 @ ^= IV
1264 vst1.8 {q0-q1}, [r1]! @ write output
1272 vldmia r9, {q14} @ reload IV
1273 vld1.8 {q8-q9}, [r0]! @ reload input
1274 veor q0, q0, q14 @ ^= IV
1278 vst1.8 {q0-q1}, [r1]! @ write output
1285 vldmia r9, {q14} @ reload IV
1286 vld1.8 {q8}, [r0]! @ reload input
1287 veor q0, q0, q14 @ ^= IV
1288 vld1.8 {q15}, [r0]! @ reload input
1290 vst1.8 {q0-q1}, [r1]! @ write output
1295 mov r10, r1 @ save original out pointer
1296 mov r1, r9 @ use the iv scratch space as out buffer
1298 vmov q4,q15 @ just in case ensure that IV
1299 vmov q5,q0 @ and input are preserved
1301 vld1.8 {q0}, [r9,:64] @ load result
1302 veor q0, q0, q4 @ ^= IV
1303 vmov q15, q5 @ q5 holds input
1304 vst1.8 {q0}, [r10] @ write output
1307 #ifndef BSAES_ASM_EXTENDED_KEY
1310 .Lcbc_dec_bzero: @ wipe key schedule [if any]
1317 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1318 vst1.8 {q15}, [r8] @ return IV
1320 ldmia sp!, {r4-r10, pc}
1321 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1323 .global bsaes_ctr32_encrypt_blocks
1324 .type bsaes_ctr32_encrypt_blocks,%function
1326 bsaes_ctr32_encrypt_blocks:
1327 cmp r2, #8 @ use plain AES for
1328 blo .Lctr_enc_short @ small sizes
1331 stmdb sp!, {r4-r10, lr}
1333 ldr r8, [ip] @ ctr is 1st arg on the stack
1334 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1335 mov r9, sp @ save sp
1337 ldr r10, [r3, #240] @ get # of rounds
1338 #ifndef BSAES_ASM_EXTENDED_KEY
1339 @ allocate the key schedule on the stack
1340 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1341 add r12, #96 @ size of bit-sliced key schedule
1343 @ populate the key schedule
1344 mov r4, r3 @ pass key
1345 mov r5, r10 @ pass # of rounds
1346 mov sp, r12 @ sp is sp
1347 bl _bsaes_key_convert
1348 veor q7,q7,q15 @ fix up last round key
1349 vstmia r12, {q7} @ save last round key
1351 vld1.8 {q0}, [r8] @ load counter
1352 add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
1353 vldmia sp, {q4} @ load round0 key
1359 @ populate the key schedule
1361 mov r4, r3 @ pass key
1362 mov r5, r10 @ pass # of rounds
1363 add r12, r3, #248 @ pass key schedule
1364 bl _bsaes_key_convert
1365 veor q7,q7,q15 @ fix up last round key
1366 vstmia r12, {q7} @ save last round key
1369 0: add r12, r3, #248
1370 vld1.8 {q0}, [r8] @ load counter
1371 adrl r8, .LREVM0SR @ borrow r8
1372 vldmia r12, {q4} @ load round0 key
1373 sub sp, #0x10 @ place for adjusted round0 key
1376 vmov.i32 q8,#1 @ compose 1<<96
1381 vadd.u32 q9,q8,q8 @ compose 2<<96
1382 vstmia sp, {q4} @ save adjusted round0 key
1387 vadd.u32 q10, q8, q9 @ compose 3<<96
1388 vadd.u32 q1, q0, q8 @ +1
1389 vadd.u32 q2, q0, q9 @ +2
1390 vadd.u32 q3, q0, q10 @ +3
1391 vadd.u32 q4, q1, q10
1392 vadd.u32 q5, q2, q10
1393 vadd.u32 q6, q3, q10
1394 vadd.u32 q7, q4, q10
1395 vadd.u32 q10, q5, q10 @ next counter
1397 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1398 @ to flip byte order in 32-bit counter
1400 vldmia sp, {q9} @ load round0 key
1401 #ifndef BSAES_ASM_EXTENDED_KEY
1402 add r4, sp, #0x10 @ pass next round key
1406 vldmia r8, {q8} @ .LREVM0SR
1407 mov r5, r10 @ pass rounds
1408 vstmia r9, {q10} @ save next counter
1409 sub r6, r8, #.LREVM0SR-.LSR @ pass constants
1411 bl _bsaes_encrypt8_alt
1414 blo .Lctr_enc_loop_done
1416 vld1.8 {q8-q9}, [r0]! @ load input
1417 vld1.8 {q10-q11}, [r0]!
1420 vld1.8 {q12-q13}, [r0]!
1423 vld1.8 {q14-q15}, [r0]!
1425 vst1.8 {q0-q1}, [r1]! @ write output
1431 vmov.i32 q8, #1 @ compose 1<<96
1435 vext.8 q8, q9, q8, #4
1437 vadd.u32 q9,q8,q8 @ compose 2<<96
1439 vldmia r9, {q0} @ load counter
1445 .Lctr_enc_loop_done:
1447 vld1.8 {q8}, [r0]! @ load input
1449 vst1.8 {q0}, [r1]! @ write output
1481 #ifndef BSAES_ASM_EXTENDED_KEY
1482 .Lctr_enc_bzero: @ wipe key schedule [if any]
1491 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1493 ldmia sp!, {r4-r10, pc} @ return
1497 ldr ip, [sp] @ ctr pointer is passed on stack
1498 stmdb sp!, {r4-r8, lr}
1500 mov r4, r0 @ copy arguments
1504 ldr r8, [ip, #12] @ load counter LSW
1505 vld1.8 {q1}, [ip] @ load whole counter value
1510 vst1.8 {q1}, [sp,:64] @ copy counter value
1513 .Lctr_enc_short_loop:
1514 add r0, sp, #0x10 @ input counter value
1515 mov r1, sp @ output on the stack
1520 vld1.8 {q0}, [r4]! @ load input
1521 vld1.8 {q1}, [sp,:64] @ load encrypted counter
1525 str r0, [sp, #0x1c] @ next counter value
1527 str r8, [sp, #0x1c] @ next counter value
1530 vst1.8 {q0}, [r5]! @ store output
1532 bne .Lctr_enc_short_loop
1538 ldmia sp!, {r4-r8, pc}
1539 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1540 .globl bsaes_xts_encrypt
1541 .type bsaes_xts_encrypt,%function
1545 stmdb sp!, {r4-r10, lr} @ 0x20
1547 mov r6, sp @ future r3
1554 sub r0, sp, #0x10 @ 0x10
1555 bic r0, #0xf @ align at 16 bytes
1558 #ifdef XTS_CHAIN_TWEAK
1559 ldr r0, [ip] @ pointer to input tweak
1561 @ generate initial tweak
1562 ldr r0, [ip, #4] @ iv[]
1564 ldr r2, [ip, #0] @ key2
1566 mov r0,sp @ pointer to initial tweak
1569 ldr r1, [r10, #240] @ get # of rounds
1571 #ifndef BSAES_ASM_EXTENDED_KEY
1572 @ allocate the key schedule on the stack
1573 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
1574 @ add r12, #96 @ size of bit-sliced key schedule
1575 sub r12, #48 @ place for tweak[9]
1577 @ populate the key schedule
1578 mov r4, r10 @ pass key
1579 mov r5, r1 @ pass # of rounds
1581 add r12, #0x90 @ pass key schedule
1582 bl _bsaes_key_convert
1583 veor q7, q7, q15 @ fix up last round key
1584 vstmia r12, {q7} @ save last round key
1586 ldr r12, [r10, #244]
1590 str r12, [r10, #244]
1591 mov r4, r10 @ pass key
1592 mov r5, r1 @ pass # of rounds
1593 add r12, r10, #248 @ pass key schedule
1594 bl _bsaes_key_convert
1595 veor q7, q7, q15 @ fix up last round key
1599 0: sub sp, #0x90 @ place for tweak[9]
1602 vld1.8 {q8}, [r0] @ initial tweak
1611 vldmia r2, {q5} @ load XTS magic
1612 vshr.s64 q6, q8, #63
1616 vst1.64 {q8}, [r0,:128]!
1618 vshr.s64 q7, q9, #63
1621 vadd.u64 q10, q9, q9
1622 vst1.64 {q9}, [r0,:128]!
1624 vshr.s64 q6, q10, #63
1628 vadd.u64 q11, q10, q10
1629 vst1.64 {q10}, [r0,:128]!
1631 vshr.s64 q7, q11, #63
1636 vadd.u64 q12, q11, q11
1637 vst1.64 {q11}, [r0,:128]!
1639 vshr.s64 q6, q12, #63
1644 vadd.u64 q13, q12, q12
1645 vst1.64 {q12}, [r0,:128]!
1647 vshr.s64 q7, q13, #63
1652 vadd.u64 q14, q13, q13
1653 vst1.64 {q13}, [r0,:128]!
1655 vshr.s64 q6, q14, #63
1660 vadd.u64 q15, q14, q14
1661 vst1.64 {q14}, [r0,:128]!
1663 vshr.s64 q7, q15, #63
1668 vadd.u64 q8, q15, q15
1669 vst1.64 {q15}, [r0,:128]!
1672 vst1.64 {q8}, [r0,:128] @ next round tweak
1674 vld1.8 {q6-q7}, [r7]!
1676 #ifndef BSAES_ASM_EXTENDED_KEY
1677 add r4, sp, #0x90 @ pass key schedule
1679 add r4, r10, #248 @ pass key schedule
1682 mov r5, r1 @ pass rounds
1688 vld1.64 {q8-q9}, [r0,:128]!
1689 vld1.64 {q10-q11}, [r0,:128]!
1691 vld1.64 {q12-q13}, [r0,:128]!
1694 vst1.8 {q0-q1}, [r8]!
1696 vld1.64 {q14-q15}, [r0,:128]!
1698 vst1.8 {q8-q9}, [r8]!
1701 vst1.8 {q10-q11}, [r8]!
1703 vst1.8 {q12-q13}, [r8]!
1705 vld1.64 {q8}, [r0,:128] @ next round tweak
1714 vldmia r2, {q5} @ load XTS magic
1715 vshr.s64 q7, q8, #63
1719 vst1.64 {q8}, [r0,:128]!
1721 vshr.s64 q6, q9, #63
1724 vadd.u64 q10, q9, q9
1725 vst1.64 {q9}, [r0,:128]!
1727 vshr.s64 q7, q10, #63
1733 vadd.u64 q11, q10, q10
1734 vst1.64 {q10}, [r0,:128]!
1736 vshr.s64 q6, q11, #63
1743 vadd.u64 q12, q11, q11
1744 vst1.64 {q11}, [r0,:128]!
1746 vshr.s64 q7, q12, #63
1753 vadd.u64 q13, q12, q12
1754 vst1.64 {q12}, [r0,:128]!
1756 vshr.s64 q6, q13, #63
1763 vadd.u64 q14, q13, q13
1764 vst1.64 {q13}, [r0,:128]!
1766 vshr.s64 q7, q14, #63
1773 vadd.u64 q15, q14, q14
1774 vst1.64 {q14}, [r0,:128]!
1776 vshr.s64 q6, q15, #63
1784 vst1.64 {q15}, [r0,:128] @ next round tweak
1788 #ifndef BSAES_ASM_EXTENDED_KEY
1789 add r4, sp, #0x90 @ pass key schedule
1791 add r4, r10, #248 @ pass key schedule
1794 mov r5, r1 @ pass rounds
1799 vld1.64 {q8-q9}, [r0,:128]!
1800 vld1.64 {q10-q11}, [r0,:128]!
1802 vld1.64 {q12-q13}, [r0,:128]!
1805 vst1.8 {q0-q1}, [r8]!
1807 vld1.64 {q14}, [r0,:128]!
1809 vst1.8 {q8-q9}, [r8]!
1812 vst1.8 {q10-q11}, [r8]!
1815 vld1.64 {q8}, [r0,:128] @ next round tweak
1819 vst1.64 {q14}, [r0,:128] @ next round tweak
1822 #ifndef BSAES_ASM_EXTENDED_KEY
1823 add r4, sp, #0x90 @ pass key schedule
1825 add r4, r10, #248 @ pass key schedule
1828 mov r5, r1 @ pass rounds
1833 vld1.64 {q8-q9}, [r0,:128]!
1834 vld1.64 {q10-q11}, [r0,:128]!
1836 vld1.64 {q12-q13}, [r0,:128]!
1839 vst1.8 {q0-q1}, [r8]!
1842 vst1.8 {q8-q9}, [r8]!
1844 vst1.8 {q10-q11}, [r8]!
1846 vld1.64 {q8}, [r0,:128] @ next round tweak
1849 @ put this in range for both ARM and Thumb mode adr instructions
1856 vst1.64 {q13}, [r0,:128] @ next round tweak
1859 #ifndef BSAES_ASM_EXTENDED_KEY
1860 add r4, sp, #0x90 @ pass key schedule
1862 add r4, r10, #248 @ pass key schedule
1865 mov r5, r1 @ pass rounds
1870 vld1.64 {q8-q9}, [r0,:128]!
1871 vld1.64 {q10-q11}, [r0,:128]!
1873 vld1.64 {q12}, [r0,:128]!
1876 vst1.8 {q0-q1}, [r8]!
1879 vst1.8 {q8-q9}, [r8]!
1882 vld1.64 {q8}, [r0,:128] @ next round tweak
1886 vst1.64 {q12}, [r0,:128] @ next round tweak
1889 #ifndef BSAES_ASM_EXTENDED_KEY
1890 add r4, sp, #0x90 @ pass key schedule
1892 add r4, r10, #248 @ pass key schedule
1895 mov r5, r1 @ pass rounds
1900 vld1.64 {q8-q9}, [r0,:128]!
1901 vld1.64 {q10-q11}, [r0,:128]!
1905 vst1.8 {q0-q1}, [r8]!
1907 vst1.8 {q8-q9}, [r8]!
1909 vld1.64 {q8}, [r0,:128] @ next round tweak
1913 vst1.64 {q11}, [r0,:128] @ next round tweak
1916 #ifndef BSAES_ASM_EXTENDED_KEY
1917 add r4, sp, #0x90 @ pass key schedule
1919 add r4, r10, #248 @ pass key schedule
1922 mov r5, r1 @ pass rounds
1927 vld1.64 {q8-q9}, [r0,:128]!
1928 vld1.64 {q10}, [r0,:128]!
1932 vst1.8 {q0-q1}, [r8]!
1935 vld1.64 {q8}, [r0,:128] @ next round tweak
1939 vst1.64 {q10}, [r0,:128] @ next round tweak
1942 #ifndef BSAES_ASM_EXTENDED_KEY
1943 add r4, sp, #0x90 @ pass key schedule
1945 add r4, r10, #248 @ pass key schedule
1948 mov r5, r1 @ pass rounds
1953 vld1.64 {q8-q9}, [r0,:128]!
1956 vst1.8 {q0-q1}, [r8]!
1958 vld1.64 {q8}, [r0,:128] @ next round tweak
1965 vst1.8 {q0}, [sp,:128]
1967 mov r4, r3 @ preserve fp
1971 vld1.8 {q0}, [sp,:128]
1976 vmov q8, q9 @ next round tweak
1979 #ifndef XTS_CHAIN_TWEAK
1986 ldrb r1, [r8, #-0x10]
1987 strb r0, [r8, #-0x10]
1997 vst1.8 {q0}, [sp,:128]
1999 mov r4, r3 @ preserve fp
2003 vld1.8 {q0}, [sp,:128]
2013 #ifdef XTS_CHAIN_TWEAK
2014 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2016 .Lxts_enc_bzero: @ wipe key schedule [if any]
2022 #ifdef XTS_CHAIN_TWEAK
2026 ldmia sp!, {r4-r10, pc} @ return
2028 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2030 .globl bsaes_xts_decrypt
2031 .type bsaes_xts_decrypt,%function
2035 stmdb sp!, {r4-r10, lr} @ 0x20
2037 mov r6, sp @ future r3
2044 sub r0, sp, #0x10 @ 0x10
2045 bic r0, #0xf @ align at 16 bytes
2048 #ifdef XTS_CHAIN_TWEAK
2049 ldr r0, [ip] @ pointer to input tweak
2051 @ generate initial tweak
2052 ldr r0, [ip, #4] @ iv[]
2054 ldr r2, [ip, #0] @ key2
2056 mov r0, sp @ pointer to initial tweak
2059 ldr r1, [r10, #240] @ get # of rounds
2061 #ifndef BSAES_ASM_EXTENDED_KEY
2062 @ allocate the key schedule on the stack
2063 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
2064 @ add r12, #96 @ size of bit-sliced key schedule
2065 sub r12, #48 @ place for tweak[9]
2067 @ populate the key schedule
2068 mov r4, r10 @ pass key
2069 mov r5, r1 @ pass # of rounds
2071 add r12, #0x90 @ pass key schedule
2072 bl _bsaes_key_convert
2075 vstmia r12, {q15} @ save last round key
2076 veor q7, q7, q6 @ fix up round 0 key
2079 ldr r12, [r10, #244]
2083 str r12, [r10, #244]
2084 mov r4, r10 @ pass key
2085 mov r5, r1 @ pass # of rounds
2086 add r12, r10, #248 @ pass key schedule
2087 bl _bsaes_key_convert
2090 vstmia r12, {q15} @ save last round key
2091 veor q7, q7, q6 @ fix up round 0 key
2095 0: sub sp, #0x90 @ place for tweak[9]
2097 vld1.8 {q8}, [r0] @ initial tweak
2100 #ifndef XTS_CHAIN_TWEAK
2101 tst r9, #0xf @ if not multiple of 16
2102 it ne @ Thumb2 thing, sanity check in ARM
2103 subne r9, #0x10 @ subtract another 16 bytes
2112 vldmia r2, {q5} @ load XTS magic
2113 vshr.s64 q6, q8, #63
2117 vst1.64 {q8}, [r0,:128]!
2119 vshr.s64 q7, q9, #63
2122 vadd.u64 q10, q9, q9
2123 vst1.64 {q9}, [r0,:128]!
2125 vshr.s64 q6, q10, #63
2129 vadd.u64 q11, q10, q10
2130 vst1.64 {q10}, [r0,:128]!
2132 vshr.s64 q7, q11, #63
2137 vadd.u64 q12, q11, q11
2138 vst1.64 {q11}, [r0,:128]!
2140 vshr.s64 q6, q12, #63
2145 vadd.u64 q13, q12, q12
2146 vst1.64 {q12}, [r0,:128]!
2148 vshr.s64 q7, q13, #63
2153 vadd.u64 q14, q13, q13
2154 vst1.64 {q13}, [r0,:128]!
2156 vshr.s64 q6, q14, #63
2161 vadd.u64 q15, q14, q14
2162 vst1.64 {q14}, [r0,:128]!
2164 vshr.s64 q7, q15, #63
2169 vadd.u64 q8, q15, q15
2170 vst1.64 {q15}, [r0,:128]!
2173 vst1.64 {q8}, [r0,:128] @ next round tweak
2175 vld1.8 {q6-q7}, [r7]!
2177 #ifndef BSAES_ASM_EXTENDED_KEY
2178 add r4, sp, #0x90 @ pass key schedule
2180 add r4, r10, #248 @ pass key schedule
2183 mov r5, r1 @ pass rounds
2189 vld1.64 {q8-q9}, [r0,:128]!
2190 vld1.64 {q10-q11}, [r0,:128]!
2192 vld1.64 {q12-q13}, [r0,:128]!
2195 vst1.8 {q0-q1}, [r8]!
2197 vld1.64 {q14-q15}, [r0,:128]!
2199 vst1.8 {q8-q9}, [r8]!
2202 vst1.8 {q10-q11}, [r8]!
2204 vst1.8 {q12-q13}, [r8]!
2206 vld1.64 {q8}, [r0,:128] @ next round tweak
2215 vldmia r2, {q5} @ load XTS magic
2216 vshr.s64 q7, q8, #63
2220 vst1.64 {q8}, [r0,:128]!
2222 vshr.s64 q6, q9, #63
2225 vadd.u64 q10, q9, q9
2226 vst1.64 {q9}, [r0,:128]!
2228 vshr.s64 q7, q10, #63
2234 vadd.u64 q11, q10, q10
2235 vst1.64 {q10}, [r0,:128]!
2237 vshr.s64 q6, q11, #63
2244 vadd.u64 q12, q11, q11
2245 vst1.64 {q11}, [r0,:128]!
2247 vshr.s64 q7, q12, #63
2254 vadd.u64 q13, q12, q12
2255 vst1.64 {q12}, [r0,:128]!
2257 vshr.s64 q6, q13, #63
2264 vadd.u64 q14, q13, q13
2265 vst1.64 {q13}, [r0,:128]!
2267 vshr.s64 q7, q14, #63
2274 vadd.u64 q15, q14, q14
2275 vst1.64 {q14}, [r0,:128]!
2277 vshr.s64 q6, q15, #63
2285 vst1.64 {q15}, [r0,:128] @ next round tweak
2289 #ifndef BSAES_ASM_EXTENDED_KEY
2290 add r4, sp, #0x90 @ pass key schedule
2292 add r4, r10, #248 @ pass key schedule
2295 mov r5, r1 @ pass rounds
2300 vld1.64 {q8-q9}, [r0,:128]!
2301 vld1.64 {q10-q11}, [r0,:128]!
2303 vld1.64 {q12-q13}, [r0,:128]!
2306 vst1.8 {q0-q1}, [r8]!
2308 vld1.64 {q14}, [r0,:128]!
2310 vst1.8 {q8-q9}, [r8]!
2313 vst1.8 {q10-q11}, [r8]!
2316 vld1.64 {q8}, [r0,:128] @ next round tweak
2320 vst1.64 {q14}, [r0,:128] @ next round tweak
2323 #ifndef BSAES_ASM_EXTENDED_KEY
2324 add r4, sp, #0x90 @ pass key schedule
2326 add r4, r10, #248 @ pass key schedule
2329 mov r5, r1 @ pass rounds
2334 vld1.64 {q8-q9}, [r0,:128]!
2335 vld1.64 {q10-q11}, [r0,:128]!
2337 vld1.64 {q12-q13}, [r0,:128]!
2340 vst1.8 {q0-q1}, [r8]!
2343 vst1.8 {q8-q9}, [r8]!
2345 vst1.8 {q10-q11}, [r8]!
2347 vld1.64 {q8}, [r0,:128] @ next round tweak
2351 vst1.64 {q13}, [r0,:128] @ next round tweak
2354 #ifndef BSAES_ASM_EXTENDED_KEY
2355 add r4, sp, #0x90 @ pass key schedule
2357 add r4, r10, #248 @ pass key schedule
2360 mov r5, r1 @ pass rounds
2365 vld1.64 {q8-q9}, [r0,:128]!
2366 vld1.64 {q10-q11}, [r0,:128]!
2368 vld1.64 {q12}, [r0,:128]!
2371 vst1.8 {q0-q1}, [r8]!
2374 vst1.8 {q8-q9}, [r8]!
2377 vld1.64 {q8}, [r0,:128] @ next round tweak
2381 vst1.64 {q12}, [r0,:128] @ next round tweak
2384 #ifndef BSAES_ASM_EXTENDED_KEY
2385 add r4, sp, #0x90 @ pass key schedule
2387 add r4, r10, #248 @ pass key schedule
2390 mov r5, r1 @ pass rounds
2395 vld1.64 {q8-q9}, [r0,:128]!
2396 vld1.64 {q10-q11}, [r0,:128]!
2400 vst1.8 {q0-q1}, [r8]!
2402 vst1.8 {q8-q9}, [r8]!
2404 vld1.64 {q8}, [r0,:128] @ next round tweak
2408 vst1.64 {q11}, [r0,:128] @ next round tweak
2411 #ifndef BSAES_ASM_EXTENDED_KEY
2412 add r4, sp, #0x90 @ pass key schedule
2414 add r4, r10, #248 @ pass key schedule
2417 mov r5, r1 @ pass rounds
2422 vld1.64 {q8-q9}, [r0,:128]!
2423 vld1.64 {q10}, [r0,:128]!
2427 vst1.8 {q0-q1}, [r8]!
2430 vld1.64 {q8}, [r0,:128] @ next round tweak
2434 vst1.64 {q10}, [r0,:128] @ next round tweak
2437 #ifndef BSAES_ASM_EXTENDED_KEY
2438 add r4, sp, #0x90 @ pass key schedule
2440 add r4, r10, #248 @ pass key schedule
2443 mov r5, r1 @ pass rounds
2448 vld1.64 {q8-q9}, [r0,:128]!
2451 vst1.8 {q0-q1}, [r8]!
2453 vld1.64 {q8}, [r0,:128] @ next round tweak
2460 vst1.8 {q0}, [sp,:128]
2462 mov r4, r3 @ preserve fp
2463 mov r5, r2 @ preserve magic
2467 vld1.8 {q0}, [sp,:128]
2473 vmov q8, q9 @ next round tweak
2476 #ifndef XTS_CHAIN_TWEAK
2480 @ calculate one round of extra tweak for the stolen ciphertext
2482 vshr.s64 q6, q8, #63
2488 @ perform the final decryption with the last tweak value
2493 vst1.8 {q0}, [sp,:128]
2495 mov r4, r3 @ preserve fp
2499 vld1.8 {q0}, [sp,:128]
2507 strb r1, [r8, #0x10]
2517 vst1.8 {q0}, [sp,:128]
2522 vld1.8 {q0}, [sp,:128]
2532 #ifdef XTS_CHAIN_TWEAK
2533 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2535 .Lxts_dec_bzero: @ wipe key schedule [if any]
2541 #ifdef XTS_CHAIN_TWEAK
2545 ldmia sp!, {r4-r10, pc} @ return
2547 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt