2 @ ====================================================================
3 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
4 @ project. The module is, however, dual licensed under OpenSSL and
5 @ CRYPTOGAMS licenses depending on where you obtain it. For further
6 @ details see http://www.openssl.org/~appro/cryptogams/.
8 @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
9 @ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
11 @ ====================================================================
13 @ Bit-sliced AES for ARM NEON
17 @ This implementation is direct adaptation of bsaes-x86_64 module for
18 @ ARM NEON. Except that this module is endian-neutral [in sense that
19 @ it can be compiled for either endianness] by courtesy of vld1.8's
20 @ neutrality. Initial version doesn't implement interface to OpenSSL,
21 @ only low-level primitives and unsupported entry points, just enough
22 @ to collect performance results, which for Cortex-A8 core are:
24 @ encrypt 19.5 cycles per byte processed with 128-bit key
25 @ decrypt 22.1 cycles per byte processed with 128-bit key
26 @ key conv. 440 cycles per 128-bit key/0.18 of 8x block
28 @ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
29 @ which is [much] worse than anticipated (for further details see
30 @ http://www.openssl.org/~appro/Snapdragon-S4.html).
32 @ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
33 @ manages in 20.0 cycles].
35 @ When comparing to x86_64 results keep in mind that NEON unit is
36 @ [mostly] single-issue and thus can't [fully] benefit from
37 @ instruction-level parallelism. And when comparing to aes-armv4
38 @ results keep in mind key schedule conversion overhead (see
39 @ bsaes-x86_64.pl for further details)...
45 @ Add CBC, CTR and XTS subroutines, adapt for kernel use.
47 @ <ard.biesheuvel@linaro.org>
50 # include "arm_arch.h"
52 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
53 # define VFP_ABI_POP vldmia sp!,{d8-d15}
54 # define VFP_ABI_FRAME 0x40
58 # define VFP_ABI_FRAME 0
59 # define BSAES_ASM_EXTENDED_KEY
60 # define XTS_CHAIN_TWEAK
61 # define __ARM_ARCH__ 7
70 .syntax unified @ ARMv7-capable assembler is expected to handle this
79 .type _bsaes_decrypt8,%function
82 adr r6,_bsaes_decrypt8
83 vldmia r4!, {q9} @ round 0 key
84 add r6,r6,#.LM0ISR-_bsaes_decrypt8
86 vldmia r6!, {q8} @ .LM0ISR
87 veor q10, q0, q9 @ xor with round0 key
101 vtbl.8 d8, {q14}, d16
102 vtbl.8 d9, {q14}, d17
104 vtbl.8 d10, {q15}, d16
105 vtbl.8 d11, {q15}, d17
107 vtbl.8 d12, {q10}, d16
108 vtbl.8 d13, {q10}, d17
109 vtbl.8 d14, {q11}, d16
110 vtbl.8 d15, {q11}, d17
111 vmov.i8 q8,#0x55 @ compose .LBS0
112 vmov.i8 q9,#0x33 @ compose .LBS1
120 vshl.u64 q10, q10, #1
122 vshl.u64 q11, q11, #1
132 vshl.u64 q10, q10, #1
134 vshl.u64 q11, q11, #1
137 vmov.i8 q8,#0x0f @ compose .LBS2
145 vshl.u64 q10, q10, #2
147 vshl.u64 q11, q11, #2
157 vshl.u64 q10, q10, #2
159 vshl.u64 q11, q11, #2
169 vshl.u64 q10, q10, #4
171 vshl.u64 q11, q11, #4
181 vshl.u64 q10, q10, #4
183 vshl.u64 q11, q11, #4
201 vtbl.8 d4, {q10}, d24
202 vtbl.8 d5, {q10}, d25
204 vtbl.8 d6, {q11}, d24
205 vtbl.8 d7, {q11}, d25
212 vtbl.8 d10, {q9}, d24
213 vtbl.8 d11, {q9}, d25
215 vtbl.8 d12, {q10}, d24
216 vtbl.8 d13, {q10}, d25
217 vtbl.8 d14, {q11}, d24
218 vtbl.8 d15, {q11}, d25
271 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
273 @ new smaller inversion
280 veor q14, q8, q14 @ q14=q15
369 @ multiplication by 0x05-0x00-0x04-0x00
370 vext.8 q8, q0, q0, #8
371 vext.8 q14, q3, q3, #8
372 vext.8 q15, q5, q5, #8
374 vext.8 q9, q1, q1, #8
376 vext.8 q10, q6, q6, #8
378 vext.8 q11, q4, q4, #8
380 vext.8 q12, q2, q2, #8
382 vext.8 q13, q7, q7, #8
401 vext.8 q8, q0, q0, #12 @ x0 <<< 32
402 vext.8 q9, q1, q1, #12
403 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
404 vext.8 q10, q6, q6, #12
406 vext.8 q11, q4, q4, #12
408 vext.8 q12, q2, q2, #12
410 vext.8 q13, q7, q7, #12
412 vext.8 q14, q3, q3, #12
414 vext.8 q15, q5, q5, #12
419 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
423 vext.8 q1, q1, q1, #8
428 vext.8 q8, q2, q2, #8
430 vext.8 q9, q7, q7, #8
432 vext.8 q2, q4, q4, #8
434 vext.8 q7, q5, q5, #8
436 vext.8 q4, q3, q3, #8
438 vext.8 q3, q6, q6, #8
447 vldmia r6, {q12} @ .LISR
448 ite eq @ Thumb2 thing, sanity check in ARM
451 vldmia r6, {q12} @ .LISRM0
455 vmov.i8 q8,#0x55 @ compose .LBS0
456 vmov.i8 q9,#0x33 @ compose .LBS1
464 vshl.u64 q10, q10, #1
466 vshl.u64 q11, q11, #1
476 vshl.u64 q10, q10, #1
478 vshl.u64 q11, q11, #1
481 vmov.i8 q8,#0x0f @ compose .LBS2
489 vshl.u64 q10, q10, #2
491 vshl.u64 q11, q11, #2
501 vshl.u64 q10, q10, #2
503 vshl.u64 q11, q11, #2
513 vshl.u64 q10, q10, #4
515 vshl.u64 q11, q11, #4
525 vshl.u64 q10, q10, #4
527 vshl.u64 q11, q11, #4
530 vldmia r4, {q8} @ last round key
540 .size _bsaes_decrypt8,.-_bsaes_decrypt8
542 .type _bsaes_const,%object
545 .LM0ISR: @ InvShiftRows constants
546 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
548 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
550 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
551 .LM0SR: @ ShiftRows constants
552 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
554 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
556 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
558 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
560 .quad 0x090d01050c000408, 0x03070b0f060a0e02
561 .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
563 .size _bsaes_const,.-_bsaes_const
565 .type _bsaes_encrypt8,%function
568 adr r6,_bsaes_encrypt8
569 vldmia r4!, {q9} @ round 0 key
570 sub r6,r6,#_bsaes_encrypt8-.LM0SR
572 vldmia r6!, {q8} @ .LM0SR
574 veor q10, q0, q9 @ xor with round0 key
576 vtbl.8 d0, {q10}, d16
577 vtbl.8 d1, {q10}, d17
579 vtbl.8 d2, {q11}, d16
580 vtbl.8 d3, {q11}, d17
582 vtbl.8 d4, {q12}, d16
583 vtbl.8 d5, {q12}, d17
585 vtbl.8 d6, {q13}, d16
586 vtbl.8 d7, {q13}, d17
588 vtbl.8 d8, {q14}, d16
589 vtbl.8 d9, {q14}, d17
591 vtbl.8 d10, {q15}, d16
592 vtbl.8 d11, {q15}, d17
594 vtbl.8 d12, {q10}, d16
595 vtbl.8 d13, {q10}, d17
596 vtbl.8 d14, {q11}, d16
597 vtbl.8 d15, {q11}, d17
598 _bsaes_encrypt8_bitslice:
599 vmov.i8 q8,#0x55 @ compose .LBS0
600 vmov.i8 q9,#0x33 @ compose .LBS1
608 vshl.u64 q10, q10, #1
610 vshl.u64 q11, q11, #1
620 vshl.u64 q10, q10, #1
622 vshl.u64 q11, q11, #1
625 vmov.i8 q8,#0x0f @ compose .LBS2
633 vshl.u64 q10, q10, #2
635 vshl.u64 q11, q11, #2
645 vshl.u64 q10, q10, #2
647 vshl.u64 q11, q11, #2
657 vshl.u64 q10, q10, #4
659 vshl.u64 q11, q11, #4
669 vshl.u64 q10, q10, #4
671 vshl.u64 q11, q11, #4
689 vtbl.8 d4, {q10}, d24
690 vtbl.8 d5, {q10}, d25
692 vtbl.8 d6, {q11}, d24
693 vtbl.8 d7, {q11}, d25
700 vtbl.8 d10, {q9}, d24
701 vtbl.8 d11, {q9}, d25
703 vtbl.8 d12, {q10}, d24
704 vtbl.8 d13, {q10}, d25
705 vtbl.8 d14, {q11}, d24
706 vtbl.8 d15, {q11}, d25
760 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
762 @ new smaller inversion
769 veor q14, q8, q14 @ q14=q15
856 vext.8 q8, q0, q0, #12 @ x0 <<< 32
857 vext.8 q9, q1, q1, #12
858 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
859 vext.8 q10, q4, q4, #12
861 vext.8 q11, q6, q6, #12
863 vext.8 q12, q3, q3, #12
865 vext.8 q13, q7, q7, #12
867 vext.8 q14, q2, q2, #12
869 vext.8 q15, q5, q5, #12
874 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
878 vext.8 q1, q1, q1, #8
883 vext.8 q8, q3, q3, #8
885 vext.8 q9, q7, q7, #8
887 vext.8 q3, q6, q6, #8
889 vext.8 q7, q5, q5, #8
891 vext.8 q6, q2, q2, #8
893 vext.8 q2, q4, q4, #8
902 vldmia r6, {q12} @ .LSR
903 ite eq @ Thumb2 thing, samity check in ARM
906 vldmia r6, {q12} @ .LSRM0
910 vmov.i8 q8,#0x55 @ compose .LBS0
911 vmov.i8 q9,#0x33 @ compose .LBS1
919 vshl.u64 q10, q10, #1
921 vshl.u64 q11, q11, #1
931 vshl.u64 q10, q10, #1
933 vshl.u64 q11, q11, #1
936 vmov.i8 q8,#0x0f @ compose .LBS2
944 vshl.u64 q10, q10, #2
946 vshl.u64 q11, q11, #2
956 vshl.u64 q10, q10, #2
958 vshl.u64 q11, q11, #2
968 vshl.u64 q10, q10, #4
970 vshl.u64 q11, q11, #4
980 vshl.u64 q10, q10, #4
982 vshl.u64 q11, q11, #4
985 vldmia r4, {q8} @ last round key
995 .size _bsaes_encrypt8,.-_bsaes_encrypt8
996 .type _bsaes_key_convert,%function
999 adr r6,_bsaes_key_convert
1000 vld1.8 {q7}, [r4]! @ load round 0 key
1001 sub r6,r6,#_bsaes_key_convert-.LM0
1002 vld1.8 {q15}, [r4]! @ load round 1 key
1004 vmov.i8 q8, #0x01 @ bit masks
1010 vldmia r6, {q14} @ .LM0
1017 vstmia r12!, {q7} @ save round 0 key
1022 vtbl.8 d14,{q15},d28
1023 vtbl.8 d15,{q15},d29
1035 vld1.8 {q15}, [r4]! @ load next round key
1036 vmvn q0, q0 @ "pnot"
1044 vstmia r12!,{q0-q7} @ write bit-sliced round key
1047 vmov.i8 q7,#0x63 @ compose .L63
1048 @ don't save last round key
1050 .size _bsaes_key_convert,.-_bsaes_key_convert
1051 .extern AES_cbc_encrypt
1054 .global bsaes_cbc_encrypt
1055 .type bsaes_cbc_encrypt,%function
1069 @ it is up to the caller to make sure we are called with enc == 0
1072 stmdb sp!, {r4-r10, lr}
1074 ldr r8, [ip] @ IV is 1st arg on the stack
1075 mov r2, r2, lsr#4 @ len in 16 byte blocks
1076 sub sp, #0x10 @ scratch space to carry over the IV
1077 mov r9, sp @ save sp
1079 ldr r10, [r3, #240] @ get # of rounds
1080 #ifndef BSAES_ASM_EXTENDED_KEY
1081 @ allocate the key schedule on the stack
1082 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1083 add r12, #96 @ sifze of bit-slices key schedule
1085 @ populate the key schedule
1086 mov r4, r3 @ pass key
1087 mov r5, r10 @ pass # of rounds
1088 mov sp, r12 @ sp is sp
1089 bl _bsaes_key_convert
1091 vstmia r12, {q15} @ save last round key
1092 veor q7, q7, q6 @ fix up round 0 key
1099 @ populate the key schedule
1101 mov r4, r3 @ pass key
1102 mov r5, r10 @ pass # of rounds
1103 add r12, r3, #248 @ pass key schedule
1104 bl _bsaes_key_convert
1107 vstmia r12, {q15} @ save last round key
1108 veor q7, q7, q6 @ fix up round 0 key
1115 vld1.8 {q15}, [r8] @ load IV
1121 bmi .Lcbc_dec_loop_finish
1123 vld1.8 {q0-q1}, [r0]! @ load input
1124 vld1.8 {q2-q3}, [r0]!
1125 #ifndef BSAES_ASM_EXTENDED_KEY
1126 mov r4, sp @ pass the key
1130 vld1.8 {q4-q5}, [r0]!
1132 vld1.8 {q6-q7}, [r0]
1134 vstmia r9, {q15} @ put aside IV
1138 vldmia r9, {q14} @ reload IV
1139 vld1.8 {q8-q9}, [r0]! @ reload input
1140 veor q0, q0, q14 @ ^= IV
1141 vld1.8 {q10-q11}, [r0]!
1144 vld1.8 {q12-q13}, [r0]!
1147 vld1.8 {q14-q15}, [r0]!
1149 vst1.8 {q0-q1}, [r1]! @ write output
1161 .Lcbc_dec_loop_finish:
1165 vld1.8 {q0}, [r0]! @ load input
1169 #ifndef BSAES_ASM_EXTENDED_KEY
1170 mov r4, sp @ pass the key
1175 vstmia r9, {q15} @ put aside IV
1192 vldmia r9, {q14} @ reload IV
1193 vld1.8 {q8-q9}, [r0]! @ reload input
1194 veor q0, q0, q14 @ ^= IV
1195 vld1.8 {q10-q11}, [r0]!
1198 vld1.8 {q12-q13}, [r0]!
1203 vst1.8 {q0-q1}, [r1]! @ write output
1215 vldmia r9,{q14} @ reload IV
1216 vld1.8 {q8-q9}, [r0]! @ reload input
1217 veor q0, q0, q14 @ ^= IV
1218 vld1.8 {q10-q11}, [r0]!
1226 vst1.8 {q0-q1}, [r1]! @ write output
1236 vldmia r9, {q14} @ reload IV
1237 vld1.8 {q8-q9}, [r0]! @ reload input
1238 veor q0, q0, q14 @ ^= IV
1239 vld1.8 {q10-q11}, [r0]!
1244 vst1.8 {q0-q1}, [r1]! @ write output
1254 vldmia r9, {q14} @ reload IV
1255 vld1.8 {q8-q9}, [r0]! @ reload input
1256 veor q0, q0, q14 @ ^= IV
1262 vst1.8 {q0-q1}, [r1]! @ write output
1270 vldmia r9, {q14} @ reload IV
1271 vld1.8 {q8-q9}, [r0]! @ reload input
1272 veor q0, q0, q14 @ ^= IV
1276 vst1.8 {q0-q1}, [r1]! @ write output
1283 vldmia r9, {q14} @ reload IV
1284 vld1.8 {q8}, [r0]! @ reload input
1285 veor q0, q0, q14 @ ^= IV
1286 vld1.8 {q15}, [r0]! @ reload input
1288 vst1.8 {q0-q1}, [r1]! @ write output
1293 mov r10, r1 @ save original out pointer
1294 mov r1, r9 @ use the iv scratch space as out buffer
1296 vmov q4,q15 @ just in case ensure that IV
1297 vmov q5,q0 @ and input are preserved
1299 vld1.8 {q0}, [r9,:64] @ load result
1300 veor q0, q0, q4 @ ^= IV
1301 vmov q15, q5 @ q5 holds input
1302 vst1.8 {q0}, [r10] @ write output
1305 #ifndef BSAES_ASM_EXTENDED_KEY
1308 .Lcbc_dec_bzero: @ wipe key schedule [if any]
1315 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1316 vst1.8 {q15}, [r8] @ return IV
1318 ldmia sp!, {r4-r10, pc}
1319 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1321 .global bsaes_ctr32_encrypt_blocks
1322 .type bsaes_ctr32_encrypt_blocks,%function
1324 bsaes_ctr32_encrypt_blocks:
1325 cmp r2, #8 @ use plain AES for
1326 blo .Lctr_enc_short @ small sizes
1329 stmdb sp!, {r4-r10, lr}
1331 ldr r8, [ip] @ ctr is 1st arg on the stack
1332 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1333 mov r9, sp @ save sp
1335 ldr r10, [r3, #240] @ get # of rounds
1336 #ifndef BSAES_ASM_EXTENDED_KEY
1337 @ allocate the key schedule on the stack
1338 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1339 add r12, #96 @ size of bit-sliced key schedule
1341 @ populate the key schedule
1342 mov r4, r3 @ pass key
1343 mov r5, r10 @ pass # of rounds
1344 mov sp, r12 @ sp is sp
1345 bl _bsaes_key_convert
1346 veor q7,q7,q15 @ fix up last round key
1347 vstmia r12, {q7} @ save last round key
1349 vld1.8 {q0}, [r8] @ load counter
1350 add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
1351 vldmia sp, {q4} @ load round0 key
1357 @ populate the key schedule
1359 mov r4, r3 @ pass key
1360 mov r5, r10 @ pass # of rounds
1361 add r12, r3, #248 @ pass key schedule
1362 bl _bsaes_key_convert
1363 veor q7,q7,q15 @ fix up last round key
1364 vstmia r12, {q7} @ save last round key
1367 0: add r12, r3, #248
1368 vld1.8 {q0}, [r8] @ load counter
1369 adrl r8, .LREVM0SR @ borrow r8
1370 vldmia r12, {q4} @ load round0 key
1371 sub sp, #0x10 @ place for adjusted round0 key
1374 vmov.i32 q8,#1 @ compose 1<<96
1379 vadd.u32 q9,q8,q8 @ compose 2<<96
1380 vstmia sp, {q4} @ save adjusted round0 key
1385 vadd.u32 q10, q8, q9 @ compose 3<<96
1386 vadd.u32 q1, q0, q8 @ +1
1387 vadd.u32 q2, q0, q9 @ +2
1388 vadd.u32 q3, q0, q10 @ +3
1389 vadd.u32 q4, q1, q10
1390 vadd.u32 q5, q2, q10
1391 vadd.u32 q6, q3, q10
1392 vadd.u32 q7, q4, q10
1393 vadd.u32 q10, q5, q10 @ next counter
1395 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1396 @ to flip byte order in 32-bit counter
1398 vldmia sp, {q9} @ load round0 key
1399 #ifndef BSAES_ASM_EXTENDED_KEY
1400 add r4, sp, #0x10 @ pass next round key
1404 vldmia r8, {q8} @ .LREVM0SR
1405 mov r5, r10 @ pass rounds
1406 vstmia r9, {q10} @ save next counter
1407 sub r6, r8, #.LREVM0SR-.LSR @ pass constants
1409 bl _bsaes_encrypt8_alt
1412 blo .Lctr_enc_loop_done
1414 vld1.8 {q8-q9}, [r0]! @ load input
1415 vld1.8 {q10-q11}, [r0]!
1418 vld1.8 {q12-q13}, [r0]!
1421 vld1.8 {q14-q15}, [r0]!
1423 vst1.8 {q0-q1}, [r1]! @ write output
1429 vmov.i32 q8, #1 @ compose 1<<96
1433 vext.8 q8, q9, q8, #4
1435 vadd.u32 q9,q8,q8 @ compose 2<<96
1437 vldmia r9, {q0} @ load counter
1443 .Lctr_enc_loop_done:
1445 vld1.8 {q8}, [r0]! @ load input
1447 vst1.8 {q0}, [r1]! @ write output
1479 #ifndef BSAES_ASM_EXTENDED_KEY
1480 .Lctr_enc_bzero: @ wipe key schedule [if any]
1489 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1491 ldmia sp!, {r4-r10, pc} @ return
1495 ldr ip, [sp] @ ctr pointer is passed on stack
1496 stmdb sp!, {r4-r8, lr}
1498 mov r4, r0 @ copy arguments
1502 ldr r8, [ip, #12] @ load counter LSW
1503 vld1.8 {q1}, [ip] @ load whole counter value
1508 vst1.8 {q1}, [sp,:64] @ copy counter value
1511 .Lctr_enc_short_loop:
1512 add r0, sp, #0x10 @ input counter value
1513 mov r1, sp @ output on the stack
1518 vld1.8 {q0}, [r4]! @ load input
1519 vld1.8 {q1}, [sp,:64] @ load encrypted counter
1523 str r0, [sp, #0x1c] @ next counter value
1525 str r8, [sp, #0x1c] @ next counter value
1528 vst1.8 {q0}, [r5]! @ store output
1530 bne .Lctr_enc_short_loop
1536 ldmia sp!, {r4-r8, pc}
1537 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1538 .globl bsaes_xts_encrypt
1539 .type bsaes_xts_encrypt,%function
1543 stmdb sp!, {r4-r10, lr} @ 0x20
1545 mov r6, sp @ future r3
1552 sub r0, sp, #0x10 @ 0x10
1553 bic r0, #0xf @ align at 16 bytes
1556 #ifdef XTS_CHAIN_TWEAK
1557 ldr r0, [ip] @ pointer to input tweak
1559 @ generate initial tweak
1560 ldr r0, [ip, #4] @ iv[]
1562 ldr r2, [ip, #0] @ key2
1564 mov r0,sp @ pointer to initial tweak
1567 ldr r1, [r10, #240] @ get # of rounds
1569 #ifndef BSAES_ASM_EXTENDED_KEY
1570 @ allocate the key schedule on the stack
1571 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
1572 @ add r12, #96 @ size of bit-sliced key schedule
1573 sub r12, #48 @ place for tweak[9]
1575 @ populate the key schedule
1576 mov r4, r10 @ pass key
1577 mov r5, r1 @ pass # of rounds
1579 add r12, #0x90 @ pass key schedule
1580 bl _bsaes_key_convert
1581 veor q7, q7, q15 @ fix up last round key
1582 vstmia r12, {q7} @ save last round key
1584 ldr r12, [r10, #244]
1588 str r12, [r10, #244]
1589 mov r4, r10 @ pass key
1590 mov r5, r1 @ pass # of rounds
1591 add r12, r10, #248 @ pass key schedule
1592 bl _bsaes_key_convert
1593 veor q7, q7, q15 @ fix up last round key
1597 0: sub sp, #0x90 @ place for tweak[9]
1600 vld1.8 {q8}, [r0] @ initial tweak
1609 vldmia r2, {q5} @ load XTS magic
1610 vshr.s64 q6, q8, #63
1614 vst1.64 {q8}, [r0,:128]!
1616 vshr.s64 q7, q9, #63
1619 vadd.u64 q10, q9, q9
1620 vst1.64 {q9}, [r0,:128]!
1622 vshr.s64 q6, q10, #63
1626 vadd.u64 q11, q10, q10
1627 vst1.64 {q10}, [r0,:128]!
1629 vshr.s64 q7, q11, #63
1634 vadd.u64 q12, q11, q11
1635 vst1.64 {q11}, [r0,:128]!
1637 vshr.s64 q6, q12, #63
1642 vadd.u64 q13, q12, q12
1643 vst1.64 {q12}, [r0,:128]!
1645 vshr.s64 q7, q13, #63
1650 vadd.u64 q14, q13, q13
1651 vst1.64 {q13}, [r0,:128]!
1653 vshr.s64 q6, q14, #63
1658 vadd.u64 q15, q14, q14
1659 vst1.64 {q14}, [r0,:128]!
1661 vshr.s64 q7, q15, #63
1666 vadd.u64 q8, q15, q15
1667 vst1.64 {q15}, [r0,:128]!
1670 vst1.64 {q8}, [r0,:128] @ next round tweak
1672 vld1.8 {q6-q7}, [r7]!
1674 #ifndef BSAES_ASM_EXTENDED_KEY
1675 add r4, sp, #0x90 @ pass key schedule
1677 add r4, r10, #248 @ pass key schedule
1680 mov r5, r1 @ pass rounds
1686 vld1.64 {q8-q9}, [r0,:128]!
1687 vld1.64 {q10-q11}, [r0,:128]!
1689 vld1.64 {q12-q13}, [r0,:128]!
1692 vst1.8 {q0-q1}, [r8]!
1694 vld1.64 {q14-q15}, [r0,:128]!
1696 vst1.8 {q8-q9}, [r8]!
1699 vst1.8 {q10-q11}, [r8]!
1701 vst1.8 {q12-q13}, [r8]!
1703 vld1.64 {q8}, [r0,:128] @ next round tweak
1712 vldmia r2, {q5} @ load XTS magic
1713 vshr.s64 q7, q8, #63
1717 vst1.64 {q8}, [r0,:128]!
1719 vshr.s64 q6, q9, #63
1722 vadd.u64 q10, q9, q9
1723 vst1.64 {q9}, [r0,:128]!
1725 vshr.s64 q7, q10, #63
1731 vadd.u64 q11, q10, q10
1732 vst1.64 {q10}, [r0,:128]!
1734 vshr.s64 q6, q11, #63
1741 vadd.u64 q12, q11, q11
1742 vst1.64 {q11}, [r0,:128]!
1744 vshr.s64 q7, q12, #63
1751 vadd.u64 q13, q12, q12
1752 vst1.64 {q12}, [r0,:128]!
1754 vshr.s64 q6, q13, #63
1761 vadd.u64 q14, q13, q13
1762 vst1.64 {q13}, [r0,:128]!
1764 vshr.s64 q7, q14, #63
1771 vadd.u64 q15, q14, q14
1772 vst1.64 {q14}, [r0,:128]!
1774 vshr.s64 q6, q15, #63
1782 vst1.64 {q15}, [r0,:128] @ next round tweak
1786 #ifndef BSAES_ASM_EXTENDED_KEY
1787 add r4, sp, #0x90 @ pass key schedule
1789 add r4, r10, #248 @ pass key schedule
1792 mov r5, r1 @ pass rounds
1797 vld1.64 {q8-q9}, [r0,:128]!
1798 vld1.64 {q10-q11}, [r0,:128]!
1800 vld1.64 {q12-q13}, [r0,:128]!
1803 vst1.8 {q0-q1}, [r8]!
1805 vld1.64 {q14}, [r0,:128]!
1807 vst1.8 {q8-q9}, [r8]!
1810 vst1.8 {q10-q11}, [r8]!
1813 vld1.64 {q8}, [r0,:128] @ next round tweak
1817 vst1.64 {q14}, [r0,:128] @ next round tweak
1820 #ifndef BSAES_ASM_EXTENDED_KEY
1821 add r4, sp, #0x90 @ pass key schedule
1823 add r4, r10, #248 @ pass key schedule
1826 mov r5, r1 @ pass rounds
1831 vld1.64 {q8-q9}, [r0,:128]!
1832 vld1.64 {q10-q11}, [r0,:128]!
1834 vld1.64 {q12-q13}, [r0,:128]!
1837 vst1.8 {q0-q1}, [r8]!
1840 vst1.8 {q8-q9}, [r8]!
1842 vst1.8 {q10-q11}, [r8]!
1844 vld1.64 {q8}, [r0,:128] @ next round tweak
1847 @ put this in range for both ARM and Thumb mode adr instructions
1854 vst1.64 {q13}, [r0,:128] @ next round tweak
1857 #ifndef BSAES_ASM_EXTENDED_KEY
1858 add r4, sp, #0x90 @ pass key schedule
1860 add r4, r10, #248 @ pass key schedule
1863 mov r5, r1 @ pass rounds
1868 vld1.64 {q8-q9}, [r0,:128]!
1869 vld1.64 {q10-q11}, [r0,:128]!
1871 vld1.64 {q12}, [r0,:128]!
1874 vst1.8 {q0-q1}, [r8]!
1877 vst1.8 {q8-q9}, [r8]!
1880 vld1.64 {q8}, [r0,:128] @ next round tweak
1884 vst1.64 {q12}, [r0,:128] @ next round tweak
1887 #ifndef BSAES_ASM_EXTENDED_KEY
1888 add r4, sp, #0x90 @ pass key schedule
1890 add r4, r10, #248 @ pass key schedule
1893 mov r5, r1 @ pass rounds
1898 vld1.64 {q8-q9}, [r0,:128]!
1899 vld1.64 {q10-q11}, [r0,:128]!
1903 vst1.8 {q0-q1}, [r8]!
1905 vst1.8 {q8-q9}, [r8]!
1907 vld1.64 {q8}, [r0,:128] @ next round tweak
1911 vst1.64 {q11}, [r0,:128] @ next round tweak
1914 #ifndef BSAES_ASM_EXTENDED_KEY
1915 add r4, sp, #0x90 @ pass key schedule
1917 add r4, r10, #248 @ pass key schedule
1920 mov r5, r1 @ pass rounds
1925 vld1.64 {q8-q9}, [r0,:128]!
1926 vld1.64 {q10}, [r0,:128]!
1930 vst1.8 {q0-q1}, [r8]!
1933 vld1.64 {q8}, [r0,:128] @ next round tweak
1937 vst1.64 {q10}, [r0,:128] @ next round tweak
1940 #ifndef BSAES_ASM_EXTENDED_KEY
1941 add r4, sp, #0x90 @ pass key schedule
1943 add r4, r10, #248 @ pass key schedule
1946 mov r5, r1 @ pass rounds
1951 vld1.64 {q8-q9}, [r0,:128]!
1954 vst1.8 {q0-q1}, [r8]!
1956 vld1.64 {q8}, [r0,:128] @ next round tweak
1963 vst1.8 {q0}, [sp,:128]
1965 mov r4, r3 @ preserve fp
1969 vld1.8 {q0}, [sp,:128]
1974 vmov q8, q9 @ next round tweak
1977 #ifndef XTS_CHAIN_TWEAK
1984 ldrb r1, [r8, #-0x10]
1985 strb r0, [r8, #-0x10]
1995 vst1.8 {q0}, [sp,:128]
1997 mov r4, r3 @ preserve fp
2001 vld1.8 {q0}, [sp,:128]
2011 #ifdef XTS_CHAIN_TWEAK
2012 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2014 .Lxts_enc_bzero: @ wipe key schedule [if any]
2020 #ifdef XTS_CHAIN_TWEAK
2024 ldmia sp!, {r4-r10, pc} @ return
2026 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2028 .globl bsaes_xts_decrypt
2029 .type bsaes_xts_decrypt,%function
2033 stmdb sp!, {r4-r10, lr} @ 0x20
2035 mov r6, sp @ future r3
2042 sub r0, sp, #0x10 @ 0x10
2043 bic r0, #0xf @ align at 16 bytes
2046 #ifdef XTS_CHAIN_TWEAK
2047 ldr r0, [ip] @ pointer to input tweak
2049 @ generate initial tweak
2050 ldr r0, [ip, #4] @ iv[]
2052 ldr r2, [ip, #0] @ key2
2054 mov r0, sp @ pointer to initial tweak
2057 ldr r1, [r10, #240] @ get # of rounds
2059 #ifndef BSAES_ASM_EXTENDED_KEY
2060 @ allocate the key schedule on the stack
2061 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
2062 @ add r12, #96 @ size of bit-sliced key schedule
2063 sub r12, #48 @ place for tweak[9]
2065 @ populate the key schedule
2066 mov r4, r10 @ pass key
2067 mov r5, r1 @ pass # of rounds
2069 add r12, #0x90 @ pass key schedule
2070 bl _bsaes_key_convert
2073 vstmia r12, {q15} @ save last round key
2074 veor q7, q7, q6 @ fix up round 0 key
2077 ldr r12, [r10, #244]
2081 str r12, [r10, #244]
2082 mov r4, r10 @ pass key
2083 mov r5, r1 @ pass # of rounds
2084 add r12, r10, #248 @ pass key schedule
2085 bl _bsaes_key_convert
2088 vstmia r12, {q15} @ save last round key
2089 veor q7, q7, q6 @ fix up round 0 key
2093 0: sub sp, #0x90 @ place for tweak[9]
2095 vld1.8 {q8}, [r0] @ initial tweak
2098 tst r9, #0xf @ if not multiple of 16
2099 it ne @ Thumb2 thing, sanity check in ARM
2100 subne r9, #0x10 @ subtract another 16 bytes
2108 vldmia r2, {q5} @ load XTS magic
2109 vshr.s64 q6, q8, #63
2113 vst1.64 {q8}, [r0,:128]!
2115 vshr.s64 q7, q9, #63
2118 vadd.u64 q10, q9, q9
2119 vst1.64 {q9}, [r0,:128]!
2121 vshr.s64 q6, q10, #63
2125 vadd.u64 q11, q10, q10
2126 vst1.64 {q10}, [r0,:128]!
2128 vshr.s64 q7, q11, #63
2133 vadd.u64 q12, q11, q11
2134 vst1.64 {q11}, [r0,:128]!
2136 vshr.s64 q6, q12, #63
2141 vadd.u64 q13, q12, q12
2142 vst1.64 {q12}, [r0,:128]!
2144 vshr.s64 q7, q13, #63
2149 vadd.u64 q14, q13, q13
2150 vst1.64 {q13}, [r0,:128]!
2152 vshr.s64 q6, q14, #63
2157 vadd.u64 q15, q14, q14
2158 vst1.64 {q14}, [r0,:128]!
2160 vshr.s64 q7, q15, #63
2165 vadd.u64 q8, q15, q15
2166 vst1.64 {q15}, [r0,:128]!
2169 vst1.64 {q8}, [r0,:128] @ next round tweak
2171 vld1.8 {q6-q7}, [r7]!
2173 #ifndef BSAES_ASM_EXTENDED_KEY
2174 add r4, sp, #0x90 @ pass key schedule
2176 add r4, r10, #248 @ pass key schedule
2179 mov r5, r1 @ pass rounds
2185 vld1.64 {q8-q9}, [r0,:128]!
2186 vld1.64 {q10-q11}, [r0,:128]!
2188 vld1.64 {q12-q13}, [r0,:128]!
2191 vst1.8 {q0-q1}, [r8]!
2193 vld1.64 {q14-q15}, [r0,:128]!
2195 vst1.8 {q8-q9}, [r8]!
2198 vst1.8 {q10-q11}, [r8]!
2200 vst1.8 {q12-q13}, [r8]!
2202 vld1.64 {q8}, [r0,:128] @ next round tweak
2211 vldmia r2, {q5} @ load XTS magic
2212 vshr.s64 q7, q8, #63
2216 vst1.64 {q8}, [r0,:128]!
2218 vshr.s64 q6, q9, #63
2221 vadd.u64 q10, q9, q9
2222 vst1.64 {q9}, [r0,:128]!
2224 vshr.s64 q7, q10, #63
2230 vadd.u64 q11, q10, q10
2231 vst1.64 {q10}, [r0,:128]!
2233 vshr.s64 q6, q11, #63
2240 vadd.u64 q12, q11, q11
2241 vst1.64 {q11}, [r0,:128]!
2243 vshr.s64 q7, q12, #63
2250 vadd.u64 q13, q12, q12
2251 vst1.64 {q12}, [r0,:128]!
2253 vshr.s64 q6, q13, #63
2260 vadd.u64 q14, q13, q13
2261 vst1.64 {q13}, [r0,:128]!
2263 vshr.s64 q7, q14, #63
2270 vadd.u64 q15, q14, q14
2271 vst1.64 {q14}, [r0,:128]!
2273 vshr.s64 q6, q15, #63
2281 vst1.64 {q15}, [r0,:128] @ next round tweak
2285 #ifndef BSAES_ASM_EXTENDED_KEY
2286 add r4, sp, #0x90 @ pass key schedule
2288 add r4, r10, #248 @ pass key schedule
2291 mov r5, r1 @ pass rounds
2296 vld1.64 {q8-q9}, [r0,:128]!
2297 vld1.64 {q10-q11}, [r0,:128]!
2299 vld1.64 {q12-q13}, [r0,:128]!
2302 vst1.8 {q0-q1}, [r8]!
2304 vld1.64 {q14}, [r0,:128]!
2306 vst1.8 {q8-q9}, [r8]!
2309 vst1.8 {q10-q11}, [r8]!
2312 vld1.64 {q8}, [r0,:128] @ next round tweak
2316 vst1.64 {q14}, [r0,:128] @ next round tweak
2319 #ifndef BSAES_ASM_EXTENDED_KEY
2320 add r4, sp, #0x90 @ pass key schedule
2322 add r4, r10, #248 @ pass key schedule
2325 mov r5, r1 @ pass rounds
2330 vld1.64 {q8-q9}, [r0,:128]!
2331 vld1.64 {q10-q11}, [r0,:128]!
2333 vld1.64 {q12-q13}, [r0,:128]!
2336 vst1.8 {q0-q1}, [r8]!
2339 vst1.8 {q8-q9}, [r8]!
2341 vst1.8 {q10-q11}, [r8]!
2343 vld1.64 {q8}, [r0,:128] @ next round tweak
2347 vst1.64 {q13}, [r0,:128] @ next round tweak
2350 #ifndef BSAES_ASM_EXTENDED_KEY
2351 add r4, sp, #0x90 @ pass key schedule
2353 add r4, r10, #248 @ pass key schedule
2356 mov r5, r1 @ pass rounds
2361 vld1.64 {q8-q9}, [r0,:128]!
2362 vld1.64 {q10-q11}, [r0,:128]!
2364 vld1.64 {q12}, [r0,:128]!
2367 vst1.8 {q0-q1}, [r8]!
2370 vst1.8 {q8-q9}, [r8]!
2373 vld1.64 {q8}, [r0,:128] @ next round tweak
2377 vst1.64 {q12}, [r0,:128] @ next round tweak
2380 #ifndef BSAES_ASM_EXTENDED_KEY
2381 add r4, sp, #0x90 @ pass key schedule
2383 add r4, r10, #248 @ pass key schedule
2386 mov r5, r1 @ pass rounds
2391 vld1.64 {q8-q9}, [r0,:128]!
2392 vld1.64 {q10-q11}, [r0,:128]!
2396 vst1.8 {q0-q1}, [r8]!
2398 vst1.8 {q8-q9}, [r8]!
2400 vld1.64 {q8}, [r0,:128] @ next round tweak
2404 vst1.64 {q11}, [r0,:128] @ next round tweak
2407 #ifndef BSAES_ASM_EXTENDED_KEY
2408 add r4, sp, #0x90 @ pass key schedule
2410 add r4, r10, #248 @ pass key schedule
2413 mov r5, r1 @ pass rounds
2418 vld1.64 {q8-q9}, [r0,:128]!
2419 vld1.64 {q10}, [r0,:128]!
2423 vst1.8 {q0-q1}, [r8]!
2426 vld1.64 {q8}, [r0,:128] @ next round tweak
2430 vst1.64 {q10}, [r0,:128] @ next round tweak
2433 #ifndef BSAES_ASM_EXTENDED_KEY
2434 add r4, sp, #0x90 @ pass key schedule
2436 add r4, r10, #248 @ pass key schedule
2439 mov r5, r1 @ pass rounds
2444 vld1.64 {q8-q9}, [r0,:128]!
2447 vst1.8 {q0-q1}, [r8]!
2449 vld1.64 {q8}, [r0,:128] @ next round tweak
2456 vst1.8 {q0}, [sp,:128]
2458 mov r4, r3 @ preserve fp
2459 mov r5, r2 @ preserve magic
2463 vld1.8 {q0}, [sp,:128]
2469 vmov q8, q9 @ next round tweak
2472 #ifndef XTS_CHAIN_TWEAK
2476 @ calculate one round of extra tweak for the stolen ciphertext
2478 vshr.s64 q6, q8, #63
2484 @ perform the final decryption with the last tweak value
2489 vst1.8 {q0}, [sp,:128]
2491 mov r4, r3 @ preserve fp
2495 vld1.8 {q0}, [sp,:128]
2503 strb r1, [r8, #0x10]
2513 vst1.8 {q0}, [sp,:128]
2518 vld1.8 {q0}, [sp,:128]
2528 #ifdef XTS_CHAIN_TWEAK
2529 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2531 .Lxts_dec_bzero: @ wipe key schedule [if any]
2537 #ifdef XTS_CHAIN_TWEAK
2541 ldmia sp!, {r4-r10, pc} @ return
2543 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt