3 @ ====================================================================
4 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 @ project. The module is, however, dual licensed under OpenSSL and
6 @ CRYPTOGAMS licenses depending on where you obtain it. For further
7 @ details see http://www.openssl.org/~appro/cryptogams/.
9 @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
10 @ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
12 @ ====================================================================
14 @ Bit-sliced AES for ARM NEON
18 @ This implementation is direct adaptation of bsaes-x86_64 module for
19 @ ARM NEON. Except that this module is endian-neutral [in sense that
20 @ it can be compiled for either endianness] by courtesy of vld1.8's
21 @ neutrality. Initial version doesn't implement interface to OpenSSL,
22 @ only low-level primitives and unsupported entry points, just enough
23 @ to collect performance results, which for Cortex-A8 core are:
25 @ encrypt 19.5 cycles per byte processed with 128-bit key
26 @ decrypt 22.1 cycles per byte processed with 128-bit key
27 @ key conv. 440 cycles per 128-bit key/0.18 of 8x block
29 @ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
30 @ which is [much] worse than anticipated (for further details see
31 @ http://www.openssl.org/~appro/Snapdragon-S4.html).
33 @ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
34 @ manages in 20.0 cycles].
36 @ When comparing to x86_64 results keep in mind that NEON unit is
37 @ [mostly] single-issue and thus can't [fully] benefit from
38 @ instruction-level parallelism. And when comparing to aes-armv4
39 @ results keep in mind key schedule conversion overhead (see
40 @ bsaes-x86_64.pl for further details)...
46 @ Add CBC, CTR and XTS subroutines, adapt for kernel use.
48 @ <ard.biesheuvel@linaro.org>
52 # include <openssl/arm_arch.h>
54 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
55 # define VFP_ABI_POP vldmia sp!,{d8-d15}
56 # define VFP_ABI_FRAME 0x40
60 # define VFP_ABI_FRAME 0
61 # define BSAES_ASM_EXTENDED_KEY
62 # define XTS_CHAIN_TWEAK
63 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
64 # define __ARM_MAX_ARCH__ 7
71 #if __ARM_MAX_ARCH__>=7
76 .syntax unified @ ARMv7-capable assembler is expected to handle this
77 #if defined(__thumb2__) && !defined(__APPLE__)
83 .type _bsaes_decrypt8,%function
86 adr r6,_bsaes_decrypt8
87 vldmia r4!, {q9} @ round 0 key
91 add r6,r6,#.LM0ISR-_bsaes_decrypt8
94 vldmia r6!, {q8} @ .LM0ISR
95 veor q10, q0, q9 @ xor with round0 key
100 vtbl.8 d2, {q11}, d16
101 vtbl.8 d3, {q11}, d17
103 vtbl.8 d4, {q12}, d16
104 vtbl.8 d5, {q12}, d17
106 vtbl.8 d6, {q13}, d16
107 vtbl.8 d7, {q13}, d17
109 vtbl.8 d8, {q14}, d16
110 vtbl.8 d9, {q14}, d17
112 vtbl.8 d10, {q15}, d16
113 vtbl.8 d11, {q15}, d17
115 vtbl.8 d12, {q10}, d16
116 vtbl.8 d13, {q10}, d17
117 vtbl.8 d14, {q11}, d16
118 vtbl.8 d15, {q11}, d17
119 vmov.i8 q8,#0x55 @ compose .LBS0
120 vmov.i8 q9,#0x33 @ compose .LBS1
128 vshl.u64 q10, q10, #1
130 vshl.u64 q11, q11, #1
140 vshl.u64 q10, q10, #1
142 vshl.u64 q11, q11, #1
145 vmov.i8 q8,#0x0f @ compose .LBS2
153 vshl.u64 q10, q10, #2
155 vshl.u64 q11, q11, #2
165 vshl.u64 q10, q10, #2
167 vshl.u64 q11, q11, #2
177 vshl.u64 q10, q10, #4
179 vshl.u64 q11, q11, #4
189 vshl.u64 q10, q10, #4
191 vshl.u64 q11, q11, #4
198 vldmia r4!, {q8,q9,q10,q11}
209 vtbl.8 d4, {q10}, d24
210 vtbl.8 d5, {q10}, d25
212 vtbl.8 d6, {q11}, d24
213 vtbl.8 d7, {q11}, d25
220 vtbl.8 d10, {q9}, d24
221 vtbl.8 d11, {q9}, d25
223 vtbl.8 d12, {q10}, d24
224 vtbl.8 d13, {q10}, d25
225 vtbl.8 d14, {q11}, d24
226 vtbl.8 d15, {q11}, d25
279 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
281 @ new smaller inversion
288 veor q14, q8, q14 @ q14=q15
377 @ multiplication by 0x05-0x00-0x04-0x00
378 vext.8 q8, q0, q0, #8
379 vext.8 q14, q3, q3, #8
380 vext.8 q15, q5, q5, #8
382 vext.8 q9, q1, q1, #8
384 vext.8 q10, q6, q6, #8
386 vext.8 q11, q4, q4, #8
388 vext.8 q12, q2, q2, #8
390 vext.8 q13, q7, q7, #8
409 vext.8 q8, q0, q0, #12 @ x0 <<< 32
410 vext.8 q9, q1, q1, #12
411 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
412 vext.8 q10, q6, q6, #12
414 vext.8 q11, q4, q4, #12
416 vext.8 q12, q2, q2, #12
418 vext.8 q13, q7, q7, #12
420 vext.8 q14, q3, q3, #12
422 vext.8 q15, q5, q5, #12
427 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
431 vext.8 q1, q1, q1, #8
436 vext.8 q8, q2, q2, #8
438 vext.8 q9, q7, q7, #8
440 vext.8 q2, q4, q4, #8
442 vext.8 q7, q5, q5, #8
444 vext.8 q4, q3, q3, #8
446 vext.8 q3, q6, q6, #8
455 vldmia r6, {q12} @ .LISR
456 ite eq @ Thumb2 thing, sanity check in ARM
459 vldmia r6, {q12} @ .LISRM0
463 vmov.i8 q8,#0x55 @ compose .LBS0
464 vmov.i8 q9,#0x33 @ compose .LBS1
472 vshl.u64 q10, q10, #1
474 vshl.u64 q11, q11, #1
484 vshl.u64 q10, q10, #1
486 vshl.u64 q11, q11, #1
489 vmov.i8 q8,#0x0f @ compose .LBS2
497 vshl.u64 q10, q10, #2
499 vshl.u64 q11, q11, #2
509 vshl.u64 q10, q10, #2
511 vshl.u64 q11, q11, #2
521 vshl.u64 q10, q10, #4
523 vshl.u64 q11, q11, #4
533 vshl.u64 q10, q10, #4
535 vshl.u64 q11, q11, #4
538 vldmia r4, {q8} @ last round key
548 .size _bsaes_decrypt8,.-_bsaes_decrypt8
550 .type _bsaes_const,%object
553 .LM0ISR:@ InvShiftRows constants
554 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
556 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
558 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
559 .LM0SR:@ ShiftRows constants
560 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
562 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
564 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
566 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
568 .quad 0x090d01050c000408, 0x03070b0f060a0e02
569 .byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
572 .size _bsaes_const,.-_bsaes_const
574 .type _bsaes_encrypt8,%function
577 adr r6,_bsaes_encrypt8
578 vldmia r4!, {q9} @ round 0 key
582 sub r6,r6,#_bsaes_encrypt8-.LM0SR
585 vldmia r6!, {q8} @ .LM0SR
587 veor q10, q0, q9 @ xor with round0 key
589 vtbl.8 d0, {q10}, d16
590 vtbl.8 d1, {q10}, d17
592 vtbl.8 d2, {q11}, d16
593 vtbl.8 d3, {q11}, d17
595 vtbl.8 d4, {q12}, d16
596 vtbl.8 d5, {q12}, d17
598 vtbl.8 d6, {q13}, d16
599 vtbl.8 d7, {q13}, d17
601 vtbl.8 d8, {q14}, d16
602 vtbl.8 d9, {q14}, d17
604 vtbl.8 d10, {q15}, d16
605 vtbl.8 d11, {q15}, d17
607 vtbl.8 d12, {q10}, d16
608 vtbl.8 d13, {q10}, d17
609 vtbl.8 d14, {q11}, d16
610 vtbl.8 d15, {q11}, d17
611 _bsaes_encrypt8_bitslice:
612 vmov.i8 q8,#0x55 @ compose .LBS0
613 vmov.i8 q9,#0x33 @ compose .LBS1
621 vshl.u64 q10, q10, #1
623 vshl.u64 q11, q11, #1
633 vshl.u64 q10, q10, #1
635 vshl.u64 q11, q11, #1
638 vmov.i8 q8,#0x0f @ compose .LBS2
646 vshl.u64 q10, q10, #2
648 vshl.u64 q11, q11, #2
658 vshl.u64 q10, q10, #2
660 vshl.u64 q11, q11, #2
670 vshl.u64 q10, q10, #4
672 vshl.u64 q11, q11, #4
682 vshl.u64 q10, q10, #4
684 vshl.u64 q11, q11, #4
691 vldmia r4!, {q8,q9,q10,q11}
702 vtbl.8 d4, {q10}, d24
703 vtbl.8 d5, {q10}, d25
705 vtbl.8 d6, {q11}, d24
706 vtbl.8 d7, {q11}, d25
713 vtbl.8 d10, {q9}, d24
714 vtbl.8 d11, {q9}, d25
716 vtbl.8 d12, {q10}, d24
717 vtbl.8 d13, {q10}, d25
718 vtbl.8 d14, {q11}, d24
719 vtbl.8 d15, {q11}, d25
773 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
775 @ new smaller inversion
782 veor q14, q8, q14 @ q14=q15
869 vext.8 q8, q0, q0, #12 @ x0 <<< 32
870 vext.8 q9, q1, q1, #12
871 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
872 vext.8 q10, q4, q4, #12
874 vext.8 q11, q6, q6, #12
876 vext.8 q12, q3, q3, #12
878 vext.8 q13, q7, q7, #12
880 vext.8 q14, q2, q2, #12
882 vext.8 q15, q5, q5, #12
887 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
891 vext.8 q1, q1, q1, #8
896 vext.8 q8, q3, q3, #8
898 vext.8 q9, q7, q7, #8
900 vext.8 q3, q6, q6, #8
902 vext.8 q7, q5, q5, #8
904 vext.8 q6, q2, q2, #8
906 vext.8 q2, q4, q4, #8
915 vldmia r6, {q12} @ .LSR
916 ite eq @ Thumb2 thing, samity check in ARM
919 vldmia r6, {q12} @ .LSRM0
923 vmov.i8 q8,#0x55 @ compose .LBS0
924 vmov.i8 q9,#0x33 @ compose .LBS1
932 vshl.u64 q10, q10, #1
934 vshl.u64 q11, q11, #1
944 vshl.u64 q10, q10, #1
946 vshl.u64 q11, q11, #1
949 vmov.i8 q8,#0x0f @ compose .LBS2
957 vshl.u64 q10, q10, #2
959 vshl.u64 q11, q11, #2
969 vshl.u64 q10, q10, #2
971 vshl.u64 q11, q11, #2
981 vshl.u64 q10, q10, #4
983 vshl.u64 q11, q11, #4
993 vshl.u64 q10, q10, #4
995 vshl.u64 q11, q11, #4
998 vldmia r4, {q8} @ last round key
1008 .size _bsaes_encrypt8,.-_bsaes_encrypt8
1009 .type _bsaes_key_convert,%function
1012 adr r6,_bsaes_key_convert
1013 vld1.8 {q7}, [r4]! @ load round 0 key
1017 sub r6,r6,#_bsaes_key_convert-.LM0
1019 vld1.8 {q15}, [r4]! @ load round 1 key
1021 vmov.i8 q8, #0x01 @ bit masks
1027 vldmia r6, {q14} @ .LM0
1034 vstmia r12!, {q7} @ save round 0 key
1039 vtbl.8 d14,{q15},d28
1040 vtbl.8 d15,{q15},d29
1052 vld1.8 {q15}, [r4]! @ load next round key
1053 vmvn q0, q0 @ "pnot"
1061 vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key
1064 vmov.i8 q7,#0x63 @ compose .L63
1065 @ don't save last round key
1067 .size _bsaes_key_convert,.-_bsaes_key_convert
1071 .globl bsaes_cbc_encrypt
1072 .hidden bsaes_cbc_encrypt
1073 .type bsaes_cbc_encrypt,%function
1087 @ it is up to the caller to make sure we are called with enc == 0
1090 stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
1092 ldr r8, [ip] @ IV is 1st arg on the stack
1093 mov r2, r2, lsr#4 @ len in 16 byte blocks
1094 sub sp, #0x10 @ scratch space to carry over the IV
1095 mov r9, sp @ save sp
1097 ldr r10, [r3, #240] @ get # of rounds
1098 #ifndef BSAES_ASM_EXTENDED_KEY
1099 @ allocate the key schedule on the stack
1100 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1101 add r12, #96 @ sifze of bit-slices key schedule
1103 @ populate the key schedule
1104 mov r4, r3 @ pass key
1105 mov r5, r10 @ pass # of rounds
1106 mov sp, r12 @ sp is sp
1107 bl _bsaes_key_convert
1109 vstmia r12, {q15} @ save last round key
1110 veor q7, q7, q6 @ fix up round 0 key
1117 @ populate the key schedule
1119 mov r4, r3 @ pass key
1120 mov r5, r10 @ pass # of rounds
1121 add r12, r3, #248 @ pass key schedule
1122 bl _bsaes_key_convert
1125 vstmia r12, {q15} @ save last round key
1126 veor q7, q7, q6 @ fix up round 0 key
1133 vld1.8 {q15}, [r8] @ load IV
1139 bmi .Lcbc_dec_loop_finish
1141 vld1.8 {q0,q1}, [r0]! @ load input
1142 vld1.8 {q2,q3}, [r0]!
1143 #ifndef BSAES_ASM_EXTENDED_KEY
1144 mov r4, sp @ pass the key
1148 vld1.8 {q4,q5}, [r0]!
1150 vld1.8 {q6,q7}, [r0]
1152 vstmia r9, {q15} @ put aside IV
1156 vldmia r9, {q14} @ reload IV
1157 vld1.8 {q8,q9}, [r0]! @ reload input
1158 veor q0, q0, q14 @ ^= IV
1159 vld1.8 {q10,q11}, [r0]!
1162 vld1.8 {q12,q13}, [r0]!
1165 vld1.8 {q14,q15}, [r0]!
1167 vst1.8 {q0,q1}, [r1]! @ write output
1179 .Lcbc_dec_loop_finish:
1183 vld1.8 {q0}, [r0]! @ load input
1187 #ifndef BSAES_ASM_EXTENDED_KEY
1188 mov r4, sp @ pass the key
1193 vstmia r9, {q15} @ put aside IV
1210 vldmia r9, {q14} @ reload IV
1211 vld1.8 {q8,q9}, [r0]! @ reload input
1212 veor q0, q0, q14 @ ^= IV
1213 vld1.8 {q10,q11}, [r0]!
1216 vld1.8 {q12,q13}, [r0]!
1221 vst1.8 {q0,q1}, [r1]! @ write output
1233 vldmia r9,{q14} @ reload IV
1234 vld1.8 {q8,q9}, [r0]! @ reload input
1235 veor q0, q0, q14 @ ^= IV
1236 vld1.8 {q10,q11}, [r0]!
1244 vst1.8 {q0,q1}, [r1]! @ write output
1254 vldmia r9, {q14} @ reload IV
1255 vld1.8 {q8,q9}, [r0]! @ reload input
1256 veor q0, q0, q14 @ ^= IV
1257 vld1.8 {q10,q11}, [r0]!
1262 vst1.8 {q0,q1}, [r1]! @ write output
1272 vldmia r9, {q14} @ reload IV
1273 vld1.8 {q8,q9}, [r0]! @ reload input
1274 veor q0, q0, q14 @ ^= IV
1280 vst1.8 {q0,q1}, [r1]! @ write output
1288 vldmia r9, {q14} @ reload IV
1289 vld1.8 {q8,q9}, [r0]! @ reload input
1290 veor q0, q0, q14 @ ^= IV
1294 vst1.8 {q0,q1}, [r1]! @ write output
1301 vldmia r9, {q14} @ reload IV
1302 vld1.8 {q8}, [r0]! @ reload input
1303 veor q0, q0, q14 @ ^= IV
1304 vld1.8 {q15}, [r0]! @ reload input
1306 vst1.8 {q0,q1}, [r1]! @ write output
1311 mov r10, r1 @ save original out pointer
1312 mov r1, r9 @ use the iv scratch space as out buffer
1314 vmov q4,q15 @ just in case ensure that IV
1315 vmov q5,q0 @ and input are preserved
1317 vld1.8 {q0}, [r9,:64] @ load result
1318 veor q0, q0, q4 @ ^= IV
1319 vmov q15, q5 @ q5 holds input
1320 vst1.8 {q0}, [r10] @ write output
1323 #ifndef BSAES_ASM_EXTENDED_KEY
1326 .Lcbc_dec_bzero:@ wipe key schedule [if any]
1333 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1334 vst1.8 {q15}, [r8] @ return IV
1336 ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
1337 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1339 .globl bsaes_ctr32_encrypt_blocks
1340 .hidden bsaes_ctr32_encrypt_blocks
1341 .type bsaes_ctr32_encrypt_blocks,%function
1343 bsaes_ctr32_encrypt_blocks:
1344 cmp r2, #8 @ use plain AES for
1345 blo .Lctr_enc_short @ small sizes
1348 stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
1350 ldr r8, [ip] @ ctr is 1st arg on the stack
1351 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1352 mov r9, sp @ save sp
1354 ldr r10, [r3, #240] @ get # of rounds
1355 #ifndef BSAES_ASM_EXTENDED_KEY
1356 @ allocate the key schedule on the stack
1357 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1358 add r12, #96 @ size of bit-sliced key schedule
1360 @ populate the key schedule
1361 mov r4, r3 @ pass key
1362 mov r5, r10 @ pass # of rounds
1363 mov sp, r12 @ sp is sp
1364 bl _bsaes_key_convert
1365 veor q7,q7,q15 @ fix up last round key
1366 vstmia r12, {q7} @ save last round key
1368 vld1.8 {q0}, [r8] @ load counter
1370 mov r8, #:lower16:(.LREVM0SR-.LM0)
1373 add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
1375 vldmia sp, {q4} @ load round0 key
1381 @ populate the key schedule
1383 mov r4, r3 @ pass key
1384 mov r5, r10 @ pass # of rounds
1385 add r12, r3, #248 @ pass key schedule
1386 bl _bsaes_key_convert
1387 veor q7,q7,q15 @ fix up last round key
1388 vstmia r12, {q7} @ save last round key
1392 vld1.8 {q0}, [r8] @ load counter
1393 adrl r8, .LREVM0SR @ borrow r8
1394 vldmia r12, {q4} @ load round0 key
1395 sub sp, #0x10 @ place for adjusted round0 key
1398 vmov.i32 q8,#1 @ compose 1<<96
1403 vadd.u32 q9,q8,q8 @ compose 2<<96
1404 vstmia sp, {q4} @ save adjusted round0 key
1409 vadd.u32 q10, q8, q9 @ compose 3<<96
1410 vadd.u32 q1, q0, q8 @ +1
1411 vadd.u32 q2, q0, q9 @ +2
1412 vadd.u32 q3, q0, q10 @ +3
1413 vadd.u32 q4, q1, q10
1414 vadd.u32 q5, q2, q10
1415 vadd.u32 q6, q3, q10
1416 vadd.u32 q7, q4, q10
1417 vadd.u32 q10, q5, q10 @ next counter
1419 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1420 @ to flip byte order in 32-bit counter
1422 vldmia sp, {q9} @ load round0 key
1423 #ifndef BSAES_ASM_EXTENDED_KEY
1424 add r4, sp, #0x10 @ pass next round key
1428 vldmia r8, {q8} @ .LREVM0SR
1429 mov r5, r10 @ pass rounds
1430 vstmia r9, {q10} @ save next counter
1432 mov r6, #:lower16:(.LREVM0SR-.LSR)
1435 sub r6, r8, #.LREVM0SR-.LSR @ pass constants
1438 bl _bsaes_encrypt8_alt
1441 blo .Lctr_enc_loop_done
1443 vld1.8 {q8,q9}, [r0]! @ load input
1444 vld1.8 {q10,q11}, [r0]!
1447 vld1.8 {q12,q13}, [r0]!
1450 vld1.8 {q14,q15}, [r0]!
1452 vst1.8 {q0,q1}, [r1]! @ write output
1458 vmov.i32 q8, #1 @ compose 1<<96
1462 vext.8 q8, q9, q8, #4
1464 vadd.u32 q9,q8,q8 @ compose 2<<96
1466 vldmia r9, {q0} @ load counter
1472 .Lctr_enc_loop_done:
1474 vld1.8 {q8}, [r0]! @ load input
1476 vst1.8 {q0}, [r1]! @ write output
1508 #ifndef BSAES_ASM_EXTENDED_KEY
1509 .Lctr_enc_bzero:@ wipe key schedule [if any]
1518 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1520 ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
1524 ldr ip, [sp] @ ctr pointer is passed on stack
1525 stmdb sp!, {r4,r5,r6,r7,r8, lr}
1527 mov r4, r0 @ copy arguments
1531 ldr r8, [ip, #12] @ load counter .LSW
1532 vld1.8 {q1}, [ip] @ load whole counter value
1537 vst1.8 {q1}, [sp] @ copy counter value
1540 .Lctr_enc_short_loop:
1541 add r0, sp, #0x10 @ input counter value
1542 mov r1, sp @ output on the stack
1547 vld1.8 {q0}, [r4]! @ load input
1548 vld1.8 {q1}, [sp] @ load encrypted counter
1552 str r0, [sp, #0x1c] @ next counter value
1554 str r8, [sp, #0x1c] @ next counter value
1557 vst1.8 {q0}, [r5]! @ store output
1559 bne .Lctr_enc_short_loop
1565 ldmia sp!, {r4,r5,r6,r7,r8, pc}
1566 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1567 .globl bsaes_xts_encrypt
1568 .hidden bsaes_xts_encrypt
1569 .type bsaes_xts_encrypt,%function
1573 stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20
1575 mov r6, sp @ future r3
1582 sub r0, sp, #0x10 @ 0x10
1583 bic r0, #0xf @ align at 16 bytes
1586 #ifdef XTS_CHAIN_TWEAK
1587 ldr r0, [ip] @ pointer to input tweak
1589 @ generate initial tweak
1590 ldr r0, [ip, #4] @ iv[]
1592 ldr r2, [ip, #0] @ key2
1594 mov r0,sp @ pointer to initial tweak
1597 ldr r1, [r10, #240] @ get # of rounds
1599 #ifndef BSAES_ASM_EXTENDED_KEY
1600 @ allocate the key schedule on the stack
1601 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
1602 @ add r12, #96 @ size of bit-sliced key schedule
1603 sub r12, #48 @ place for tweak[9]
1605 @ populate the key schedule
1606 mov r4, r10 @ pass key
1607 mov r5, r1 @ pass # of rounds
1609 add r12, #0x90 @ pass key schedule
1610 bl _bsaes_key_convert
1611 veor q7, q7, q15 @ fix up last round key
1612 vstmia r12, {q7} @ save last round key
1614 ldr r12, [r10, #244]
1618 str r12, [r10, #244]
1619 mov r4, r10 @ pass key
1620 mov r5, r1 @ pass # of rounds
1621 add r12, r10, #248 @ pass key schedule
1622 bl _bsaes_key_convert
1623 veor q7, q7, q15 @ fix up last round key
1627 sub sp, #0x90 @ place for tweak[9]
1630 vld1.8 {q8}, [r0] @ initial tweak
1639 vldmia r2, {q5} @ load XTS magic
1640 vshr.s64 q6, q8, #63
1644 vst1.64 {q8}, [r0,:128]!
1646 vshr.s64 q7, q9, #63
1649 vadd.u64 q10, q9, q9
1650 vst1.64 {q9}, [r0,:128]!
1652 vshr.s64 q6, q10, #63
1656 vadd.u64 q11, q10, q10
1657 vst1.64 {q10}, [r0,:128]!
1659 vshr.s64 q7, q11, #63
1664 vadd.u64 q12, q11, q11
1665 vst1.64 {q11}, [r0,:128]!
1667 vshr.s64 q6, q12, #63
1672 vadd.u64 q13, q12, q12
1673 vst1.64 {q12}, [r0,:128]!
1675 vshr.s64 q7, q13, #63
1680 vadd.u64 q14, q13, q13
1681 vst1.64 {q13}, [r0,:128]!
1683 vshr.s64 q6, q14, #63
1688 vadd.u64 q15, q14, q14
1689 vst1.64 {q14}, [r0,:128]!
1691 vshr.s64 q7, q15, #63
1696 vadd.u64 q8, q15, q15
1697 vst1.64 {q15}, [r0,:128]!
1700 vst1.64 {q8}, [r0,:128] @ next round tweak
1702 vld1.8 {q6,q7}, [r7]!
1704 #ifndef BSAES_ASM_EXTENDED_KEY
1705 add r4, sp, #0x90 @ pass key schedule
1707 add r4, r10, #248 @ pass key schedule
1710 mov r5, r1 @ pass rounds
1716 vld1.64 {q8,q9}, [r0,:128]!
1717 vld1.64 {q10,q11}, [r0,:128]!
1719 vld1.64 {q12,q13}, [r0,:128]!
1722 vst1.8 {q0,q1}, [r8]!
1724 vld1.64 {q14,q15}, [r0,:128]!
1726 vst1.8 {q8,q9}, [r8]!
1729 vst1.8 {q10,q11}, [r8]!
1731 vst1.8 {q12,q13}, [r8]!
1733 vld1.64 {q8}, [r0,:128] @ next round tweak
1742 vldmia r2, {q5} @ load XTS magic
1743 vshr.s64 q7, q8, #63
1747 vst1.64 {q8}, [r0,:128]!
1749 vshr.s64 q6, q9, #63
1752 vadd.u64 q10, q9, q9
1753 vst1.64 {q9}, [r0,:128]!
1755 vshr.s64 q7, q10, #63
1761 vadd.u64 q11, q10, q10
1762 vst1.64 {q10}, [r0,:128]!
1764 vshr.s64 q6, q11, #63
1771 vadd.u64 q12, q11, q11
1772 vst1.64 {q11}, [r0,:128]!
1774 vshr.s64 q7, q12, #63
1781 vadd.u64 q13, q12, q12
1782 vst1.64 {q12}, [r0,:128]!
1784 vshr.s64 q6, q13, #63
1791 vadd.u64 q14, q13, q13
1792 vst1.64 {q13}, [r0,:128]!
1794 vshr.s64 q7, q14, #63
1801 vadd.u64 q15, q14, q14
1802 vst1.64 {q14}, [r0,:128]!
1804 vshr.s64 q6, q15, #63
1812 vst1.64 {q15}, [r0,:128] @ next round tweak
1816 #ifndef BSAES_ASM_EXTENDED_KEY
1817 add r4, sp, #0x90 @ pass key schedule
1819 add r4, r10, #248 @ pass key schedule
1822 mov r5, r1 @ pass rounds
1827 vld1.64 {q8,q9}, [r0,:128]!
1828 vld1.64 {q10,q11}, [r0,:128]!
1830 vld1.64 {q12,q13}, [r0,:128]!
1833 vst1.8 {q0,q1}, [r8]!
1835 vld1.64 {q14}, [r0,:128]!
1837 vst1.8 {q8,q9}, [r8]!
1840 vst1.8 {q10,q11}, [r8]!
1843 vld1.64 {q8}, [r0,:128] @ next round tweak
1847 vst1.64 {q14}, [r0,:128] @ next round tweak
1850 #ifndef BSAES_ASM_EXTENDED_KEY
1851 add r4, sp, #0x90 @ pass key schedule
1853 add r4, r10, #248 @ pass key schedule
1856 mov r5, r1 @ pass rounds
1861 vld1.64 {q8,q9}, [r0,:128]!
1862 vld1.64 {q10,q11}, [r0,:128]!
1864 vld1.64 {q12,q13}, [r0,:128]!
1867 vst1.8 {q0,q1}, [r8]!
1870 vst1.8 {q8,q9}, [r8]!
1872 vst1.8 {q10,q11}, [r8]!
1874 vld1.64 {q8}, [r0,:128] @ next round tweak
1877 @ put this in range for both ARM and Thumb mode adr instructions
1884 vst1.64 {q13}, [r0,:128] @ next round tweak
1887 #ifndef BSAES_ASM_EXTENDED_KEY
1888 add r4, sp, #0x90 @ pass key schedule
1890 add r4, r10, #248 @ pass key schedule
1893 mov r5, r1 @ pass rounds
1898 vld1.64 {q8,q9}, [r0,:128]!
1899 vld1.64 {q10,q11}, [r0,:128]!
1901 vld1.64 {q12}, [r0,:128]!
1904 vst1.8 {q0,q1}, [r8]!
1907 vst1.8 {q8,q9}, [r8]!
1910 vld1.64 {q8}, [r0,:128] @ next round tweak
1914 vst1.64 {q12}, [r0,:128] @ next round tweak
1917 #ifndef BSAES_ASM_EXTENDED_KEY
1918 add r4, sp, #0x90 @ pass key schedule
1920 add r4, r10, #248 @ pass key schedule
1923 mov r5, r1 @ pass rounds
1928 vld1.64 {q8,q9}, [r0,:128]!
1929 vld1.64 {q10,q11}, [r0,:128]!
1933 vst1.8 {q0,q1}, [r8]!
1935 vst1.8 {q8,q9}, [r8]!
1937 vld1.64 {q8}, [r0,:128] @ next round tweak
1941 vst1.64 {q11}, [r0,:128] @ next round tweak
1944 #ifndef BSAES_ASM_EXTENDED_KEY
1945 add r4, sp, #0x90 @ pass key schedule
1947 add r4, r10, #248 @ pass key schedule
1950 mov r5, r1 @ pass rounds
1955 vld1.64 {q8,q9}, [r0,:128]!
1956 vld1.64 {q10}, [r0,:128]!
1960 vst1.8 {q0,q1}, [r8]!
1963 vld1.64 {q8}, [r0,:128] @ next round tweak
1967 vst1.64 {q10}, [r0,:128] @ next round tweak
1970 #ifndef BSAES_ASM_EXTENDED_KEY
1971 add r4, sp, #0x90 @ pass key schedule
1973 add r4, r10, #248 @ pass key schedule
1976 mov r5, r1 @ pass rounds
1981 vld1.64 {q8,q9}, [r0,:128]!
1984 vst1.8 {q0,q1}, [r8]!
1986 vld1.64 {q8}, [r0,:128] @ next round tweak
1993 vst1.8 {q0}, [sp,:128]
1995 mov r4, r3 @ preserve fp
1999 vld1.8 {q0}, [sp,:128]
2004 vmov q8, q9 @ next round tweak
2007 #ifndef XTS_CHAIN_TWEAK
2014 ldrb r1, [r8, #-0x10]
2015 strb r0, [r8, #-0x10]
2025 vst1.8 {q0}, [sp,:128]
2027 mov r4, r3 @ preserve fp
2031 vld1.8 {q0}, [sp,:128]
2041 #ifdef XTS_CHAIN_TWEAK
2042 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2044 .Lxts_enc_bzero:@ wipe key schedule [if any]
2050 #ifdef XTS_CHAIN_TWEAK
2054 ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
2056 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2058 .globl bsaes_xts_decrypt
2059 .hidden bsaes_xts_decrypt
2060 .type bsaes_xts_decrypt,%function
2064 stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20
2066 mov r6, sp @ future r3
2073 sub r0, sp, #0x10 @ 0x10
2074 bic r0, #0xf @ align at 16 bytes
2077 #ifdef XTS_CHAIN_TWEAK
2078 ldr r0, [ip] @ pointer to input tweak
2080 @ generate initial tweak
2081 ldr r0, [ip, #4] @ iv[]
2083 ldr r2, [ip, #0] @ key2
2085 mov r0, sp @ pointer to initial tweak
2088 ldr r1, [r10, #240] @ get # of rounds
2090 #ifndef BSAES_ASM_EXTENDED_KEY
2091 @ allocate the key schedule on the stack
2092 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
2093 @ add r12, #96 @ size of bit-sliced key schedule
2094 sub r12, #48 @ place for tweak[9]
2096 @ populate the key schedule
2097 mov r4, r10 @ pass key
2098 mov r5, r1 @ pass # of rounds
2100 add r12, #0x90 @ pass key schedule
2101 bl _bsaes_key_convert
2104 vstmia r12, {q15} @ save last round key
2105 veor q7, q7, q6 @ fix up round 0 key
2108 ldr r12, [r10, #244]
2112 str r12, [r10, #244]
2113 mov r4, r10 @ pass key
2114 mov r5, r1 @ pass # of rounds
2115 add r12, r10, #248 @ pass key schedule
2116 bl _bsaes_key_convert
2119 vstmia r12, {q15} @ save last round key
2120 veor q7, q7, q6 @ fix up round 0 key
2124 sub sp, #0x90 @ place for tweak[9]
2126 vld1.8 {q8}, [r0] @ initial tweak
2129 #ifndef XTS_CHAIN_TWEAK
2130 tst r9, #0xf @ if not multiple of 16
2131 it ne @ Thumb2 thing, sanity check in ARM
2132 subne r9, #0x10 @ subtract another 16 bytes
2141 vldmia r2, {q5} @ load XTS magic
2142 vshr.s64 q6, q8, #63
2146 vst1.64 {q8}, [r0,:128]!
2148 vshr.s64 q7, q9, #63
2151 vadd.u64 q10, q9, q9
2152 vst1.64 {q9}, [r0,:128]!
2154 vshr.s64 q6, q10, #63
2158 vadd.u64 q11, q10, q10
2159 vst1.64 {q10}, [r0,:128]!
2161 vshr.s64 q7, q11, #63
2166 vadd.u64 q12, q11, q11
2167 vst1.64 {q11}, [r0,:128]!
2169 vshr.s64 q6, q12, #63
2174 vadd.u64 q13, q12, q12
2175 vst1.64 {q12}, [r0,:128]!
2177 vshr.s64 q7, q13, #63
2182 vadd.u64 q14, q13, q13
2183 vst1.64 {q13}, [r0,:128]!
2185 vshr.s64 q6, q14, #63
2190 vadd.u64 q15, q14, q14
2191 vst1.64 {q14}, [r0,:128]!
2193 vshr.s64 q7, q15, #63
2198 vadd.u64 q8, q15, q15
2199 vst1.64 {q15}, [r0,:128]!
2202 vst1.64 {q8}, [r0,:128] @ next round tweak
2204 vld1.8 {q6,q7}, [r7]!
2206 #ifndef BSAES_ASM_EXTENDED_KEY
2207 add r4, sp, #0x90 @ pass key schedule
2209 add r4, r10, #248 @ pass key schedule
2212 mov r5, r1 @ pass rounds
2218 vld1.64 {q8,q9}, [r0,:128]!
2219 vld1.64 {q10,q11}, [r0,:128]!
2221 vld1.64 {q12,q13}, [r0,:128]!
2224 vst1.8 {q0,q1}, [r8]!
2226 vld1.64 {q14,q15}, [r0,:128]!
2228 vst1.8 {q8,q9}, [r8]!
2231 vst1.8 {q10,q11}, [r8]!
2233 vst1.8 {q12,q13}, [r8]!
2235 vld1.64 {q8}, [r0,:128] @ next round tweak
2244 vldmia r2, {q5} @ load XTS magic
2245 vshr.s64 q7, q8, #63
2249 vst1.64 {q8}, [r0,:128]!
2251 vshr.s64 q6, q9, #63
2254 vadd.u64 q10, q9, q9
2255 vst1.64 {q9}, [r0,:128]!
2257 vshr.s64 q7, q10, #63
2263 vadd.u64 q11, q10, q10
2264 vst1.64 {q10}, [r0,:128]!
2266 vshr.s64 q6, q11, #63
2273 vadd.u64 q12, q11, q11
2274 vst1.64 {q11}, [r0,:128]!
2276 vshr.s64 q7, q12, #63
2283 vadd.u64 q13, q12, q12
2284 vst1.64 {q12}, [r0,:128]!
2286 vshr.s64 q6, q13, #63
2293 vadd.u64 q14, q13, q13
2294 vst1.64 {q13}, [r0,:128]!
2296 vshr.s64 q7, q14, #63
2303 vadd.u64 q15, q14, q14
2304 vst1.64 {q14}, [r0,:128]!
2306 vshr.s64 q6, q15, #63
2314 vst1.64 {q15}, [r0,:128] @ next round tweak
2318 #ifndef BSAES_ASM_EXTENDED_KEY
2319 add r4, sp, #0x90 @ pass key schedule
2321 add r4, r10, #248 @ pass key schedule
2324 mov r5, r1 @ pass rounds
2329 vld1.64 {q8,q9}, [r0,:128]!
2330 vld1.64 {q10,q11}, [r0,:128]!
2332 vld1.64 {q12,q13}, [r0,:128]!
2335 vst1.8 {q0,q1}, [r8]!
2337 vld1.64 {q14}, [r0,:128]!
2339 vst1.8 {q8,q9}, [r8]!
2342 vst1.8 {q10,q11}, [r8]!
2345 vld1.64 {q8}, [r0,:128] @ next round tweak
2349 vst1.64 {q14}, [r0,:128] @ next round tweak
2352 #ifndef BSAES_ASM_EXTENDED_KEY
2353 add r4, sp, #0x90 @ pass key schedule
2355 add r4, r10, #248 @ pass key schedule
2358 mov r5, r1 @ pass rounds
2363 vld1.64 {q8,q9}, [r0,:128]!
2364 vld1.64 {q10,q11}, [r0,:128]!
2366 vld1.64 {q12,q13}, [r0,:128]!
2369 vst1.8 {q0,q1}, [r8]!
2372 vst1.8 {q8,q9}, [r8]!
2374 vst1.8 {q10,q11}, [r8]!
2376 vld1.64 {q8}, [r0,:128] @ next round tweak
2380 vst1.64 {q13}, [r0,:128] @ next round tweak
2383 #ifndef BSAES_ASM_EXTENDED_KEY
2384 add r4, sp, #0x90 @ pass key schedule
2386 add r4, r10, #248 @ pass key schedule
2389 mov r5, r1 @ pass rounds
2394 vld1.64 {q8,q9}, [r0,:128]!
2395 vld1.64 {q10,q11}, [r0,:128]!
2397 vld1.64 {q12}, [r0,:128]!
2400 vst1.8 {q0,q1}, [r8]!
2403 vst1.8 {q8,q9}, [r8]!
2406 vld1.64 {q8}, [r0,:128] @ next round tweak
2410 vst1.64 {q12}, [r0,:128] @ next round tweak
2413 #ifndef BSAES_ASM_EXTENDED_KEY
2414 add r4, sp, #0x90 @ pass key schedule
2416 add r4, r10, #248 @ pass key schedule
2419 mov r5, r1 @ pass rounds
2424 vld1.64 {q8,q9}, [r0,:128]!
2425 vld1.64 {q10,q11}, [r0,:128]!
2429 vst1.8 {q0,q1}, [r8]!
2431 vst1.8 {q8,q9}, [r8]!
2433 vld1.64 {q8}, [r0,:128] @ next round tweak
2437 vst1.64 {q11}, [r0,:128] @ next round tweak
2440 #ifndef BSAES_ASM_EXTENDED_KEY
2441 add r4, sp, #0x90 @ pass key schedule
2443 add r4, r10, #248 @ pass key schedule
2446 mov r5, r1 @ pass rounds
2451 vld1.64 {q8,q9}, [r0,:128]!
2452 vld1.64 {q10}, [r0,:128]!
2456 vst1.8 {q0,q1}, [r8]!
2459 vld1.64 {q8}, [r0,:128] @ next round tweak
2463 vst1.64 {q10}, [r0,:128] @ next round tweak
2466 #ifndef BSAES_ASM_EXTENDED_KEY
2467 add r4, sp, #0x90 @ pass key schedule
2469 add r4, r10, #248 @ pass key schedule
2472 mov r5, r1 @ pass rounds
2477 vld1.64 {q8,q9}, [r0,:128]!
2480 vst1.8 {q0,q1}, [r8]!
2482 vld1.64 {q8}, [r0,:128] @ next round tweak
2489 vst1.8 {q0}, [sp,:128]
2491 mov r4, r3 @ preserve fp
2492 mov r5, r2 @ preserve magic
2496 vld1.8 {q0}, [sp,:128]
2502 vmov q8, q9 @ next round tweak
2505 #ifndef XTS_CHAIN_TWEAK
2509 @ calculate one round of extra tweak for the stolen ciphertext
2511 vshr.s64 q6, q8, #63
2517 @ perform the final decryption with the last tweak value
2522 vst1.8 {q0}, [sp,:128]
2524 mov r4, r3 @ preserve fp
2528 vld1.8 {q0}, [sp,:128]
2536 strb r1, [r8, #0x10]
2546 vst1.8 {q0}, [sp,:128]
2551 vld1.8 {q0}, [sp,:128]
2561 #ifdef XTS_CHAIN_TWEAK
2562 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2564 .Lxts_dec_bzero:@ wipe key schedule [if any]
2570 #ifdef XTS_CHAIN_TWEAK
2574 ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
2576 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt