1 @ SPDX-License-Identifier: GPL-2.0
3 @ This code is taken from the OpenSSL project but the author (Andy Polyakov)
4 @ has relicensed it under the GPLv2. Therefore this program is free software;
5 @ you can redistribute it and/or modify it under the terms of the GNU General
6 @ Public License version 2 as published by the Free Software Foundation.
8 @ The original headers, including the original license headers, are
9 @ included below for completeness.
11 @ ====================================================================
12 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
13 @ project. The module is, however, dual licensed under OpenSSL and
14 @ CRYPTOGAMS licenses depending on where you obtain it. For further
15 @ details see https://www.openssl.org/~appro/cryptogams/.
16 @ ====================================================================
18 @ SHA512 block procedure for ARMv4. September 2007.
20 @ This code is ~4.5 (four and a half) times faster than code generated
21 @ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
22 @ Xscale PXA250 core].
26 @ Rescheduling for dual-issue pipeline resulted in 6% improvement on
27 @ Cortex A8 core and ~40 cycles per processed byte.
31 @ Profiler-assisted and platform-specific optimization resulted in 7%
32 @ improvement on Coxtex A8 core and ~38 cycles per byte.
36 @ Add NEON implementation. On Cortex A8 it was measured to process
37 @ one byte in 23.3 cycles or ~60% faster than integer-only code.
41 @ Improve NEON performance by 12% on Snapdragon S4. In absolute
42 @ terms it's 22.6 cycles per byte, which is disappointing result.
43 @ Technical writers asserted that 3-way S4 pipeline can sustain
44 @ multiple NEON instructions per cycle, but dual NEON issue could
45 @ not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
46 @ for further details. On side note Cortex-A15 processes one byte in
49 @ Byte order [in]dependence. =========================================
51 @ Originally caller was expected to maintain specific *dword* order in
52 @ h[0-7], namely with most significant dword at *lower* address, which
53 @ was reflected in below two parameters as 0 and 4. Now caller is
54 @ expected to maintain native byte order for whole 64-bit values.
56 # include "arm_arch.h"
57 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
58 # define VFP_ABI_POP vldmia sp!,{d8-d15}
60 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
61 # define __ARM_MAX_ARCH__ 7
69 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
73 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
91 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
92 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
93 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
94 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
95 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
96 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
97 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
98 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
99 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
100 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
101 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
102 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
103 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
104 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
105 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
106 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
107 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
108 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
109 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
110 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
111 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
112 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
113 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
114 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
115 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
116 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
117 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
118 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
119 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
120 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
121 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
122 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
123 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
124 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
125 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
126 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
127 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
128 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
129 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
130 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
132 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
134 .word OPENSSL_armcap_P-sha512_block_data_order
140 .global sha512_block_data_order
141 .type sha512_block_data_order,%function
142 sha512_block_data_order:
143 .Lsha512_block_data_order:
145 sub r3,pc,#8 @ sha512_block_data_order
147 adr r3,.Lsha512_block_data_order
149 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
150 ldr r12,.LOPENSSL_armcap
151 ldr r12,[r3,r12] @ OPENSSL_armcap_P
155 add r2,r1,r2,lsl#7 @ len to point at the end of inp
156 stmdb sp!,{r4-r12,lr}
157 sub r14,r3,#672 @ K512
214 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
215 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
216 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
222 ldr r11,[sp,#56+0] @ h.lo
223 eor r10,r10,r7,lsl#18
224 ldr r12,[sp,#56+4] @ h.hi
226 eor r10,r10,r8,lsr#18
228 eor r10,r10,r7,lsl#14
232 eor r10,r10,r8,lsl#23 @ Sigma1(e)
234 ldr r9,[sp,#40+0] @ f.lo
235 adc r4,r4,r10 @ T += Sigma1(e)
236 ldr r10,[sp,#40+4] @ f.hi
238 ldr r11,[sp,#48+0] @ g.lo
239 adc r4,r4,r12 @ T += h
240 ldr r12,[sp,#48+4] @ g.hi
251 ldr r11,[r14,#LO] @ K[i].lo
252 eor r10,r10,r12 @ Ch(e,f,g)
253 ldr r12,[r14,#HI] @ K[i].hi
256 ldr r7,[sp,#24+0] @ d.lo
257 adc r4,r4,r10 @ T += Ch(e,f,g)
258 ldr r8,[sp,#24+4] @ d.hi
261 adc r4,r4,r12 @ T += K[i]
263 ldr r11,[sp,#8+0] @ b.lo
264 adc r8,r8,r4 @ d += T
267 ldr r12,[sp,#16+0] @ c.lo
269 it eq @ Thumb2 thing, sanity check in ARM
272 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
273 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
274 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
282 eor r10,r10,r6,lsl#30
286 eor r10,r10,r6,lsl#25 @ Sigma0(a)
289 adc r4,r4,r10 @ T += Sigma0(a)
291 ldr r10,[sp,#8+4] @ b.hi
293 ldr r11,[sp,#16+4] @ c.hi
297 orr r5,r5,r9 @ Maj(a,b,c).lo
300 orr r6,r6,r12 @ Maj(a,b,c).hi
302 adc r6,r6,r4 @ h += T
311 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
312 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
313 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
328 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
329 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
330 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
334 eor r10,r10,r11,lsl#13
336 eor r10,r10,r11,lsr#29
338 eor r10,r10,r12,lsl#3
340 eor r10,r10,r12,lsr#6
354 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
355 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
356 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
362 ldr r11,[sp,#56+0] @ h.lo
363 eor r10,r10,r7,lsl#18
364 ldr r12,[sp,#56+4] @ h.hi
366 eor r10,r10,r8,lsr#18
368 eor r10,r10,r7,lsl#14
372 eor r10,r10,r8,lsl#23 @ Sigma1(e)
374 ldr r9,[sp,#40+0] @ f.lo
375 adc r4,r4,r10 @ T += Sigma1(e)
376 ldr r10,[sp,#40+4] @ f.hi
378 ldr r11,[sp,#48+0] @ g.lo
379 adc r4,r4,r12 @ T += h
380 ldr r12,[sp,#48+4] @ g.hi
391 ldr r11,[r14,#LO] @ K[i].lo
392 eor r10,r10,r12 @ Ch(e,f,g)
393 ldr r12,[r14,#HI] @ K[i].hi
396 ldr r7,[sp,#24+0] @ d.lo
397 adc r4,r4,r10 @ T += Ch(e,f,g)
398 ldr r8,[sp,#24+4] @ d.hi
401 adc r4,r4,r12 @ T += K[i]
403 ldr r11,[sp,#8+0] @ b.lo
404 adc r8,r8,r4 @ d += T
407 ldr r12,[sp,#16+0] @ c.lo
409 it eq @ Thumb2 thing, sanity check in ARM
412 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
413 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
414 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
422 eor r10,r10,r6,lsl#30
426 eor r10,r10,r6,lsl#25 @ Sigma0(a)
429 adc r4,r4,r10 @ T += Sigma0(a)
431 ldr r10,[sp,#8+4] @ b.hi
433 ldr r11,[sp,#16+4] @ c.hi
437 orr r5,r5,r9 @ Maj(a,b,c).lo
440 orr r6,r6,r12 @ Maj(a,b,c).hi
442 adc r6,r6,r4 @ h += T
446 ittt eq @ Thumb2 thing, sanity check in ARM
449 ldreq r10,[sp,#184+4]
523 add sp,sp,#8*9 @ destroy frame
525 ldmia sp!,{r4-r12,pc}
527 ldmia sp!,{r4-r12,lr}
529 moveq pc,lr @ be binary compatible with V4, yet
530 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
532 .size sha512_block_data_order,.-sha512_block_data_order
533 #if __ARM_MAX_ARCH__>=7
537 .global sha512_block_data_order_neon
538 .type sha512_block_data_order_neon,%function
540 sha512_block_data_order_neon:
542 dmb @ errata #451034 on early Cortex A8
543 add r2,r1,r2,lsl#7 @ len to point at the end of inp
545 adr r3,.Lsha512_block_data_order
546 sub r3,r3,.Lsha512_block_data_order-K512
547 vldmia r0,{d16-d23} @ load context
549 vshr.u64 d24,d20,#14 @ 0
551 vld1.64 {d0},[r1]! @ handles unaligned
555 vadd.i64 d16,d30 @ h+=Maj from the past
558 vld1.64 {d28},[r3,:64]! @ K[i++]
563 #if 0<16 && defined(__ARMEL__)
567 vbsl d29,d21,d22 @ Ch(e,f,g)
569 veor d26,d25 @ Sigma1(e)
581 vbsl d30,d18,d17 @ Maj(a,b,c)
582 veor d23,d26 @ Sigma0(a)
586 vshr.u64 d24,d19,#14 @ 1
588 vld1.64 {d1},[r1]! @ handles unaligned
592 vadd.i64 d23,d30 @ h+=Maj from the past
595 vld1.64 {d28},[r3,:64]! @ K[i++]
600 #if 1<16 && defined(__ARMEL__)
604 vbsl d29,d20,d21 @ Ch(e,f,g)
606 veor d26,d25 @ Sigma1(e)
618 vbsl d30,d17,d16 @ Maj(a,b,c)
619 veor d22,d26 @ Sigma0(a)
623 vshr.u64 d24,d18,#14 @ 2
625 vld1.64 {d2},[r1]! @ handles unaligned
629 vadd.i64 d22,d30 @ h+=Maj from the past
632 vld1.64 {d28},[r3,:64]! @ K[i++]
637 #if 2<16 && defined(__ARMEL__)
641 vbsl d29,d19,d20 @ Ch(e,f,g)
643 veor d26,d25 @ Sigma1(e)
655 vbsl d30,d16,d23 @ Maj(a,b,c)
656 veor d21,d26 @ Sigma0(a)
660 vshr.u64 d24,d17,#14 @ 3
662 vld1.64 {d3},[r1]! @ handles unaligned
666 vadd.i64 d21,d30 @ h+=Maj from the past
669 vld1.64 {d28},[r3,:64]! @ K[i++]
674 #if 3<16 && defined(__ARMEL__)
678 vbsl d29,d18,d19 @ Ch(e,f,g)
680 veor d26,d25 @ Sigma1(e)
692 vbsl d30,d23,d22 @ Maj(a,b,c)
693 veor d20,d26 @ Sigma0(a)
697 vshr.u64 d24,d16,#14 @ 4
699 vld1.64 {d4},[r1]! @ handles unaligned
703 vadd.i64 d20,d30 @ h+=Maj from the past
706 vld1.64 {d28},[r3,:64]! @ K[i++]
711 #if 4<16 && defined(__ARMEL__)
715 vbsl d29,d17,d18 @ Ch(e,f,g)
717 veor d26,d25 @ Sigma1(e)
729 vbsl d30,d22,d21 @ Maj(a,b,c)
730 veor d19,d26 @ Sigma0(a)
734 vshr.u64 d24,d23,#14 @ 5
736 vld1.64 {d5},[r1]! @ handles unaligned
740 vadd.i64 d19,d30 @ h+=Maj from the past
743 vld1.64 {d28},[r3,:64]! @ K[i++]
748 #if 5<16 && defined(__ARMEL__)
752 vbsl d29,d16,d17 @ Ch(e,f,g)
754 veor d26,d25 @ Sigma1(e)
766 vbsl d30,d21,d20 @ Maj(a,b,c)
767 veor d18,d26 @ Sigma0(a)
771 vshr.u64 d24,d22,#14 @ 6
773 vld1.64 {d6},[r1]! @ handles unaligned
777 vadd.i64 d18,d30 @ h+=Maj from the past
780 vld1.64 {d28},[r3,:64]! @ K[i++]
785 #if 6<16 && defined(__ARMEL__)
789 vbsl d29,d23,d16 @ Ch(e,f,g)
791 veor d26,d25 @ Sigma1(e)
803 vbsl d30,d20,d19 @ Maj(a,b,c)
804 veor d17,d26 @ Sigma0(a)
808 vshr.u64 d24,d21,#14 @ 7
810 vld1.64 {d7},[r1]! @ handles unaligned
814 vadd.i64 d17,d30 @ h+=Maj from the past
817 vld1.64 {d28},[r3,:64]! @ K[i++]
822 #if 7<16 && defined(__ARMEL__)
826 vbsl d29,d22,d23 @ Ch(e,f,g)
828 veor d26,d25 @ Sigma1(e)
840 vbsl d30,d19,d18 @ Maj(a,b,c)
841 veor d16,d26 @ Sigma0(a)
845 vshr.u64 d24,d20,#14 @ 8
847 vld1.64 {d8},[r1]! @ handles unaligned
851 vadd.i64 d16,d30 @ h+=Maj from the past
854 vld1.64 {d28},[r3,:64]! @ K[i++]
859 #if 8<16 && defined(__ARMEL__)
863 vbsl d29,d21,d22 @ Ch(e,f,g)
865 veor d26,d25 @ Sigma1(e)
877 vbsl d30,d18,d17 @ Maj(a,b,c)
878 veor d23,d26 @ Sigma0(a)
882 vshr.u64 d24,d19,#14 @ 9
884 vld1.64 {d9},[r1]! @ handles unaligned
888 vadd.i64 d23,d30 @ h+=Maj from the past
891 vld1.64 {d28},[r3,:64]! @ K[i++]
896 #if 9<16 && defined(__ARMEL__)
900 vbsl d29,d20,d21 @ Ch(e,f,g)
902 veor d26,d25 @ Sigma1(e)
914 vbsl d30,d17,d16 @ Maj(a,b,c)
915 veor d22,d26 @ Sigma0(a)
919 vshr.u64 d24,d18,#14 @ 10
921 vld1.64 {d10},[r1]! @ handles unaligned
925 vadd.i64 d22,d30 @ h+=Maj from the past
928 vld1.64 {d28},[r3,:64]! @ K[i++]
933 #if 10<16 && defined(__ARMEL__)
937 vbsl d29,d19,d20 @ Ch(e,f,g)
939 veor d26,d25 @ Sigma1(e)
951 vbsl d30,d16,d23 @ Maj(a,b,c)
952 veor d21,d26 @ Sigma0(a)
956 vshr.u64 d24,d17,#14 @ 11
958 vld1.64 {d11},[r1]! @ handles unaligned
962 vadd.i64 d21,d30 @ h+=Maj from the past
965 vld1.64 {d28},[r3,:64]! @ K[i++]
970 #if 11<16 && defined(__ARMEL__)
974 vbsl d29,d18,d19 @ Ch(e,f,g)
976 veor d26,d25 @ Sigma1(e)
988 vbsl d30,d23,d22 @ Maj(a,b,c)
989 veor d20,d26 @ Sigma0(a)
993 vshr.u64 d24,d16,#14 @ 12
995 vld1.64 {d12},[r1]! @ handles unaligned
999 vadd.i64 d20,d30 @ h+=Maj from the past
1001 vshr.u64 d26,d16,#41
1002 vld1.64 {d28},[r3,:64]! @ K[i++]
1007 #if 12<16 && defined(__ARMEL__)
1011 vbsl d29,d17,d18 @ Ch(e,f,g)
1012 vshr.u64 d24,d20,#28
1013 veor d26,d25 @ Sigma1(e)
1014 vadd.i64 d27,d29,d19
1015 vshr.u64 d25,d20,#34
1018 vshr.u64 d26,d20,#39
1025 vbsl d30,d22,d21 @ Maj(a,b,c)
1026 veor d19,d26 @ Sigma0(a)
1030 vshr.u64 d24,d23,#14 @ 13
1032 vld1.64 {d13},[r1]! @ handles unaligned
1034 vshr.u64 d25,d23,#18
1036 vadd.i64 d19,d30 @ h+=Maj from the past
1038 vshr.u64 d26,d23,#41
1039 vld1.64 {d28},[r3,:64]! @ K[i++]
1044 #if 13<16 && defined(__ARMEL__)
1048 vbsl d29,d16,d17 @ Ch(e,f,g)
1049 vshr.u64 d24,d19,#28
1050 veor d26,d25 @ Sigma1(e)
1051 vadd.i64 d27,d29,d18
1052 vshr.u64 d25,d19,#34
1055 vshr.u64 d26,d19,#39
1062 vbsl d30,d21,d20 @ Maj(a,b,c)
1063 veor d18,d26 @ Sigma0(a)
1067 vshr.u64 d24,d22,#14 @ 14
1069 vld1.64 {d14},[r1]! @ handles unaligned
1071 vshr.u64 d25,d22,#18
1073 vadd.i64 d18,d30 @ h+=Maj from the past
1075 vshr.u64 d26,d22,#41
1076 vld1.64 {d28},[r3,:64]! @ K[i++]
1081 #if 14<16 && defined(__ARMEL__)
1085 vbsl d29,d23,d16 @ Ch(e,f,g)
1086 vshr.u64 d24,d18,#28
1087 veor d26,d25 @ Sigma1(e)
1088 vadd.i64 d27,d29,d17
1089 vshr.u64 d25,d18,#34
1092 vshr.u64 d26,d18,#39
1099 vbsl d30,d20,d19 @ Maj(a,b,c)
1100 veor d17,d26 @ Sigma0(a)
1104 vshr.u64 d24,d21,#14 @ 15
1106 vld1.64 {d15},[r1]! @ handles unaligned
1108 vshr.u64 d25,d21,#18
1110 vadd.i64 d17,d30 @ h+=Maj from the past
1112 vshr.u64 d26,d21,#41
1113 vld1.64 {d28},[r3,:64]! @ K[i++]
1118 #if 15<16 && defined(__ARMEL__)
1122 vbsl d29,d22,d23 @ Ch(e,f,g)
1123 vshr.u64 d24,d17,#28
1124 veor d26,d25 @ Sigma1(e)
1125 vadd.i64 d27,d29,d16
1126 vshr.u64 d25,d17,#34
1129 vshr.u64 d26,d17,#39
1136 vbsl d30,d19,d18 @ Maj(a,b,c)
1137 veor d16,d26 @ Sigma0(a)
1146 vadd.i64 d16,d30 @ h+=Maj from the past
1149 vext.8 q14,q0,q1,#8 @ X[i+1]
1153 veor q15,q13 @ sigma1(X[i+14])
1159 vext.8 q14,q4,q5,#8 @ X[i+9]
1161 vshr.u64 d24,d20,#14 @ from NEON_00_15
1163 vshr.u64 d25,d20,#18 @ from NEON_00_15
1164 veor q15,q13 @ sigma0(X[i+1])
1165 vshr.u64 d26,d20,#41 @ from NEON_00_15
1167 vld1.64 {d28},[r3,:64]! @ K[i++]
1172 #if 16<16 && defined(__ARMEL__)
1176 vbsl d29,d21,d22 @ Ch(e,f,g)
1177 vshr.u64 d24,d16,#28
1178 veor d26,d25 @ Sigma1(e)
1179 vadd.i64 d27,d29,d23
1180 vshr.u64 d25,d16,#34
1183 vshr.u64 d26,d16,#39
1190 vbsl d30,d18,d17 @ Maj(a,b,c)
1191 veor d23,d26 @ Sigma0(a)
1195 vshr.u64 d24,d19,#14 @ 17
1197 vld1.64 {d1},[r1]! @ handles unaligned
1199 vshr.u64 d25,d19,#18
1201 vadd.i64 d23,d30 @ h+=Maj from the past
1203 vshr.u64 d26,d19,#41
1204 vld1.64 {d28},[r3,:64]! @ K[i++]
1209 #if 17<16 && defined(__ARMEL__)
1213 vbsl d29,d20,d21 @ Ch(e,f,g)
1214 vshr.u64 d24,d23,#28
1215 veor d26,d25 @ Sigma1(e)
1216 vadd.i64 d27,d29,d22
1217 vshr.u64 d25,d23,#34
1220 vshr.u64 d26,d23,#39
1227 vbsl d30,d17,d16 @ Maj(a,b,c)
1228 veor d22,d26 @ Sigma0(a)
1234 vadd.i64 d22,d30 @ h+=Maj from the past
1237 vext.8 q14,q1,q2,#8 @ X[i+1]
1241 veor q15,q13 @ sigma1(X[i+14])
1247 vext.8 q14,q5,q6,#8 @ X[i+9]
1249 vshr.u64 d24,d18,#14 @ from NEON_00_15
1251 vshr.u64 d25,d18,#18 @ from NEON_00_15
1252 veor q15,q13 @ sigma0(X[i+1])
1253 vshr.u64 d26,d18,#41 @ from NEON_00_15
1255 vld1.64 {d28},[r3,:64]! @ K[i++]
1260 #if 18<16 && defined(__ARMEL__)
1264 vbsl d29,d19,d20 @ Ch(e,f,g)
1265 vshr.u64 d24,d22,#28
1266 veor d26,d25 @ Sigma1(e)
1267 vadd.i64 d27,d29,d21
1268 vshr.u64 d25,d22,#34
1271 vshr.u64 d26,d22,#39
1278 vbsl d30,d16,d23 @ Maj(a,b,c)
1279 veor d21,d26 @ Sigma0(a)
1283 vshr.u64 d24,d17,#14 @ 19
1285 vld1.64 {d3},[r1]! @ handles unaligned
1287 vshr.u64 d25,d17,#18
1289 vadd.i64 d21,d30 @ h+=Maj from the past
1291 vshr.u64 d26,d17,#41
1292 vld1.64 {d28},[r3,:64]! @ K[i++]
1297 #if 19<16 && defined(__ARMEL__)
1301 vbsl d29,d18,d19 @ Ch(e,f,g)
1302 vshr.u64 d24,d21,#28
1303 veor d26,d25 @ Sigma1(e)
1304 vadd.i64 d27,d29,d20
1305 vshr.u64 d25,d21,#34
1308 vshr.u64 d26,d21,#39
1315 vbsl d30,d23,d22 @ Maj(a,b,c)
1316 veor d20,d26 @ Sigma0(a)
1322 vadd.i64 d20,d30 @ h+=Maj from the past
1325 vext.8 q14,q2,q3,#8 @ X[i+1]
1329 veor q15,q13 @ sigma1(X[i+14])
1335 vext.8 q14,q6,q7,#8 @ X[i+9]
1337 vshr.u64 d24,d16,#14 @ from NEON_00_15
1339 vshr.u64 d25,d16,#18 @ from NEON_00_15
1340 veor q15,q13 @ sigma0(X[i+1])
1341 vshr.u64 d26,d16,#41 @ from NEON_00_15
1343 vld1.64 {d28},[r3,:64]! @ K[i++]
1348 #if 20<16 && defined(__ARMEL__)
1352 vbsl d29,d17,d18 @ Ch(e,f,g)
1353 vshr.u64 d24,d20,#28
1354 veor d26,d25 @ Sigma1(e)
1355 vadd.i64 d27,d29,d19
1356 vshr.u64 d25,d20,#34
1359 vshr.u64 d26,d20,#39
1366 vbsl d30,d22,d21 @ Maj(a,b,c)
1367 veor d19,d26 @ Sigma0(a)
1371 vshr.u64 d24,d23,#14 @ 21
1373 vld1.64 {d5},[r1]! @ handles unaligned
1375 vshr.u64 d25,d23,#18
1377 vadd.i64 d19,d30 @ h+=Maj from the past
1379 vshr.u64 d26,d23,#41
1380 vld1.64 {d28},[r3,:64]! @ K[i++]
1385 #if 21<16 && defined(__ARMEL__)
1389 vbsl d29,d16,d17 @ Ch(e,f,g)
1390 vshr.u64 d24,d19,#28
1391 veor d26,d25 @ Sigma1(e)
1392 vadd.i64 d27,d29,d18
1393 vshr.u64 d25,d19,#34
1396 vshr.u64 d26,d19,#39
1403 vbsl d30,d21,d20 @ Maj(a,b,c)
1404 veor d18,d26 @ Sigma0(a)
1410 vadd.i64 d18,d30 @ h+=Maj from the past
1413 vext.8 q14,q3,q4,#8 @ X[i+1]
1417 veor q15,q13 @ sigma1(X[i+14])
1423 vext.8 q14,q7,q0,#8 @ X[i+9]
1425 vshr.u64 d24,d22,#14 @ from NEON_00_15
1427 vshr.u64 d25,d22,#18 @ from NEON_00_15
1428 veor q15,q13 @ sigma0(X[i+1])
1429 vshr.u64 d26,d22,#41 @ from NEON_00_15
1431 vld1.64 {d28},[r3,:64]! @ K[i++]
1436 #if 22<16 && defined(__ARMEL__)
1440 vbsl d29,d23,d16 @ Ch(e,f,g)
1441 vshr.u64 d24,d18,#28
1442 veor d26,d25 @ Sigma1(e)
1443 vadd.i64 d27,d29,d17
1444 vshr.u64 d25,d18,#34
1447 vshr.u64 d26,d18,#39
1454 vbsl d30,d20,d19 @ Maj(a,b,c)
1455 veor d17,d26 @ Sigma0(a)
1459 vshr.u64 d24,d21,#14 @ 23
1461 vld1.64 {d7},[r1]! @ handles unaligned
1463 vshr.u64 d25,d21,#18
1465 vadd.i64 d17,d30 @ h+=Maj from the past
1467 vshr.u64 d26,d21,#41
1468 vld1.64 {d28},[r3,:64]! @ K[i++]
1473 #if 23<16 && defined(__ARMEL__)
1477 vbsl d29,d22,d23 @ Ch(e,f,g)
1478 vshr.u64 d24,d17,#28
1479 veor d26,d25 @ Sigma1(e)
1480 vadd.i64 d27,d29,d16
1481 vshr.u64 d25,d17,#34
1484 vshr.u64 d26,d17,#39
1491 vbsl d30,d19,d18 @ Maj(a,b,c)
1492 veor d16,d26 @ Sigma0(a)
1498 vadd.i64 d16,d30 @ h+=Maj from the past
1501 vext.8 q14,q4,q5,#8 @ X[i+1]
1505 veor q15,q13 @ sigma1(X[i+14])
1511 vext.8 q14,q0,q1,#8 @ X[i+9]
1513 vshr.u64 d24,d20,#14 @ from NEON_00_15
1515 vshr.u64 d25,d20,#18 @ from NEON_00_15
1516 veor q15,q13 @ sigma0(X[i+1])
1517 vshr.u64 d26,d20,#41 @ from NEON_00_15
1519 vld1.64 {d28},[r3,:64]! @ K[i++]
1524 #if 24<16 && defined(__ARMEL__)
1528 vbsl d29,d21,d22 @ Ch(e,f,g)
1529 vshr.u64 d24,d16,#28
1530 veor d26,d25 @ Sigma1(e)
1531 vadd.i64 d27,d29,d23
1532 vshr.u64 d25,d16,#34
1535 vshr.u64 d26,d16,#39
1542 vbsl d30,d18,d17 @ Maj(a,b,c)
1543 veor d23,d26 @ Sigma0(a)
1547 vshr.u64 d24,d19,#14 @ 25
1549 vld1.64 {d9},[r1]! @ handles unaligned
1551 vshr.u64 d25,d19,#18
1553 vadd.i64 d23,d30 @ h+=Maj from the past
1555 vshr.u64 d26,d19,#41
1556 vld1.64 {d28},[r3,:64]! @ K[i++]
1561 #if 25<16 && defined(__ARMEL__)
1565 vbsl d29,d20,d21 @ Ch(e,f,g)
1566 vshr.u64 d24,d23,#28
1567 veor d26,d25 @ Sigma1(e)
1568 vadd.i64 d27,d29,d22
1569 vshr.u64 d25,d23,#34
1572 vshr.u64 d26,d23,#39
1579 vbsl d30,d17,d16 @ Maj(a,b,c)
1580 veor d22,d26 @ Sigma0(a)
1586 vadd.i64 d22,d30 @ h+=Maj from the past
1589 vext.8 q14,q5,q6,#8 @ X[i+1]
1593 veor q15,q13 @ sigma1(X[i+14])
1599 vext.8 q14,q1,q2,#8 @ X[i+9]
1601 vshr.u64 d24,d18,#14 @ from NEON_00_15
1603 vshr.u64 d25,d18,#18 @ from NEON_00_15
1604 veor q15,q13 @ sigma0(X[i+1])
1605 vshr.u64 d26,d18,#41 @ from NEON_00_15
1607 vld1.64 {d28},[r3,:64]! @ K[i++]
1612 #if 26<16 && defined(__ARMEL__)
1616 vbsl d29,d19,d20 @ Ch(e,f,g)
1617 vshr.u64 d24,d22,#28
1618 veor d26,d25 @ Sigma1(e)
1619 vadd.i64 d27,d29,d21
1620 vshr.u64 d25,d22,#34
1623 vshr.u64 d26,d22,#39
1630 vbsl d30,d16,d23 @ Maj(a,b,c)
1631 veor d21,d26 @ Sigma0(a)
1635 vshr.u64 d24,d17,#14 @ 27
1637 vld1.64 {d11},[r1]! @ handles unaligned
1639 vshr.u64 d25,d17,#18
1641 vadd.i64 d21,d30 @ h+=Maj from the past
1643 vshr.u64 d26,d17,#41
1644 vld1.64 {d28},[r3,:64]! @ K[i++]
1649 #if 27<16 && defined(__ARMEL__)
1653 vbsl d29,d18,d19 @ Ch(e,f,g)
1654 vshr.u64 d24,d21,#28
1655 veor d26,d25 @ Sigma1(e)
1656 vadd.i64 d27,d29,d20
1657 vshr.u64 d25,d21,#34
1660 vshr.u64 d26,d21,#39
1667 vbsl d30,d23,d22 @ Maj(a,b,c)
1668 veor d20,d26 @ Sigma0(a)
1674 vadd.i64 d20,d30 @ h+=Maj from the past
1677 vext.8 q14,q6,q7,#8 @ X[i+1]
1681 veor q15,q13 @ sigma1(X[i+14])
1687 vext.8 q14,q2,q3,#8 @ X[i+9]
1689 vshr.u64 d24,d16,#14 @ from NEON_00_15
1691 vshr.u64 d25,d16,#18 @ from NEON_00_15
1692 veor q15,q13 @ sigma0(X[i+1])
1693 vshr.u64 d26,d16,#41 @ from NEON_00_15
1695 vld1.64 {d28},[r3,:64]! @ K[i++]
1700 #if 28<16 && defined(__ARMEL__)
1704 vbsl d29,d17,d18 @ Ch(e,f,g)
1705 vshr.u64 d24,d20,#28
1706 veor d26,d25 @ Sigma1(e)
1707 vadd.i64 d27,d29,d19
1708 vshr.u64 d25,d20,#34
1711 vshr.u64 d26,d20,#39
1718 vbsl d30,d22,d21 @ Maj(a,b,c)
1719 veor d19,d26 @ Sigma0(a)
1723 vshr.u64 d24,d23,#14 @ 29
1725 vld1.64 {d13},[r1]! @ handles unaligned
1727 vshr.u64 d25,d23,#18
1729 vadd.i64 d19,d30 @ h+=Maj from the past
1731 vshr.u64 d26,d23,#41
1732 vld1.64 {d28},[r3,:64]! @ K[i++]
1737 #if 29<16 && defined(__ARMEL__)
1741 vbsl d29,d16,d17 @ Ch(e,f,g)
1742 vshr.u64 d24,d19,#28
1743 veor d26,d25 @ Sigma1(e)
1744 vadd.i64 d27,d29,d18
1745 vshr.u64 d25,d19,#34
1748 vshr.u64 d26,d19,#39
1755 vbsl d30,d21,d20 @ Maj(a,b,c)
1756 veor d18,d26 @ Sigma0(a)
1762 vadd.i64 d18,d30 @ h+=Maj from the past
1765 vext.8 q14,q7,q0,#8 @ X[i+1]
1769 veor q15,q13 @ sigma1(X[i+14])
1775 vext.8 q14,q3,q4,#8 @ X[i+9]
1777 vshr.u64 d24,d22,#14 @ from NEON_00_15
1779 vshr.u64 d25,d22,#18 @ from NEON_00_15
1780 veor q15,q13 @ sigma0(X[i+1])
1781 vshr.u64 d26,d22,#41 @ from NEON_00_15
1783 vld1.64 {d28},[r3,:64]! @ K[i++]
1788 #if 30<16 && defined(__ARMEL__)
1792 vbsl d29,d23,d16 @ Ch(e,f,g)
1793 vshr.u64 d24,d18,#28
1794 veor d26,d25 @ Sigma1(e)
1795 vadd.i64 d27,d29,d17
1796 vshr.u64 d25,d18,#34
1799 vshr.u64 d26,d18,#39
1806 vbsl d30,d20,d19 @ Maj(a,b,c)
1807 veor d17,d26 @ Sigma0(a)
1811 vshr.u64 d24,d21,#14 @ 31
1813 vld1.64 {d15},[r1]! @ handles unaligned
1815 vshr.u64 d25,d21,#18
1817 vadd.i64 d17,d30 @ h+=Maj from the past
1819 vshr.u64 d26,d21,#41
1820 vld1.64 {d28},[r3,:64]! @ K[i++]
1825 #if 31<16 && defined(__ARMEL__)
1829 vbsl d29,d22,d23 @ Ch(e,f,g)
1830 vshr.u64 d24,d17,#28
1831 veor d26,d25 @ Sigma1(e)
1832 vadd.i64 d27,d29,d16
1833 vshr.u64 d25,d17,#34
1836 vshr.u64 d26,d17,#39
1843 vbsl d30,d19,d18 @ Maj(a,b,c)
1844 veor d16,d26 @ Sigma0(a)
1850 vadd.i64 d16,d30 @ h+=Maj from the past
1851 vldmia r0,{d24-d31} @ load context to temp
1852 vadd.i64 q8,q12 @ vectorized accumulate
1856 vstmia r0,{d16-d23} @ save context
1858 sub r3,#640 @ rewind K512
1862 bx lr @ .word 0xe12fff1e
1863 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
1865 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
1867 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1868 .comm OPENSSL_armcap_P,4,4