1 @ SPDX-License-Identifier: GPL-2.0
3 @ This code is taken from the OpenSSL project but the author (Andy Polyakov)
4 @ has relicensed it under the GPLv2. Therefore this program is free software;
5 @ you can redistribute it and/or modify it under the terms of the GNU General
6 @ Public License version 2 as published by the Free Software Foundation.
8 @ The original headers, including the original license headers, are
9 @ included below for completeness.
11 @ ====================================================================
12 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
13 @ project. The module is, however, dual licensed under OpenSSL and
14 @ CRYPTOGAMS licenses depending on where you obtain it. For further
15 @ details see http://www.openssl.org/~appro/cryptogams/.
16 @ ====================================================================
18 @ SHA512 block procedure for ARMv4. September 2007.
20 @ This code is ~4.5 (four and a half) times faster than code generated
21 @ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
22 @ Xscale PXA250 core].
26 @ Rescheduling for dual-issue pipeline resulted in 6% improvement on
27 @ Cortex A8 core and ~40 cycles per processed byte.
31 @ Profiler-assisted and platform-specific optimization resulted in 7%
32 @ improvement on Coxtex A8 core and ~38 cycles per byte.
36 @ Add NEON implementation. On Cortex A8 it was measured to process
37 @ one byte in 23.3 cycles or ~60% faster than integer-only code.
41 @ Improve NEON performance by 12% on Snapdragon S4. In absolute
42 @ terms it's 22.6 cycles per byte, which is disappointing result.
43 @ Technical writers asserted that 3-way S4 pipeline can sustain
44 @ multiple NEON instructions per cycle, but dual NEON issue could
45 @ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
46 @ for further details. On side note Cortex-A15 processes one byte in
49 @ Byte order [in]dependence. =========================================
51 @ Originally caller was expected to maintain specific *dword* order in
52 @ h[0-7], namely with most significant dword at *lower* address, which
53 @ was reflected in below two parameters as 0 and 4. Now caller is
54 @ expected to maintain native byte order for whole 64-bit values.
56 # include "arm_arch.h"
57 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
58 # define VFP_ABI_POP vldmia sp!,{d8-d15}
60 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
61 # define __ARM_MAX_ARCH__ 7
69 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
73 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
92 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
93 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
94 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
95 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
96 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
97 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
98 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
99 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
100 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
101 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
102 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
103 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
104 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
105 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
106 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
107 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
108 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
109 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
110 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
111 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
112 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
113 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
114 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
115 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
116 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
117 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
118 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
119 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
120 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
121 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
122 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
123 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
124 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
125 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
126 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
127 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
128 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
129 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
130 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
131 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
133 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
135 .word OPENSSL_armcap_P-sha512_block_data_order
141 .global sha512_block_data_order
142 .type sha512_block_data_order,%function
143 sha512_block_data_order:
145 sub r3,pc,#8 @ sha512_block_data_order
147 adr r3,sha512_block_data_order
149 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
150 ldr r12,.LOPENSSL_armcap
151 ldr r12,[r3,r12] @ OPENSSL_armcap_P
155 add r2,r1,r2,lsl#7 @ len to point at the end of inp
156 stmdb sp!,{r4-r12,lr}
157 sub r14,r3,#672 @ K512
214 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
215 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
216 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
222 ldr r11,[sp,#56+0] @ h.lo
223 eor r10,r10,r7,lsl#18
224 ldr r12,[sp,#56+4] @ h.hi
226 eor r10,r10,r8,lsr#18
228 eor r10,r10,r7,lsl#14
232 eor r10,r10,r8,lsl#23 @ Sigma1(e)
234 ldr r9,[sp,#40+0] @ f.lo
235 adc r4,r4,r10 @ T += Sigma1(e)
236 ldr r10,[sp,#40+4] @ f.hi
238 ldr r11,[sp,#48+0] @ g.lo
239 adc r4,r4,r12 @ T += h
240 ldr r12,[sp,#48+4] @ g.hi
251 ldr r11,[r14,#LO] @ K[i].lo
252 eor r10,r10,r12 @ Ch(e,f,g)
253 ldr r12,[r14,#HI] @ K[i].hi
256 ldr r7,[sp,#24+0] @ d.lo
257 adc r4,r4,r10 @ T += Ch(e,f,g)
258 ldr r8,[sp,#24+4] @ d.hi
261 adc r4,r4,r12 @ T += K[i]
263 ldr r11,[sp,#8+0] @ b.lo
264 adc r8,r8,r4 @ d += T
267 ldr r12,[sp,#16+0] @ c.lo
269 it eq @ Thumb2 thing, sanity check in ARM
272 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
273 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
274 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
282 eor r10,r10,r6,lsl#30
286 eor r10,r10,r6,lsl#25 @ Sigma0(a)
289 adc r4,r4,r10 @ T += Sigma0(a)
291 ldr r10,[sp,#8+4] @ b.hi
293 ldr r11,[sp,#16+4] @ c.hi
297 orr r5,r5,r9 @ Maj(a,b,c).lo
300 orr r6,r6,r12 @ Maj(a,b,c).hi
302 adc r6,r6,r4 @ h += T
311 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
312 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
313 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
328 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
329 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
330 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
334 eor r10,r10,r11,lsl#13
336 eor r10,r10,r11,lsr#29
338 eor r10,r10,r12,lsl#3
340 eor r10,r10,r12,lsr#6
354 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
355 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
356 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
362 ldr r11,[sp,#56+0] @ h.lo
363 eor r10,r10,r7,lsl#18
364 ldr r12,[sp,#56+4] @ h.hi
366 eor r10,r10,r8,lsr#18
368 eor r10,r10,r7,lsl#14
372 eor r10,r10,r8,lsl#23 @ Sigma1(e)
374 ldr r9,[sp,#40+0] @ f.lo
375 adc r4,r4,r10 @ T += Sigma1(e)
376 ldr r10,[sp,#40+4] @ f.hi
378 ldr r11,[sp,#48+0] @ g.lo
379 adc r4,r4,r12 @ T += h
380 ldr r12,[sp,#48+4] @ g.hi
391 ldr r11,[r14,#LO] @ K[i].lo
392 eor r10,r10,r12 @ Ch(e,f,g)
393 ldr r12,[r14,#HI] @ K[i].hi
396 ldr r7,[sp,#24+0] @ d.lo
397 adc r4,r4,r10 @ T += Ch(e,f,g)
398 ldr r8,[sp,#24+4] @ d.hi
401 adc r4,r4,r12 @ T += K[i]
403 ldr r11,[sp,#8+0] @ b.lo
404 adc r8,r8,r4 @ d += T
407 ldr r12,[sp,#16+0] @ c.lo
409 it eq @ Thumb2 thing, sanity check in ARM
412 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
413 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
414 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
422 eor r10,r10,r6,lsl#30
426 eor r10,r10,r6,lsl#25 @ Sigma0(a)
429 adc r4,r4,r10 @ T += Sigma0(a)
431 ldr r10,[sp,#8+4] @ b.hi
433 ldr r11,[sp,#16+4] @ c.hi
437 orr r5,r5,r9 @ Maj(a,b,c).lo
440 orr r6,r6,r12 @ Maj(a,b,c).hi
442 adc r6,r6,r4 @ h += T
446 ittt eq @ Thumb2 thing, sanity check in ARM
449 ldreq r10,[sp,#184+4]
523 add sp,sp,#8*9 @ destroy frame
525 ldmia sp!,{r4-r12,pc}
527 ldmia sp!,{r4-r12,lr}
529 moveq pc,lr @ be binary compatible with V4, yet
530 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
532 .size sha512_block_data_order,.-sha512_block_data_order
533 #if __ARM_MAX_ARCH__>=7
537 .global sha512_block_data_order_neon
538 .type sha512_block_data_order_neon,%function
540 sha512_block_data_order_neon:
542 dmb @ errata #451034 on early Cortex A8
543 add r2,r1,r2,lsl#7 @ len to point at the end of inp
546 vldmia r0,{d16-d23} @ load context
548 vshr.u64 d24,d20,#14 @ 0
550 vld1.64 {d0},[r1]! @ handles unaligned
554 vadd.i64 d16,d30 @ h+=Maj from the past
557 vld1.64 {d28},[r3,:64]! @ K[i++]
562 #if 0<16 && defined(__ARMEL__)
566 vbsl d29,d21,d22 @ Ch(e,f,g)
568 veor d26,d25 @ Sigma1(e)
580 vbsl d30,d18,d17 @ Maj(a,b,c)
581 veor d23,d26 @ Sigma0(a)
585 vshr.u64 d24,d19,#14 @ 1
587 vld1.64 {d1},[r1]! @ handles unaligned
591 vadd.i64 d23,d30 @ h+=Maj from the past
594 vld1.64 {d28},[r3,:64]! @ K[i++]
599 #if 1<16 && defined(__ARMEL__)
603 vbsl d29,d20,d21 @ Ch(e,f,g)
605 veor d26,d25 @ Sigma1(e)
617 vbsl d30,d17,d16 @ Maj(a,b,c)
618 veor d22,d26 @ Sigma0(a)
622 vshr.u64 d24,d18,#14 @ 2
624 vld1.64 {d2},[r1]! @ handles unaligned
628 vadd.i64 d22,d30 @ h+=Maj from the past
631 vld1.64 {d28},[r3,:64]! @ K[i++]
636 #if 2<16 && defined(__ARMEL__)
640 vbsl d29,d19,d20 @ Ch(e,f,g)
642 veor d26,d25 @ Sigma1(e)
654 vbsl d30,d16,d23 @ Maj(a,b,c)
655 veor d21,d26 @ Sigma0(a)
659 vshr.u64 d24,d17,#14 @ 3
661 vld1.64 {d3},[r1]! @ handles unaligned
665 vadd.i64 d21,d30 @ h+=Maj from the past
668 vld1.64 {d28},[r3,:64]! @ K[i++]
673 #if 3<16 && defined(__ARMEL__)
677 vbsl d29,d18,d19 @ Ch(e,f,g)
679 veor d26,d25 @ Sigma1(e)
691 vbsl d30,d23,d22 @ Maj(a,b,c)
692 veor d20,d26 @ Sigma0(a)
696 vshr.u64 d24,d16,#14 @ 4
698 vld1.64 {d4},[r1]! @ handles unaligned
702 vadd.i64 d20,d30 @ h+=Maj from the past
705 vld1.64 {d28},[r3,:64]! @ K[i++]
710 #if 4<16 && defined(__ARMEL__)
714 vbsl d29,d17,d18 @ Ch(e,f,g)
716 veor d26,d25 @ Sigma1(e)
728 vbsl d30,d22,d21 @ Maj(a,b,c)
729 veor d19,d26 @ Sigma0(a)
733 vshr.u64 d24,d23,#14 @ 5
735 vld1.64 {d5},[r1]! @ handles unaligned
739 vadd.i64 d19,d30 @ h+=Maj from the past
742 vld1.64 {d28},[r3,:64]! @ K[i++]
747 #if 5<16 && defined(__ARMEL__)
751 vbsl d29,d16,d17 @ Ch(e,f,g)
753 veor d26,d25 @ Sigma1(e)
765 vbsl d30,d21,d20 @ Maj(a,b,c)
766 veor d18,d26 @ Sigma0(a)
770 vshr.u64 d24,d22,#14 @ 6
772 vld1.64 {d6},[r1]! @ handles unaligned
776 vadd.i64 d18,d30 @ h+=Maj from the past
779 vld1.64 {d28},[r3,:64]! @ K[i++]
784 #if 6<16 && defined(__ARMEL__)
788 vbsl d29,d23,d16 @ Ch(e,f,g)
790 veor d26,d25 @ Sigma1(e)
802 vbsl d30,d20,d19 @ Maj(a,b,c)
803 veor d17,d26 @ Sigma0(a)
807 vshr.u64 d24,d21,#14 @ 7
809 vld1.64 {d7},[r1]! @ handles unaligned
813 vadd.i64 d17,d30 @ h+=Maj from the past
816 vld1.64 {d28},[r3,:64]! @ K[i++]
821 #if 7<16 && defined(__ARMEL__)
825 vbsl d29,d22,d23 @ Ch(e,f,g)
827 veor d26,d25 @ Sigma1(e)
839 vbsl d30,d19,d18 @ Maj(a,b,c)
840 veor d16,d26 @ Sigma0(a)
844 vshr.u64 d24,d20,#14 @ 8
846 vld1.64 {d8},[r1]! @ handles unaligned
850 vadd.i64 d16,d30 @ h+=Maj from the past
853 vld1.64 {d28},[r3,:64]! @ K[i++]
858 #if 8<16 && defined(__ARMEL__)
862 vbsl d29,d21,d22 @ Ch(e,f,g)
864 veor d26,d25 @ Sigma1(e)
876 vbsl d30,d18,d17 @ Maj(a,b,c)
877 veor d23,d26 @ Sigma0(a)
881 vshr.u64 d24,d19,#14 @ 9
883 vld1.64 {d9},[r1]! @ handles unaligned
887 vadd.i64 d23,d30 @ h+=Maj from the past
890 vld1.64 {d28},[r3,:64]! @ K[i++]
895 #if 9<16 && defined(__ARMEL__)
899 vbsl d29,d20,d21 @ Ch(e,f,g)
901 veor d26,d25 @ Sigma1(e)
913 vbsl d30,d17,d16 @ Maj(a,b,c)
914 veor d22,d26 @ Sigma0(a)
918 vshr.u64 d24,d18,#14 @ 10
920 vld1.64 {d10},[r1]! @ handles unaligned
924 vadd.i64 d22,d30 @ h+=Maj from the past
927 vld1.64 {d28},[r3,:64]! @ K[i++]
932 #if 10<16 && defined(__ARMEL__)
936 vbsl d29,d19,d20 @ Ch(e,f,g)
938 veor d26,d25 @ Sigma1(e)
950 vbsl d30,d16,d23 @ Maj(a,b,c)
951 veor d21,d26 @ Sigma0(a)
955 vshr.u64 d24,d17,#14 @ 11
957 vld1.64 {d11},[r1]! @ handles unaligned
961 vadd.i64 d21,d30 @ h+=Maj from the past
964 vld1.64 {d28},[r3,:64]! @ K[i++]
969 #if 11<16 && defined(__ARMEL__)
973 vbsl d29,d18,d19 @ Ch(e,f,g)
975 veor d26,d25 @ Sigma1(e)
987 vbsl d30,d23,d22 @ Maj(a,b,c)
988 veor d20,d26 @ Sigma0(a)
992 vshr.u64 d24,d16,#14 @ 12
994 vld1.64 {d12},[r1]! @ handles unaligned
998 vadd.i64 d20,d30 @ h+=Maj from the past
1000 vshr.u64 d26,d16,#41
1001 vld1.64 {d28},[r3,:64]! @ K[i++]
1006 #if 12<16 && defined(__ARMEL__)
1010 vbsl d29,d17,d18 @ Ch(e,f,g)
1011 vshr.u64 d24,d20,#28
1012 veor d26,d25 @ Sigma1(e)
1013 vadd.i64 d27,d29,d19
1014 vshr.u64 d25,d20,#34
1017 vshr.u64 d26,d20,#39
1024 vbsl d30,d22,d21 @ Maj(a,b,c)
1025 veor d19,d26 @ Sigma0(a)
1029 vshr.u64 d24,d23,#14 @ 13
1031 vld1.64 {d13},[r1]! @ handles unaligned
1033 vshr.u64 d25,d23,#18
1035 vadd.i64 d19,d30 @ h+=Maj from the past
1037 vshr.u64 d26,d23,#41
1038 vld1.64 {d28},[r3,:64]! @ K[i++]
1043 #if 13<16 && defined(__ARMEL__)
1047 vbsl d29,d16,d17 @ Ch(e,f,g)
1048 vshr.u64 d24,d19,#28
1049 veor d26,d25 @ Sigma1(e)
1050 vadd.i64 d27,d29,d18
1051 vshr.u64 d25,d19,#34
1054 vshr.u64 d26,d19,#39
1061 vbsl d30,d21,d20 @ Maj(a,b,c)
1062 veor d18,d26 @ Sigma0(a)
1066 vshr.u64 d24,d22,#14 @ 14
1068 vld1.64 {d14},[r1]! @ handles unaligned
1070 vshr.u64 d25,d22,#18
1072 vadd.i64 d18,d30 @ h+=Maj from the past
1074 vshr.u64 d26,d22,#41
1075 vld1.64 {d28},[r3,:64]! @ K[i++]
1080 #if 14<16 && defined(__ARMEL__)
1084 vbsl d29,d23,d16 @ Ch(e,f,g)
1085 vshr.u64 d24,d18,#28
1086 veor d26,d25 @ Sigma1(e)
1087 vadd.i64 d27,d29,d17
1088 vshr.u64 d25,d18,#34
1091 vshr.u64 d26,d18,#39
1098 vbsl d30,d20,d19 @ Maj(a,b,c)
1099 veor d17,d26 @ Sigma0(a)
1103 vshr.u64 d24,d21,#14 @ 15
1105 vld1.64 {d15},[r1]! @ handles unaligned
1107 vshr.u64 d25,d21,#18
1109 vadd.i64 d17,d30 @ h+=Maj from the past
1111 vshr.u64 d26,d21,#41
1112 vld1.64 {d28},[r3,:64]! @ K[i++]
1117 #if 15<16 && defined(__ARMEL__)
1121 vbsl d29,d22,d23 @ Ch(e,f,g)
1122 vshr.u64 d24,d17,#28
1123 veor d26,d25 @ Sigma1(e)
1124 vadd.i64 d27,d29,d16
1125 vshr.u64 d25,d17,#34
1128 vshr.u64 d26,d17,#39
1135 vbsl d30,d19,d18 @ Maj(a,b,c)
1136 veor d16,d26 @ Sigma0(a)
1145 vadd.i64 d16,d30 @ h+=Maj from the past
1148 vext.8 q14,q0,q1,#8 @ X[i+1]
1152 veor q15,q13 @ sigma1(X[i+14])
1158 vext.8 q14,q4,q5,#8 @ X[i+9]
1160 vshr.u64 d24,d20,#14 @ from NEON_00_15
1162 vshr.u64 d25,d20,#18 @ from NEON_00_15
1163 veor q15,q13 @ sigma0(X[i+1])
1164 vshr.u64 d26,d20,#41 @ from NEON_00_15
1166 vld1.64 {d28},[r3,:64]! @ K[i++]
1171 #if 16<16 && defined(__ARMEL__)
1175 vbsl d29,d21,d22 @ Ch(e,f,g)
1176 vshr.u64 d24,d16,#28
1177 veor d26,d25 @ Sigma1(e)
1178 vadd.i64 d27,d29,d23
1179 vshr.u64 d25,d16,#34
1182 vshr.u64 d26,d16,#39
1189 vbsl d30,d18,d17 @ Maj(a,b,c)
1190 veor d23,d26 @ Sigma0(a)
1194 vshr.u64 d24,d19,#14 @ 17
1196 vld1.64 {d1},[r1]! @ handles unaligned
1198 vshr.u64 d25,d19,#18
1200 vadd.i64 d23,d30 @ h+=Maj from the past
1202 vshr.u64 d26,d19,#41
1203 vld1.64 {d28},[r3,:64]! @ K[i++]
1208 #if 17<16 && defined(__ARMEL__)
1212 vbsl d29,d20,d21 @ Ch(e,f,g)
1213 vshr.u64 d24,d23,#28
1214 veor d26,d25 @ Sigma1(e)
1215 vadd.i64 d27,d29,d22
1216 vshr.u64 d25,d23,#34
1219 vshr.u64 d26,d23,#39
1226 vbsl d30,d17,d16 @ Maj(a,b,c)
1227 veor d22,d26 @ Sigma0(a)
1233 vadd.i64 d22,d30 @ h+=Maj from the past
1236 vext.8 q14,q1,q2,#8 @ X[i+1]
1240 veor q15,q13 @ sigma1(X[i+14])
1246 vext.8 q14,q5,q6,#8 @ X[i+9]
1248 vshr.u64 d24,d18,#14 @ from NEON_00_15
1250 vshr.u64 d25,d18,#18 @ from NEON_00_15
1251 veor q15,q13 @ sigma0(X[i+1])
1252 vshr.u64 d26,d18,#41 @ from NEON_00_15
1254 vld1.64 {d28},[r3,:64]! @ K[i++]
1259 #if 18<16 && defined(__ARMEL__)
1263 vbsl d29,d19,d20 @ Ch(e,f,g)
1264 vshr.u64 d24,d22,#28
1265 veor d26,d25 @ Sigma1(e)
1266 vadd.i64 d27,d29,d21
1267 vshr.u64 d25,d22,#34
1270 vshr.u64 d26,d22,#39
1277 vbsl d30,d16,d23 @ Maj(a,b,c)
1278 veor d21,d26 @ Sigma0(a)
1282 vshr.u64 d24,d17,#14 @ 19
1284 vld1.64 {d3},[r1]! @ handles unaligned
1286 vshr.u64 d25,d17,#18
1288 vadd.i64 d21,d30 @ h+=Maj from the past
1290 vshr.u64 d26,d17,#41
1291 vld1.64 {d28},[r3,:64]! @ K[i++]
1296 #if 19<16 && defined(__ARMEL__)
1300 vbsl d29,d18,d19 @ Ch(e,f,g)
1301 vshr.u64 d24,d21,#28
1302 veor d26,d25 @ Sigma1(e)
1303 vadd.i64 d27,d29,d20
1304 vshr.u64 d25,d21,#34
1307 vshr.u64 d26,d21,#39
1314 vbsl d30,d23,d22 @ Maj(a,b,c)
1315 veor d20,d26 @ Sigma0(a)
1321 vadd.i64 d20,d30 @ h+=Maj from the past
1324 vext.8 q14,q2,q3,#8 @ X[i+1]
1328 veor q15,q13 @ sigma1(X[i+14])
1334 vext.8 q14,q6,q7,#8 @ X[i+9]
1336 vshr.u64 d24,d16,#14 @ from NEON_00_15
1338 vshr.u64 d25,d16,#18 @ from NEON_00_15
1339 veor q15,q13 @ sigma0(X[i+1])
1340 vshr.u64 d26,d16,#41 @ from NEON_00_15
1342 vld1.64 {d28},[r3,:64]! @ K[i++]
1347 #if 20<16 && defined(__ARMEL__)
1351 vbsl d29,d17,d18 @ Ch(e,f,g)
1352 vshr.u64 d24,d20,#28
1353 veor d26,d25 @ Sigma1(e)
1354 vadd.i64 d27,d29,d19
1355 vshr.u64 d25,d20,#34
1358 vshr.u64 d26,d20,#39
1365 vbsl d30,d22,d21 @ Maj(a,b,c)
1366 veor d19,d26 @ Sigma0(a)
1370 vshr.u64 d24,d23,#14 @ 21
1372 vld1.64 {d5},[r1]! @ handles unaligned
1374 vshr.u64 d25,d23,#18
1376 vadd.i64 d19,d30 @ h+=Maj from the past
1378 vshr.u64 d26,d23,#41
1379 vld1.64 {d28},[r3,:64]! @ K[i++]
1384 #if 21<16 && defined(__ARMEL__)
1388 vbsl d29,d16,d17 @ Ch(e,f,g)
1389 vshr.u64 d24,d19,#28
1390 veor d26,d25 @ Sigma1(e)
1391 vadd.i64 d27,d29,d18
1392 vshr.u64 d25,d19,#34
1395 vshr.u64 d26,d19,#39
1402 vbsl d30,d21,d20 @ Maj(a,b,c)
1403 veor d18,d26 @ Sigma0(a)
1409 vadd.i64 d18,d30 @ h+=Maj from the past
1412 vext.8 q14,q3,q4,#8 @ X[i+1]
1416 veor q15,q13 @ sigma1(X[i+14])
1422 vext.8 q14,q7,q0,#8 @ X[i+9]
1424 vshr.u64 d24,d22,#14 @ from NEON_00_15
1426 vshr.u64 d25,d22,#18 @ from NEON_00_15
1427 veor q15,q13 @ sigma0(X[i+1])
1428 vshr.u64 d26,d22,#41 @ from NEON_00_15
1430 vld1.64 {d28},[r3,:64]! @ K[i++]
1435 #if 22<16 && defined(__ARMEL__)
1439 vbsl d29,d23,d16 @ Ch(e,f,g)
1440 vshr.u64 d24,d18,#28
1441 veor d26,d25 @ Sigma1(e)
1442 vadd.i64 d27,d29,d17
1443 vshr.u64 d25,d18,#34
1446 vshr.u64 d26,d18,#39
1453 vbsl d30,d20,d19 @ Maj(a,b,c)
1454 veor d17,d26 @ Sigma0(a)
1458 vshr.u64 d24,d21,#14 @ 23
1460 vld1.64 {d7},[r1]! @ handles unaligned
1462 vshr.u64 d25,d21,#18
1464 vadd.i64 d17,d30 @ h+=Maj from the past
1466 vshr.u64 d26,d21,#41
1467 vld1.64 {d28},[r3,:64]! @ K[i++]
1472 #if 23<16 && defined(__ARMEL__)
1476 vbsl d29,d22,d23 @ Ch(e,f,g)
1477 vshr.u64 d24,d17,#28
1478 veor d26,d25 @ Sigma1(e)
1479 vadd.i64 d27,d29,d16
1480 vshr.u64 d25,d17,#34
1483 vshr.u64 d26,d17,#39
1490 vbsl d30,d19,d18 @ Maj(a,b,c)
1491 veor d16,d26 @ Sigma0(a)
1497 vadd.i64 d16,d30 @ h+=Maj from the past
1500 vext.8 q14,q4,q5,#8 @ X[i+1]
1504 veor q15,q13 @ sigma1(X[i+14])
1510 vext.8 q14,q0,q1,#8 @ X[i+9]
1512 vshr.u64 d24,d20,#14 @ from NEON_00_15
1514 vshr.u64 d25,d20,#18 @ from NEON_00_15
1515 veor q15,q13 @ sigma0(X[i+1])
1516 vshr.u64 d26,d20,#41 @ from NEON_00_15
1518 vld1.64 {d28},[r3,:64]! @ K[i++]
1523 #if 24<16 && defined(__ARMEL__)
1527 vbsl d29,d21,d22 @ Ch(e,f,g)
1528 vshr.u64 d24,d16,#28
1529 veor d26,d25 @ Sigma1(e)
1530 vadd.i64 d27,d29,d23
1531 vshr.u64 d25,d16,#34
1534 vshr.u64 d26,d16,#39
1541 vbsl d30,d18,d17 @ Maj(a,b,c)
1542 veor d23,d26 @ Sigma0(a)
1546 vshr.u64 d24,d19,#14 @ 25
1548 vld1.64 {d9},[r1]! @ handles unaligned
1550 vshr.u64 d25,d19,#18
1552 vadd.i64 d23,d30 @ h+=Maj from the past
1554 vshr.u64 d26,d19,#41
1555 vld1.64 {d28},[r3,:64]! @ K[i++]
1560 #if 25<16 && defined(__ARMEL__)
1564 vbsl d29,d20,d21 @ Ch(e,f,g)
1565 vshr.u64 d24,d23,#28
1566 veor d26,d25 @ Sigma1(e)
1567 vadd.i64 d27,d29,d22
1568 vshr.u64 d25,d23,#34
1571 vshr.u64 d26,d23,#39
1578 vbsl d30,d17,d16 @ Maj(a,b,c)
1579 veor d22,d26 @ Sigma0(a)
1585 vadd.i64 d22,d30 @ h+=Maj from the past
1588 vext.8 q14,q5,q6,#8 @ X[i+1]
1592 veor q15,q13 @ sigma1(X[i+14])
1598 vext.8 q14,q1,q2,#8 @ X[i+9]
1600 vshr.u64 d24,d18,#14 @ from NEON_00_15
1602 vshr.u64 d25,d18,#18 @ from NEON_00_15
1603 veor q15,q13 @ sigma0(X[i+1])
1604 vshr.u64 d26,d18,#41 @ from NEON_00_15
1606 vld1.64 {d28},[r3,:64]! @ K[i++]
1611 #if 26<16 && defined(__ARMEL__)
1615 vbsl d29,d19,d20 @ Ch(e,f,g)
1616 vshr.u64 d24,d22,#28
1617 veor d26,d25 @ Sigma1(e)
1618 vadd.i64 d27,d29,d21
1619 vshr.u64 d25,d22,#34
1622 vshr.u64 d26,d22,#39
1629 vbsl d30,d16,d23 @ Maj(a,b,c)
1630 veor d21,d26 @ Sigma0(a)
1634 vshr.u64 d24,d17,#14 @ 27
1636 vld1.64 {d11},[r1]! @ handles unaligned
1638 vshr.u64 d25,d17,#18
1640 vadd.i64 d21,d30 @ h+=Maj from the past
1642 vshr.u64 d26,d17,#41
1643 vld1.64 {d28},[r3,:64]! @ K[i++]
1648 #if 27<16 && defined(__ARMEL__)
1652 vbsl d29,d18,d19 @ Ch(e,f,g)
1653 vshr.u64 d24,d21,#28
1654 veor d26,d25 @ Sigma1(e)
1655 vadd.i64 d27,d29,d20
1656 vshr.u64 d25,d21,#34
1659 vshr.u64 d26,d21,#39
1666 vbsl d30,d23,d22 @ Maj(a,b,c)
1667 veor d20,d26 @ Sigma0(a)
1673 vadd.i64 d20,d30 @ h+=Maj from the past
1676 vext.8 q14,q6,q7,#8 @ X[i+1]
1680 veor q15,q13 @ sigma1(X[i+14])
1686 vext.8 q14,q2,q3,#8 @ X[i+9]
1688 vshr.u64 d24,d16,#14 @ from NEON_00_15
1690 vshr.u64 d25,d16,#18 @ from NEON_00_15
1691 veor q15,q13 @ sigma0(X[i+1])
1692 vshr.u64 d26,d16,#41 @ from NEON_00_15
1694 vld1.64 {d28},[r3,:64]! @ K[i++]
1699 #if 28<16 && defined(__ARMEL__)
1703 vbsl d29,d17,d18 @ Ch(e,f,g)
1704 vshr.u64 d24,d20,#28
1705 veor d26,d25 @ Sigma1(e)
1706 vadd.i64 d27,d29,d19
1707 vshr.u64 d25,d20,#34
1710 vshr.u64 d26,d20,#39
1717 vbsl d30,d22,d21 @ Maj(a,b,c)
1718 veor d19,d26 @ Sigma0(a)
1722 vshr.u64 d24,d23,#14 @ 29
1724 vld1.64 {d13},[r1]! @ handles unaligned
1726 vshr.u64 d25,d23,#18
1728 vadd.i64 d19,d30 @ h+=Maj from the past
1730 vshr.u64 d26,d23,#41
1731 vld1.64 {d28},[r3,:64]! @ K[i++]
1736 #if 29<16 && defined(__ARMEL__)
1740 vbsl d29,d16,d17 @ Ch(e,f,g)
1741 vshr.u64 d24,d19,#28
1742 veor d26,d25 @ Sigma1(e)
1743 vadd.i64 d27,d29,d18
1744 vshr.u64 d25,d19,#34
1747 vshr.u64 d26,d19,#39
1754 vbsl d30,d21,d20 @ Maj(a,b,c)
1755 veor d18,d26 @ Sigma0(a)
1761 vadd.i64 d18,d30 @ h+=Maj from the past
1764 vext.8 q14,q7,q0,#8 @ X[i+1]
1768 veor q15,q13 @ sigma1(X[i+14])
1774 vext.8 q14,q3,q4,#8 @ X[i+9]
1776 vshr.u64 d24,d22,#14 @ from NEON_00_15
1778 vshr.u64 d25,d22,#18 @ from NEON_00_15
1779 veor q15,q13 @ sigma0(X[i+1])
1780 vshr.u64 d26,d22,#41 @ from NEON_00_15
1782 vld1.64 {d28},[r3,:64]! @ K[i++]
1787 #if 30<16 && defined(__ARMEL__)
1791 vbsl d29,d23,d16 @ Ch(e,f,g)
1792 vshr.u64 d24,d18,#28
1793 veor d26,d25 @ Sigma1(e)
1794 vadd.i64 d27,d29,d17
1795 vshr.u64 d25,d18,#34
1798 vshr.u64 d26,d18,#39
1805 vbsl d30,d20,d19 @ Maj(a,b,c)
1806 veor d17,d26 @ Sigma0(a)
1810 vshr.u64 d24,d21,#14 @ 31
1812 vld1.64 {d15},[r1]! @ handles unaligned
1814 vshr.u64 d25,d21,#18
1816 vadd.i64 d17,d30 @ h+=Maj from the past
1818 vshr.u64 d26,d21,#41
1819 vld1.64 {d28},[r3,:64]! @ K[i++]
1824 #if 31<16 && defined(__ARMEL__)
1828 vbsl d29,d22,d23 @ Ch(e,f,g)
1829 vshr.u64 d24,d17,#28
1830 veor d26,d25 @ Sigma1(e)
1831 vadd.i64 d27,d29,d16
1832 vshr.u64 d25,d17,#34
1835 vshr.u64 d26,d17,#39
1842 vbsl d30,d19,d18 @ Maj(a,b,c)
1843 veor d16,d26 @ Sigma0(a)
1849 vadd.i64 d16,d30 @ h+=Maj from the past
1850 vldmia r0,{d24-d31} @ load context to temp
1851 vadd.i64 q8,q12 @ vectorized accumulate
1855 vstmia r0,{d16-d23} @ save context
1857 sub r3,#640 @ rewind K512
1861 bx lr @ .word 0xe12fff1e
1862 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
1864 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
1866 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1867 .comm OPENSSL_armcap_P,4,4