3 @ ====================================================================
4 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 @ project. The module is, however, dual licensed under OpenSSL and
6 @ CRYPTOGAMS licenses depending on where you obtain it. For further
7 @ details see http://www.openssl.org/~appro/cryptogams/.
9 @ Permission to use under GPL terms is granted.
10 @ ====================================================================
12 @ SHA512 block procedure for ARMv4. September 2007.
14 @ This code is ~4.5 (four and a half) times faster than code generated
15 @ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 @ Xscale PXA250 core].
20 @ Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 @ Cortex A8 core and ~40 cycles per processed byte.
25 @ Profiler-assisted and platform-specific optimization resulted in 7%
26 @ improvement on Coxtex A8 core and ~38 cycles per byte.
30 @ Add NEON implementation. On Cortex A8 it was measured to process
31 @ one byte in 23.3 cycles or ~60% faster than integer-only code.
35 @ Improve NEON performance by 12% on Snapdragon S4. In absolute
36 @ terms it's 22.6 cycles per byte, which is disappointing result.
37 @ Technical writers asserted that 3-way S4 pipeline can sustain
38 @ multiple NEON instructions per cycle, but dual NEON issue could
39 @ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 @ for further details. On side note Cortex-A15 processes one byte in
43 @ Byte order [in]dependence. =========================================
45 @ Originally caller was expected to maintain specific *dword* order in
46 @ h[0-7], namely with most significant dword at *lower* address, which
47 @ was reflected in below two parameters as 0 and 4. Now caller is
48 @ expected to maintain native byte order for whole 64-bit values.
50 # include "arm_arch.h"
51 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
52 # define VFP_ABI_POP vldmia sp!,{d8-d15}
54 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
55 # define __ARM_MAX_ARCH__ 7
63 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
67 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
71 #if __ARM_ARCH__<7 || defined(__APPLE__)
86 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
87 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
88 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
89 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
90 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
91 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
92 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
93 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
94 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
95 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
96 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
97 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
98 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
99 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
100 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
101 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
102 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
103 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
104 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
105 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
106 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
107 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
108 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
109 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
110 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
111 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
112 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
113 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
114 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
115 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
116 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
117 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
118 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
119 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
120 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
121 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
122 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
123 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
124 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
125 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
127 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
129 .word OPENSSL_armcap_P-.Lsha512_block_data_order
135 .globl sha512_block_data_order
136 .type sha512_block_data_order,%function
137 sha512_block_data_order:
138 .Lsha512_block_data_order:
140 sub r3,pc,#8 @ sha512_block_data_order
142 adr r3,sha512_block_data_order
144 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
145 ldr r12,.LOPENSSL_armcap
146 ldr r12,[r3,r12] @ OPENSSL_armcap_P
153 add r2,r1,r2,lsl#7 @ len to point at the end of inp
154 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
155 sub r14,r3,#672 @ K512
212 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
213 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
214 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
220 ldr r11,[sp,#56+0] @ h.lo
221 eor r10,r10,r7,lsl#18
222 ldr r12,[sp,#56+4] @ h.hi
224 eor r10,r10,r8,lsr#18
226 eor r10,r10,r7,lsl#14
230 eor r10,r10,r8,lsl#23 @ Sigma1(e)
232 ldr r9,[sp,#40+0] @ f.lo
233 adc r4,r4,r10 @ T += Sigma1(e)
234 ldr r10,[sp,#40+4] @ f.hi
236 ldr r11,[sp,#48+0] @ g.lo
237 adc r4,r4,r12 @ T += h
238 ldr r12,[sp,#48+4] @ g.hi
249 ldr r11,[r14,#LO] @ K[i].lo
250 eor r10,r10,r12 @ Ch(e,f,g)
251 ldr r12,[r14,#HI] @ K[i].hi
254 ldr r7,[sp,#24+0] @ d.lo
255 adc r4,r4,r10 @ T += Ch(e,f,g)
256 ldr r8,[sp,#24+4] @ d.hi
259 adc r4,r4,r12 @ T += K[i]
261 ldr r11,[sp,#8+0] @ b.lo
262 adc r8,r8,r4 @ d += T
265 ldr r12,[sp,#16+0] @ c.lo
267 it eq @ Thumb2 thing, sanity check in ARM
270 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
271 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
272 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
280 eor r10,r10,r6,lsl#30
284 eor r10,r10,r6,lsl#25 @ Sigma0(a)
287 adc r4,r4,r10 @ T += Sigma0(a)
289 ldr r10,[sp,#8+4] @ b.hi
291 ldr r11,[sp,#16+4] @ c.hi
295 orr r5,r5,r9 @ Maj(a,b,c).lo
298 orr r6,r6,r12 @ Maj(a,b,c).hi
300 adc r6,r6,r4 @ h += T
309 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
310 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
311 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
326 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
327 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
328 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
332 eor r10,r10,r11,lsl#13
334 eor r10,r10,r11,lsr#29
336 eor r10,r10,r12,lsl#3
338 eor r10,r10,r12,lsr#6
352 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
353 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
354 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
360 ldr r11,[sp,#56+0] @ h.lo
361 eor r10,r10,r7,lsl#18
362 ldr r12,[sp,#56+4] @ h.hi
364 eor r10,r10,r8,lsr#18
366 eor r10,r10,r7,lsl#14
370 eor r10,r10,r8,lsl#23 @ Sigma1(e)
372 ldr r9,[sp,#40+0] @ f.lo
373 adc r4,r4,r10 @ T += Sigma1(e)
374 ldr r10,[sp,#40+4] @ f.hi
376 ldr r11,[sp,#48+0] @ g.lo
377 adc r4,r4,r12 @ T += h
378 ldr r12,[sp,#48+4] @ g.hi
389 ldr r11,[r14,#LO] @ K[i].lo
390 eor r10,r10,r12 @ Ch(e,f,g)
391 ldr r12,[r14,#HI] @ K[i].hi
394 ldr r7,[sp,#24+0] @ d.lo
395 adc r4,r4,r10 @ T += Ch(e,f,g)
396 ldr r8,[sp,#24+4] @ d.hi
399 adc r4,r4,r12 @ T += K[i]
401 ldr r11,[sp,#8+0] @ b.lo
402 adc r8,r8,r4 @ d += T
405 ldr r12,[sp,#16+0] @ c.lo
407 it eq @ Thumb2 thing, sanity check in ARM
410 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
411 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
412 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
420 eor r10,r10,r6,lsl#30
424 eor r10,r10,r6,lsl#25 @ Sigma0(a)
427 adc r4,r4,r10 @ T += Sigma0(a)
429 ldr r10,[sp,#8+4] @ b.hi
431 ldr r11,[sp,#16+4] @ c.hi
435 orr r5,r5,r9 @ Maj(a,b,c).lo
438 orr r6,r6,r12 @ Maj(a,b,c).hi
440 adc r6,r6,r4 @ h += T
444 ittt eq @ Thumb2 thing, sanity check in ARM
447 ldreq r10,[sp,#184+4]
521 add sp,sp,#8*9 @ destroy frame
523 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
525 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
527 moveq pc,lr @ be binary compatible with V4, yet
528 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
530 .size sha512_block_data_order,.-sha512_block_data_order
531 #if __ARM_MAX_ARCH__>=7
535 .globl sha512_block_data_order_neon
536 .type sha512_block_data_order_neon,%function
538 sha512_block_data_order_neon:
540 dmb @ errata #451034 on early Cortex A8
541 add r2,r1,r2,lsl#7 @ len to point at the end of inp
544 vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context
546 vshr.u64 d24,d20,#14 @ 0
548 vld1.64 {d0},[r1]! @ handles unaligned
552 vadd.i64 d16,d30 @ h+=Maj from the past
555 vld1.64 {d28},[r3,:64]! @ K[i++]
560 #if 0<16 && defined(__ARMEL__)
564 vbsl d29,d21,d22 @ Ch(e,f,g)
566 veor d26,d25 @ Sigma1(e)
578 vbsl d30,d18,d17 @ Maj(a,b,c)
579 veor d23,d26 @ Sigma0(a)
583 vshr.u64 d24,d19,#14 @ 1
585 vld1.64 {d1},[r1]! @ handles unaligned
589 vadd.i64 d23,d30 @ h+=Maj from the past
592 vld1.64 {d28},[r3,:64]! @ K[i++]
597 #if 1<16 && defined(__ARMEL__)
601 vbsl d29,d20,d21 @ Ch(e,f,g)
603 veor d26,d25 @ Sigma1(e)
615 vbsl d30,d17,d16 @ Maj(a,b,c)
616 veor d22,d26 @ Sigma0(a)
620 vshr.u64 d24,d18,#14 @ 2
622 vld1.64 {d2},[r1]! @ handles unaligned
626 vadd.i64 d22,d30 @ h+=Maj from the past
629 vld1.64 {d28},[r3,:64]! @ K[i++]
634 #if 2<16 && defined(__ARMEL__)
638 vbsl d29,d19,d20 @ Ch(e,f,g)
640 veor d26,d25 @ Sigma1(e)
652 vbsl d30,d16,d23 @ Maj(a,b,c)
653 veor d21,d26 @ Sigma0(a)
657 vshr.u64 d24,d17,#14 @ 3
659 vld1.64 {d3},[r1]! @ handles unaligned
663 vadd.i64 d21,d30 @ h+=Maj from the past
666 vld1.64 {d28},[r3,:64]! @ K[i++]
671 #if 3<16 && defined(__ARMEL__)
675 vbsl d29,d18,d19 @ Ch(e,f,g)
677 veor d26,d25 @ Sigma1(e)
689 vbsl d30,d23,d22 @ Maj(a,b,c)
690 veor d20,d26 @ Sigma0(a)
694 vshr.u64 d24,d16,#14 @ 4
696 vld1.64 {d4},[r1]! @ handles unaligned
700 vadd.i64 d20,d30 @ h+=Maj from the past
703 vld1.64 {d28},[r3,:64]! @ K[i++]
708 #if 4<16 && defined(__ARMEL__)
712 vbsl d29,d17,d18 @ Ch(e,f,g)
714 veor d26,d25 @ Sigma1(e)
726 vbsl d30,d22,d21 @ Maj(a,b,c)
727 veor d19,d26 @ Sigma0(a)
731 vshr.u64 d24,d23,#14 @ 5
733 vld1.64 {d5},[r1]! @ handles unaligned
737 vadd.i64 d19,d30 @ h+=Maj from the past
740 vld1.64 {d28},[r3,:64]! @ K[i++]
745 #if 5<16 && defined(__ARMEL__)
749 vbsl d29,d16,d17 @ Ch(e,f,g)
751 veor d26,d25 @ Sigma1(e)
763 vbsl d30,d21,d20 @ Maj(a,b,c)
764 veor d18,d26 @ Sigma0(a)
768 vshr.u64 d24,d22,#14 @ 6
770 vld1.64 {d6},[r1]! @ handles unaligned
774 vadd.i64 d18,d30 @ h+=Maj from the past
777 vld1.64 {d28},[r3,:64]! @ K[i++]
782 #if 6<16 && defined(__ARMEL__)
786 vbsl d29,d23,d16 @ Ch(e,f,g)
788 veor d26,d25 @ Sigma1(e)
800 vbsl d30,d20,d19 @ Maj(a,b,c)
801 veor d17,d26 @ Sigma0(a)
805 vshr.u64 d24,d21,#14 @ 7
807 vld1.64 {d7},[r1]! @ handles unaligned
811 vadd.i64 d17,d30 @ h+=Maj from the past
814 vld1.64 {d28},[r3,:64]! @ K[i++]
819 #if 7<16 && defined(__ARMEL__)
823 vbsl d29,d22,d23 @ Ch(e,f,g)
825 veor d26,d25 @ Sigma1(e)
837 vbsl d30,d19,d18 @ Maj(a,b,c)
838 veor d16,d26 @ Sigma0(a)
842 vshr.u64 d24,d20,#14 @ 8
844 vld1.64 {d8},[r1]! @ handles unaligned
848 vadd.i64 d16,d30 @ h+=Maj from the past
851 vld1.64 {d28},[r3,:64]! @ K[i++]
856 #if 8<16 && defined(__ARMEL__)
860 vbsl d29,d21,d22 @ Ch(e,f,g)
862 veor d26,d25 @ Sigma1(e)
874 vbsl d30,d18,d17 @ Maj(a,b,c)
875 veor d23,d26 @ Sigma0(a)
879 vshr.u64 d24,d19,#14 @ 9
881 vld1.64 {d9},[r1]! @ handles unaligned
885 vadd.i64 d23,d30 @ h+=Maj from the past
888 vld1.64 {d28},[r3,:64]! @ K[i++]
893 #if 9<16 && defined(__ARMEL__)
897 vbsl d29,d20,d21 @ Ch(e,f,g)
899 veor d26,d25 @ Sigma1(e)
911 vbsl d30,d17,d16 @ Maj(a,b,c)
912 veor d22,d26 @ Sigma0(a)
916 vshr.u64 d24,d18,#14 @ 10
918 vld1.64 {d10},[r1]! @ handles unaligned
922 vadd.i64 d22,d30 @ h+=Maj from the past
925 vld1.64 {d28},[r3,:64]! @ K[i++]
930 #if 10<16 && defined(__ARMEL__)
934 vbsl d29,d19,d20 @ Ch(e,f,g)
936 veor d26,d25 @ Sigma1(e)
948 vbsl d30,d16,d23 @ Maj(a,b,c)
949 veor d21,d26 @ Sigma0(a)
953 vshr.u64 d24,d17,#14 @ 11
955 vld1.64 {d11},[r1]! @ handles unaligned
959 vadd.i64 d21,d30 @ h+=Maj from the past
962 vld1.64 {d28},[r3,:64]! @ K[i++]
967 #if 11<16 && defined(__ARMEL__)
971 vbsl d29,d18,d19 @ Ch(e,f,g)
973 veor d26,d25 @ Sigma1(e)
985 vbsl d30,d23,d22 @ Maj(a,b,c)
986 veor d20,d26 @ Sigma0(a)
990 vshr.u64 d24,d16,#14 @ 12
992 vld1.64 {d12},[r1]! @ handles unaligned
996 vadd.i64 d20,d30 @ h+=Maj from the past
999 vld1.64 {d28},[r3,:64]! @ K[i++]
1004 #if 12<16 && defined(__ARMEL__)
1008 vbsl d29,d17,d18 @ Ch(e,f,g)
1009 vshr.u64 d24,d20,#28
1010 veor d26,d25 @ Sigma1(e)
1011 vadd.i64 d27,d29,d19
1012 vshr.u64 d25,d20,#34
1015 vshr.u64 d26,d20,#39
1022 vbsl d30,d22,d21 @ Maj(a,b,c)
1023 veor d19,d26 @ Sigma0(a)
1027 vshr.u64 d24,d23,#14 @ 13
1029 vld1.64 {d13},[r1]! @ handles unaligned
1031 vshr.u64 d25,d23,#18
1033 vadd.i64 d19,d30 @ h+=Maj from the past
1035 vshr.u64 d26,d23,#41
1036 vld1.64 {d28},[r3,:64]! @ K[i++]
1041 #if 13<16 && defined(__ARMEL__)
1045 vbsl d29,d16,d17 @ Ch(e,f,g)
1046 vshr.u64 d24,d19,#28
1047 veor d26,d25 @ Sigma1(e)
1048 vadd.i64 d27,d29,d18
1049 vshr.u64 d25,d19,#34
1052 vshr.u64 d26,d19,#39
1059 vbsl d30,d21,d20 @ Maj(a,b,c)
1060 veor d18,d26 @ Sigma0(a)
1064 vshr.u64 d24,d22,#14 @ 14
1066 vld1.64 {d14},[r1]! @ handles unaligned
1068 vshr.u64 d25,d22,#18
1070 vadd.i64 d18,d30 @ h+=Maj from the past
1072 vshr.u64 d26,d22,#41
1073 vld1.64 {d28},[r3,:64]! @ K[i++]
1078 #if 14<16 && defined(__ARMEL__)
1082 vbsl d29,d23,d16 @ Ch(e,f,g)
1083 vshr.u64 d24,d18,#28
1084 veor d26,d25 @ Sigma1(e)
1085 vadd.i64 d27,d29,d17
1086 vshr.u64 d25,d18,#34
1089 vshr.u64 d26,d18,#39
1096 vbsl d30,d20,d19 @ Maj(a,b,c)
1097 veor d17,d26 @ Sigma0(a)
1101 vshr.u64 d24,d21,#14 @ 15
1103 vld1.64 {d15},[r1]! @ handles unaligned
1105 vshr.u64 d25,d21,#18
1107 vadd.i64 d17,d30 @ h+=Maj from the past
1109 vshr.u64 d26,d21,#41
1110 vld1.64 {d28},[r3,:64]! @ K[i++]
1115 #if 15<16 && defined(__ARMEL__)
1119 vbsl d29,d22,d23 @ Ch(e,f,g)
1120 vshr.u64 d24,d17,#28
1121 veor d26,d25 @ Sigma1(e)
1122 vadd.i64 d27,d29,d16
1123 vshr.u64 d25,d17,#34
1126 vshr.u64 d26,d17,#39
1133 vbsl d30,d19,d18 @ Maj(a,b,c)
1134 veor d16,d26 @ Sigma0(a)
1143 vadd.i64 d16,d30 @ h+=Maj from the past
1146 vext.8 q14,q0,q1,#8 @ X[i+1]
1150 veor q15,q13 @ sigma1(X[i+14])
1156 vext.8 q14,q4,q5,#8 @ X[i+9]
1158 vshr.u64 d24,d20,#14 @ from NEON_00_15
1160 vshr.u64 d25,d20,#18 @ from NEON_00_15
1161 veor q15,q13 @ sigma0(X[i+1])
1162 vshr.u64 d26,d20,#41 @ from NEON_00_15
1164 vld1.64 {d28},[r3,:64]! @ K[i++]
1169 #if 16<16 && defined(__ARMEL__)
1173 vbsl d29,d21,d22 @ Ch(e,f,g)
1174 vshr.u64 d24,d16,#28
1175 veor d26,d25 @ Sigma1(e)
1176 vadd.i64 d27,d29,d23
1177 vshr.u64 d25,d16,#34
1180 vshr.u64 d26,d16,#39
1187 vbsl d30,d18,d17 @ Maj(a,b,c)
1188 veor d23,d26 @ Sigma0(a)
1192 vshr.u64 d24,d19,#14 @ 17
1194 vld1.64 {d1},[r1]! @ handles unaligned
1196 vshr.u64 d25,d19,#18
1198 vadd.i64 d23,d30 @ h+=Maj from the past
1200 vshr.u64 d26,d19,#41
1201 vld1.64 {d28},[r3,:64]! @ K[i++]
1206 #if 17<16 && defined(__ARMEL__)
1210 vbsl d29,d20,d21 @ Ch(e,f,g)
1211 vshr.u64 d24,d23,#28
1212 veor d26,d25 @ Sigma1(e)
1213 vadd.i64 d27,d29,d22
1214 vshr.u64 d25,d23,#34
1217 vshr.u64 d26,d23,#39
1224 vbsl d30,d17,d16 @ Maj(a,b,c)
1225 veor d22,d26 @ Sigma0(a)
1231 vadd.i64 d22,d30 @ h+=Maj from the past
1234 vext.8 q14,q1,q2,#8 @ X[i+1]
1238 veor q15,q13 @ sigma1(X[i+14])
1244 vext.8 q14,q5,q6,#8 @ X[i+9]
1246 vshr.u64 d24,d18,#14 @ from NEON_00_15
1248 vshr.u64 d25,d18,#18 @ from NEON_00_15
1249 veor q15,q13 @ sigma0(X[i+1])
1250 vshr.u64 d26,d18,#41 @ from NEON_00_15
1252 vld1.64 {d28},[r3,:64]! @ K[i++]
1257 #if 18<16 && defined(__ARMEL__)
1261 vbsl d29,d19,d20 @ Ch(e,f,g)
1262 vshr.u64 d24,d22,#28
1263 veor d26,d25 @ Sigma1(e)
1264 vadd.i64 d27,d29,d21
1265 vshr.u64 d25,d22,#34
1268 vshr.u64 d26,d22,#39
1275 vbsl d30,d16,d23 @ Maj(a,b,c)
1276 veor d21,d26 @ Sigma0(a)
1280 vshr.u64 d24,d17,#14 @ 19
1282 vld1.64 {d3},[r1]! @ handles unaligned
1284 vshr.u64 d25,d17,#18
1286 vadd.i64 d21,d30 @ h+=Maj from the past
1288 vshr.u64 d26,d17,#41
1289 vld1.64 {d28},[r3,:64]! @ K[i++]
1294 #if 19<16 && defined(__ARMEL__)
1298 vbsl d29,d18,d19 @ Ch(e,f,g)
1299 vshr.u64 d24,d21,#28
1300 veor d26,d25 @ Sigma1(e)
1301 vadd.i64 d27,d29,d20
1302 vshr.u64 d25,d21,#34
1305 vshr.u64 d26,d21,#39
1312 vbsl d30,d23,d22 @ Maj(a,b,c)
1313 veor d20,d26 @ Sigma0(a)
1319 vadd.i64 d20,d30 @ h+=Maj from the past
1322 vext.8 q14,q2,q3,#8 @ X[i+1]
1326 veor q15,q13 @ sigma1(X[i+14])
1332 vext.8 q14,q6,q7,#8 @ X[i+9]
1334 vshr.u64 d24,d16,#14 @ from NEON_00_15
1336 vshr.u64 d25,d16,#18 @ from NEON_00_15
1337 veor q15,q13 @ sigma0(X[i+1])
1338 vshr.u64 d26,d16,#41 @ from NEON_00_15
1340 vld1.64 {d28},[r3,:64]! @ K[i++]
1345 #if 20<16 && defined(__ARMEL__)
1349 vbsl d29,d17,d18 @ Ch(e,f,g)
1350 vshr.u64 d24,d20,#28
1351 veor d26,d25 @ Sigma1(e)
1352 vadd.i64 d27,d29,d19
1353 vshr.u64 d25,d20,#34
1356 vshr.u64 d26,d20,#39
1363 vbsl d30,d22,d21 @ Maj(a,b,c)
1364 veor d19,d26 @ Sigma0(a)
1368 vshr.u64 d24,d23,#14 @ 21
1370 vld1.64 {d5},[r1]! @ handles unaligned
1372 vshr.u64 d25,d23,#18
1374 vadd.i64 d19,d30 @ h+=Maj from the past
1376 vshr.u64 d26,d23,#41
1377 vld1.64 {d28},[r3,:64]! @ K[i++]
1382 #if 21<16 && defined(__ARMEL__)
1386 vbsl d29,d16,d17 @ Ch(e,f,g)
1387 vshr.u64 d24,d19,#28
1388 veor d26,d25 @ Sigma1(e)
1389 vadd.i64 d27,d29,d18
1390 vshr.u64 d25,d19,#34
1393 vshr.u64 d26,d19,#39
1400 vbsl d30,d21,d20 @ Maj(a,b,c)
1401 veor d18,d26 @ Sigma0(a)
1407 vadd.i64 d18,d30 @ h+=Maj from the past
1410 vext.8 q14,q3,q4,#8 @ X[i+1]
1414 veor q15,q13 @ sigma1(X[i+14])
1420 vext.8 q14,q7,q0,#8 @ X[i+9]
1422 vshr.u64 d24,d22,#14 @ from NEON_00_15
1424 vshr.u64 d25,d22,#18 @ from NEON_00_15
1425 veor q15,q13 @ sigma0(X[i+1])
1426 vshr.u64 d26,d22,#41 @ from NEON_00_15
1428 vld1.64 {d28},[r3,:64]! @ K[i++]
1433 #if 22<16 && defined(__ARMEL__)
1437 vbsl d29,d23,d16 @ Ch(e,f,g)
1438 vshr.u64 d24,d18,#28
1439 veor d26,d25 @ Sigma1(e)
1440 vadd.i64 d27,d29,d17
1441 vshr.u64 d25,d18,#34
1444 vshr.u64 d26,d18,#39
1451 vbsl d30,d20,d19 @ Maj(a,b,c)
1452 veor d17,d26 @ Sigma0(a)
1456 vshr.u64 d24,d21,#14 @ 23
1458 vld1.64 {d7},[r1]! @ handles unaligned
1460 vshr.u64 d25,d21,#18
1462 vadd.i64 d17,d30 @ h+=Maj from the past
1464 vshr.u64 d26,d21,#41
1465 vld1.64 {d28},[r3,:64]! @ K[i++]
1470 #if 23<16 && defined(__ARMEL__)
1474 vbsl d29,d22,d23 @ Ch(e,f,g)
1475 vshr.u64 d24,d17,#28
1476 veor d26,d25 @ Sigma1(e)
1477 vadd.i64 d27,d29,d16
1478 vshr.u64 d25,d17,#34
1481 vshr.u64 d26,d17,#39
1488 vbsl d30,d19,d18 @ Maj(a,b,c)
1489 veor d16,d26 @ Sigma0(a)
1495 vadd.i64 d16,d30 @ h+=Maj from the past
1498 vext.8 q14,q4,q5,#8 @ X[i+1]
1502 veor q15,q13 @ sigma1(X[i+14])
1508 vext.8 q14,q0,q1,#8 @ X[i+9]
1510 vshr.u64 d24,d20,#14 @ from NEON_00_15
1512 vshr.u64 d25,d20,#18 @ from NEON_00_15
1513 veor q15,q13 @ sigma0(X[i+1])
1514 vshr.u64 d26,d20,#41 @ from NEON_00_15
1516 vld1.64 {d28},[r3,:64]! @ K[i++]
1521 #if 24<16 && defined(__ARMEL__)
1525 vbsl d29,d21,d22 @ Ch(e,f,g)
1526 vshr.u64 d24,d16,#28
1527 veor d26,d25 @ Sigma1(e)
1528 vadd.i64 d27,d29,d23
1529 vshr.u64 d25,d16,#34
1532 vshr.u64 d26,d16,#39
1539 vbsl d30,d18,d17 @ Maj(a,b,c)
1540 veor d23,d26 @ Sigma0(a)
1544 vshr.u64 d24,d19,#14 @ 25
1546 vld1.64 {d9},[r1]! @ handles unaligned
1548 vshr.u64 d25,d19,#18
1550 vadd.i64 d23,d30 @ h+=Maj from the past
1552 vshr.u64 d26,d19,#41
1553 vld1.64 {d28},[r3,:64]! @ K[i++]
1558 #if 25<16 && defined(__ARMEL__)
1562 vbsl d29,d20,d21 @ Ch(e,f,g)
1563 vshr.u64 d24,d23,#28
1564 veor d26,d25 @ Sigma1(e)
1565 vadd.i64 d27,d29,d22
1566 vshr.u64 d25,d23,#34
1569 vshr.u64 d26,d23,#39
1576 vbsl d30,d17,d16 @ Maj(a,b,c)
1577 veor d22,d26 @ Sigma0(a)
1583 vadd.i64 d22,d30 @ h+=Maj from the past
1586 vext.8 q14,q5,q6,#8 @ X[i+1]
1590 veor q15,q13 @ sigma1(X[i+14])
1596 vext.8 q14,q1,q2,#8 @ X[i+9]
1598 vshr.u64 d24,d18,#14 @ from NEON_00_15
1600 vshr.u64 d25,d18,#18 @ from NEON_00_15
1601 veor q15,q13 @ sigma0(X[i+1])
1602 vshr.u64 d26,d18,#41 @ from NEON_00_15
1604 vld1.64 {d28},[r3,:64]! @ K[i++]
1609 #if 26<16 && defined(__ARMEL__)
1613 vbsl d29,d19,d20 @ Ch(e,f,g)
1614 vshr.u64 d24,d22,#28
1615 veor d26,d25 @ Sigma1(e)
1616 vadd.i64 d27,d29,d21
1617 vshr.u64 d25,d22,#34
1620 vshr.u64 d26,d22,#39
1627 vbsl d30,d16,d23 @ Maj(a,b,c)
1628 veor d21,d26 @ Sigma0(a)
1632 vshr.u64 d24,d17,#14 @ 27
1634 vld1.64 {d11},[r1]! @ handles unaligned
1636 vshr.u64 d25,d17,#18
1638 vadd.i64 d21,d30 @ h+=Maj from the past
1640 vshr.u64 d26,d17,#41
1641 vld1.64 {d28},[r3,:64]! @ K[i++]
1646 #if 27<16 && defined(__ARMEL__)
1650 vbsl d29,d18,d19 @ Ch(e,f,g)
1651 vshr.u64 d24,d21,#28
1652 veor d26,d25 @ Sigma1(e)
1653 vadd.i64 d27,d29,d20
1654 vshr.u64 d25,d21,#34
1657 vshr.u64 d26,d21,#39
1664 vbsl d30,d23,d22 @ Maj(a,b,c)
1665 veor d20,d26 @ Sigma0(a)
1671 vadd.i64 d20,d30 @ h+=Maj from the past
1674 vext.8 q14,q6,q7,#8 @ X[i+1]
1678 veor q15,q13 @ sigma1(X[i+14])
1684 vext.8 q14,q2,q3,#8 @ X[i+9]
1686 vshr.u64 d24,d16,#14 @ from NEON_00_15
1688 vshr.u64 d25,d16,#18 @ from NEON_00_15
1689 veor q15,q13 @ sigma0(X[i+1])
1690 vshr.u64 d26,d16,#41 @ from NEON_00_15
1692 vld1.64 {d28},[r3,:64]! @ K[i++]
1697 #if 28<16 && defined(__ARMEL__)
1701 vbsl d29,d17,d18 @ Ch(e,f,g)
1702 vshr.u64 d24,d20,#28
1703 veor d26,d25 @ Sigma1(e)
1704 vadd.i64 d27,d29,d19
1705 vshr.u64 d25,d20,#34
1708 vshr.u64 d26,d20,#39
1715 vbsl d30,d22,d21 @ Maj(a,b,c)
1716 veor d19,d26 @ Sigma0(a)
1720 vshr.u64 d24,d23,#14 @ 29
1722 vld1.64 {d13},[r1]! @ handles unaligned
1724 vshr.u64 d25,d23,#18
1726 vadd.i64 d19,d30 @ h+=Maj from the past
1728 vshr.u64 d26,d23,#41
1729 vld1.64 {d28},[r3,:64]! @ K[i++]
1734 #if 29<16 && defined(__ARMEL__)
1738 vbsl d29,d16,d17 @ Ch(e,f,g)
1739 vshr.u64 d24,d19,#28
1740 veor d26,d25 @ Sigma1(e)
1741 vadd.i64 d27,d29,d18
1742 vshr.u64 d25,d19,#34
1745 vshr.u64 d26,d19,#39
1752 vbsl d30,d21,d20 @ Maj(a,b,c)
1753 veor d18,d26 @ Sigma0(a)
1759 vadd.i64 d18,d30 @ h+=Maj from the past
1762 vext.8 q14,q7,q0,#8 @ X[i+1]
1766 veor q15,q13 @ sigma1(X[i+14])
1772 vext.8 q14,q3,q4,#8 @ X[i+9]
1774 vshr.u64 d24,d22,#14 @ from NEON_00_15
1776 vshr.u64 d25,d22,#18 @ from NEON_00_15
1777 veor q15,q13 @ sigma0(X[i+1])
1778 vshr.u64 d26,d22,#41 @ from NEON_00_15
1780 vld1.64 {d28},[r3,:64]! @ K[i++]
1785 #if 30<16 && defined(__ARMEL__)
1789 vbsl d29,d23,d16 @ Ch(e,f,g)
1790 vshr.u64 d24,d18,#28
1791 veor d26,d25 @ Sigma1(e)
1792 vadd.i64 d27,d29,d17
1793 vshr.u64 d25,d18,#34
1796 vshr.u64 d26,d18,#39
1803 vbsl d30,d20,d19 @ Maj(a,b,c)
1804 veor d17,d26 @ Sigma0(a)
1808 vshr.u64 d24,d21,#14 @ 31
1810 vld1.64 {d15},[r1]! @ handles unaligned
1812 vshr.u64 d25,d21,#18
1814 vadd.i64 d17,d30 @ h+=Maj from the past
1816 vshr.u64 d26,d21,#41
1817 vld1.64 {d28},[r3,:64]! @ K[i++]
1822 #if 31<16 && defined(__ARMEL__)
1826 vbsl d29,d22,d23 @ Ch(e,f,g)
1827 vshr.u64 d24,d17,#28
1828 veor d26,d25 @ Sigma1(e)
1829 vadd.i64 d27,d29,d16
1830 vshr.u64 d25,d17,#34
1833 vshr.u64 d26,d17,#39
1840 vbsl d30,d19,d18 @ Maj(a,b,c)
1841 veor d16,d26 @ Sigma0(a)
1847 vadd.i64 d16,d30 @ h+=Maj from the past
1848 vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp
1849 vadd.i64 q8,q12 @ vectorized accumulate
1853 vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context
1855 sub r3,#640 @ rewind K512
1859 bx lr @ .word 0xe12fff1e
1860 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
1862 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1865 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1866 .comm OPENSSL_armcap_P,4,4
1867 .hidden OPENSSL_armcap_P