2 @ ====================================================================
3 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
4 @ project. The module is, however, dual licensed under OpenSSL and
5 @ CRYPTOGAMS licenses depending on where you obtain it. For further
6 @ details see http://www.openssl.org/~appro/cryptogams/.
8 @ Permission to use under GPL terms is granted.
9 @ ====================================================================
11 @ SHA512 block procedure for ARMv4. September 2007.
13 @ This code is ~4.5 (four and a half) times faster than code generated
14 @ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
15 @ Xscale PXA250 core].
19 @ Rescheduling for dual-issue pipeline resulted in 6% improvement on
20 @ Cortex A8 core and ~40 cycles per processed byte.
24 @ Profiler-assisted and platform-specific optimization resulted in 7%
25 @ improvement on Coxtex A8 core and ~38 cycles per byte.
29 @ Add NEON implementation. On Cortex A8 it was measured to process
30 @ one byte in 23.3 cycles or ~60% faster than integer-only code.
34 @ Improve NEON performance by 12% on Snapdragon S4. In absolute
35 @ terms it's 22.6 cycles per byte, which is disappointing result.
36 @ Technical writers asserted that 3-way S4 pipeline can sustain
37 @ multiple NEON instructions per cycle, but dual NEON issue could
38 @ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
39 @ for further details. On side note Cortex-A15 processes one byte in
42 @ Byte order [in]dependence. =========================================
44 @ Originally caller was expected to maintain specific *dword* order in
45 @ h[0-7], namely with most significant dword at *lower* address, which
46 @ was reflected in below two parameters as 0 and 4. Now caller is
47 @ expected to maintain native byte order for whole 64-bit values.
49 # include "arm_arch.h"
50 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
51 # define VFP_ABI_POP vldmia sp!,{d8-d15}
53 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
54 # define __ARM_MAX_ARCH__ 7
62 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
66 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
85 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
86 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
87 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
88 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
89 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
90 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
91 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
92 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
93 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
94 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
95 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
96 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
97 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
98 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
99 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
100 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
101 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
102 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
103 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
104 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
105 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
106 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
107 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
108 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
109 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
110 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
111 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
112 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
113 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
114 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
115 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
116 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
117 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
118 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
119 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
120 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
121 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
122 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
123 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
124 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
126 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
128 .word OPENSSL_armcap_P-sha512_block_data_order
134 .global sha512_block_data_order
135 .type sha512_block_data_order,%function
136 sha512_block_data_order:
138 sub r3,pc,#8 @ sha512_block_data_order
140 adr r3,sha512_block_data_order
142 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
143 ldr r12,.LOPENSSL_armcap
144 ldr r12,[r3,r12] @ OPENSSL_armcap_P
148 add r2,r1,r2,lsl#7 @ len to point at the end of inp
149 stmdb sp!,{r4-r12,lr}
150 sub r14,r3,#672 @ K512
207 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
208 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
209 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
215 ldr r11,[sp,#56+0] @ h.lo
216 eor r10,r10,r7,lsl#18
217 ldr r12,[sp,#56+4] @ h.hi
219 eor r10,r10,r8,lsr#18
221 eor r10,r10,r7,lsl#14
225 eor r10,r10,r8,lsl#23 @ Sigma1(e)
227 ldr r9,[sp,#40+0] @ f.lo
228 adc r4,r4,r10 @ T += Sigma1(e)
229 ldr r10,[sp,#40+4] @ f.hi
231 ldr r11,[sp,#48+0] @ g.lo
232 adc r4,r4,r12 @ T += h
233 ldr r12,[sp,#48+4] @ g.hi
244 ldr r11,[r14,#LO] @ K[i].lo
245 eor r10,r10,r12 @ Ch(e,f,g)
246 ldr r12,[r14,#HI] @ K[i].hi
249 ldr r7,[sp,#24+0] @ d.lo
250 adc r4,r4,r10 @ T += Ch(e,f,g)
251 ldr r8,[sp,#24+4] @ d.hi
254 adc r4,r4,r12 @ T += K[i]
256 ldr r11,[sp,#8+0] @ b.lo
257 adc r8,r8,r4 @ d += T
260 ldr r12,[sp,#16+0] @ c.lo
262 it eq @ Thumb2 thing, sanity check in ARM
265 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
266 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
267 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
275 eor r10,r10,r6,lsl#30
279 eor r10,r10,r6,lsl#25 @ Sigma0(a)
282 adc r4,r4,r10 @ T += Sigma0(a)
284 ldr r10,[sp,#8+4] @ b.hi
286 ldr r11,[sp,#16+4] @ c.hi
290 orr r5,r5,r9 @ Maj(a,b,c).lo
293 orr r6,r6,r12 @ Maj(a,b,c).hi
295 adc r6,r6,r4 @ h += T
304 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
305 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
306 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
321 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
322 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
323 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
327 eor r10,r10,r11,lsl#13
329 eor r10,r10,r11,lsr#29
331 eor r10,r10,r12,lsl#3
333 eor r10,r10,r12,lsr#6
347 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
348 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
349 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
355 ldr r11,[sp,#56+0] @ h.lo
356 eor r10,r10,r7,lsl#18
357 ldr r12,[sp,#56+4] @ h.hi
359 eor r10,r10,r8,lsr#18
361 eor r10,r10,r7,lsl#14
365 eor r10,r10,r8,lsl#23 @ Sigma1(e)
367 ldr r9,[sp,#40+0] @ f.lo
368 adc r4,r4,r10 @ T += Sigma1(e)
369 ldr r10,[sp,#40+4] @ f.hi
371 ldr r11,[sp,#48+0] @ g.lo
372 adc r4,r4,r12 @ T += h
373 ldr r12,[sp,#48+4] @ g.hi
384 ldr r11,[r14,#LO] @ K[i].lo
385 eor r10,r10,r12 @ Ch(e,f,g)
386 ldr r12,[r14,#HI] @ K[i].hi
389 ldr r7,[sp,#24+0] @ d.lo
390 adc r4,r4,r10 @ T += Ch(e,f,g)
391 ldr r8,[sp,#24+4] @ d.hi
394 adc r4,r4,r12 @ T += K[i]
396 ldr r11,[sp,#8+0] @ b.lo
397 adc r8,r8,r4 @ d += T
400 ldr r12,[sp,#16+0] @ c.lo
402 it eq @ Thumb2 thing, sanity check in ARM
405 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
406 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
407 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
415 eor r10,r10,r6,lsl#30
419 eor r10,r10,r6,lsl#25 @ Sigma0(a)
422 adc r4,r4,r10 @ T += Sigma0(a)
424 ldr r10,[sp,#8+4] @ b.hi
426 ldr r11,[sp,#16+4] @ c.hi
430 orr r5,r5,r9 @ Maj(a,b,c).lo
433 orr r6,r6,r12 @ Maj(a,b,c).hi
435 adc r6,r6,r4 @ h += T
439 ittt eq @ Thumb2 thing, sanity check in ARM
442 ldreq r10,[sp,#184+4]
516 add sp,sp,#8*9 @ destroy frame
518 ldmia sp!,{r4-r12,pc}
520 ldmia sp!,{r4-r12,lr}
522 moveq pc,lr @ be binary compatible with V4, yet
523 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
525 .size sha512_block_data_order,.-sha512_block_data_order
526 #if __ARM_MAX_ARCH__>=7
530 .global sha512_block_data_order_neon
531 .type sha512_block_data_order_neon,%function
533 sha512_block_data_order_neon:
535 dmb @ errata #451034 on early Cortex A8
536 add r2,r1,r2,lsl#7 @ len to point at the end of inp
539 vldmia r0,{d16-d23} @ load context
541 vshr.u64 d24,d20,#14 @ 0
543 vld1.64 {d0},[r1]! @ handles unaligned
547 vadd.i64 d16,d30 @ h+=Maj from the past
550 vld1.64 {d28},[r3,:64]! @ K[i++]
555 #if 0<16 && defined(__ARMEL__)
559 vbsl d29,d21,d22 @ Ch(e,f,g)
561 veor d26,d25 @ Sigma1(e)
573 vbsl d30,d18,d17 @ Maj(a,b,c)
574 veor d23,d26 @ Sigma0(a)
578 vshr.u64 d24,d19,#14 @ 1
580 vld1.64 {d1},[r1]! @ handles unaligned
584 vadd.i64 d23,d30 @ h+=Maj from the past
587 vld1.64 {d28},[r3,:64]! @ K[i++]
592 #if 1<16 && defined(__ARMEL__)
596 vbsl d29,d20,d21 @ Ch(e,f,g)
598 veor d26,d25 @ Sigma1(e)
610 vbsl d30,d17,d16 @ Maj(a,b,c)
611 veor d22,d26 @ Sigma0(a)
615 vshr.u64 d24,d18,#14 @ 2
617 vld1.64 {d2},[r1]! @ handles unaligned
621 vadd.i64 d22,d30 @ h+=Maj from the past
624 vld1.64 {d28},[r3,:64]! @ K[i++]
629 #if 2<16 && defined(__ARMEL__)
633 vbsl d29,d19,d20 @ Ch(e,f,g)
635 veor d26,d25 @ Sigma1(e)
647 vbsl d30,d16,d23 @ Maj(a,b,c)
648 veor d21,d26 @ Sigma0(a)
652 vshr.u64 d24,d17,#14 @ 3
654 vld1.64 {d3},[r1]! @ handles unaligned
658 vadd.i64 d21,d30 @ h+=Maj from the past
661 vld1.64 {d28},[r3,:64]! @ K[i++]
666 #if 3<16 && defined(__ARMEL__)
670 vbsl d29,d18,d19 @ Ch(e,f,g)
672 veor d26,d25 @ Sigma1(e)
684 vbsl d30,d23,d22 @ Maj(a,b,c)
685 veor d20,d26 @ Sigma0(a)
689 vshr.u64 d24,d16,#14 @ 4
691 vld1.64 {d4},[r1]! @ handles unaligned
695 vadd.i64 d20,d30 @ h+=Maj from the past
698 vld1.64 {d28},[r3,:64]! @ K[i++]
703 #if 4<16 && defined(__ARMEL__)
707 vbsl d29,d17,d18 @ Ch(e,f,g)
709 veor d26,d25 @ Sigma1(e)
721 vbsl d30,d22,d21 @ Maj(a,b,c)
722 veor d19,d26 @ Sigma0(a)
726 vshr.u64 d24,d23,#14 @ 5
728 vld1.64 {d5},[r1]! @ handles unaligned
732 vadd.i64 d19,d30 @ h+=Maj from the past
735 vld1.64 {d28},[r3,:64]! @ K[i++]
740 #if 5<16 && defined(__ARMEL__)
744 vbsl d29,d16,d17 @ Ch(e,f,g)
746 veor d26,d25 @ Sigma1(e)
758 vbsl d30,d21,d20 @ Maj(a,b,c)
759 veor d18,d26 @ Sigma0(a)
763 vshr.u64 d24,d22,#14 @ 6
765 vld1.64 {d6},[r1]! @ handles unaligned
769 vadd.i64 d18,d30 @ h+=Maj from the past
772 vld1.64 {d28},[r3,:64]! @ K[i++]
777 #if 6<16 && defined(__ARMEL__)
781 vbsl d29,d23,d16 @ Ch(e,f,g)
783 veor d26,d25 @ Sigma1(e)
795 vbsl d30,d20,d19 @ Maj(a,b,c)
796 veor d17,d26 @ Sigma0(a)
800 vshr.u64 d24,d21,#14 @ 7
802 vld1.64 {d7},[r1]! @ handles unaligned
806 vadd.i64 d17,d30 @ h+=Maj from the past
809 vld1.64 {d28},[r3,:64]! @ K[i++]
814 #if 7<16 && defined(__ARMEL__)
818 vbsl d29,d22,d23 @ Ch(e,f,g)
820 veor d26,d25 @ Sigma1(e)
832 vbsl d30,d19,d18 @ Maj(a,b,c)
833 veor d16,d26 @ Sigma0(a)
837 vshr.u64 d24,d20,#14 @ 8
839 vld1.64 {d8},[r1]! @ handles unaligned
843 vadd.i64 d16,d30 @ h+=Maj from the past
846 vld1.64 {d28},[r3,:64]! @ K[i++]
851 #if 8<16 && defined(__ARMEL__)
855 vbsl d29,d21,d22 @ Ch(e,f,g)
857 veor d26,d25 @ Sigma1(e)
869 vbsl d30,d18,d17 @ Maj(a,b,c)
870 veor d23,d26 @ Sigma0(a)
874 vshr.u64 d24,d19,#14 @ 9
876 vld1.64 {d9},[r1]! @ handles unaligned
880 vadd.i64 d23,d30 @ h+=Maj from the past
883 vld1.64 {d28},[r3,:64]! @ K[i++]
888 #if 9<16 && defined(__ARMEL__)
892 vbsl d29,d20,d21 @ Ch(e,f,g)
894 veor d26,d25 @ Sigma1(e)
906 vbsl d30,d17,d16 @ Maj(a,b,c)
907 veor d22,d26 @ Sigma0(a)
911 vshr.u64 d24,d18,#14 @ 10
913 vld1.64 {d10},[r1]! @ handles unaligned
917 vadd.i64 d22,d30 @ h+=Maj from the past
920 vld1.64 {d28},[r3,:64]! @ K[i++]
925 #if 10<16 && defined(__ARMEL__)
929 vbsl d29,d19,d20 @ Ch(e,f,g)
931 veor d26,d25 @ Sigma1(e)
943 vbsl d30,d16,d23 @ Maj(a,b,c)
944 veor d21,d26 @ Sigma0(a)
948 vshr.u64 d24,d17,#14 @ 11
950 vld1.64 {d11},[r1]! @ handles unaligned
954 vadd.i64 d21,d30 @ h+=Maj from the past
957 vld1.64 {d28},[r3,:64]! @ K[i++]
962 #if 11<16 && defined(__ARMEL__)
966 vbsl d29,d18,d19 @ Ch(e,f,g)
968 veor d26,d25 @ Sigma1(e)
980 vbsl d30,d23,d22 @ Maj(a,b,c)
981 veor d20,d26 @ Sigma0(a)
985 vshr.u64 d24,d16,#14 @ 12
987 vld1.64 {d12},[r1]! @ handles unaligned
991 vadd.i64 d20,d30 @ h+=Maj from the past
994 vld1.64 {d28},[r3,:64]! @ K[i++]
999 #if 12<16 && defined(__ARMEL__)
1003 vbsl d29,d17,d18 @ Ch(e,f,g)
1004 vshr.u64 d24,d20,#28
1005 veor d26,d25 @ Sigma1(e)
1006 vadd.i64 d27,d29,d19
1007 vshr.u64 d25,d20,#34
1010 vshr.u64 d26,d20,#39
1017 vbsl d30,d22,d21 @ Maj(a,b,c)
1018 veor d19,d26 @ Sigma0(a)
1022 vshr.u64 d24,d23,#14 @ 13
1024 vld1.64 {d13},[r1]! @ handles unaligned
1026 vshr.u64 d25,d23,#18
1028 vadd.i64 d19,d30 @ h+=Maj from the past
1030 vshr.u64 d26,d23,#41
1031 vld1.64 {d28},[r3,:64]! @ K[i++]
1036 #if 13<16 && defined(__ARMEL__)
1040 vbsl d29,d16,d17 @ Ch(e,f,g)
1041 vshr.u64 d24,d19,#28
1042 veor d26,d25 @ Sigma1(e)
1043 vadd.i64 d27,d29,d18
1044 vshr.u64 d25,d19,#34
1047 vshr.u64 d26,d19,#39
1054 vbsl d30,d21,d20 @ Maj(a,b,c)
1055 veor d18,d26 @ Sigma0(a)
1059 vshr.u64 d24,d22,#14 @ 14
1061 vld1.64 {d14},[r1]! @ handles unaligned
1063 vshr.u64 d25,d22,#18
1065 vadd.i64 d18,d30 @ h+=Maj from the past
1067 vshr.u64 d26,d22,#41
1068 vld1.64 {d28},[r3,:64]! @ K[i++]
1073 #if 14<16 && defined(__ARMEL__)
1077 vbsl d29,d23,d16 @ Ch(e,f,g)
1078 vshr.u64 d24,d18,#28
1079 veor d26,d25 @ Sigma1(e)
1080 vadd.i64 d27,d29,d17
1081 vshr.u64 d25,d18,#34
1084 vshr.u64 d26,d18,#39
1091 vbsl d30,d20,d19 @ Maj(a,b,c)
1092 veor d17,d26 @ Sigma0(a)
1096 vshr.u64 d24,d21,#14 @ 15
1098 vld1.64 {d15},[r1]! @ handles unaligned
1100 vshr.u64 d25,d21,#18
1102 vadd.i64 d17,d30 @ h+=Maj from the past
1104 vshr.u64 d26,d21,#41
1105 vld1.64 {d28},[r3,:64]! @ K[i++]
1110 #if 15<16 && defined(__ARMEL__)
1114 vbsl d29,d22,d23 @ Ch(e,f,g)
1115 vshr.u64 d24,d17,#28
1116 veor d26,d25 @ Sigma1(e)
1117 vadd.i64 d27,d29,d16
1118 vshr.u64 d25,d17,#34
1121 vshr.u64 d26,d17,#39
1128 vbsl d30,d19,d18 @ Maj(a,b,c)
1129 veor d16,d26 @ Sigma0(a)
1138 vadd.i64 d16,d30 @ h+=Maj from the past
1141 vext.8 q14,q0,q1,#8 @ X[i+1]
1145 veor q15,q13 @ sigma1(X[i+14])
1151 vext.8 q14,q4,q5,#8 @ X[i+9]
1153 vshr.u64 d24,d20,#14 @ from NEON_00_15
1155 vshr.u64 d25,d20,#18 @ from NEON_00_15
1156 veor q15,q13 @ sigma0(X[i+1])
1157 vshr.u64 d26,d20,#41 @ from NEON_00_15
1159 vld1.64 {d28},[r3,:64]! @ K[i++]
1164 #if 16<16 && defined(__ARMEL__)
1168 vbsl d29,d21,d22 @ Ch(e,f,g)
1169 vshr.u64 d24,d16,#28
1170 veor d26,d25 @ Sigma1(e)
1171 vadd.i64 d27,d29,d23
1172 vshr.u64 d25,d16,#34
1175 vshr.u64 d26,d16,#39
1182 vbsl d30,d18,d17 @ Maj(a,b,c)
1183 veor d23,d26 @ Sigma0(a)
1187 vshr.u64 d24,d19,#14 @ 17
1189 vld1.64 {d1},[r1]! @ handles unaligned
1191 vshr.u64 d25,d19,#18
1193 vadd.i64 d23,d30 @ h+=Maj from the past
1195 vshr.u64 d26,d19,#41
1196 vld1.64 {d28},[r3,:64]! @ K[i++]
1201 #if 17<16 && defined(__ARMEL__)
1205 vbsl d29,d20,d21 @ Ch(e,f,g)
1206 vshr.u64 d24,d23,#28
1207 veor d26,d25 @ Sigma1(e)
1208 vadd.i64 d27,d29,d22
1209 vshr.u64 d25,d23,#34
1212 vshr.u64 d26,d23,#39
1219 vbsl d30,d17,d16 @ Maj(a,b,c)
1220 veor d22,d26 @ Sigma0(a)
1226 vadd.i64 d22,d30 @ h+=Maj from the past
1229 vext.8 q14,q1,q2,#8 @ X[i+1]
1233 veor q15,q13 @ sigma1(X[i+14])
1239 vext.8 q14,q5,q6,#8 @ X[i+9]
1241 vshr.u64 d24,d18,#14 @ from NEON_00_15
1243 vshr.u64 d25,d18,#18 @ from NEON_00_15
1244 veor q15,q13 @ sigma0(X[i+1])
1245 vshr.u64 d26,d18,#41 @ from NEON_00_15
1247 vld1.64 {d28},[r3,:64]! @ K[i++]
1252 #if 18<16 && defined(__ARMEL__)
1256 vbsl d29,d19,d20 @ Ch(e,f,g)
1257 vshr.u64 d24,d22,#28
1258 veor d26,d25 @ Sigma1(e)
1259 vadd.i64 d27,d29,d21
1260 vshr.u64 d25,d22,#34
1263 vshr.u64 d26,d22,#39
1270 vbsl d30,d16,d23 @ Maj(a,b,c)
1271 veor d21,d26 @ Sigma0(a)
1275 vshr.u64 d24,d17,#14 @ 19
1277 vld1.64 {d3},[r1]! @ handles unaligned
1279 vshr.u64 d25,d17,#18
1281 vadd.i64 d21,d30 @ h+=Maj from the past
1283 vshr.u64 d26,d17,#41
1284 vld1.64 {d28},[r3,:64]! @ K[i++]
1289 #if 19<16 && defined(__ARMEL__)
1293 vbsl d29,d18,d19 @ Ch(e,f,g)
1294 vshr.u64 d24,d21,#28
1295 veor d26,d25 @ Sigma1(e)
1296 vadd.i64 d27,d29,d20
1297 vshr.u64 d25,d21,#34
1300 vshr.u64 d26,d21,#39
1307 vbsl d30,d23,d22 @ Maj(a,b,c)
1308 veor d20,d26 @ Sigma0(a)
1314 vadd.i64 d20,d30 @ h+=Maj from the past
1317 vext.8 q14,q2,q3,#8 @ X[i+1]
1321 veor q15,q13 @ sigma1(X[i+14])
1327 vext.8 q14,q6,q7,#8 @ X[i+9]
1329 vshr.u64 d24,d16,#14 @ from NEON_00_15
1331 vshr.u64 d25,d16,#18 @ from NEON_00_15
1332 veor q15,q13 @ sigma0(X[i+1])
1333 vshr.u64 d26,d16,#41 @ from NEON_00_15
1335 vld1.64 {d28},[r3,:64]! @ K[i++]
1340 #if 20<16 && defined(__ARMEL__)
1344 vbsl d29,d17,d18 @ Ch(e,f,g)
1345 vshr.u64 d24,d20,#28
1346 veor d26,d25 @ Sigma1(e)
1347 vadd.i64 d27,d29,d19
1348 vshr.u64 d25,d20,#34
1351 vshr.u64 d26,d20,#39
1358 vbsl d30,d22,d21 @ Maj(a,b,c)
1359 veor d19,d26 @ Sigma0(a)
1363 vshr.u64 d24,d23,#14 @ 21
1365 vld1.64 {d5},[r1]! @ handles unaligned
1367 vshr.u64 d25,d23,#18
1369 vadd.i64 d19,d30 @ h+=Maj from the past
1371 vshr.u64 d26,d23,#41
1372 vld1.64 {d28},[r3,:64]! @ K[i++]
1377 #if 21<16 && defined(__ARMEL__)
1381 vbsl d29,d16,d17 @ Ch(e,f,g)
1382 vshr.u64 d24,d19,#28
1383 veor d26,d25 @ Sigma1(e)
1384 vadd.i64 d27,d29,d18
1385 vshr.u64 d25,d19,#34
1388 vshr.u64 d26,d19,#39
1395 vbsl d30,d21,d20 @ Maj(a,b,c)
1396 veor d18,d26 @ Sigma0(a)
1402 vadd.i64 d18,d30 @ h+=Maj from the past
1405 vext.8 q14,q3,q4,#8 @ X[i+1]
1409 veor q15,q13 @ sigma1(X[i+14])
1415 vext.8 q14,q7,q0,#8 @ X[i+9]
1417 vshr.u64 d24,d22,#14 @ from NEON_00_15
1419 vshr.u64 d25,d22,#18 @ from NEON_00_15
1420 veor q15,q13 @ sigma0(X[i+1])
1421 vshr.u64 d26,d22,#41 @ from NEON_00_15
1423 vld1.64 {d28},[r3,:64]! @ K[i++]
1428 #if 22<16 && defined(__ARMEL__)
1432 vbsl d29,d23,d16 @ Ch(e,f,g)
1433 vshr.u64 d24,d18,#28
1434 veor d26,d25 @ Sigma1(e)
1435 vadd.i64 d27,d29,d17
1436 vshr.u64 d25,d18,#34
1439 vshr.u64 d26,d18,#39
1446 vbsl d30,d20,d19 @ Maj(a,b,c)
1447 veor d17,d26 @ Sigma0(a)
1451 vshr.u64 d24,d21,#14 @ 23
1453 vld1.64 {d7},[r1]! @ handles unaligned
1455 vshr.u64 d25,d21,#18
1457 vadd.i64 d17,d30 @ h+=Maj from the past
1459 vshr.u64 d26,d21,#41
1460 vld1.64 {d28},[r3,:64]! @ K[i++]
1465 #if 23<16 && defined(__ARMEL__)
1469 vbsl d29,d22,d23 @ Ch(e,f,g)
1470 vshr.u64 d24,d17,#28
1471 veor d26,d25 @ Sigma1(e)
1472 vadd.i64 d27,d29,d16
1473 vshr.u64 d25,d17,#34
1476 vshr.u64 d26,d17,#39
1483 vbsl d30,d19,d18 @ Maj(a,b,c)
1484 veor d16,d26 @ Sigma0(a)
1490 vadd.i64 d16,d30 @ h+=Maj from the past
1493 vext.8 q14,q4,q5,#8 @ X[i+1]
1497 veor q15,q13 @ sigma1(X[i+14])
1503 vext.8 q14,q0,q1,#8 @ X[i+9]
1505 vshr.u64 d24,d20,#14 @ from NEON_00_15
1507 vshr.u64 d25,d20,#18 @ from NEON_00_15
1508 veor q15,q13 @ sigma0(X[i+1])
1509 vshr.u64 d26,d20,#41 @ from NEON_00_15
1511 vld1.64 {d28},[r3,:64]! @ K[i++]
1516 #if 24<16 && defined(__ARMEL__)
1520 vbsl d29,d21,d22 @ Ch(e,f,g)
1521 vshr.u64 d24,d16,#28
1522 veor d26,d25 @ Sigma1(e)
1523 vadd.i64 d27,d29,d23
1524 vshr.u64 d25,d16,#34
1527 vshr.u64 d26,d16,#39
1534 vbsl d30,d18,d17 @ Maj(a,b,c)
1535 veor d23,d26 @ Sigma0(a)
1539 vshr.u64 d24,d19,#14 @ 25
1541 vld1.64 {d9},[r1]! @ handles unaligned
1543 vshr.u64 d25,d19,#18
1545 vadd.i64 d23,d30 @ h+=Maj from the past
1547 vshr.u64 d26,d19,#41
1548 vld1.64 {d28},[r3,:64]! @ K[i++]
1553 #if 25<16 && defined(__ARMEL__)
1557 vbsl d29,d20,d21 @ Ch(e,f,g)
1558 vshr.u64 d24,d23,#28
1559 veor d26,d25 @ Sigma1(e)
1560 vadd.i64 d27,d29,d22
1561 vshr.u64 d25,d23,#34
1564 vshr.u64 d26,d23,#39
1571 vbsl d30,d17,d16 @ Maj(a,b,c)
1572 veor d22,d26 @ Sigma0(a)
1578 vadd.i64 d22,d30 @ h+=Maj from the past
1581 vext.8 q14,q5,q6,#8 @ X[i+1]
1585 veor q15,q13 @ sigma1(X[i+14])
1591 vext.8 q14,q1,q2,#8 @ X[i+9]
1593 vshr.u64 d24,d18,#14 @ from NEON_00_15
1595 vshr.u64 d25,d18,#18 @ from NEON_00_15
1596 veor q15,q13 @ sigma0(X[i+1])
1597 vshr.u64 d26,d18,#41 @ from NEON_00_15
1599 vld1.64 {d28},[r3,:64]! @ K[i++]
1604 #if 26<16 && defined(__ARMEL__)
1608 vbsl d29,d19,d20 @ Ch(e,f,g)
1609 vshr.u64 d24,d22,#28
1610 veor d26,d25 @ Sigma1(e)
1611 vadd.i64 d27,d29,d21
1612 vshr.u64 d25,d22,#34
1615 vshr.u64 d26,d22,#39
1622 vbsl d30,d16,d23 @ Maj(a,b,c)
1623 veor d21,d26 @ Sigma0(a)
1627 vshr.u64 d24,d17,#14 @ 27
1629 vld1.64 {d11},[r1]! @ handles unaligned
1631 vshr.u64 d25,d17,#18
1633 vadd.i64 d21,d30 @ h+=Maj from the past
1635 vshr.u64 d26,d17,#41
1636 vld1.64 {d28},[r3,:64]! @ K[i++]
1641 #if 27<16 && defined(__ARMEL__)
1645 vbsl d29,d18,d19 @ Ch(e,f,g)
1646 vshr.u64 d24,d21,#28
1647 veor d26,d25 @ Sigma1(e)
1648 vadd.i64 d27,d29,d20
1649 vshr.u64 d25,d21,#34
1652 vshr.u64 d26,d21,#39
1659 vbsl d30,d23,d22 @ Maj(a,b,c)
1660 veor d20,d26 @ Sigma0(a)
1666 vadd.i64 d20,d30 @ h+=Maj from the past
1669 vext.8 q14,q6,q7,#8 @ X[i+1]
1673 veor q15,q13 @ sigma1(X[i+14])
1679 vext.8 q14,q2,q3,#8 @ X[i+9]
1681 vshr.u64 d24,d16,#14 @ from NEON_00_15
1683 vshr.u64 d25,d16,#18 @ from NEON_00_15
1684 veor q15,q13 @ sigma0(X[i+1])
1685 vshr.u64 d26,d16,#41 @ from NEON_00_15
1687 vld1.64 {d28},[r3,:64]! @ K[i++]
1692 #if 28<16 && defined(__ARMEL__)
1696 vbsl d29,d17,d18 @ Ch(e,f,g)
1697 vshr.u64 d24,d20,#28
1698 veor d26,d25 @ Sigma1(e)
1699 vadd.i64 d27,d29,d19
1700 vshr.u64 d25,d20,#34
1703 vshr.u64 d26,d20,#39
1710 vbsl d30,d22,d21 @ Maj(a,b,c)
1711 veor d19,d26 @ Sigma0(a)
1715 vshr.u64 d24,d23,#14 @ 29
1717 vld1.64 {d13},[r1]! @ handles unaligned
1719 vshr.u64 d25,d23,#18
1721 vadd.i64 d19,d30 @ h+=Maj from the past
1723 vshr.u64 d26,d23,#41
1724 vld1.64 {d28},[r3,:64]! @ K[i++]
1729 #if 29<16 && defined(__ARMEL__)
1733 vbsl d29,d16,d17 @ Ch(e,f,g)
1734 vshr.u64 d24,d19,#28
1735 veor d26,d25 @ Sigma1(e)
1736 vadd.i64 d27,d29,d18
1737 vshr.u64 d25,d19,#34
1740 vshr.u64 d26,d19,#39
1747 vbsl d30,d21,d20 @ Maj(a,b,c)
1748 veor d18,d26 @ Sigma0(a)
1754 vadd.i64 d18,d30 @ h+=Maj from the past
1757 vext.8 q14,q7,q0,#8 @ X[i+1]
1761 veor q15,q13 @ sigma1(X[i+14])
1767 vext.8 q14,q3,q4,#8 @ X[i+9]
1769 vshr.u64 d24,d22,#14 @ from NEON_00_15
1771 vshr.u64 d25,d22,#18 @ from NEON_00_15
1772 veor q15,q13 @ sigma0(X[i+1])
1773 vshr.u64 d26,d22,#41 @ from NEON_00_15
1775 vld1.64 {d28},[r3,:64]! @ K[i++]
1780 #if 30<16 && defined(__ARMEL__)
1784 vbsl d29,d23,d16 @ Ch(e,f,g)
1785 vshr.u64 d24,d18,#28
1786 veor d26,d25 @ Sigma1(e)
1787 vadd.i64 d27,d29,d17
1788 vshr.u64 d25,d18,#34
1791 vshr.u64 d26,d18,#39
1798 vbsl d30,d20,d19 @ Maj(a,b,c)
1799 veor d17,d26 @ Sigma0(a)
1803 vshr.u64 d24,d21,#14 @ 31
1805 vld1.64 {d15},[r1]! @ handles unaligned
1807 vshr.u64 d25,d21,#18
1809 vadd.i64 d17,d30 @ h+=Maj from the past
1811 vshr.u64 d26,d21,#41
1812 vld1.64 {d28},[r3,:64]! @ K[i++]
1817 #if 31<16 && defined(__ARMEL__)
1821 vbsl d29,d22,d23 @ Ch(e,f,g)
1822 vshr.u64 d24,d17,#28
1823 veor d26,d25 @ Sigma1(e)
1824 vadd.i64 d27,d29,d16
1825 vshr.u64 d25,d17,#34
1828 vshr.u64 d26,d17,#39
1835 vbsl d30,d19,d18 @ Maj(a,b,c)
1836 veor d16,d26 @ Sigma0(a)
1842 vadd.i64 d16,d30 @ h+=Maj from the past
1843 vldmia r0,{d24-d31} @ load context to temp
1844 vadd.i64 q8,q12 @ vectorized accumulate
1848 vstmia r0,{d16-d23} @ save context
1850 sub r3,#640 @ rewind K512
1854 bx lr @ .word 0xe12fff1e
1855 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
1857 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
1859 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1860 .comm OPENSSL_armcap_P,4,4