3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA512 block procedure for ARMv4. September 2007.
14 # This code is ~4.5 (four and a half) times faster than code generated
15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 # Xscale PXA250 core].
20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 # Cortex A8 core and ~40 cycles per processed byte.
25 # Profiler-assisted and platform-specific optimization resulted in 7%
26 # improvement on Coxtex A8 core and ~38 cycles per byte.
30 # Add NEON implementation. On Cortex A8 it was measured to process
31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
36 # terms it's 22.6 cycles per byte, which is disappointing result.
37 # Technical writers asserted that 3-way S4 pipeline can sustain
38 # multiple NEON instructions per cycle, but dual NEON issue could
39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 # for further details. On side note Cortex-A15 processes one byte in
43 # Byte order [in]dependence. =========================================
45 # Originally caller was expected to maintain specific *dword* order in
46 # h[0-7], namely with most significant dword at *lower* address, which
47 # was reflected in below two parameters as 0 and 4. Now caller is
48 # expected to maintain native byte order for whole 64-bit values.
51 # ====================================================================
53 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
54 open STDOUT
,">$output";
56 $ctx="r0"; # parameter block
70 ############ r13 is stack pointer
72 ############ r15 is program counter
87 @ Sigma1
(x
) (ROTR
((x
),14) ^ ROTR
((x
),18) ^ ROTR
((x
),41))
88 @ LO lo
>>14^hi
<<18 ^ lo
>>18^hi
<<14 ^ hi
>>9^lo
<<23
89 @ HI hi
>>14^lo
<<18 ^ hi
>>18^lo
<<14 ^ lo
>>9^hi
<<23
91 str
$Tlo,[sp
,#$Xoff+0]
93 str
$Thi,[sp
,#$Xoff+4]
94 eor
$t0,$t0,$Ehi,lsl
#18
95 ldr
$t2,[sp
,#$Hoff+0] @ h.lo
96 eor
$t1,$t1,$Elo,lsl
#18
97 ldr
$t3,[sp
,#$Hoff+4] @ h.hi
98 eor
$t0,$t0,$Elo,lsr
#18
99 eor
$t1,$t1,$Ehi,lsr
#18
100 eor
$t0,$t0,$Ehi,lsl
#14
101 eor
$t1,$t1,$Elo,lsl
#14
102 eor
$t0,$t0,$Ehi,lsr
#9
103 eor
$t1,$t1,$Elo,lsr
#9
104 eor
$t0,$t0,$Elo,lsl
#23
105 eor
$t1,$t1,$Ehi,lsl
#23 @ Sigma1(e)
107 ldr
$t0,[sp
,#$Foff+0] @ f.lo
108 adc
$Thi,$Thi,$t1 @ T
+= Sigma1
(e
)
109 ldr
$t1,[sp
,#$Foff+4] @ f.hi
111 ldr
$t2,[sp
,#$Goff+0] @ g.lo
112 adc
$Thi,$Thi,$t3 @ T
+= h
113 ldr
$t3,[sp
,#$Goff+4] @ g.hi
116 str
$Elo,[sp
,#$Eoff+0]
118 str
$Ehi,[sp
,#$Eoff+4]
120 str
$Alo,[sp
,#$Aoff+0]
122 str
$Ahi,[sp
,#$Aoff+4]
124 ldr
$t2,[$Ktbl,#$lo] @ K[i].lo
125 eor
$t1,$t1,$t3 @ Ch
(e
,f
,g
)
126 ldr
$t3,[$Ktbl,#$hi] @ K[i].hi
129 ldr
$Elo,[sp
,#$Doff+0] @ d.lo
130 adc
$Thi,$Thi,$t1 @ T
+= Ch
(e
,f
,g
)
131 ldr
$Ehi,[sp
,#$Doff+4] @ d.hi
134 adc
$Thi,$Thi,$t3 @ T
+= K
[i
]
136 ldr
$t2,[sp
,#$Boff+0] @ b.lo
137 adc
$Ehi,$Ehi,$Thi @ d
+= T
140 ldr
$t3,[sp
,#$Coff+0] @ c.lo
142 it
eq @ Thumb2 thing
, sanity check
in ARM
145 @ Sigma0
(x
) (ROTR
((x
),28) ^ ROTR
((x
),34) ^ ROTR
((x
),39))
146 @ LO lo
>>28^hi
<<4 ^ hi
>>2^lo
<<30 ^ hi
>>7^lo
<<25
147 @ HI hi
>>28^lo
<<4 ^ lo
>>2^hi
<<30 ^ lo
>>7^hi
<<25
150 eor
$t0,$t0,$Ahi,lsl
#4
151 eor
$t1,$t1,$Alo,lsl
#4
152 eor
$t0,$t0,$Ahi,lsr
#2
153 eor
$t1,$t1,$Alo,lsr
#2
154 eor
$t0,$t0,$Alo,lsl
#30
155 eor
$t1,$t1,$Ahi,lsl
#30
156 eor
$t0,$t0,$Ahi,lsr
#7
157 eor
$t1,$t1,$Alo,lsr
#7
158 eor
$t0,$t0,$Alo,lsl
#25
159 eor
$t1,$t1,$Ahi,lsl
#25 @ Sigma0(a)
162 adc
$Thi,$Thi,$t1 @ T
+= Sigma0
(a
)
164 ldr
$t1,[sp
,#$Boff+4] @ b.hi
166 ldr
$t2,[sp
,#$Coff+4] @ c.hi
170 orr
$Alo,$Alo,$t0 @ Maj
(a
,b
,c
).lo
173 orr
$Ahi,$Ahi,$t3 @ Maj
(a
,b
,c
).hi
175 adc
$Ahi,$Ahi,$Thi @ h
+= T
182 # include "arm_arch.h"
183 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
184 # define VFP_ABI_POP vldmia sp!,{d8-d15}
186 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
187 # define __ARM_MAX_ARCH__ 7
188 # define VFP_ABI_PUSH
195 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
199 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
218 WORD64
(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
219 WORD64
(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
220 WORD64
(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
221 WORD64
(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
222 WORD64
(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
223 WORD64
(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
224 WORD64
(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
225 WORD64
(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
226 WORD64
(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
227 WORD64
(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
228 WORD64
(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
229 WORD64
(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
230 WORD64
(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
231 WORD64
(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
232 WORD64
(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
233 WORD64
(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
234 WORD64
(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
235 WORD64
(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
236 WORD64
(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
237 WORD64
(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
238 WORD64
(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
239 WORD64
(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
240 WORD64
(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
241 WORD64
(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
242 WORD64
(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
243 WORD64
(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
244 WORD64
(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
245 WORD64
(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
246 WORD64
(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
247 WORD64
(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
248 WORD64
(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
249 WORD64
(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
250 WORD64
(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
251 WORD64
(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
252 WORD64
(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
253 WORD64
(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
254 WORD64
(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
255 WORD64
(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
256 WORD64
(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
257 WORD64
(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
259 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
261 .word OPENSSL_armcap_P
-sha512_block_data_order
267 .global sha512_block_data_order
268 .type sha512_block_data_order
,%function
269 sha512_block_data_order
:
271 sub r3
,pc
,#8 @ sha512_block_data_order
273 adr r3
,sha512_block_data_order
275 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
276 ldr r12
,.LOPENSSL_armcap
277 ldr r12
,[r3
,r12
] @ OPENSSL_armcap_P
281 add
$len,$inp,$len,lsl
#7 @ len to point at the end of inp
282 stmdb sp
!,{r4
-r12
,lr
}
283 sub $Ktbl,r3
,#672 @ K512
286 ldr
$Elo,[$ctx,#$Eoff+$lo]
287 ldr
$Ehi,[$ctx,#$Eoff+$hi]
288 ldr
$t0, [$ctx,#$Goff+$lo]
289 ldr
$t1, [$ctx,#$Goff+$hi]
290 ldr
$t2, [$ctx,#$Hoff+$lo]
291 ldr
$t3, [$ctx,#$Hoff+$hi]
293 str
$t0, [sp
,#$Goff+0]
294 str
$t1, [sp
,#$Goff+4]
295 str
$t2, [sp
,#$Hoff+0]
296 str
$t3, [sp
,#$Hoff+4]
297 ldr
$Alo,[$ctx,#$Aoff+$lo]
298 ldr
$Ahi,[$ctx,#$Aoff+$hi]
299 ldr
$Tlo,[$ctx,#$Boff+$lo]
300 ldr
$Thi,[$ctx,#$Boff+$hi]
301 ldr
$t0, [$ctx,#$Coff+$lo]
302 ldr
$t1, [$ctx,#$Coff+$hi]
303 ldr
$t2, [$ctx,#$Doff+$lo]
304 ldr
$t3, [$ctx,#$Doff+$hi]
305 str
$Tlo,[sp
,#$Boff+0]
306 str
$Thi,[sp
,#$Boff+4]
307 str
$t0, [sp
,#$Coff+0]
308 str
$t1, [sp
,#$Coff+4]
309 str
$t2, [sp
,#$Doff+0]
310 str
$t3, [sp
,#$Doff+4]
311 ldr
$Tlo,[$ctx,#$Foff+$lo]
312 ldr
$Thi,[$ctx,#$Foff+$hi]
313 str
$Tlo,[sp
,#$Foff+0]
314 str
$Thi,[sp
,#$Foff+4]
324 orr
$Tlo,$Tlo,$t0,lsl
#8
326 orr
$Tlo,$Tlo,$t1,lsl
#16
328 orr
$Tlo,$Tlo,$t2,lsl
#24
329 orr
$Thi,$Thi,$t3,lsl
#8
330 orr
$Thi,$Thi,$t0,lsl
#16
331 orr
$Thi,$Thi,$t1,lsl
#24
345 ldr
$t0,[sp
,#`$Xoff+8*(16-1)`+0]
346 ldr
$t1,[sp
,#`$Xoff+8*(16-1)`+4]
349 @ sigma0
(x
) (ROTR
((x
),1) ^ ROTR
((x
),8) ^ ((x
)>>7))
350 @ LO lo
>>1^hi
<<31 ^ lo
>>8^hi
<<24 ^ lo
>>7^hi
<<25
351 @ HI hi
>>1^lo
<<31 ^ hi
>>8^lo
<<24 ^ hi
>>7
353 ldr
$t2,[sp
,#`$Xoff+8*(16-14)`+0]
355 ldr
$t3,[sp
,#`$Xoff+8*(16-14)`+4]
356 eor
$Tlo,$Tlo,$t1,lsl
#31
357 eor
$Thi,$Thi,$t0,lsl
#31
358 eor
$Tlo,$Tlo,$t0,lsr
#8
359 eor
$Thi,$Thi,$t1,lsr
#8
360 eor
$Tlo,$Tlo,$t1,lsl
#24
361 eor
$Thi,$Thi,$t0,lsl
#24
362 eor
$Tlo,$Tlo,$t0,lsr
#7
363 eor
$Thi,$Thi,$t1,lsr
#7
364 eor
$Tlo,$Tlo,$t1,lsl
#25
366 @ sigma1
(x
) (ROTR
((x
),19) ^ ROTR
((x
),61) ^ ((x
)>>6))
367 @ LO lo
>>19^hi
<<13 ^ hi
>>29^lo
<<3 ^ lo
>>6^hi
<<26
368 @ HI hi
>>19^lo
<<13 ^ lo
>>29^hi
<<3 ^ hi
>>6
371 eor
$t0,$t0,$t3,lsl
#13
372 eor
$t1,$t1,$t2,lsl
#13
373 eor
$t0,$t0,$t3,lsr
#29
374 eor
$t1,$t1,$t2,lsr
#29
375 eor
$t0,$t0,$t2,lsl
#3
376 eor
$t1,$t1,$t3,lsl
#3
377 eor
$t0,$t0,$t2,lsr
#6
378 eor
$t1,$t1,$t3,lsr
#6
379 ldr
$t2,[sp
,#`$Xoff+8*(16-9)`+0]
380 eor
$t0,$t0,$t3,lsl
#26
382 ldr
$t3,[sp
,#`$Xoff+8*(16-9)`+4]
384 ldr
$t0,[sp
,#`$Xoff+8*16`+0]
387 ldr
$t1,[sp
,#`$Xoff+8*16`+4]
396 ittt
eq @ Thumb2 thing
, sanity check
in ARM
398 ldreq
$t0,[sp
,#`$Xoff+8*(16-1)`+0]
399 ldreq
$t1,[sp
,#`$Xoff+8*(16-1)`+4]
403 ldr
$Tlo,[sp
,#$Boff+0]
404 ldr
$Thi,[sp
,#$Boff+4]
405 ldr
$t0, [$ctx,#$Aoff+$lo]
406 ldr
$t1, [$ctx,#$Aoff+$hi]
407 ldr
$t2, [$ctx,#$Boff+$lo]
408 ldr
$t3, [$ctx,#$Boff+$hi]
410 str
$t0, [$ctx,#$Aoff+$lo]
412 str
$t1, [$ctx,#$Aoff+$hi]
414 str
$t2, [$ctx,#$Boff+$lo]
416 str
$t3, [$ctx,#$Boff+$hi]
418 ldr
$Alo,[sp
,#$Coff+0]
419 ldr
$Ahi,[sp
,#$Coff+4]
420 ldr
$Tlo,[sp
,#$Doff+0]
421 ldr
$Thi,[sp
,#$Doff+4]
422 ldr
$t0, [$ctx,#$Coff+$lo]
423 ldr
$t1, [$ctx,#$Coff+$hi]
424 ldr
$t2, [$ctx,#$Doff+$lo]
425 ldr
$t3, [$ctx,#$Doff+$hi]
427 str
$t0, [$ctx,#$Coff+$lo]
429 str
$t1, [$ctx,#$Coff+$hi]
431 str
$t2, [$ctx,#$Doff+$lo]
433 str
$t3, [$ctx,#$Doff+$hi]
435 ldr
$Tlo,[sp
,#$Foff+0]
436 ldr
$Thi,[sp
,#$Foff+4]
437 ldr
$t0, [$ctx,#$Eoff+$lo]
438 ldr
$t1, [$ctx,#$Eoff+$hi]
439 ldr
$t2, [$ctx,#$Foff+$lo]
440 ldr
$t3, [$ctx,#$Foff+$hi]
442 str
$Elo,[$ctx,#$Eoff+$lo]
444 str
$Ehi,[$ctx,#$Eoff+$hi]
446 str
$t2, [$ctx,#$Foff+$lo]
448 str
$t3, [$ctx,#$Foff+$hi]
450 ldr
$Alo,[sp
,#$Goff+0]
451 ldr
$Ahi,[sp
,#$Goff+4]
452 ldr
$Tlo,[sp
,#$Hoff+0]
453 ldr
$Thi,[sp
,#$Hoff+4]
454 ldr
$t0, [$ctx,#$Goff+$lo]
455 ldr
$t1, [$ctx,#$Goff+$hi]
456 ldr
$t2, [$ctx,#$Hoff+$lo]
457 ldr
$t3, [$ctx,#$Hoff+$hi]
459 str
$t0, [$ctx,#$Goff+$lo]
461 str
$t1, [$ctx,#$Goff+$hi]
463 str
$t2, [$ctx,#$Hoff+$lo]
465 str
$t3, [$ctx,#$Hoff+$hi]
473 add sp
,sp
,#8*9 @ destroy frame
475 ldmia sp
!,{r4
-r12
,pc
}
477 ldmia sp
!,{r4
-r12
,lr
}
479 moveq pc
,lr @ be binary compatible with V4
, yet
480 bx lr @ interoperable with Thumb ISA
:-)
482 .size sha512_block_data_order
,.-sha512_block_data_order
486 my @Sigma0=(28,34,39);
487 my @Sigma1=(14,18,41);
488 my @sigma0=(1, 8, 7);
489 my @sigma1=(19,61,6);
492 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
494 my @X=map("d$_",(0..15));
495 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
499 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
500 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
502 $code.=<<___
if ($i<16 || $i&1);
503 vshr
.u64
$t0,$e,#@Sigma1[0] @ $i
505 vld1
.64
{@X[$i%16]},[$inp]! @ handles unaligned
507 vshr
.u64
$t1,$e,#@Sigma1[1]
509 vadd
.i64
$a,$Maj @ h
+=Maj from the past
511 vshr
.u64
$t2,$e,#@Sigma1[2]
514 vld1
.64
{$K},[$Ktbl,:64]! @ K
[i
++]
515 vsli
.64 $t0,$e,#`64-@Sigma1[0]`
516 vsli
.64 $t1,$e,#`64-@Sigma1[1]`
518 vsli
.64 $t2,$e,#`64-@Sigma1[2]`
519 #if $i<16 && defined(__ARMEL__)
520 vrev64
.8
@X[$i],@X[$i]
523 vbsl
$Ch,$f,$g @ Ch
(e
,f
,g
)
524 vshr
.u64
$t0,$a,#@Sigma0[0]
525 veor
$t2,$t1 @ Sigma1
(e
)
527 vshr
.u64
$t1,$a,#@Sigma0[1]
528 vsli
.64 $t0,$a,#`64-@Sigma0[0]`
530 vshr
.u64
$t2,$a,#@Sigma0[2]
531 vadd
.i64
$K,@X[$i%16]
532 vsli
.64 $t1,$a,#`64-@Sigma0[1]`
534 vsli
.64 $t2,$a,#`64-@Sigma0[2]`
537 vbsl
$Maj,$c,$b @ Maj
(a
,b
,c
)
538 veor
$h,$t2 @ Sigma0
(a
)
548 if ($i&1) { &NEON_00_15
($i,@_); return; }
550 # 2x-vectorized, therefore runs every 2nd round
551 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
552 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
553 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
554 my $e=@_[4]; # $e from NEON_00_15
557 vshr
.u64
$t0,@X[($i+7)%8],#@sigma1[0]
558 vshr
.u64
$t1,@X[($i+7)%8],#@sigma1[1]
559 vadd
.i64
@_[0],d30 @ h
+=Maj from the past
560 vshr
.u64
$s1,@X[($i+7)%8],#@sigma1[2]
561 vsli
.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
562 vext
.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
563 vsli
.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
565 vshr
.u64
$t0,$s0,#@sigma0[0]
566 veor
$s1,$t1 @ sigma1
(X
[i
+14])
567 vshr
.u64
$t1,$s0,#@sigma0[1]
568 vadd
.i64
@X[$i%8],$s1
569 vshr
.u64
$s1,$s0,#@sigma0[2]
570 vsli
.64 $t0,$s0,#`64-@sigma0[0]`
571 vsli
.64 $t1,$s0,#`64-@sigma0[1]`
572 vext
.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
574 vshr
.u64
$d0,$e,#@Sigma1[0] @ from NEON_00_15
575 vadd
.i64
@X[$i%8],$s0
576 vshr
.u64
$d1,$e,#@Sigma1[1] @ from NEON_00_15
577 veor
$s1,$t1 @ sigma0
(X
[i
+1])
578 vshr
.u64
$d2,$e,#@Sigma1[2] @ from NEON_00_15
579 vadd
.i64
@X[$i%8],$s1
581 &NEON_00_15
(2*$i,@_);
585 #if __ARM_MAX_ARCH__>=7
589 .global sha512_block_data_order_neon
590 .type sha512_block_data_order_neon
,%function
592 sha512_block_data_order_neon
:
594 dmb @ errata
#451034 on early Cortex A8
595 add
$len,$inp,$len,lsl
#7 @ len to point at the end of inp
598 vldmia
$ctx,{$A-$H} @ load context
601 for($i=0;$i<16;$i++) { &NEON_00_15
($i,@V); unshift(@V,pop(@V)); }
607 for(;$i<32;$i++) { &NEON_16_79
($i,@V); unshift(@V,pop(@V)); }
611 vadd
.i64
$A,d30 @ h
+=Maj from the past
612 vldmia
$ctx,{d24
-d31
} @ load context to temp
613 vadd
.i64 q8
,q12 @ vectorized accumulate
617 vstmia
$ctx,{$A-$H} @ save context
619 sub $Ktbl,#640 @ rewind K512
624 .size sha512_block_data_order_neon
,.-sha512_block_data_order_neon
629 .asciz
"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
631 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
632 .comm OPENSSL_armcap_P
,4,4
636 $code =~ s/\`([^\`]*)\`/eval $1/gem;
637 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
638 $code =~ s/\bret\b/bx lr/gm;
643 last if (!s/^#/@/ and !/^$/);
649 close STDOUT
; # enforce flush