2 # SPDX-License-Identifier: GPL-2.0
4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
5 # has relicensed it under the GPLv2. Therefore this program is free software;
6 # you can redistribute it and/or modify it under the terms of the GNU General
7 # Public License version 2 as published by the Free Software Foundation.
9 # The original headers, including the original license headers, are
10 # included below for completeness.
12 # ====================================================================
13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # project. The module is, however, dual licensed under OpenSSL and
15 # CRYPTOGAMS licenses depending on where you obtain it. For further
16 # details see http://www.openssl.org/~appro/cryptogams/.
17 # ====================================================================
19 # SHA512 block procedure for ARMv4. September 2007.
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23 # Xscale PXA250 core].
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
43 # terms it's 22.6 cycles per byte, which is disappointing result.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
45 # multiple NEON instructions per cycle, but dual NEON issue could
46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
50 # Byte order [in]dependence. =========================================
52 # Originally caller was expected to maintain specific *dword* order in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
58 # ====================================================================
60 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61 open STDOUT
,">$output";
63 $ctx="r0"; # parameter block
77 ############ r13 is stack pointer
79 ############ r15 is program counter
94 @ Sigma1
(x
) (ROTR
((x
),14) ^ ROTR
((x
),18) ^ ROTR
((x
),41))
95 @ LO lo
>>14^hi
<<18 ^ lo
>>18^hi
<<14 ^ hi
>>9^lo
<<23
96 @ HI hi
>>14^lo
<<18 ^ hi
>>18^lo
<<14 ^ lo
>>9^hi
<<23
98 str
$Tlo,[sp
,#$Xoff+0]
100 str
$Thi,[sp
,#$Xoff+4]
101 eor
$t0,$t0,$Ehi,lsl
#18
102 ldr
$t2,[sp
,#$Hoff+0] @ h.lo
103 eor
$t1,$t1,$Elo,lsl
#18
104 ldr
$t3,[sp
,#$Hoff+4] @ h.hi
105 eor
$t0,$t0,$Elo,lsr
#18
106 eor
$t1,$t1,$Ehi,lsr
#18
107 eor
$t0,$t0,$Ehi,lsl
#14
108 eor
$t1,$t1,$Elo,lsl
#14
109 eor
$t0,$t0,$Ehi,lsr
#9
110 eor
$t1,$t1,$Elo,lsr
#9
111 eor
$t0,$t0,$Elo,lsl
#23
112 eor
$t1,$t1,$Ehi,lsl
#23 @ Sigma1(e)
114 ldr
$t0,[sp
,#$Foff+0] @ f.lo
115 adc
$Thi,$Thi,$t1 @ T
+= Sigma1
(e
)
116 ldr
$t1,[sp
,#$Foff+4] @ f.hi
118 ldr
$t2,[sp
,#$Goff+0] @ g.lo
119 adc
$Thi,$Thi,$t3 @ T
+= h
120 ldr
$t3,[sp
,#$Goff+4] @ g.hi
123 str
$Elo,[sp
,#$Eoff+0]
125 str
$Ehi,[sp
,#$Eoff+4]
127 str
$Alo,[sp
,#$Aoff+0]
129 str
$Ahi,[sp
,#$Aoff+4]
131 ldr
$t2,[$Ktbl,#$lo] @ K[i].lo
132 eor
$t1,$t1,$t3 @ Ch
(e
,f
,g
)
133 ldr
$t3,[$Ktbl,#$hi] @ K[i].hi
136 ldr
$Elo,[sp
,#$Doff+0] @ d.lo
137 adc
$Thi,$Thi,$t1 @ T
+= Ch
(e
,f
,g
)
138 ldr
$Ehi,[sp
,#$Doff+4] @ d.hi
141 adc
$Thi,$Thi,$t3 @ T
+= K
[i
]
143 ldr
$t2,[sp
,#$Boff+0] @ b.lo
144 adc
$Ehi,$Ehi,$Thi @ d
+= T
147 ldr
$t3,[sp
,#$Coff+0] @ c.lo
149 it
eq @ Thumb2 thing
, sanity check
in ARM
152 @ Sigma0
(x
) (ROTR
((x
),28) ^ ROTR
((x
),34) ^ ROTR
((x
),39))
153 @ LO lo
>>28^hi
<<4 ^ hi
>>2^lo
<<30 ^ hi
>>7^lo
<<25
154 @ HI hi
>>28^lo
<<4 ^ lo
>>2^hi
<<30 ^ lo
>>7^hi
<<25
157 eor
$t0,$t0,$Ahi,lsl
#4
158 eor
$t1,$t1,$Alo,lsl
#4
159 eor
$t0,$t0,$Ahi,lsr
#2
160 eor
$t1,$t1,$Alo,lsr
#2
161 eor
$t0,$t0,$Alo,lsl
#30
162 eor
$t1,$t1,$Ahi,lsl
#30
163 eor
$t0,$t0,$Ahi,lsr
#7
164 eor
$t1,$t1,$Alo,lsr
#7
165 eor
$t0,$t0,$Alo,lsl
#25
166 eor
$t1,$t1,$Ahi,lsl
#25 @ Sigma0(a)
169 adc
$Thi,$Thi,$t1 @ T
+= Sigma0
(a
)
171 ldr
$t1,[sp
,#$Boff+4] @ b.hi
173 ldr
$t2,[sp
,#$Coff+4] @ c.hi
177 orr
$Alo,$Alo,$t0 @ Maj
(a
,b
,c
).lo
180 orr
$Ahi,$Ahi,$t3 @ Maj
(a
,b
,c
).hi
182 adc
$Ahi,$Ahi,$Thi @ h
+= T
189 # include "arm_arch.h"
190 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
191 # define VFP_ABI_POP vldmia sp!,{d8-d15}
193 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
194 # define __ARM_MAX_ARCH__ 7
195 # define VFP_ABI_PUSH
202 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
206 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
225 WORD64
(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
226 WORD64
(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
227 WORD64
(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
228 WORD64
(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
229 WORD64
(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
230 WORD64
(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
231 WORD64
(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
232 WORD64
(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
233 WORD64
(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
234 WORD64
(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
235 WORD64
(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
236 WORD64
(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
237 WORD64
(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
238 WORD64
(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
239 WORD64
(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
240 WORD64
(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
241 WORD64
(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
242 WORD64
(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
243 WORD64
(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
244 WORD64
(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
245 WORD64
(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
246 WORD64
(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
247 WORD64
(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
248 WORD64
(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
249 WORD64
(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
250 WORD64
(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
251 WORD64
(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
252 WORD64
(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
253 WORD64
(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
254 WORD64
(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
255 WORD64
(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
256 WORD64
(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
257 WORD64
(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
258 WORD64
(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
259 WORD64
(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
260 WORD64
(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
261 WORD64
(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
262 WORD64
(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
263 WORD64
(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
264 WORD64
(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
266 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
268 .word OPENSSL_armcap_P
-sha512_block_data_order
274 .global sha512_block_data_order
275 .type sha512_block_data_order
,%function
276 sha512_block_data_order
:
277 .Lsha512_block_data_order
:
279 sub r3
,pc
,#8 @ sha512_block_data_order
281 adr r3
,.Lsha512_block_data_order
283 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
284 ldr r12
,.LOPENSSL_armcap
285 ldr r12
,[r3
,r12
] @ OPENSSL_armcap_P
289 add
$len,$inp,$len,lsl
#7 @ len to point at the end of inp
290 stmdb sp
!,{r4
-r12
,lr
}
291 sub $Ktbl,r3
,#672 @ K512
294 ldr
$Elo,[$ctx,#$Eoff+$lo]
295 ldr
$Ehi,[$ctx,#$Eoff+$hi]
296 ldr
$t0, [$ctx,#$Goff+$lo]
297 ldr
$t1, [$ctx,#$Goff+$hi]
298 ldr
$t2, [$ctx,#$Hoff+$lo]
299 ldr
$t3, [$ctx,#$Hoff+$hi]
301 str
$t0, [sp
,#$Goff+0]
302 str
$t1, [sp
,#$Goff+4]
303 str
$t2, [sp
,#$Hoff+0]
304 str
$t3, [sp
,#$Hoff+4]
305 ldr
$Alo,[$ctx,#$Aoff+$lo]
306 ldr
$Ahi,[$ctx,#$Aoff+$hi]
307 ldr
$Tlo,[$ctx,#$Boff+$lo]
308 ldr
$Thi,[$ctx,#$Boff+$hi]
309 ldr
$t0, [$ctx,#$Coff+$lo]
310 ldr
$t1, [$ctx,#$Coff+$hi]
311 ldr
$t2, [$ctx,#$Doff+$lo]
312 ldr
$t3, [$ctx,#$Doff+$hi]
313 str
$Tlo,[sp
,#$Boff+0]
314 str
$Thi,[sp
,#$Boff+4]
315 str
$t0, [sp
,#$Coff+0]
316 str
$t1, [sp
,#$Coff+4]
317 str
$t2, [sp
,#$Doff+0]
318 str
$t3, [sp
,#$Doff+4]
319 ldr
$Tlo,[$ctx,#$Foff+$lo]
320 ldr
$Thi,[$ctx,#$Foff+$hi]
321 str
$Tlo,[sp
,#$Foff+0]
322 str
$Thi,[sp
,#$Foff+4]
332 orr
$Tlo,$Tlo,$t0,lsl
#8
334 orr
$Tlo,$Tlo,$t1,lsl
#16
336 orr
$Tlo,$Tlo,$t2,lsl
#24
337 orr
$Thi,$Thi,$t3,lsl
#8
338 orr
$Thi,$Thi,$t0,lsl
#16
339 orr
$Thi,$Thi,$t1,lsl
#24
353 ldr
$t0,[sp
,#`$Xoff+8*(16-1)`+0]
354 ldr
$t1,[sp
,#`$Xoff+8*(16-1)`+4]
357 @ sigma0
(x
) (ROTR
((x
),1) ^ ROTR
((x
),8) ^ ((x
)>>7))
358 @ LO lo
>>1^hi
<<31 ^ lo
>>8^hi
<<24 ^ lo
>>7^hi
<<25
359 @ HI hi
>>1^lo
<<31 ^ hi
>>8^lo
<<24 ^ hi
>>7
361 ldr
$t2,[sp
,#`$Xoff+8*(16-14)`+0]
363 ldr
$t3,[sp
,#`$Xoff+8*(16-14)`+4]
364 eor
$Tlo,$Tlo,$t1,lsl
#31
365 eor
$Thi,$Thi,$t0,lsl
#31
366 eor
$Tlo,$Tlo,$t0,lsr
#8
367 eor
$Thi,$Thi,$t1,lsr
#8
368 eor
$Tlo,$Tlo,$t1,lsl
#24
369 eor
$Thi,$Thi,$t0,lsl
#24
370 eor
$Tlo,$Tlo,$t0,lsr
#7
371 eor
$Thi,$Thi,$t1,lsr
#7
372 eor
$Tlo,$Tlo,$t1,lsl
#25
374 @ sigma1
(x
) (ROTR
((x
),19) ^ ROTR
((x
),61) ^ ((x
)>>6))
375 @ LO lo
>>19^hi
<<13 ^ hi
>>29^lo
<<3 ^ lo
>>6^hi
<<26
376 @ HI hi
>>19^lo
<<13 ^ lo
>>29^hi
<<3 ^ hi
>>6
379 eor
$t0,$t0,$t3,lsl
#13
380 eor
$t1,$t1,$t2,lsl
#13
381 eor
$t0,$t0,$t3,lsr
#29
382 eor
$t1,$t1,$t2,lsr
#29
383 eor
$t0,$t0,$t2,lsl
#3
384 eor
$t1,$t1,$t3,lsl
#3
385 eor
$t0,$t0,$t2,lsr
#6
386 eor
$t1,$t1,$t3,lsr
#6
387 ldr
$t2,[sp
,#`$Xoff+8*(16-9)`+0]
388 eor
$t0,$t0,$t3,lsl
#26
390 ldr
$t3,[sp
,#`$Xoff+8*(16-9)`+4]
392 ldr
$t0,[sp
,#`$Xoff+8*16`+0]
395 ldr
$t1,[sp
,#`$Xoff+8*16`+4]
404 ittt
eq @ Thumb2 thing
, sanity check
in ARM
406 ldreq
$t0,[sp
,#`$Xoff+8*(16-1)`+0]
407 ldreq
$t1,[sp
,#`$Xoff+8*(16-1)`+4]
411 ldr
$Tlo,[sp
,#$Boff+0]
412 ldr
$Thi,[sp
,#$Boff+4]
413 ldr
$t0, [$ctx,#$Aoff+$lo]
414 ldr
$t1, [$ctx,#$Aoff+$hi]
415 ldr
$t2, [$ctx,#$Boff+$lo]
416 ldr
$t3, [$ctx,#$Boff+$hi]
418 str
$t0, [$ctx,#$Aoff+$lo]
420 str
$t1, [$ctx,#$Aoff+$hi]
422 str
$t2, [$ctx,#$Boff+$lo]
424 str
$t3, [$ctx,#$Boff+$hi]
426 ldr
$Alo,[sp
,#$Coff+0]
427 ldr
$Ahi,[sp
,#$Coff+4]
428 ldr
$Tlo,[sp
,#$Doff+0]
429 ldr
$Thi,[sp
,#$Doff+4]
430 ldr
$t0, [$ctx,#$Coff+$lo]
431 ldr
$t1, [$ctx,#$Coff+$hi]
432 ldr
$t2, [$ctx,#$Doff+$lo]
433 ldr
$t3, [$ctx,#$Doff+$hi]
435 str
$t0, [$ctx,#$Coff+$lo]
437 str
$t1, [$ctx,#$Coff+$hi]
439 str
$t2, [$ctx,#$Doff+$lo]
441 str
$t3, [$ctx,#$Doff+$hi]
443 ldr
$Tlo,[sp
,#$Foff+0]
444 ldr
$Thi,[sp
,#$Foff+4]
445 ldr
$t0, [$ctx,#$Eoff+$lo]
446 ldr
$t1, [$ctx,#$Eoff+$hi]
447 ldr
$t2, [$ctx,#$Foff+$lo]
448 ldr
$t3, [$ctx,#$Foff+$hi]
450 str
$Elo,[$ctx,#$Eoff+$lo]
452 str
$Ehi,[$ctx,#$Eoff+$hi]
454 str
$t2, [$ctx,#$Foff+$lo]
456 str
$t3, [$ctx,#$Foff+$hi]
458 ldr
$Alo,[sp
,#$Goff+0]
459 ldr
$Ahi,[sp
,#$Goff+4]
460 ldr
$Tlo,[sp
,#$Hoff+0]
461 ldr
$Thi,[sp
,#$Hoff+4]
462 ldr
$t0, [$ctx,#$Goff+$lo]
463 ldr
$t1, [$ctx,#$Goff+$hi]
464 ldr
$t2, [$ctx,#$Hoff+$lo]
465 ldr
$t3, [$ctx,#$Hoff+$hi]
467 str
$t0, [$ctx,#$Goff+$lo]
469 str
$t1, [$ctx,#$Goff+$hi]
471 str
$t2, [$ctx,#$Hoff+$lo]
473 str
$t3, [$ctx,#$Hoff+$hi]
481 add sp
,sp
,#8*9 @ destroy frame
483 ldmia sp
!,{r4
-r12
,pc
}
485 ldmia sp
!,{r4
-r12
,lr
}
487 moveq pc
,lr @ be binary compatible with V4
, yet
488 bx lr @ interoperable with Thumb ISA
:-)
490 .size sha512_block_data_order
,.-sha512_block_data_order
494 my @Sigma0=(28,34,39);
495 my @Sigma1=(14,18,41);
496 my @sigma0=(1, 8, 7);
497 my @sigma1=(19,61,6);
500 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
502 my @X=map("d$_",(0..15));
503 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
507 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
508 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
510 $code.=<<___
if ($i<16 || $i&1);
511 vshr
.u64
$t0,$e,#@Sigma1[0] @ $i
513 vld1
.64
{@X[$i%16]},[$inp]! @ handles unaligned
515 vshr
.u64
$t1,$e,#@Sigma1[1]
517 vadd
.i64
$a,$Maj @ h
+=Maj from the past
519 vshr
.u64
$t2,$e,#@Sigma1[2]
522 vld1
.64
{$K},[$Ktbl,:64]! @ K
[i
++]
523 vsli
.64 $t0,$e,#`64-@Sigma1[0]`
524 vsli
.64 $t1,$e,#`64-@Sigma1[1]`
526 vsli
.64 $t2,$e,#`64-@Sigma1[2]`
527 #if $i<16 && defined(__ARMEL__)
528 vrev64
.8
@X[$i],@X[$i]
531 vbsl
$Ch,$f,$g @ Ch
(e
,f
,g
)
532 vshr
.u64
$t0,$a,#@Sigma0[0]
533 veor
$t2,$t1 @ Sigma1
(e
)
535 vshr
.u64
$t1,$a,#@Sigma0[1]
536 vsli
.64 $t0,$a,#`64-@Sigma0[0]`
538 vshr
.u64
$t2,$a,#@Sigma0[2]
539 vadd
.i64
$K,@X[$i%16]
540 vsli
.64 $t1,$a,#`64-@Sigma0[1]`
542 vsli
.64 $t2,$a,#`64-@Sigma0[2]`
545 vbsl
$Maj,$c,$b @ Maj
(a
,b
,c
)
546 veor
$h,$t2 @ Sigma0
(a
)
556 if ($i&1) { &NEON_00_15
($i,@_); return; }
558 # 2x-vectorized, therefore runs every 2nd round
559 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
560 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
561 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
562 my $e=@_[4]; # $e from NEON_00_15
565 vshr
.u64
$t0,@X[($i+7)%8],#@sigma1[0]
566 vshr
.u64
$t1,@X[($i+7)%8],#@sigma1[1]
567 vadd
.i64
@_[0],d30 @ h
+=Maj from the past
568 vshr
.u64
$s1,@X[($i+7)%8],#@sigma1[2]
569 vsli
.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
570 vext
.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
571 vsli
.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
573 vshr
.u64
$t0,$s0,#@sigma0[0]
574 veor
$s1,$t1 @ sigma1
(X
[i
+14])
575 vshr
.u64
$t1,$s0,#@sigma0[1]
576 vadd
.i64
@X[$i%8],$s1
577 vshr
.u64
$s1,$s0,#@sigma0[2]
578 vsli
.64 $t0,$s0,#`64-@sigma0[0]`
579 vsli
.64 $t1,$s0,#`64-@sigma0[1]`
580 vext
.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
582 vshr
.u64
$d0,$e,#@Sigma1[0] @ from NEON_00_15
583 vadd
.i64
@X[$i%8],$s0
584 vshr
.u64
$d1,$e,#@Sigma1[1] @ from NEON_00_15
585 veor
$s1,$t1 @ sigma0
(X
[i
+1])
586 vshr
.u64
$d2,$e,#@Sigma1[2] @ from NEON_00_15
587 vadd
.i64
@X[$i%8],$s1
589 &NEON_00_15
(2*$i,@_);
593 #if __ARM_MAX_ARCH__>=7
597 .global sha512_block_data_order_neon
598 .type sha512_block_data_order_neon
,%function
600 sha512_block_data_order_neon
:
602 dmb @ errata
#451034 on early Cortex A8
603 add
$len,$inp,$len,lsl
#7 @ len to point at the end of inp
606 vldmia
$ctx,{$A-$H} @ load context
609 for($i=0;$i<16;$i++) { &NEON_00_15
($i,@V); unshift(@V,pop(@V)); }
615 for(;$i<32;$i++) { &NEON_16_79
($i,@V); unshift(@V,pop(@V)); }
619 vadd
.i64
$A,d30 @ h
+=Maj from the past
620 vldmia
$ctx,{d24
-d31
} @ load context to temp
621 vadd
.i64 q8
,q12 @ vectorized accumulate
625 vstmia
$ctx,{$A-$H} @ save context
627 sub $Ktbl,#640 @ rewind K512
632 .size sha512_block_data_order_neon
,.-sha512_block_data_order_neon
637 .asciz
"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
639 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
640 .comm OPENSSL_armcap_P
,4,4
644 $code =~ s/\`([^\`]*)\`/eval $1/gem;
645 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
646 $code =~ s/\bret\b/bx lr/gm;
651 last if (!s/^#/@/ and !/^$/);
657 close STDOUT
; # enforce flush