2 # SPDX-License-Identifier: GPL-2.0
4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
5 # has relicensed it under the GPLv2. Therefore this program is free software;
6 # you can redistribute it and/or modify it under the terms of the GNU General
7 # Public License version 2 as published by the Free Software Foundation.
9 # The original headers, including the original license headers, are
10 # included below for completeness.
12 # ====================================================================
13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # project. The module is, however, dual licensed under OpenSSL and
15 # CRYPTOGAMS licenses depending on where you obtain it. For further
16 # details see http://www.openssl.org/~appro/cryptogams/.
17 # ====================================================================
19 # SHA512 block procedure for ARMv4. September 2007.
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23 # Xscale PXA250 core].
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
43 # terms it's 22.6 cycles per byte, which is disappointing result.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
45 # multiple NEON instructions per cycle, but dual NEON issue could
46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
50 # Byte order [in]dependence. =========================================
52 # Originally caller was expected to maintain specific *dword* order in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
58 # ====================================================================
60 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61 open STDOUT
,">$output";
63 $ctx="r0"; # parameter block
77 ############ r13 is stack pointer
79 ############ r15 is program counter
94 @ Sigma1
(x
) (ROTR
((x
),14) ^ ROTR
((x
),18) ^ ROTR
((x
),41))
95 @ LO lo
>>14^hi
<<18 ^ lo
>>18^hi
<<14 ^ hi
>>9^lo
<<23
96 @ HI hi
>>14^lo
<<18 ^ hi
>>18^lo
<<14 ^ lo
>>9^hi
<<23
98 str
$Tlo,[sp
,#$Xoff+0]
100 str
$Thi,[sp
,#$Xoff+4]
101 eor
$t0,$t0,$Ehi,lsl
#18
102 ldr
$t2,[sp
,#$Hoff+0] @ h.lo
103 eor
$t1,$t1,$Elo,lsl
#18
104 ldr
$t3,[sp
,#$Hoff+4] @ h.hi
105 eor
$t0,$t0,$Elo,lsr
#18
106 eor
$t1,$t1,$Ehi,lsr
#18
107 eor
$t0,$t0,$Ehi,lsl
#14
108 eor
$t1,$t1,$Elo,lsl
#14
109 eor
$t0,$t0,$Ehi,lsr
#9
110 eor
$t1,$t1,$Elo,lsr
#9
111 eor
$t0,$t0,$Elo,lsl
#23
112 eor
$t1,$t1,$Ehi,lsl
#23 @ Sigma1(e)
114 ldr
$t0,[sp
,#$Foff+0] @ f.lo
115 adc
$Thi,$Thi,$t1 @ T
+= Sigma1
(e
)
116 ldr
$t1,[sp
,#$Foff+4] @ f.hi
118 ldr
$t2,[sp
,#$Goff+0] @ g.lo
119 adc
$Thi,$Thi,$t3 @ T
+= h
120 ldr
$t3,[sp
,#$Goff+4] @ g.hi
123 str
$Elo,[sp
,#$Eoff+0]
125 str
$Ehi,[sp
,#$Eoff+4]
127 str
$Alo,[sp
,#$Aoff+0]
129 str
$Ahi,[sp
,#$Aoff+4]
131 ldr
$t2,[$Ktbl,#$lo] @ K[i].lo
132 eor
$t1,$t1,$t3 @ Ch
(e
,f
,g
)
133 ldr
$t3,[$Ktbl,#$hi] @ K[i].hi
136 ldr
$Elo,[sp
,#$Doff+0] @ d.lo
137 adc
$Thi,$Thi,$t1 @ T
+= Ch
(e
,f
,g
)
138 ldr
$Ehi,[sp
,#$Doff+4] @ d.hi
141 adc
$Thi,$Thi,$t3 @ T
+= K
[i
]
143 ldr
$t2,[sp
,#$Boff+0] @ b.lo
144 adc
$Ehi,$Ehi,$Thi @ d
+= T
147 ldr
$t3,[sp
,#$Coff+0] @ c.lo
149 it
eq @ Thumb2 thing
, sanity check
in ARM
152 @ Sigma0
(x
) (ROTR
((x
),28) ^ ROTR
((x
),34) ^ ROTR
((x
),39))
153 @ LO lo
>>28^hi
<<4 ^ hi
>>2^lo
<<30 ^ hi
>>7^lo
<<25
154 @ HI hi
>>28^lo
<<4 ^ lo
>>2^hi
<<30 ^ lo
>>7^hi
<<25
157 eor
$t0,$t0,$Ahi,lsl
#4
158 eor
$t1,$t1,$Alo,lsl
#4
159 eor
$t0,$t0,$Ahi,lsr
#2
160 eor
$t1,$t1,$Alo,lsr
#2
161 eor
$t0,$t0,$Alo,lsl
#30
162 eor
$t1,$t1,$Ahi,lsl
#30
163 eor
$t0,$t0,$Ahi,lsr
#7
164 eor
$t1,$t1,$Alo,lsr
#7
165 eor
$t0,$t0,$Alo,lsl
#25
166 eor
$t1,$t1,$Ahi,lsl
#25 @ Sigma0(a)
169 adc
$Thi,$Thi,$t1 @ T
+= Sigma0
(a
)
171 ldr
$t1,[sp
,#$Boff+4] @ b.hi
173 ldr
$t2,[sp
,#$Coff+4] @ c.hi
177 orr
$Alo,$Alo,$t0 @ Maj
(a
,b
,c
).lo
180 orr
$Ahi,$Ahi,$t3 @ Maj
(a
,b
,c
).hi
182 adc
$Ahi,$Ahi,$Thi @ h
+= T
189 # include "arm_arch.h"
190 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
191 # define VFP_ABI_POP vldmia sp!,{d8-d15}
193 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
194 # define __ARM_MAX_ARCH__ 7
195 # define VFP_ABI_PUSH
202 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
206 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
225 WORD64
(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
226 WORD64
(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
227 WORD64
(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
228 WORD64
(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
229 WORD64
(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
230 WORD64
(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
231 WORD64
(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
232 WORD64
(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
233 WORD64
(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
234 WORD64
(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
235 WORD64
(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
236 WORD64
(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
237 WORD64
(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
238 WORD64
(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
239 WORD64
(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
240 WORD64
(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
241 WORD64
(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
242 WORD64
(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
243 WORD64
(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
244 WORD64
(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
245 WORD64
(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
246 WORD64
(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
247 WORD64
(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
248 WORD64
(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
249 WORD64
(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
250 WORD64
(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
251 WORD64
(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
252 WORD64
(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
253 WORD64
(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
254 WORD64
(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
255 WORD64
(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
256 WORD64
(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
257 WORD64
(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
258 WORD64
(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
259 WORD64
(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
260 WORD64
(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
261 WORD64
(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
262 WORD64
(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
263 WORD64
(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
264 WORD64
(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
266 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
268 .word OPENSSL_armcap_P
-sha512_block_data_order
274 .global sha512_block_data_order
275 .type sha512_block_data_order
,%function
276 sha512_block_data_order
:
278 sub r3
,pc
,#8 @ sha512_block_data_order
280 adr r3
,sha512_block_data_order
282 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
283 ldr r12
,.LOPENSSL_armcap
284 ldr r12
,[r3
,r12
] @ OPENSSL_armcap_P
288 add
$len,$inp,$len,lsl
#7 @ len to point at the end of inp
289 stmdb sp
!,{r4
-r12
,lr
}
290 sub $Ktbl,r3
,#672 @ K512
293 ldr
$Elo,[$ctx,#$Eoff+$lo]
294 ldr
$Ehi,[$ctx,#$Eoff+$hi]
295 ldr
$t0, [$ctx,#$Goff+$lo]
296 ldr
$t1, [$ctx,#$Goff+$hi]
297 ldr
$t2, [$ctx,#$Hoff+$lo]
298 ldr
$t3, [$ctx,#$Hoff+$hi]
300 str
$t0, [sp
,#$Goff+0]
301 str
$t1, [sp
,#$Goff+4]
302 str
$t2, [sp
,#$Hoff+0]
303 str
$t3, [sp
,#$Hoff+4]
304 ldr
$Alo,[$ctx,#$Aoff+$lo]
305 ldr
$Ahi,[$ctx,#$Aoff+$hi]
306 ldr
$Tlo,[$ctx,#$Boff+$lo]
307 ldr
$Thi,[$ctx,#$Boff+$hi]
308 ldr
$t0, [$ctx,#$Coff+$lo]
309 ldr
$t1, [$ctx,#$Coff+$hi]
310 ldr
$t2, [$ctx,#$Doff+$lo]
311 ldr
$t3, [$ctx,#$Doff+$hi]
312 str
$Tlo,[sp
,#$Boff+0]
313 str
$Thi,[sp
,#$Boff+4]
314 str
$t0, [sp
,#$Coff+0]
315 str
$t1, [sp
,#$Coff+4]
316 str
$t2, [sp
,#$Doff+0]
317 str
$t3, [sp
,#$Doff+4]
318 ldr
$Tlo,[$ctx,#$Foff+$lo]
319 ldr
$Thi,[$ctx,#$Foff+$hi]
320 str
$Tlo,[sp
,#$Foff+0]
321 str
$Thi,[sp
,#$Foff+4]
331 orr
$Tlo,$Tlo,$t0,lsl
#8
333 orr
$Tlo,$Tlo,$t1,lsl
#16
335 orr
$Tlo,$Tlo,$t2,lsl
#24
336 orr
$Thi,$Thi,$t3,lsl
#8
337 orr
$Thi,$Thi,$t0,lsl
#16
338 orr
$Thi,$Thi,$t1,lsl
#24
352 ldr
$t0,[sp
,#`$Xoff+8*(16-1)`+0]
353 ldr
$t1,[sp
,#`$Xoff+8*(16-1)`+4]
356 @ sigma0
(x
) (ROTR
((x
),1) ^ ROTR
((x
),8) ^ ((x
)>>7))
357 @ LO lo
>>1^hi
<<31 ^ lo
>>8^hi
<<24 ^ lo
>>7^hi
<<25
358 @ HI hi
>>1^lo
<<31 ^ hi
>>8^lo
<<24 ^ hi
>>7
360 ldr
$t2,[sp
,#`$Xoff+8*(16-14)`+0]
362 ldr
$t3,[sp
,#`$Xoff+8*(16-14)`+4]
363 eor
$Tlo,$Tlo,$t1,lsl
#31
364 eor
$Thi,$Thi,$t0,lsl
#31
365 eor
$Tlo,$Tlo,$t0,lsr
#8
366 eor
$Thi,$Thi,$t1,lsr
#8
367 eor
$Tlo,$Tlo,$t1,lsl
#24
368 eor
$Thi,$Thi,$t0,lsl
#24
369 eor
$Tlo,$Tlo,$t0,lsr
#7
370 eor
$Thi,$Thi,$t1,lsr
#7
371 eor
$Tlo,$Tlo,$t1,lsl
#25
373 @ sigma1
(x
) (ROTR
((x
),19) ^ ROTR
((x
),61) ^ ((x
)>>6))
374 @ LO lo
>>19^hi
<<13 ^ hi
>>29^lo
<<3 ^ lo
>>6^hi
<<26
375 @ HI hi
>>19^lo
<<13 ^ lo
>>29^hi
<<3 ^ hi
>>6
378 eor
$t0,$t0,$t3,lsl
#13
379 eor
$t1,$t1,$t2,lsl
#13
380 eor
$t0,$t0,$t3,lsr
#29
381 eor
$t1,$t1,$t2,lsr
#29
382 eor
$t0,$t0,$t2,lsl
#3
383 eor
$t1,$t1,$t3,lsl
#3
384 eor
$t0,$t0,$t2,lsr
#6
385 eor
$t1,$t1,$t3,lsr
#6
386 ldr
$t2,[sp
,#`$Xoff+8*(16-9)`+0]
387 eor
$t0,$t0,$t3,lsl
#26
389 ldr
$t3,[sp
,#`$Xoff+8*(16-9)`+4]
391 ldr
$t0,[sp
,#`$Xoff+8*16`+0]
394 ldr
$t1,[sp
,#`$Xoff+8*16`+4]
403 ittt
eq @ Thumb2 thing
, sanity check
in ARM
405 ldreq
$t0,[sp
,#`$Xoff+8*(16-1)`+0]
406 ldreq
$t1,[sp
,#`$Xoff+8*(16-1)`+4]
410 ldr
$Tlo,[sp
,#$Boff+0]
411 ldr
$Thi,[sp
,#$Boff+4]
412 ldr
$t0, [$ctx,#$Aoff+$lo]
413 ldr
$t1, [$ctx,#$Aoff+$hi]
414 ldr
$t2, [$ctx,#$Boff+$lo]
415 ldr
$t3, [$ctx,#$Boff+$hi]
417 str
$t0, [$ctx,#$Aoff+$lo]
419 str
$t1, [$ctx,#$Aoff+$hi]
421 str
$t2, [$ctx,#$Boff+$lo]
423 str
$t3, [$ctx,#$Boff+$hi]
425 ldr
$Alo,[sp
,#$Coff+0]
426 ldr
$Ahi,[sp
,#$Coff+4]
427 ldr
$Tlo,[sp
,#$Doff+0]
428 ldr
$Thi,[sp
,#$Doff+4]
429 ldr
$t0, [$ctx,#$Coff+$lo]
430 ldr
$t1, [$ctx,#$Coff+$hi]
431 ldr
$t2, [$ctx,#$Doff+$lo]
432 ldr
$t3, [$ctx,#$Doff+$hi]
434 str
$t0, [$ctx,#$Coff+$lo]
436 str
$t1, [$ctx,#$Coff+$hi]
438 str
$t2, [$ctx,#$Doff+$lo]
440 str
$t3, [$ctx,#$Doff+$hi]
442 ldr
$Tlo,[sp
,#$Foff+0]
443 ldr
$Thi,[sp
,#$Foff+4]
444 ldr
$t0, [$ctx,#$Eoff+$lo]
445 ldr
$t1, [$ctx,#$Eoff+$hi]
446 ldr
$t2, [$ctx,#$Foff+$lo]
447 ldr
$t3, [$ctx,#$Foff+$hi]
449 str
$Elo,[$ctx,#$Eoff+$lo]
451 str
$Ehi,[$ctx,#$Eoff+$hi]
453 str
$t2, [$ctx,#$Foff+$lo]
455 str
$t3, [$ctx,#$Foff+$hi]
457 ldr
$Alo,[sp
,#$Goff+0]
458 ldr
$Ahi,[sp
,#$Goff+4]
459 ldr
$Tlo,[sp
,#$Hoff+0]
460 ldr
$Thi,[sp
,#$Hoff+4]
461 ldr
$t0, [$ctx,#$Goff+$lo]
462 ldr
$t1, [$ctx,#$Goff+$hi]
463 ldr
$t2, [$ctx,#$Hoff+$lo]
464 ldr
$t3, [$ctx,#$Hoff+$hi]
466 str
$t0, [$ctx,#$Goff+$lo]
468 str
$t1, [$ctx,#$Goff+$hi]
470 str
$t2, [$ctx,#$Hoff+$lo]
472 str
$t3, [$ctx,#$Hoff+$hi]
480 add sp
,sp
,#8*9 @ destroy frame
482 ldmia sp
!,{r4
-r12
,pc
}
484 ldmia sp
!,{r4
-r12
,lr
}
486 moveq pc
,lr @ be binary compatible with V4
, yet
487 bx lr @ interoperable with Thumb ISA
:-)
489 .size sha512_block_data_order
,.-sha512_block_data_order
493 my @Sigma0=(28,34,39);
494 my @Sigma1=(14,18,41);
495 my @sigma0=(1, 8, 7);
496 my @sigma1=(19,61,6);
499 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
501 my @X=map("d$_",(0..15));
502 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
506 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
507 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
509 $code.=<<___
if ($i<16 || $i&1);
510 vshr
.u64
$t0,$e,#@Sigma1[0] @ $i
512 vld1
.64
{@X[$i%16]},[$inp]! @ handles unaligned
514 vshr
.u64
$t1,$e,#@Sigma1[1]
516 vadd
.i64
$a,$Maj @ h
+=Maj from the past
518 vshr
.u64
$t2,$e,#@Sigma1[2]
521 vld1
.64
{$K},[$Ktbl,:64]! @ K
[i
++]
522 vsli
.64 $t0,$e,#`64-@Sigma1[0]`
523 vsli
.64 $t1,$e,#`64-@Sigma1[1]`
525 vsli
.64 $t2,$e,#`64-@Sigma1[2]`
526 #if $i<16 && defined(__ARMEL__)
527 vrev64
.8
@X[$i],@X[$i]
530 vbsl
$Ch,$f,$g @ Ch
(e
,f
,g
)
531 vshr
.u64
$t0,$a,#@Sigma0[0]
532 veor
$t2,$t1 @ Sigma1
(e
)
534 vshr
.u64
$t1,$a,#@Sigma0[1]
535 vsli
.64 $t0,$a,#`64-@Sigma0[0]`
537 vshr
.u64
$t2,$a,#@Sigma0[2]
538 vadd
.i64
$K,@X[$i%16]
539 vsli
.64 $t1,$a,#`64-@Sigma0[1]`
541 vsli
.64 $t2,$a,#`64-@Sigma0[2]`
544 vbsl
$Maj,$c,$b @ Maj
(a
,b
,c
)
545 veor
$h,$t2 @ Sigma0
(a
)
555 if ($i&1) { &NEON_00_15
($i,@_); return; }
557 # 2x-vectorized, therefore runs every 2nd round
558 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
559 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
560 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
561 my $e=@_[4]; # $e from NEON_00_15
564 vshr
.u64
$t0,@X[($i+7)%8],#@sigma1[0]
565 vshr
.u64
$t1,@X[($i+7)%8],#@sigma1[1]
566 vadd
.i64
@_[0],d30 @ h
+=Maj from the past
567 vshr
.u64
$s1,@X[($i+7)%8],#@sigma1[2]
568 vsli
.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
569 vext
.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
570 vsli
.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
572 vshr
.u64
$t0,$s0,#@sigma0[0]
573 veor
$s1,$t1 @ sigma1
(X
[i
+14])
574 vshr
.u64
$t1,$s0,#@sigma0[1]
575 vadd
.i64
@X[$i%8],$s1
576 vshr
.u64
$s1,$s0,#@sigma0[2]
577 vsli
.64 $t0,$s0,#`64-@sigma0[0]`
578 vsli
.64 $t1,$s0,#`64-@sigma0[1]`
579 vext
.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
581 vshr
.u64
$d0,$e,#@Sigma1[0] @ from NEON_00_15
582 vadd
.i64
@X[$i%8],$s0
583 vshr
.u64
$d1,$e,#@Sigma1[1] @ from NEON_00_15
584 veor
$s1,$t1 @ sigma0
(X
[i
+1])
585 vshr
.u64
$d2,$e,#@Sigma1[2] @ from NEON_00_15
586 vadd
.i64
@X[$i%8],$s1
588 &NEON_00_15
(2*$i,@_);
592 #if __ARM_MAX_ARCH__>=7
596 .global sha512_block_data_order_neon
597 .type sha512_block_data_order_neon
,%function
599 sha512_block_data_order_neon
:
601 dmb @ errata
#451034 on early Cortex A8
602 add
$len,$inp,$len,lsl
#7 @ len to point at the end of inp
605 vldmia
$ctx,{$A-$H} @ load context
608 for($i=0;$i<16;$i++) { &NEON_00_15
($i,@V); unshift(@V,pop(@V)); }
614 for(;$i<32;$i++) { &NEON_16_79
($i,@V); unshift(@V,pop(@V)); }
618 vadd
.i64
$A,d30 @ h
+=Maj from the past
619 vldmia
$ctx,{d24
-d31
} @ load context to temp
620 vadd
.i64 q8
,q12 @ vectorized accumulate
624 vstmia
$ctx,{$A-$H} @ save context
626 sub $Ktbl,#640 @ rewind K512
631 .size sha512_block_data_order_neon
,.-sha512_block_data_order_neon
636 .asciz
"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
638 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
639 .comm OPENSSL_armcap_P
,4,4
643 $code =~ s/\`([^\`]*)\`/eval $1/gem;
644 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
645 $code =~ s/\bret\b/bx lr/gm;
650 last if (!s/^#/@/ and !/^$/);
656 close STDOUT
; # enforce flush