3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA256 block procedure for ARMv4. May 2007.
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
40 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41 open STDOUT
,">$output";
55 @V=($A,$B,$C,$D,$E,$F,$G,$H);
65 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
67 $code.=<<___
if ($i<16);
69 @ ldr
$t1,[$inp],#4 @ $i
71 str
$inp,[sp
,#17*4] @ make room for $t4
73 eor
$t0,$e,$e,ror
#`$Sigma1[1]-$Sigma1[0]`
74 add
$a,$a,$t2 @ h
+=Maj
(a
,b
,c
) from the past
75 eor
$t0,$t0,$e,ror
#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
80 @ ldrb
$t1,[$inp,#3] @ $i
81 add
$a,$a,$t2 @ h
+=Maj
(a
,b
,c
) from the past
86 orr
$t1,$t1,$t0,lsl
#16
88 str
$inp,[sp
,#17*4] @ make room for $t4
90 eor
$t0,$e,$e,ror
#`$Sigma1[1]-$Sigma1[0]`
91 orr
$t1,$t1,$t2,lsl
#24
92 eor
$t0,$t0,$e,ror
#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
96 ldr
$t2,[$Ktbl],#4 @ *K256++
97 add
$h,$h,$t1 @ h
+=X
[i
]
98 str
$t1,[sp
,#`$i%16`*4]
100 add
$h,$h,$t0,ror
#$Sigma1[0] @ h+=Sigma1(e)
102 add
$h,$h,$t2 @ h
+=K256
[i
]
103 eor
$t1,$t1,$g @ Ch
(e
,f
,g
)
104 eor
$t0,$a,$a,ror
#`$Sigma0[1]-$Sigma0[0]`
105 add
$h,$h,$t1 @ h
+=Ch
(e
,f
,g
)
108 cmp $t2,#0xf2 @ done?
112 ldr
$t1,[$inp],#4 @ prefetch
116 eor
$t2,$a,$b @ a
^b
, b
^c
in next round
118 ldr
$t1,[sp
,#`($i+2)%16`*4] @ from future BODY_16_xx
119 eor
$t2,$a,$b @ a
^b
, b
^c
in next round
120 ldr
$t4,[sp
,#`($i+15)%16`*4] @ from future BODY_16_xx
122 eor
$t0,$t0,$a,ror
#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
123 and $t3,$t3,$t2 @
(b
^c
)&=(a
^b
)
125 eor
$t3,$t3,$b @ Maj
(a
,b
,c
)
126 add
$h,$h,$t0,ror
#$Sigma0[0] @ h+=Sigma0(a)
127 @ add
$h,$h,$t3 @ h
+=Maj
(a
,b
,c
)
133 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
136 @ ldr
$t1,[sp
,#`($i+1)%16`*4] @ $i
137 @ ldr
$t4,[sp
,#`($i+14)%16`*4]
138 mov
$t0,$t1,ror
#$sigma0[0]
139 add
$a,$a,$t2 @ h
+=Maj
(a
,b
,c
) from the past
140 mov
$t2,$t4,ror
#$sigma1[0]
141 eor
$t0,$t0,$t1,ror
#$sigma0[1]
142 eor
$t2,$t2,$t4,ror
#$sigma1[1]
143 eor
$t0,$t0,$t1,lsr
#$sigma0[2] @ sigma0(X[i+1])
144 ldr
$t1,[sp
,#`($i+0)%16`*4]
145 eor
$t2,$t2,$t4,lsr
#$sigma1[2] @ sigma1(X[i+14])
146 ldr
$t4,[sp
,#`($i+9)%16`*4]
149 eor
$t0,$e,$e,ror
#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
151 eor
$t0,$t0,$e,ror
#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
152 add
$t1,$t1,$t4 @ X
[i
]
159 # include "arm_arch.h"
161 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
162 # define __ARM_MAX_ARCH__ 7
181 .word
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
182 .word
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
183 .word
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
184 .word
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
185 .word
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
186 .word
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
187 .word
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188 .word
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189 .word
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190 .word
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191 .word
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
192 .word
0xd192e819,0xd6990624,0xf40e3585,0x106aa070
193 .word
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
194 .word
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195 .word
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196 .word
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
199 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
201 .word OPENSSL_armcap_P
-sha256_block_data_order
205 .global sha256_block_data_order
206 .type sha256_block_data_order
,%function
207 sha256_block_data_order
:
209 sub r3
,pc
,#8 @ sha256_block_data_order
211 adr r3
,sha256_block_data_order
213 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
214 ldr r12
,.LOPENSSL_armcap
215 ldr r12
,[r3
,r12
] @ OPENSSL_armcap_P
216 tst r12
,#ARMV8_SHA256
221 add
$len,$inp,$len,lsl
#6 @ len to point at the end of inp
222 stmdb sp
!,{$ctx,$inp,$len,r4
-r11
,lr
}
223 ldmia
$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
224 sub $Ktbl,r3
,#256+32 @ K256
225 sub sp
,sp
,#16*4 @ alloca(X[16])
232 eor
$t3,$B,$C @ magic
235 for($i=0;$i<16;$i++) { &BODY_00_15
($i,@V); unshift(@V,pop(@V)); }
236 $code.=".Lrounds_16_xx:\n";
237 for (;$i<32;$i++) { &BODY_16_XX
($i,@V); unshift(@V,pop(@V)); }
240 ite
eq @ Thumb2 thing
, sanity check
in ARM
242 ldreq
$t3,[sp
,#16*4] @ pull ctx
245 add
$A,$A,$t2 @ h
+=Maj
(a
,b
,c
) from the past
260 ldr
$inp,[sp
,#17*4] @ pull inp
261 ldr
$t2,[sp
,#18*4] @ pull inp+len
264 stmia
$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
266 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
269 add sp
,sp
,#`16+3`*4 @ destroy frame
271 ldmia sp
!,{r4
-r11
,pc
}
273 ldmia sp
!,{r4
-r11
,lr
}
275 moveq pc
,lr @ be binary compatible with V4
, yet
276 bx lr @ interoperable with Thumb ISA
:-)
278 .size sha256_block_data_order
,.-sha256_block_data_order
280 ######################################################################
284 my @X=map("q$_",(0..3));
285 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
289 sub Dlo
() { shift=~m
|q
([1]?
[0-9])|?
"d".($1*2):""; }
290 sub Dhi
() { shift=~m
|q
([1]?
[0-9])|?
"d".($1*2+1):""; }
292 sub AUTOLOAD
() # thunk [simplified] x86-style perlasm
293 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
295 $arg = "#$arg" if ($arg*1 eq $arg);
296 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
302 my @insns = (&$body,&$body,&$body,&$body);
303 my ($a,$b,$c,$d,$e,$f,$g,$h);
305 &vext_8
($T0,@X[0],@X[1],4); # X[1..4]
309 &vext_8
($T1,@X[2],@X[3],4); # X[9..12]
313 &vshr_u32
($T2,$T0,$sigma0[0]);
316 &vadd_i32
(@X[0],@X[0],$T1); # X[0..3] += X[9..12]
319 &vshr_u32
($T1,$T0,$sigma0[2]);
322 &vsli_32
($T2,$T0,32-$sigma0[0]);
325 &vshr_u32
($T3,$T0,$sigma0[1]);
331 &vsli_32
($T3,$T0,32-$sigma0[1]);
334 &vshr_u32
($T4,&Dhi
(@X[3]),$sigma1[0]);
337 &veor
($T1,$T1,$T3); # sigma0(X[1..4])
340 &vsli_32
($T4,&Dhi
(@X[3]),32-$sigma1[0]);
343 &vshr_u32
($T5,&Dhi
(@X[3]),$sigma1[2]);
346 &vadd_i32
(@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
352 &vshr_u32
($T4,&Dhi
(@X[3]),$sigma1[1]);
355 &vsli_32
($T4,&Dhi
(@X[3]),32-$sigma1[1]);
358 &veor
($T5,$T5,$T4); # sigma1(X[14..15])
361 &vadd_i32
(&Dlo
(@X[0]),&Dlo
(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
364 &vshr_u32
($T4,&Dlo
(@X[0]),$sigma1[0]);
367 &vsli_32
($T4,&Dlo
(@X[0]),32-$sigma1[0]);
370 &vshr_u32
($T5,&Dlo
(@X[0]),$sigma1[2]);
376 &vshr_u32
($T4,&Dlo
(@X[0]),$sigma1[1]);
379 &vld1_32
("{$T0}","[$Ktbl,:128]!");
382 &vsli_32
($T4,&Dlo
(@X[0]),32-$sigma1[1]);
385 &veor
($T5,$T5,$T4); # sigma1(X[16..17])
388 &vadd_i32
(&Dhi
(@X[0]),&Dhi
(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
391 &vadd_i32
($T0,$T0,@X[0]);
392 while($#insns>=2) { eval(shift(@insns)); }
393 &vst1_32
("{$T0}","[$Xfer,:128]!");
397 push(@X,shift(@X)); # "rotate" X[]
403 my @insns = (&$body,&$body,&$body,&$body);
404 my ($a,$b,$c,$d,$e,$f,$g,$h);
410 &vld1_32
("{$T0}","[$Ktbl,:128]!");
415 &vrev32_8
(@X[0],@X[0]);
420 &vadd_i32
($T0,$T0,@X[0]);
421 foreach (@insns) { eval; } # remaining instructions
422 &vst1_32
("{$T0}","[$Xfer,:128]!");
424 push(@X,shift(@X)); # "rotate" X[]
429 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
430 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
432 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
433 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
435 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
436 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
437 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
438 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
439 '&eor ($t2,$a,$b)', # a^b, b^c in next round
440 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
441 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
442 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
443 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
444 '&ldr ($t1,"[sp,#64]") if ($j==31)',
445 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
446 '&add ($d,$d,$h)', # d+=h
447 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
448 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
449 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
454 #if __ARM_MAX_ARCH__>=7
458 .global sha256_block_data_order_neon
459 .type sha256_block_data_order_neon
,%function
461 sha256_block_data_order_neon
:
463 stmdb sp
!,{r4
-r12
,lr
}
467 bic
$H,$H,#15 @ align for 128-bit stores
470 add
$len,$inp,$len,lsl
#6 @ len to point at the end of inp
472 vld1
.8
{@X[0]},[$inp]!
473 vld1
.8
{@X[1]},[$inp]!
474 vld1
.8
{@X[2]},[$inp]!
475 vld1
.8
{@X[3]},[$inp]!
476 vld1
.32
{$T0},[$Ktbl,:128]!
477 vld1
.32
{$T1},[$Ktbl,:128]!
478 vld1
.32
{$T2},[$Ktbl,:128]!
479 vld1
.32
{$T3},[$Ktbl,:128]!
480 vrev32
.8
@X[0],@X[0] @ yes
, even on
482 vrev32
.8
@X[1],@X[1] @ big
-endian
488 str
$t2,[sp
,#76] @ save original sp
489 vadd
.i32
$T0,$T0,@X[0]
490 vadd
.i32
$T1,$T1,@X[1]
491 vst1
.32
{$T0},[$Xfer,:128]!
492 vadd
.i32
$T2,$T2,@X[2]
493 vst1
.32
{$T1},[$Xfer,:128]!
494 vadd
.i32
$T3,$T3,@X[3]
495 vst1
.32
{$T2},[$Xfer,:128]!
496 vst1
.32
{$T3},[$Xfer,:128]!
508 &Xupdate
(\
&body_00_15
);
509 &Xupdate
(\
&body_00_15
);
510 &Xupdate
(\
&body_00_15
);
511 &Xupdate
(\
&body_00_15
);
513 teq
$t1,#0 @ check for K256 terminator
520 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
523 subeq
$inp,$inp,#64 @ avoid SEGV
524 vld1
.8
{@X[0]},[$inp]! @ load
next input block
525 vld1
.8
{@X[1]},[$inp]!
526 vld1
.8
{@X[2]},[$inp]!
527 vld1
.8
{@X[3]},[$inp]!
532 &Xpreload
(\
&body_00_15
);
533 &Xpreload
(\
&body_00_15
);
534 &Xpreload
(\
&body_00_15
);
535 &Xpreload
(\
&body_00_15
);
538 add
$A,$A,$t2 @ h
+=Maj
(a
,b
,c
) from the past
542 add
$A,$A,$t0 @ accumulate
564 ldreq sp
,[sp
,#76] @ restore original sp
569 ldmia sp
!,{r4
-r12
,pc
}
570 .size sha256_block_data_order_neon
,.-sha256_block_data_order_neon
574 ######################################################################
578 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
579 my @MSG=map("q$_",(8..11));
580 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
584 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
587 # define INST(a,b,c,d) .byte c,d|0xc,a,b
589 # define INST(a,b,c,d) .byte a,b,c,d
592 .type sha256_block_data_order_armv8
,%function
594 sha256_block_data_order_armv8
:
596 vld1
.32
{$ABCD,$EFGH},[$ctx]
599 sub $Ktbl,$Ktbl,#.LARMv8-K256
603 add
$len,$inp,$len,lsl
#6 @ len to point at the end of inp
606 vld1
.8
{@MSG[0]-@MSG[1]},[$inp]!
607 vld1
.8
{@MSG[2]-@MSG[3]},[$inp]!
608 vld1
.32
{$W0},[$Ktbl]!
609 vrev32
.8
@MSG[0],@MSG[0]
610 vrev32
.8
@MSG[1],@MSG[1]
611 vrev32
.8
@MSG[2],@MSG[2]
612 vrev32
.8
@MSG[3],@MSG[3]
613 vmov
$ABCD_SAVE,$ABCD @ offload
614 vmov
$EFGH_SAVE,$EFGH
617 for($i=0;$i<12;$i++) {
619 vld1
.32
{$W1},[$Ktbl]!
620 vadd
.i32
$W0,$W0,@MSG[0]
621 sha256su0
@MSG[0],@MSG[1]
623 sha256h
$ABCD,$EFGH,$W0
624 sha256h2
$EFGH,$abcd,$W0
625 sha256su1
@MSG[0],@MSG[2],@MSG[3]
627 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
630 vld1
.32
{$W1},[$Ktbl]!
631 vadd
.i32
$W0,$W0,@MSG[0]
633 sha256h
$ABCD,$EFGH,$W0
634 sha256h2
$EFGH,$abcd,$W0
636 vld1
.32
{$W0},[$Ktbl]!
637 vadd
.i32
$W1,$W1,@MSG[1]
639 sha256h
$ABCD,$EFGH,$W1
640 sha256h2
$EFGH,$abcd,$W1
642 vld1
.32
{$W1},[$Ktbl]
643 vadd
.i32
$W0,$W0,@MSG[2]
644 sub $Ktbl,$Ktbl,#256-16 @ rewind
646 sha256h
$ABCD,$EFGH,$W0
647 sha256h2
$EFGH,$abcd,$W0
649 vadd
.i32
$W1,$W1,@MSG[3]
651 sha256h
$ABCD,$EFGH,$W1
652 sha256h2
$EFGH,$abcd,$W1
654 vadd
.i32
$ABCD,$ABCD,$ABCD_SAVE
655 vadd
.i32
$EFGH,$EFGH,$EFGH_SAVE
659 vst1
.32
{$ABCD,$EFGH},[$ctx]
662 .size sha256_block_data_order_armv8
,.-sha256_block_data_order_armv8
667 .asciz
"SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
669 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
670 .comm OPENSSL_armcap_P
,4,4
677 last if (!s/^#/@/ and !/^$/);
683 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
684 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
687 my ($mnemonic,$arg)=@_;
689 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
690 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
691 |(($2&7)<<17)|(($2&8)<<4)
692 |(($3&7)<<1) |(($3&8)<<2);
693 # since ARMv7 instructions are always encoded little-endian.
694 # correct solution is to use .inst directive, but older
695 # assemblers don't implement it:-(
696 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
697 $word&0xff,($word>>8)&0xff,
698 ($word>>16)&0xff,($word>>24)&0xff,
704 foreach (split($/,$code)) {
706 s/\`([^\`]*)\`/eval $1/geo;
708 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
710 s/\bret\b/bx lr/go or
711 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
716 close STDOUT
; # enforce flush