gro: Allow tunnel stacking in the case of FOU/GUE
[linux/fpc-iii.git] / arch / arm / crypto / sha256-armv4.pl
blobfac0533ea633e9803dea6b6ee6a6f019c48c5196
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA256 block procedure for ARMv4. May 2007.
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
18 # July 2010.
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
23 # February 2011.
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
28 # September 2013.
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
34 # about it).
36 # May 2014.
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
40 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41 open STDOUT,">$output";
43 $ctx="r0"; $t0="r0";
44 $inp="r1"; $t4="r1";
45 $len="r2"; $t1="r2";
46 $T1="r3"; $t3="r3";
47 $A="r4";
48 $B="r5";
49 $C="r6";
50 $D="r7";
51 $E="r8";
52 $F="r9";
53 $G="r10";
54 $H="r11";
55 @V=($A,$B,$C,$D,$E,$F,$G,$H);
56 $t2="r12";
57 $Ktbl="r14";
59 @Sigma0=( 2,13,22);
60 @Sigma1=( 6,11,25);
61 @sigma0=( 7,18, 3);
62 @sigma1=(17,19,10);
64 sub BODY_00_15 {
65 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
67 $code.=<<___ if ($i<16);
68 #if __ARM_ARCH__>=7
69 @ ldr $t1,[$inp],#4 @ $i
70 # if $i==15
71 str $inp,[sp,#17*4] @ make room for $t4
72 # endif
73 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
74 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
75 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
76 # ifndef __ARMEB__
77 rev $t1,$t1
78 # endif
79 #else
80 @ ldrb $t1,[$inp,#3] @ $i
81 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
82 ldrb $t2,[$inp,#2]
83 ldrb $t0,[$inp,#1]
84 orr $t1,$t1,$t2,lsl#8
85 ldrb $t2,[$inp],#4
86 orr $t1,$t1,$t0,lsl#16
87 # if $i==15
88 str $inp,[sp,#17*4] @ make room for $t4
89 # endif
90 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
91 orr $t1,$t1,$t2,lsl#24
92 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
93 #endif
94 ___
95 $code.=<<___;
96 ldr $t2,[$Ktbl],#4 @ *K256++
97 add $h,$h,$t1 @ h+=X[i]
98 str $t1,[sp,#`$i%16`*4]
99 eor $t1,$f,$g
100 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
101 and $t1,$t1,$e
102 add $h,$h,$t2 @ h+=K256[i]
103 eor $t1,$t1,$g @ Ch(e,f,g)
104 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
105 add $h,$h,$t1 @ h+=Ch(e,f,g)
106 #if $i==31
107 and $t2,$t2,#0xff
108 cmp $t2,#0xf2 @ done?
109 #endif
110 #if $i<15
111 # if __ARM_ARCH__>=7
112 ldr $t1,[$inp],#4 @ prefetch
113 # else
114 ldrb $t1,[$inp,#3]
115 # endif
116 eor $t2,$a,$b @ a^b, b^c in next round
117 #else
118 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
119 eor $t2,$a,$b @ a^b, b^c in next round
120 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
121 #endif
122 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
123 and $t3,$t3,$t2 @ (b^c)&=(a^b)
124 add $d,$d,$h @ d+=h
125 eor $t3,$t3,$b @ Maj(a,b,c)
126 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
127 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
129 ($t2,$t3)=($t3,$t2);
132 sub BODY_16_XX {
133 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
135 $code.=<<___;
136 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
137 @ ldr $t4,[sp,#`($i+14)%16`*4]
138 mov $t0,$t1,ror#$sigma0[0]
139 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
140 mov $t2,$t4,ror#$sigma1[0]
141 eor $t0,$t0,$t1,ror#$sigma0[1]
142 eor $t2,$t2,$t4,ror#$sigma1[1]
143 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
144 ldr $t1,[sp,#`($i+0)%16`*4]
145 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
146 ldr $t4,[sp,#`($i+9)%16`*4]
148 add $t2,$t2,$t0
149 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
150 add $t1,$t1,$t2
151 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
152 add $t1,$t1,$t4 @ X[i]
154 &BODY_00_15(@_);
157 $code=<<___;
158 #ifndef __KERNEL__
159 # include "arm_arch.h"
160 #else
161 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
162 # define __ARM_MAX_ARCH__ 7
163 #endif
165 .text
166 #if __ARM_ARCH__<7
167 .code 32
168 #else
169 .syntax unified
170 # ifdef __thumb2__
171 # define adrl adr
172 .thumb
173 # else
174 .code 32
175 # endif
176 #endif
178 .type K256,%object
179 .align 5
180 K256:
181 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
182 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
183 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
184 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
185 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
186 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
187 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
192 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
193 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
194 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
197 .size K256,.-K256
198 .word 0 @ terminator
199 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
200 .LOPENSSL_armcap:
201 .word OPENSSL_armcap_P-sha256_block_data_order
202 #endif
203 .align 5
205 .global sha256_block_data_order
206 .type sha256_block_data_order,%function
207 sha256_block_data_order:
208 #if __ARM_ARCH__<7
209 sub r3,pc,#8 @ sha256_block_data_order
210 #else
211 adr r3,sha256_block_data_order
212 #endif
213 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
214 ldr r12,.LOPENSSL_armcap
215 ldr r12,[r3,r12] @ OPENSSL_armcap_P
216 tst r12,#ARMV8_SHA256
217 bne .LARMv8
218 tst r12,#ARMV7_NEON
219 bne .LNEON
220 #endif
221 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
222 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
223 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
224 sub $Ktbl,r3,#256+32 @ K256
225 sub sp,sp,#16*4 @ alloca(X[16])
226 .Loop:
227 # if __ARM_ARCH__>=7
228 ldr $t1,[$inp],#4
229 # else
230 ldrb $t1,[$inp,#3]
231 # endif
232 eor $t3,$B,$C @ magic
233 eor $t2,$t2,$t2
235 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
236 $code.=".Lrounds_16_xx:\n";
237 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
238 $code.=<<___;
239 #if __ARM_ARCH__>=7
240 ite eq @ Thumb2 thing, sanity check in ARM
241 #endif
242 ldreq $t3,[sp,#16*4] @ pull ctx
243 bne .Lrounds_16_xx
245 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
246 ldr $t0,[$t3,#0]
247 ldr $t1,[$t3,#4]
248 ldr $t2,[$t3,#8]
249 add $A,$A,$t0
250 ldr $t0,[$t3,#12]
251 add $B,$B,$t1
252 ldr $t1,[$t3,#16]
253 add $C,$C,$t2
254 ldr $t2,[$t3,#20]
255 add $D,$D,$t0
256 ldr $t0,[$t3,#24]
257 add $E,$E,$t1
258 ldr $t1,[$t3,#28]
259 add $F,$F,$t2
260 ldr $inp,[sp,#17*4] @ pull inp
261 ldr $t2,[sp,#18*4] @ pull inp+len
262 add $G,$G,$t0
263 add $H,$H,$t1
264 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
265 cmp $inp,$t2
266 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
267 bne .Loop
269 add sp,sp,#`16+3`*4 @ destroy frame
270 #if __ARM_ARCH__>=5
271 ldmia sp!,{r4-r11,pc}
272 #else
273 ldmia sp!,{r4-r11,lr}
274 tst lr,#1
275 moveq pc,lr @ be binary compatible with V4, yet
276 bx lr @ interoperable with Thumb ISA:-)
277 #endif
278 .size sha256_block_data_order,.-sha256_block_data_order
280 ######################################################################
281 # NEON stuff
284 my @X=map("q$_",(0..3));
285 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
286 my $Xfer=$t4;
287 my $j=0;
289 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
290 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
292 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
293 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
294 my $arg = pop;
295 $arg = "#$arg" if ($arg*1 eq $arg);
296 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
299 sub Xupdate()
300 { use integer;
301 my $body = shift;
302 my @insns = (&$body,&$body,&$body,&$body);
303 my ($a,$b,$c,$d,$e,$f,$g,$h);
305 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
306 eval(shift(@insns));
307 eval(shift(@insns));
308 eval(shift(@insns));
309 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
310 eval(shift(@insns));
311 eval(shift(@insns));
312 eval(shift(@insns));
313 &vshr_u32 ($T2,$T0,$sigma0[0]);
314 eval(shift(@insns));
315 eval(shift(@insns));
316 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
317 eval(shift(@insns));
318 eval(shift(@insns));
319 &vshr_u32 ($T1,$T0,$sigma0[2]);
320 eval(shift(@insns));
321 eval(shift(@insns));
322 &vsli_32 ($T2,$T0,32-$sigma0[0]);
323 eval(shift(@insns));
324 eval(shift(@insns));
325 &vshr_u32 ($T3,$T0,$sigma0[1]);
326 eval(shift(@insns));
327 eval(shift(@insns));
328 &veor ($T1,$T1,$T2);
329 eval(shift(@insns));
330 eval(shift(@insns));
331 &vsli_32 ($T3,$T0,32-$sigma0[1]);
332 eval(shift(@insns));
333 eval(shift(@insns));
334 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
335 eval(shift(@insns));
336 eval(shift(@insns));
337 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
338 eval(shift(@insns));
339 eval(shift(@insns));
340 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
341 eval(shift(@insns));
342 eval(shift(@insns));
343 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
344 eval(shift(@insns));
345 eval(shift(@insns));
346 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
347 eval(shift(@insns));
348 eval(shift(@insns));
349 &veor ($T5,$T5,$T4);
350 eval(shift(@insns));
351 eval(shift(@insns));
352 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
353 eval(shift(@insns));
354 eval(shift(@insns));
355 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
356 eval(shift(@insns));
357 eval(shift(@insns));
358 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
359 eval(shift(@insns));
360 eval(shift(@insns));
361 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
362 eval(shift(@insns));
363 eval(shift(@insns));
364 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
365 eval(shift(@insns));
366 eval(shift(@insns));
367 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
368 eval(shift(@insns));
369 eval(shift(@insns));
370 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
371 eval(shift(@insns));
372 eval(shift(@insns));
373 &veor ($T5,$T5,$T4);
374 eval(shift(@insns));
375 eval(shift(@insns));
376 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
377 eval(shift(@insns));
378 eval(shift(@insns));
379 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
380 eval(shift(@insns));
381 eval(shift(@insns));
382 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
383 eval(shift(@insns));
384 eval(shift(@insns));
385 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
386 eval(shift(@insns));
387 eval(shift(@insns));
388 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
389 eval(shift(@insns));
390 eval(shift(@insns));
391 &vadd_i32 ($T0,$T0,@X[0]);
392 while($#insns>=2) { eval(shift(@insns)); }
393 &vst1_32 ("{$T0}","[$Xfer,:128]!");
394 eval(shift(@insns));
395 eval(shift(@insns));
397 push(@X,shift(@X)); # "rotate" X[]
400 sub Xpreload()
401 { use integer;
402 my $body = shift;
403 my @insns = (&$body,&$body,&$body,&$body);
404 my ($a,$b,$c,$d,$e,$f,$g,$h);
406 eval(shift(@insns));
407 eval(shift(@insns));
408 eval(shift(@insns));
409 eval(shift(@insns));
410 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
411 eval(shift(@insns));
412 eval(shift(@insns));
413 eval(shift(@insns));
414 eval(shift(@insns));
415 &vrev32_8 (@X[0],@X[0]);
416 eval(shift(@insns));
417 eval(shift(@insns));
418 eval(shift(@insns));
419 eval(shift(@insns));
420 &vadd_i32 ($T0,$T0,@X[0]);
421 foreach (@insns) { eval; } # remaining instructions
422 &vst1_32 ("{$T0}","[$Xfer,:128]!");
424 push(@X,shift(@X)); # "rotate" X[]
427 sub body_00_15 () {
429 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
430 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
431 '&eor ($t1,$f,$g)',
432 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
433 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
434 '&and ($t1,$t1,$e)',
435 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
436 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
437 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
438 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
439 '&eor ($t2,$a,$b)', # a^b, b^c in next round
440 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
441 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
442 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
443 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
444 '&ldr ($t1,"[sp,#64]") if ($j==31)',
445 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
446 '&add ($d,$d,$h)', # d+=h
447 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
448 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
449 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
453 $code.=<<___;
454 #if __ARM_MAX_ARCH__>=7
455 .arch armv7-a
456 .fpu neon
458 .global sha256_block_data_order_neon
459 .type sha256_block_data_order_neon,%function
460 .align 4
461 sha256_block_data_order_neon:
462 .LNEON:
463 stmdb sp!,{r4-r12,lr}
465 sub $H,sp,#16*4+16
466 adrl $Ktbl,K256
467 bic $H,$H,#15 @ align for 128-bit stores
468 mov $t2,sp
469 mov sp,$H @ alloca
470 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
472 vld1.8 {@X[0]},[$inp]!
473 vld1.8 {@X[1]},[$inp]!
474 vld1.8 {@X[2]},[$inp]!
475 vld1.8 {@X[3]},[$inp]!
476 vld1.32 {$T0},[$Ktbl,:128]!
477 vld1.32 {$T1},[$Ktbl,:128]!
478 vld1.32 {$T2},[$Ktbl,:128]!
479 vld1.32 {$T3},[$Ktbl,:128]!
480 vrev32.8 @X[0],@X[0] @ yes, even on
481 str $ctx,[sp,#64]
482 vrev32.8 @X[1],@X[1] @ big-endian
483 str $inp,[sp,#68]
484 mov $Xfer,sp
485 vrev32.8 @X[2],@X[2]
486 str $len,[sp,#72]
487 vrev32.8 @X[3],@X[3]
488 str $t2,[sp,#76] @ save original sp
489 vadd.i32 $T0,$T0,@X[0]
490 vadd.i32 $T1,$T1,@X[1]
491 vst1.32 {$T0},[$Xfer,:128]!
492 vadd.i32 $T2,$T2,@X[2]
493 vst1.32 {$T1},[$Xfer,:128]!
494 vadd.i32 $T3,$T3,@X[3]
495 vst1.32 {$T2},[$Xfer,:128]!
496 vst1.32 {$T3},[$Xfer,:128]!
498 ldmia $ctx,{$A-$H}
499 sub $Xfer,$Xfer,#64
500 ldr $t1,[sp,#0]
501 eor $t2,$t2,$t2
502 eor $t3,$B,$C
503 b .L_00_48
505 .align 4
506 .L_00_48:
508 &Xupdate(\&body_00_15);
509 &Xupdate(\&body_00_15);
510 &Xupdate(\&body_00_15);
511 &Xupdate(\&body_00_15);
512 $code.=<<___;
513 teq $t1,#0 @ check for K256 terminator
514 ldr $t1,[sp,#0]
515 sub $Xfer,$Xfer,#64
516 bne .L_00_48
518 ldr $inp,[sp,#68]
519 ldr $t0,[sp,#72]
520 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
521 teq $inp,$t0
522 it eq
523 subeq $inp,$inp,#64 @ avoid SEGV
524 vld1.8 {@X[0]},[$inp]! @ load next input block
525 vld1.8 {@X[1]},[$inp]!
526 vld1.8 {@X[2]},[$inp]!
527 vld1.8 {@X[3]},[$inp]!
528 it ne
529 strne $inp,[sp,#68]
530 mov $Xfer,sp
532 &Xpreload(\&body_00_15);
533 &Xpreload(\&body_00_15);
534 &Xpreload(\&body_00_15);
535 &Xpreload(\&body_00_15);
536 $code.=<<___;
537 ldr $t0,[$t1,#0]
538 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
539 ldr $t2,[$t1,#4]
540 ldr $t3,[$t1,#8]
541 ldr $t4,[$t1,#12]
542 add $A,$A,$t0 @ accumulate
543 ldr $t0,[$t1,#16]
544 add $B,$B,$t2
545 ldr $t2,[$t1,#20]
546 add $C,$C,$t3
547 ldr $t3,[$t1,#24]
548 add $D,$D,$t4
549 ldr $t4,[$t1,#28]
550 add $E,$E,$t0
551 str $A,[$t1],#4
552 add $F,$F,$t2
553 str $B,[$t1],#4
554 add $G,$G,$t3
555 str $C,[$t1],#4
556 add $H,$H,$t4
557 str $D,[$t1],#4
558 stmia $t1,{$E-$H}
560 ittte ne
561 movne $Xfer,sp
562 ldrne $t1,[sp,#0]
563 eorne $t2,$t2,$t2
564 ldreq sp,[sp,#76] @ restore original sp
565 itt ne
566 eorne $t3,$B,$C
567 bne .L_00_48
569 ldmia sp!,{r4-r12,pc}
570 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
571 #endif
574 ######################################################################
575 # ARMv8 stuff
578 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
579 my @MSG=map("q$_",(8..11));
580 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
581 my $Ktbl="r3";
583 $code.=<<___;
584 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
586 # ifdef __thumb2__
587 # define INST(a,b,c,d) .byte c,d|0xc,a,b
588 # else
589 # define INST(a,b,c,d) .byte a,b,c,d
590 # endif
592 .type sha256_block_data_order_armv8,%function
593 .align 5
594 sha256_block_data_order_armv8:
595 .LARMv8:
596 vld1.32 {$ABCD,$EFGH},[$ctx]
597 # ifdef __thumb2__
598 adr $Ktbl,.LARMv8
599 sub $Ktbl,$Ktbl,#.LARMv8-K256
600 # else
601 adrl $Ktbl,K256
602 # endif
603 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
605 .Loop_v8:
606 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
607 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
608 vld1.32 {$W0},[$Ktbl]!
609 vrev32.8 @MSG[0],@MSG[0]
610 vrev32.8 @MSG[1],@MSG[1]
611 vrev32.8 @MSG[2],@MSG[2]
612 vrev32.8 @MSG[3],@MSG[3]
613 vmov $ABCD_SAVE,$ABCD @ offload
614 vmov $EFGH_SAVE,$EFGH
615 teq $inp,$len
617 for($i=0;$i<12;$i++) {
618 $code.=<<___;
619 vld1.32 {$W1},[$Ktbl]!
620 vadd.i32 $W0,$W0,@MSG[0]
621 sha256su0 @MSG[0],@MSG[1]
622 vmov $abcd,$ABCD
623 sha256h $ABCD,$EFGH,$W0
624 sha256h2 $EFGH,$abcd,$W0
625 sha256su1 @MSG[0],@MSG[2],@MSG[3]
627 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
629 $code.=<<___;
630 vld1.32 {$W1},[$Ktbl]!
631 vadd.i32 $W0,$W0,@MSG[0]
632 vmov $abcd,$ABCD
633 sha256h $ABCD,$EFGH,$W0
634 sha256h2 $EFGH,$abcd,$W0
636 vld1.32 {$W0},[$Ktbl]!
637 vadd.i32 $W1,$W1,@MSG[1]
638 vmov $abcd,$ABCD
639 sha256h $ABCD,$EFGH,$W1
640 sha256h2 $EFGH,$abcd,$W1
642 vld1.32 {$W1},[$Ktbl]
643 vadd.i32 $W0,$W0,@MSG[2]
644 sub $Ktbl,$Ktbl,#256-16 @ rewind
645 vmov $abcd,$ABCD
646 sha256h $ABCD,$EFGH,$W0
647 sha256h2 $EFGH,$abcd,$W0
649 vadd.i32 $W1,$W1,@MSG[3]
650 vmov $abcd,$ABCD
651 sha256h $ABCD,$EFGH,$W1
652 sha256h2 $EFGH,$abcd,$W1
654 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
655 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
656 it ne
657 bne .Loop_v8
659 vst1.32 {$ABCD,$EFGH},[$ctx]
661 ret @ bx lr
662 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
663 #endif
666 $code.=<<___;
667 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
668 .align 2
669 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
670 .comm OPENSSL_armcap_P,4,4
671 #endif
674 open SELF,$0;
675 while(<SELF>) {
676 next if (/^#!/);
677 last if (!s/^#/@/ and !/^$/);
678 print;
680 close SELF;
682 { my %opcode = (
683 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
684 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
686 sub unsha256 {
687 my ($mnemonic,$arg)=@_;
689 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
690 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
691 |(($2&7)<<17)|(($2&8)<<4)
692 |(($3&7)<<1) |(($3&8)<<2);
693 # since ARMv7 instructions are always encoded little-endian.
694 # correct solution is to use .inst directive, but older
695 # assemblers don't implement it:-(
696 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
697 $word&0xff,($word>>8)&0xff,
698 ($word>>16)&0xff,($word>>24)&0xff,
699 $mnemonic,$arg;
704 foreach (split($/,$code)) {
706 s/\`([^\`]*)\`/eval $1/geo;
708 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
710 s/\bret\b/bx lr/go or
711 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
713 print $_,"\n";
716 close STDOUT; # enforce flush