x86/oprofile: Fix bogus GCC-8 warning in nmi_setup()
[cris-mirror.git] / arch / arm / crypto / sha512-armv4.pl
bloba2b11a84435776aca76fc4f479a95468a252964d
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA512 block procedure for ARMv4. September 2007.
14 # This code is ~4.5 (four and a half) times faster than code generated
15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 # Xscale PXA250 core].
18 # July 2010.
20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 # Cortex A8 core and ~40 cycles per processed byte.
23 # February 2011.
25 # Profiler-assisted and platform-specific optimization resulted in 7%
26 # improvement on Coxtex A8 core and ~38 cycles per byte.
28 # March 2011.
30 # Add NEON implementation. On Cortex A8 it was measured to process
31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
33 # August 2012.
35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
36 # terms it's 22.6 cycles per byte, which is disappointing result.
37 # Technical writers asserted that 3-way S4 pipeline can sustain
38 # multiple NEON instructions per cycle, but dual NEON issue could
39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 # for further details. On side note Cortex-A15 processes one byte in
41 # 16 cycles.
43 # Byte order [in]dependence. =========================================
45 # Originally caller was expected to maintain specific *dword* order in
46 # h[0-7], namely with most significant dword at *lower* address, which
47 # was reflected in below two parameters as 0 and 4. Now caller is
48 # expected to maintain native byte order for whole 64-bit values.
49 $hi="HI";
50 $lo="LO";
51 # ====================================================================
53 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
54 open STDOUT,">$output";
56 $ctx="r0"; # parameter block
57 $inp="r1";
58 $len="r2";
60 $Tlo="r3";
61 $Thi="r4";
62 $Alo="r5";
63 $Ahi="r6";
64 $Elo="r7";
65 $Ehi="r8";
66 $t0="r9";
67 $t1="r10";
68 $t2="r11";
69 $t3="r12";
70 ############ r13 is stack pointer
71 $Ktbl="r14";
72 ############ r15 is program counter
74 $Aoff=8*0;
75 $Boff=8*1;
76 $Coff=8*2;
77 $Doff=8*3;
78 $Eoff=8*4;
79 $Foff=8*5;
80 $Goff=8*6;
81 $Hoff=8*7;
82 $Xoff=8*8;
84 sub BODY_00_15() {
85 my $magic = shift;
86 $code.=<<___;
87 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
88 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
89 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
90 mov $t0,$Elo,lsr#14
91 str $Tlo,[sp,#$Xoff+0]
92 mov $t1,$Ehi,lsr#14
93 str $Thi,[sp,#$Xoff+4]
94 eor $t0,$t0,$Ehi,lsl#18
95 ldr $t2,[sp,#$Hoff+0] @ h.lo
96 eor $t1,$t1,$Elo,lsl#18
97 ldr $t3,[sp,#$Hoff+4] @ h.hi
98 eor $t0,$t0,$Elo,lsr#18
99 eor $t1,$t1,$Ehi,lsr#18
100 eor $t0,$t0,$Ehi,lsl#14
101 eor $t1,$t1,$Elo,lsl#14
102 eor $t0,$t0,$Ehi,lsr#9
103 eor $t1,$t1,$Elo,lsr#9
104 eor $t0,$t0,$Elo,lsl#23
105 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
106 adds $Tlo,$Tlo,$t0
107 ldr $t0,[sp,#$Foff+0] @ f.lo
108 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
109 ldr $t1,[sp,#$Foff+4] @ f.hi
110 adds $Tlo,$Tlo,$t2
111 ldr $t2,[sp,#$Goff+0] @ g.lo
112 adc $Thi,$Thi,$t3 @ T += h
113 ldr $t3,[sp,#$Goff+4] @ g.hi
115 eor $t0,$t0,$t2
116 str $Elo,[sp,#$Eoff+0]
117 eor $t1,$t1,$t3
118 str $Ehi,[sp,#$Eoff+4]
119 and $t0,$t0,$Elo
120 str $Alo,[sp,#$Aoff+0]
121 and $t1,$t1,$Ehi
122 str $Ahi,[sp,#$Aoff+4]
123 eor $t0,$t0,$t2
124 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
125 eor $t1,$t1,$t3 @ Ch(e,f,g)
126 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
128 adds $Tlo,$Tlo,$t0
129 ldr $Elo,[sp,#$Doff+0] @ d.lo
130 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
131 ldr $Ehi,[sp,#$Doff+4] @ d.hi
132 adds $Tlo,$Tlo,$t2
133 and $t0,$t2,#0xff
134 adc $Thi,$Thi,$t3 @ T += K[i]
135 adds $Elo,$Elo,$Tlo
136 ldr $t2,[sp,#$Boff+0] @ b.lo
137 adc $Ehi,$Ehi,$Thi @ d += T
138 teq $t0,#$magic
140 ldr $t3,[sp,#$Coff+0] @ c.lo
141 #if __ARM_ARCH__>=7
142 it eq @ Thumb2 thing, sanity check in ARM
143 #endif
144 orreq $Ktbl,$Ktbl,#1
145 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
146 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
147 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
148 mov $t0,$Alo,lsr#28
149 mov $t1,$Ahi,lsr#28
150 eor $t0,$t0,$Ahi,lsl#4
151 eor $t1,$t1,$Alo,lsl#4
152 eor $t0,$t0,$Ahi,lsr#2
153 eor $t1,$t1,$Alo,lsr#2
154 eor $t0,$t0,$Alo,lsl#30
155 eor $t1,$t1,$Ahi,lsl#30
156 eor $t0,$t0,$Ahi,lsr#7
157 eor $t1,$t1,$Alo,lsr#7
158 eor $t0,$t0,$Alo,lsl#25
159 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
160 adds $Tlo,$Tlo,$t0
161 and $t0,$Alo,$t2
162 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
164 ldr $t1,[sp,#$Boff+4] @ b.hi
165 orr $Alo,$Alo,$t2
166 ldr $t2,[sp,#$Coff+4] @ c.hi
167 and $Alo,$Alo,$t3
168 and $t3,$Ahi,$t1
169 orr $Ahi,$Ahi,$t1
170 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
171 and $Ahi,$Ahi,$t2
172 adds $Alo,$Alo,$Tlo
173 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
174 sub sp,sp,#8
175 adc $Ahi,$Ahi,$Thi @ h += T
176 tst $Ktbl,#1
177 add $Ktbl,$Ktbl,#8
180 $code=<<___;
181 #ifndef __KERNEL__
182 # include "arm_arch.h"
183 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
184 # define VFP_ABI_POP vldmia sp!,{d8-d15}
185 #else
186 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
187 # define __ARM_MAX_ARCH__ 7
188 # define VFP_ABI_PUSH
189 # define VFP_ABI_POP
190 #endif
192 #ifdef __ARMEL__
193 # define LO 0
194 # define HI 4
195 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
196 #else
197 # define HI 0
198 # define LO 4
199 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
200 #endif
202 .text
203 #if __ARM_ARCH__<7
204 .code 32
205 #else
206 .syntax unified
207 # ifdef __thumb2__
208 # define adrl adr
209 .thumb
210 # else
211 .code 32
212 # endif
213 #endif
215 .type K512,%object
216 .align 5
217 K512:
218 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
219 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
220 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
221 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
222 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
223 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
224 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
225 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
226 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
227 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
228 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
229 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
230 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
231 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
232 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
233 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
234 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
235 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
236 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
237 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
238 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
239 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
240 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
241 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
242 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
243 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
244 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
245 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
246 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
247 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
248 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
249 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
250 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
251 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
252 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
253 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
254 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
255 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
256 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
257 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
258 .size K512,.-K512
259 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
260 .LOPENSSL_armcap:
261 .word OPENSSL_armcap_P-sha512_block_data_order
262 .skip 32-4
263 #else
264 .skip 32
265 #endif
267 .global sha512_block_data_order
268 .type sha512_block_data_order,%function
269 sha512_block_data_order:
270 #if __ARM_ARCH__<7
271 sub r3,pc,#8 @ sha512_block_data_order
272 #else
273 adr r3,sha512_block_data_order
274 #endif
275 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
276 ldr r12,.LOPENSSL_armcap
277 ldr r12,[r3,r12] @ OPENSSL_armcap_P
278 tst r12,#1
279 bne .LNEON
280 #endif
281 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
282 stmdb sp!,{r4-r12,lr}
283 sub $Ktbl,r3,#672 @ K512
284 sub sp,sp,#9*8
286 ldr $Elo,[$ctx,#$Eoff+$lo]
287 ldr $Ehi,[$ctx,#$Eoff+$hi]
288 ldr $t0, [$ctx,#$Goff+$lo]
289 ldr $t1, [$ctx,#$Goff+$hi]
290 ldr $t2, [$ctx,#$Hoff+$lo]
291 ldr $t3, [$ctx,#$Hoff+$hi]
292 .Loop:
293 str $t0, [sp,#$Goff+0]
294 str $t1, [sp,#$Goff+4]
295 str $t2, [sp,#$Hoff+0]
296 str $t3, [sp,#$Hoff+4]
297 ldr $Alo,[$ctx,#$Aoff+$lo]
298 ldr $Ahi,[$ctx,#$Aoff+$hi]
299 ldr $Tlo,[$ctx,#$Boff+$lo]
300 ldr $Thi,[$ctx,#$Boff+$hi]
301 ldr $t0, [$ctx,#$Coff+$lo]
302 ldr $t1, [$ctx,#$Coff+$hi]
303 ldr $t2, [$ctx,#$Doff+$lo]
304 ldr $t3, [$ctx,#$Doff+$hi]
305 str $Tlo,[sp,#$Boff+0]
306 str $Thi,[sp,#$Boff+4]
307 str $t0, [sp,#$Coff+0]
308 str $t1, [sp,#$Coff+4]
309 str $t2, [sp,#$Doff+0]
310 str $t3, [sp,#$Doff+4]
311 ldr $Tlo,[$ctx,#$Foff+$lo]
312 ldr $Thi,[$ctx,#$Foff+$hi]
313 str $Tlo,[sp,#$Foff+0]
314 str $Thi,[sp,#$Foff+4]
316 .L00_15:
317 #if __ARM_ARCH__<7
318 ldrb $Tlo,[$inp,#7]
319 ldrb $t0, [$inp,#6]
320 ldrb $t1, [$inp,#5]
321 ldrb $t2, [$inp,#4]
322 ldrb $Thi,[$inp,#3]
323 ldrb $t3, [$inp,#2]
324 orr $Tlo,$Tlo,$t0,lsl#8
325 ldrb $t0, [$inp,#1]
326 orr $Tlo,$Tlo,$t1,lsl#16
327 ldrb $t1, [$inp],#8
328 orr $Tlo,$Tlo,$t2,lsl#24
329 orr $Thi,$Thi,$t3,lsl#8
330 orr $Thi,$Thi,$t0,lsl#16
331 orr $Thi,$Thi,$t1,lsl#24
332 #else
333 ldr $Tlo,[$inp,#4]
334 ldr $Thi,[$inp],#8
335 #ifdef __ARMEL__
336 rev $Tlo,$Tlo
337 rev $Thi,$Thi
338 #endif
339 #endif
341 &BODY_00_15(0x94);
342 $code.=<<___;
343 tst $Ktbl,#1
344 beq .L00_15
345 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
346 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
347 bic $Ktbl,$Ktbl,#1
348 .L16_79:
349 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
350 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
351 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
352 mov $Tlo,$t0,lsr#1
353 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
354 mov $Thi,$t1,lsr#1
355 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
356 eor $Tlo,$Tlo,$t1,lsl#31
357 eor $Thi,$Thi,$t0,lsl#31
358 eor $Tlo,$Tlo,$t0,lsr#8
359 eor $Thi,$Thi,$t1,lsr#8
360 eor $Tlo,$Tlo,$t1,lsl#24
361 eor $Thi,$Thi,$t0,lsl#24
362 eor $Tlo,$Tlo,$t0,lsr#7
363 eor $Thi,$Thi,$t1,lsr#7
364 eor $Tlo,$Tlo,$t1,lsl#25
366 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
367 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
368 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
369 mov $t0,$t2,lsr#19
370 mov $t1,$t3,lsr#19
371 eor $t0,$t0,$t3,lsl#13
372 eor $t1,$t1,$t2,lsl#13
373 eor $t0,$t0,$t3,lsr#29
374 eor $t1,$t1,$t2,lsr#29
375 eor $t0,$t0,$t2,lsl#3
376 eor $t1,$t1,$t3,lsl#3
377 eor $t0,$t0,$t2,lsr#6
378 eor $t1,$t1,$t3,lsr#6
379 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
380 eor $t0,$t0,$t3,lsl#26
382 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
383 adds $Tlo,$Tlo,$t0
384 ldr $t0,[sp,#`$Xoff+8*16`+0]
385 adc $Thi,$Thi,$t1
387 ldr $t1,[sp,#`$Xoff+8*16`+4]
388 adds $Tlo,$Tlo,$t2
389 adc $Thi,$Thi,$t3
390 adds $Tlo,$Tlo,$t0
391 adc $Thi,$Thi,$t1
393 &BODY_00_15(0x17);
394 $code.=<<___;
395 #if __ARM_ARCH__>=7
396 ittt eq @ Thumb2 thing, sanity check in ARM
397 #endif
398 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
399 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
400 beq .L16_79
401 bic $Ktbl,$Ktbl,#1
403 ldr $Tlo,[sp,#$Boff+0]
404 ldr $Thi,[sp,#$Boff+4]
405 ldr $t0, [$ctx,#$Aoff+$lo]
406 ldr $t1, [$ctx,#$Aoff+$hi]
407 ldr $t2, [$ctx,#$Boff+$lo]
408 ldr $t3, [$ctx,#$Boff+$hi]
409 adds $t0,$Alo,$t0
410 str $t0, [$ctx,#$Aoff+$lo]
411 adc $t1,$Ahi,$t1
412 str $t1, [$ctx,#$Aoff+$hi]
413 adds $t2,$Tlo,$t2
414 str $t2, [$ctx,#$Boff+$lo]
415 adc $t3,$Thi,$t3
416 str $t3, [$ctx,#$Boff+$hi]
418 ldr $Alo,[sp,#$Coff+0]
419 ldr $Ahi,[sp,#$Coff+4]
420 ldr $Tlo,[sp,#$Doff+0]
421 ldr $Thi,[sp,#$Doff+4]
422 ldr $t0, [$ctx,#$Coff+$lo]
423 ldr $t1, [$ctx,#$Coff+$hi]
424 ldr $t2, [$ctx,#$Doff+$lo]
425 ldr $t3, [$ctx,#$Doff+$hi]
426 adds $t0,$Alo,$t0
427 str $t0, [$ctx,#$Coff+$lo]
428 adc $t1,$Ahi,$t1
429 str $t1, [$ctx,#$Coff+$hi]
430 adds $t2,$Tlo,$t2
431 str $t2, [$ctx,#$Doff+$lo]
432 adc $t3,$Thi,$t3
433 str $t3, [$ctx,#$Doff+$hi]
435 ldr $Tlo,[sp,#$Foff+0]
436 ldr $Thi,[sp,#$Foff+4]
437 ldr $t0, [$ctx,#$Eoff+$lo]
438 ldr $t1, [$ctx,#$Eoff+$hi]
439 ldr $t2, [$ctx,#$Foff+$lo]
440 ldr $t3, [$ctx,#$Foff+$hi]
441 adds $Elo,$Elo,$t0
442 str $Elo,[$ctx,#$Eoff+$lo]
443 adc $Ehi,$Ehi,$t1
444 str $Ehi,[$ctx,#$Eoff+$hi]
445 adds $t2,$Tlo,$t2
446 str $t2, [$ctx,#$Foff+$lo]
447 adc $t3,$Thi,$t3
448 str $t3, [$ctx,#$Foff+$hi]
450 ldr $Alo,[sp,#$Goff+0]
451 ldr $Ahi,[sp,#$Goff+4]
452 ldr $Tlo,[sp,#$Hoff+0]
453 ldr $Thi,[sp,#$Hoff+4]
454 ldr $t0, [$ctx,#$Goff+$lo]
455 ldr $t1, [$ctx,#$Goff+$hi]
456 ldr $t2, [$ctx,#$Hoff+$lo]
457 ldr $t3, [$ctx,#$Hoff+$hi]
458 adds $t0,$Alo,$t0
459 str $t0, [$ctx,#$Goff+$lo]
460 adc $t1,$Ahi,$t1
461 str $t1, [$ctx,#$Goff+$hi]
462 adds $t2,$Tlo,$t2
463 str $t2, [$ctx,#$Hoff+$lo]
464 adc $t3,$Thi,$t3
465 str $t3, [$ctx,#$Hoff+$hi]
467 add sp,sp,#640
468 sub $Ktbl,$Ktbl,#640
470 teq $inp,$len
471 bne .Loop
473 add sp,sp,#8*9 @ destroy frame
474 #if __ARM_ARCH__>=5
475 ldmia sp!,{r4-r12,pc}
476 #else
477 ldmia sp!,{r4-r12,lr}
478 tst lr,#1
479 moveq pc,lr @ be binary compatible with V4, yet
480 bx lr @ interoperable with Thumb ISA:-)
481 #endif
482 .size sha512_block_data_order,.-sha512_block_data_order
486 my @Sigma0=(28,34,39);
487 my @Sigma1=(14,18,41);
488 my @sigma0=(1, 8, 7);
489 my @sigma1=(19,61,6);
491 my $Ktbl="r3";
492 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
494 my @X=map("d$_",(0..15));
495 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
497 sub NEON_00_15() {
498 my $i=shift;
499 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
500 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
502 $code.=<<___ if ($i<16 || $i&1);
503 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
504 #if $i<16
505 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
506 #endif
507 vshr.u64 $t1,$e,#@Sigma1[1]
508 #if $i>0
509 vadd.i64 $a,$Maj @ h+=Maj from the past
510 #endif
511 vshr.u64 $t2,$e,#@Sigma1[2]
513 $code.=<<___;
514 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
515 vsli.64 $t0,$e,#`64-@Sigma1[0]`
516 vsli.64 $t1,$e,#`64-@Sigma1[1]`
517 vmov $Ch,$e
518 vsli.64 $t2,$e,#`64-@Sigma1[2]`
519 #if $i<16 && defined(__ARMEL__)
520 vrev64.8 @X[$i],@X[$i]
521 #endif
522 veor $t1,$t0
523 vbsl $Ch,$f,$g @ Ch(e,f,g)
524 vshr.u64 $t0,$a,#@Sigma0[0]
525 veor $t2,$t1 @ Sigma1(e)
526 vadd.i64 $T1,$Ch,$h
527 vshr.u64 $t1,$a,#@Sigma0[1]
528 vsli.64 $t0,$a,#`64-@Sigma0[0]`
529 vadd.i64 $T1,$t2
530 vshr.u64 $t2,$a,#@Sigma0[2]
531 vadd.i64 $K,@X[$i%16]
532 vsli.64 $t1,$a,#`64-@Sigma0[1]`
533 veor $Maj,$a,$b
534 vsli.64 $t2,$a,#`64-@Sigma0[2]`
535 veor $h,$t0,$t1
536 vadd.i64 $T1,$K
537 vbsl $Maj,$c,$b @ Maj(a,b,c)
538 veor $h,$t2 @ Sigma0(a)
539 vadd.i64 $d,$T1
540 vadd.i64 $Maj,$T1
541 @ vadd.i64 $h,$Maj
545 sub NEON_16_79() {
546 my $i=shift;
548 if ($i&1) { &NEON_00_15($i,@_); return; }
550 # 2x-vectorized, therefore runs every 2nd round
551 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
552 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
553 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
554 my $e=@_[4]; # $e from NEON_00_15
555 $i /= 2;
556 $code.=<<___;
557 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
558 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
559 vadd.i64 @_[0],d30 @ h+=Maj from the past
560 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
561 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
562 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
563 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
564 veor $s1,$t0
565 vshr.u64 $t0,$s0,#@sigma0[0]
566 veor $s1,$t1 @ sigma1(X[i+14])
567 vshr.u64 $t1,$s0,#@sigma0[1]
568 vadd.i64 @X[$i%8],$s1
569 vshr.u64 $s1,$s0,#@sigma0[2]
570 vsli.64 $t0,$s0,#`64-@sigma0[0]`
571 vsli.64 $t1,$s0,#`64-@sigma0[1]`
572 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
573 veor $s1,$t0
574 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
575 vadd.i64 @X[$i%8],$s0
576 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
577 veor $s1,$t1 @ sigma0(X[i+1])
578 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
579 vadd.i64 @X[$i%8],$s1
581 &NEON_00_15(2*$i,@_);
584 $code.=<<___;
585 #if __ARM_MAX_ARCH__>=7
586 .arch armv7-a
587 .fpu neon
589 .global sha512_block_data_order_neon
590 .type sha512_block_data_order_neon,%function
591 .align 4
592 sha512_block_data_order_neon:
593 .LNEON:
594 dmb @ errata #451034 on early Cortex A8
595 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
596 VFP_ABI_PUSH
597 adrl $Ktbl,K512
598 vldmia $ctx,{$A-$H} @ load context
599 .Loop_neon:
601 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
602 $code.=<<___;
603 mov $cnt,#4
604 .L16_79_neon:
605 subs $cnt,#1
607 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
608 $code.=<<___;
609 bne .L16_79_neon
611 vadd.i64 $A,d30 @ h+=Maj from the past
612 vldmia $ctx,{d24-d31} @ load context to temp
613 vadd.i64 q8,q12 @ vectorized accumulate
614 vadd.i64 q9,q13
615 vadd.i64 q10,q14
616 vadd.i64 q11,q15
617 vstmia $ctx,{$A-$H} @ save context
618 teq $inp,$len
619 sub $Ktbl,#640 @ rewind K512
620 bne .Loop_neon
622 VFP_ABI_POP
623 ret @ bx lr
624 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
625 #endif
628 $code.=<<___;
629 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
630 .align 2
631 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
632 .comm OPENSSL_armcap_P,4,4
633 #endif
636 $code =~ s/\`([^\`]*)\`/eval $1/gem;
637 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
638 $code =~ s/\bret\b/bx lr/gm;
640 open SELF,$0;
641 while(<SELF>) {
642 next if (/^#!/);
643 last if (!s/^#/@/ and !/^$/);
644 print;
646 close SELF;
648 print $code;
649 close STDOUT; # enforce flush