perf tools: Streamline bpf examples and headers installation
[linux/fpc-iii.git] / arch / arm / crypto / sha512-armv4.pl
blobfb5d15048c0b2d2ea2d8c8c9a2bdbb15a6c14052
1 #!/usr/bin/env perl
2 # SPDX-License-Identifier: GPL-2.0
4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
5 # has relicensed it under the GPLv2. Therefore this program is free software;
6 # you can redistribute it and/or modify it under the terms of the GNU General
7 # Public License version 2 as published by the Free Software Foundation.
9 # The original headers, including the original license headers, are
10 # included below for completeness.
12 # ====================================================================
13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # project. The module is, however, dual licensed under OpenSSL and
15 # CRYPTOGAMS licenses depending on where you obtain it. For further
16 # details see http://www.openssl.org/~appro/cryptogams/.
17 # ====================================================================
19 # SHA512 block procedure for ARMv4. September 2007.
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23 # Xscale PXA250 core].
25 # July 2010.
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
30 # February 2011.
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
35 # March 2011.
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
40 # August 2012.
42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
43 # terms it's 22.6 cycles per byte, which is disappointing result.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
45 # multiple NEON instructions per cycle, but dual NEON issue could
46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
48 # 16 cycles.
50 # Byte order [in]dependence. =========================================
52 # Originally caller was expected to maintain specific *dword* order in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
56 $hi="HI";
57 $lo="LO";
58 # ====================================================================
60 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61 open STDOUT,">$output";
63 $ctx="r0"; # parameter block
64 $inp="r1";
65 $len="r2";
67 $Tlo="r3";
68 $Thi="r4";
69 $Alo="r5";
70 $Ahi="r6";
71 $Elo="r7";
72 $Ehi="r8";
73 $t0="r9";
74 $t1="r10";
75 $t2="r11";
76 $t3="r12";
77 ############ r13 is stack pointer
78 $Ktbl="r14";
79 ############ r15 is program counter
81 $Aoff=8*0;
82 $Boff=8*1;
83 $Coff=8*2;
84 $Doff=8*3;
85 $Eoff=8*4;
86 $Foff=8*5;
87 $Goff=8*6;
88 $Hoff=8*7;
89 $Xoff=8*8;
91 sub BODY_00_15() {
92 my $magic = shift;
93 $code.=<<___;
94 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
95 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
96 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
97 mov $t0,$Elo,lsr#14
98 str $Tlo,[sp,#$Xoff+0]
99 mov $t1,$Ehi,lsr#14
100 str $Thi,[sp,#$Xoff+4]
101 eor $t0,$t0,$Ehi,lsl#18
102 ldr $t2,[sp,#$Hoff+0] @ h.lo
103 eor $t1,$t1,$Elo,lsl#18
104 ldr $t3,[sp,#$Hoff+4] @ h.hi
105 eor $t0,$t0,$Elo,lsr#18
106 eor $t1,$t1,$Ehi,lsr#18
107 eor $t0,$t0,$Ehi,lsl#14
108 eor $t1,$t1,$Elo,lsl#14
109 eor $t0,$t0,$Ehi,lsr#9
110 eor $t1,$t1,$Elo,lsr#9
111 eor $t0,$t0,$Elo,lsl#23
112 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
113 adds $Tlo,$Tlo,$t0
114 ldr $t0,[sp,#$Foff+0] @ f.lo
115 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
116 ldr $t1,[sp,#$Foff+4] @ f.hi
117 adds $Tlo,$Tlo,$t2
118 ldr $t2,[sp,#$Goff+0] @ g.lo
119 adc $Thi,$Thi,$t3 @ T += h
120 ldr $t3,[sp,#$Goff+4] @ g.hi
122 eor $t0,$t0,$t2
123 str $Elo,[sp,#$Eoff+0]
124 eor $t1,$t1,$t3
125 str $Ehi,[sp,#$Eoff+4]
126 and $t0,$t0,$Elo
127 str $Alo,[sp,#$Aoff+0]
128 and $t1,$t1,$Ehi
129 str $Ahi,[sp,#$Aoff+4]
130 eor $t0,$t0,$t2
131 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
132 eor $t1,$t1,$t3 @ Ch(e,f,g)
133 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
135 adds $Tlo,$Tlo,$t0
136 ldr $Elo,[sp,#$Doff+0] @ d.lo
137 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
138 ldr $Ehi,[sp,#$Doff+4] @ d.hi
139 adds $Tlo,$Tlo,$t2
140 and $t0,$t2,#0xff
141 adc $Thi,$Thi,$t3 @ T += K[i]
142 adds $Elo,$Elo,$Tlo
143 ldr $t2,[sp,#$Boff+0] @ b.lo
144 adc $Ehi,$Ehi,$Thi @ d += T
145 teq $t0,#$magic
147 ldr $t3,[sp,#$Coff+0] @ c.lo
148 #if __ARM_ARCH__>=7
149 it eq @ Thumb2 thing, sanity check in ARM
150 #endif
151 orreq $Ktbl,$Ktbl,#1
152 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
153 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
154 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
155 mov $t0,$Alo,lsr#28
156 mov $t1,$Ahi,lsr#28
157 eor $t0,$t0,$Ahi,lsl#4
158 eor $t1,$t1,$Alo,lsl#4
159 eor $t0,$t0,$Ahi,lsr#2
160 eor $t1,$t1,$Alo,lsr#2
161 eor $t0,$t0,$Alo,lsl#30
162 eor $t1,$t1,$Ahi,lsl#30
163 eor $t0,$t0,$Ahi,lsr#7
164 eor $t1,$t1,$Alo,lsr#7
165 eor $t0,$t0,$Alo,lsl#25
166 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
167 adds $Tlo,$Tlo,$t0
168 and $t0,$Alo,$t2
169 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
171 ldr $t1,[sp,#$Boff+4] @ b.hi
172 orr $Alo,$Alo,$t2
173 ldr $t2,[sp,#$Coff+4] @ c.hi
174 and $Alo,$Alo,$t3
175 and $t3,$Ahi,$t1
176 orr $Ahi,$Ahi,$t1
177 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
178 and $Ahi,$Ahi,$t2
179 adds $Alo,$Alo,$Tlo
180 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
181 sub sp,sp,#8
182 adc $Ahi,$Ahi,$Thi @ h += T
183 tst $Ktbl,#1
184 add $Ktbl,$Ktbl,#8
187 $code=<<___;
188 #ifndef __KERNEL__
189 # include "arm_arch.h"
190 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
191 # define VFP_ABI_POP vldmia sp!,{d8-d15}
192 #else
193 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
194 # define __ARM_MAX_ARCH__ 7
195 # define VFP_ABI_PUSH
196 # define VFP_ABI_POP
197 #endif
199 #ifdef __ARMEL__
200 # define LO 0
201 # define HI 4
202 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
203 #else
204 # define HI 0
205 # define LO 4
206 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
207 #endif
209 .text
210 #if __ARM_ARCH__<7
211 .code 32
212 #else
213 .syntax unified
214 # ifdef __thumb2__
215 # define adrl adr
216 .thumb
217 # else
218 .code 32
219 # endif
220 #endif
222 .type K512,%object
223 .align 5
224 K512:
225 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
226 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
227 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
228 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
229 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
230 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
231 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
232 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
233 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
234 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
235 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
236 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
237 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
238 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
239 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
240 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
241 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
242 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
243 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
244 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
245 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
246 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
247 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
248 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
249 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
250 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
251 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
252 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
253 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
254 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
255 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
256 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
257 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
258 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
259 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
260 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
261 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
262 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
263 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
264 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
265 .size K512,.-K512
266 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
267 .LOPENSSL_armcap:
268 .word OPENSSL_armcap_P-sha512_block_data_order
269 .skip 32-4
270 #else
271 .skip 32
272 #endif
274 .global sha512_block_data_order
275 .type sha512_block_data_order,%function
276 sha512_block_data_order:
277 #if __ARM_ARCH__<7
278 sub r3,pc,#8 @ sha512_block_data_order
279 #else
280 adr r3,sha512_block_data_order
281 #endif
282 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
283 ldr r12,.LOPENSSL_armcap
284 ldr r12,[r3,r12] @ OPENSSL_armcap_P
285 tst r12,#1
286 bne .LNEON
287 #endif
288 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
289 stmdb sp!,{r4-r12,lr}
290 sub $Ktbl,r3,#672 @ K512
291 sub sp,sp,#9*8
293 ldr $Elo,[$ctx,#$Eoff+$lo]
294 ldr $Ehi,[$ctx,#$Eoff+$hi]
295 ldr $t0, [$ctx,#$Goff+$lo]
296 ldr $t1, [$ctx,#$Goff+$hi]
297 ldr $t2, [$ctx,#$Hoff+$lo]
298 ldr $t3, [$ctx,#$Hoff+$hi]
299 .Loop:
300 str $t0, [sp,#$Goff+0]
301 str $t1, [sp,#$Goff+4]
302 str $t2, [sp,#$Hoff+0]
303 str $t3, [sp,#$Hoff+4]
304 ldr $Alo,[$ctx,#$Aoff+$lo]
305 ldr $Ahi,[$ctx,#$Aoff+$hi]
306 ldr $Tlo,[$ctx,#$Boff+$lo]
307 ldr $Thi,[$ctx,#$Boff+$hi]
308 ldr $t0, [$ctx,#$Coff+$lo]
309 ldr $t1, [$ctx,#$Coff+$hi]
310 ldr $t2, [$ctx,#$Doff+$lo]
311 ldr $t3, [$ctx,#$Doff+$hi]
312 str $Tlo,[sp,#$Boff+0]
313 str $Thi,[sp,#$Boff+4]
314 str $t0, [sp,#$Coff+0]
315 str $t1, [sp,#$Coff+4]
316 str $t2, [sp,#$Doff+0]
317 str $t3, [sp,#$Doff+4]
318 ldr $Tlo,[$ctx,#$Foff+$lo]
319 ldr $Thi,[$ctx,#$Foff+$hi]
320 str $Tlo,[sp,#$Foff+0]
321 str $Thi,[sp,#$Foff+4]
323 .L00_15:
324 #if __ARM_ARCH__<7
325 ldrb $Tlo,[$inp,#7]
326 ldrb $t0, [$inp,#6]
327 ldrb $t1, [$inp,#5]
328 ldrb $t2, [$inp,#4]
329 ldrb $Thi,[$inp,#3]
330 ldrb $t3, [$inp,#2]
331 orr $Tlo,$Tlo,$t0,lsl#8
332 ldrb $t0, [$inp,#1]
333 orr $Tlo,$Tlo,$t1,lsl#16
334 ldrb $t1, [$inp],#8
335 orr $Tlo,$Tlo,$t2,lsl#24
336 orr $Thi,$Thi,$t3,lsl#8
337 orr $Thi,$Thi,$t0,lsl#16
338 orr $Thi,$Thi,$t1,lsl#24
339 #else
340 ldr $Tlo,[$inp,#4]
341 ldr $Thi,[$inp],#8
342 #ifdef __ARMEL__
343 rev $Tlo,$Tlo
344 rev $Thi,$Thi
345 #endif
346 #endif
348 &BODY_00_15(0x94);
349 $code.=<<___;
350 tst $Ktbl,#1
351 beq .L00_15
352 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
353 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
354 bic $Ktbl,$Ktbl,#1
355 .L16_79:
356 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
357 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
358 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
359 mov $Tlo,$t0,lsr#1
360 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
361 mov $Thi,$t1,lsr#1
362 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
363 eor $Tlo,$Tlo,$t1,lsl#31
364 eor $Thi,$Thi,$t0,lsl#31
365 eor $Tlo,$Tlo,$t0,lsr#8
366 eor $Thi,$Thi,$t1,lsr#8
367 eor $Tlo,$Tlo,$t1,lsl#24
368 eor $Thi,$Thi,$t0,lsl#24
369 eor $Tlo,$Tlo,$t0,lsr#7
370 eor $Thi,$Thi,$t1,lsr#7
371 eor $Tlo,$Tlo,$t1,lsl#25
373 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
374 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
375 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
376 mov $t0,$t2,lsr#19
377 mov $t1,$t3,lsr#19
378 eor $t0,$t0,$t3,lsl#13
379 eor $t1,$t1,$t2,lsl#13
380 eor $t0,$t0,$t3,lsr#29
381 eor $t1,$t1,$t2,lsr#29
382 eor $t0,$t0,$t2,lsl#3
383 eor $t1,$t1,$t3,lsl#3
384 eor $t0,$t0,$t2,lsr#6
385 eor $t1,$t1,$t3,lsr#6
386 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
387 eor $t0,$t0,$t3,lsl#26
389 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
390 adds $Tlo,$Tlo,$t0
391 ldr $t0,[sp,#`$Xoff+8*16`+0]
392 adc $Thi,$Thi,$t1
394 ldr $t1,[sp,#`$Xoff+8*16`+4]
395 adds $Tlo,$Tlo,$t2
396 adc $Thi,$Thi,$t3
397 adds $Tlo,$Tlo,$t0
398 adc $Thi,$Thi,$t1
400 &BODY_00_15(0x17);
401 $code.=<<___;
402 #if __ARM_ARCH__>=7
403 ittt eq @ Thumb2 thing, sanity check in ARM
404 #endif
405 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
406 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
407 beq .L16_79
408 bic $Ktbl,$Ktbl,#1
410 ldr $Tlo,[sp,#$Boff+0]
411 ldr $Thi,[sp,#$Boff+4]
412 ldr $t0, [$ctx,#$Aoff+$lo]
413 ldr $t1, [$ctx,#$Aoff+$hi]
414 ldr $t2, [$ctx,#$Boff+$lo]
415 ldr $t3, [$ctx,#$Boff+$hi]
416 adds $t0,$Alo,$t0
417 str $t0, [$ctx,#$Aoff+$lo]
418 adc $t1,$Ahi,$t1
419 str $t1, [$ctx,#$Aoff+$hi]
420 adds $t2,$Tlo,$t2
421 str $t2, [$ctx,#$Boff+$lo]
422 adc $t3,$Thi,$t3
423 str $t3, [$ctx,#$Boff+$hi]
425 ldr $Alo,[sp,#$Coff+0]
426 ldr $Ahi,[sp,#$Coff+4]
427 ldr $Tlo,[sp,#$Doff+0]
428 ldr $Thi,[sp,#$Doff+4]
429 ldr $t0, [$ctx,#$Coff+$lo]
430 ldr $t1, [$ctx,#$Coff+$hi]
431 ldr $t2, [$ctx,#$Doff+$lo]
432 ldr $t3, [$ctx,#$Doff+$hi]
433 adds $t0,$Alo,$t0
434 str $t0, [$ctx,#$Coff+$lo]
435 adc $t1,$Ahi,$t1
436 str $t1, [$ctx,#$Coff+$hi]
437 adds $t2,$Tlo,$t2
438 str $t2, [$ctx,#$Doff+$lo]
439 adc $t3,$Thi,$t3
440 str $t3, [$ctx,#$Doff+$hi]
442 ldr $Tlo,[sp,#$Foff+0]
443 ldr $Thi,[sp,#$Foff+4]
444 ldr $t0, [$ctx,#$Eoff+$lo]
445 ldr $t1, [$ctx,#$Eoff+$hi]
446 ldr $t2, [$ctx,#$Foff+$lo]
447 ldr $t3, [$ctx,#$Foff+$hi]
448 adds $Elo,$Elo,$t0
449 str $Elo,[$ctx,#$Eoff+$lo]
450 adc $Ehi,$Ehi,$t1
451 str $Ehi,[$ctx,#$Eoff+$hi]
452 adds $t2,$Tlo,$t2
453 str $t2, [$ctx,#$Foff+$lo]
454 adc $t3,$Thi,$t3
455 str $t3, [$ctx,#$Foff+$hi]
457 ldr $Alo,[sp,#$Goff+0]
458 ldr $Ahi,[sp,#$Goff+4]
459 ldr $Tlo,[sp,#$Hoff+0]
460 ldr $Thi,[sp,#$Hoff+4]
461 ldr $t0, [$ctx,#$Goff+$lo]
462 ldr $t1, [$ctx,#$Goff+$hi]
463 ldr $t2, [$ctx,#$Hoff+$lo]
464 ldr $t3, [$ctx,#$Hoff+$hi]
465 adds $t0,$Alo,$t0
466 str $t0, [$ctx,#$Goff+$lo]
467 adc $t1,$Ahi,$t1
468 str $t1, [$ctx,#$Goff+$hi]
469 adds $t2,$Tlo,$t2
470 str $t2, [$ctx,#$Hoff+$lo]
471 adc $t3,$Thi,$t3
472 str $t3, [$ctx,#$Hoff+$hi]
474 add sp,sp,#640
475 sub $Ktbl,$Ktbl,#640
477 teq $inp,$len
478 bne .Loop
480 add sp,sp,#8*9 @ destroy frame
481 #if __ARM_ARCH__>=5
482 ldmia sp!,{r4-r12,pc}
483 #else
484 ldmia sp!,{r4-r12,lr}
485 tst lr,#1
486 moveq pc,lr @ be binary compatible with V4, yet
487 bx lr @ interoperable with Thumb ISA:-)
488 #endif
489 .size sha512_block_data_order,.-sha512_block_data_order
493 my @Sigma0=(28,34,39);
494 my @Sigma1=(14,18,41);
495 my @sigma0=(1, 8, 7);
496 my @sigma1=(19,61,6);
498 my $Ktbl="r3";
499 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
501 my @X=map("d$_",(0..15));
502 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
504 sub NEON_00_15() {
505 my $i=shift;
506 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
507 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
509 $code.=<<___ if ($i<16 || $i&1);
510 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
511 #if $i<16
512 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
513 #endif
514 vshr.u64 $t1,$e,#@Sigma1[1]
515 #if $i>0
516 vadd.i64 $a,$Maj @ h+=Maj from the past
517 #endif
518 vshr.u64 $t2,$e,#@Sigma1[2]
520 $code.=<<___;
521 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
522 vsli.64 $t0,$e,#`64-@Sigma1[0]`
523 vsli.64 $t1,$e,#`64-@Sigma1[1]`
524 vmov $Ch,$e
525 vsli.64 $t2,$e,#`64-@Sigma1[2]`
526 #if $i<16 && defined(__ARMEL__)
527 vrev64.8 @X[$i],@X[$i]
528 #endif
529 veor $t1,$t0
530 vbsl $Ch,$f,$g @ Ch(e,f,g)
531 vshr.u64 $t0,$a,#@Sigma0[0]
532 veor $t2,$t1 @ Sigma1(e)
533 vadd.i64 $T1,$Ch,$h
534 vshr.u64 $t1,$a,#@Sigma0[1]
535 vsli.64 $t0,$a,#`64-@Sigma0[0]`
536 vadd.i64 $T1,$t2
537 vshr.u64 $t2,$a,#@Sigma0[2]
538 vadd.i64 $K,@X[$i%16]
539 vsli.64 $t1,$a,#`64-@Sigma0[1]`
540 veor $Maj,$a,$b
541 vsli.64 $t2,$a,#`64-@Sigma0[2]`
542 veor $h,$t0,$t1
543 vadd.i64 $T1,$K
544 vbsl $Maj,$c,$b @ Maj(a,b,c)
545 veor $h,$t2 @ Sigma0(a)
546 vadd.i64 $d,$T1
547 vadd.i64 $Maj,$T1
548 @ vadd.i64 $h,$Maj
552 sub NEON_16_79() {
553 my $i=shift;
555 if ($i&1) { &NEON_00_15($i,@_); return; }
557 # 2x-vectorized, therefore runs every 2nd round
558 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
559 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
560 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
561 my $e=@_[4]; # $e from NEON_00_15
562 $i /= 2;
563 $code.=<<___;
564 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
565 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
566 vadd.i64 @_[0],d30 @ h+=Maj from the past
567 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
568 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
569 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
570 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
571 veor $s1,$t0
572 vshr.u64 $t0,$s0,#@sigma0[0]
573 veor $s1,$t1 @ sigma1(X[i+14])
574 vshr.u64 $t1,$s0,#@sigma0[1]
575 vadd.i64 @X[$i%8],$s1
576 vshr.u64 $s1,$s0,#@sigma0[2]
577 vsli.64 $t0,$s0,#`64-@sigma0[0]`
578 vsli.64 $t1,$s0,#`64-@sigma0[1]`
579 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
580 veor $s1,$t0
581 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
582 vadd.i64 @X[$i%8],$s0
583 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
584 veor $s1,$t1 @ sigma0(X[i+1])
585 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
586 vadd.i64 @X[$i%8],$s1
588 &NEON_00_15(2*$i,@_);
591 $code.=<<___;
592 #if __ARM_MAX_ARCH__>=7
593 .arch armv7-a
594 .fpu neon
596 .global sha512_block_data_order_neon
597 .type sha512_block_data_order_neon,%function
598 .align 4
599 sha512_block_data_order_neon:
600 .LNEON:
601 dmb @ errata #451034 on early Cortex A8
602 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
603 VFP_ABI_PUSH
604 adrl $Ktbl,K512
605 vldmia $ctx,{$A-$H} @ load context
606 .Loop_neon:
608 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
609 $code.=<<___;
610 mov $cnt,#4
611 .L16_79_neon:
612 subs $cnt,#1
614 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
615 $code.=<<___;
616 bne .L16_79_neon
618 vadd.i64 $A,d30 @ h+=Maj from the past
619 vldmia $ctx,{d24-d31} @ load context to temp
620 vadd.i64 q8,q12 @ vectorized accumulate
621 vadd.i64 q9,q13
622 vadd.i64 q10,q14
623 vadd.i64 q11,q15
624 vstmia $ctx,{$A-$H} @ save context
625 teq $inp,$len
626 sub $Ktbl,#640 @ rewind K512
627 bne .Loop_neon
629 VFP_ABI_POP
630 ret @ bx lr
631 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
632 #endif
635 $code.=<<___;
636 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
637 .align 2
638 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
639 .comm OPENSSL_armcap_P,4,4
640 #endif
643 $code =~ s/\`([^\`]*)\`/eval $1/gem;
644 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
645 $code =~ s/\bret\b/bx lr/gm;
647 open SELF,$0;
648 while(<SELF>) {
649 next if (/^#!/);
650 last if (!s/^#/@/ and !/^$/);
651 print;
653 close SELF;
655 print $code;
656 close STDOUT; # enforce flush