Linux 5.7.6
[linux/fpc-iii.git] / arch / arm / crypto / sha512-armv4.pl
blob788c17b56ecceb5e607382539faec13b8fbc2886
1 #!/usr/bin/env perl
2 # SPDX-License-Identifier: GPL-2.0
4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
5 # has relicensed it under the GPLv2. Therefore this program is free software;
6 # you can redistribute it and/or modify it under the terms of the GNU General
7 # Public License version 2 as published by the Free Software Foundation.
9 # The original headers, including the original license headers, are
10 # included below for completeness.
12 # ====================================================================
13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # project. The module is, however, dual licensed under OpenSSL and
15 # CRYPTOGAMS licenses depending on where you obtain it. For further
16 # details see http://www.openssl.org/~appro/cryptogams/.
17 # ====================================================================
19 # SHA512 block procedure for ARMv4. September 2007.
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23 # Xscale PXA250 core].
25 # July 2010.
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
30 # February 2011.
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
35 # March 2011.
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
40 # August 2012.
42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
43 # terms it's 22.6 cycles per byte, which is disappointing result.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
45 # multiple NEON instructions per cycle, but dual NEON issue could
46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
48 # 16 cycles.
50 # Byte order [in]dependence. =========================================
52 # Originally caller was expected to maintain specific *dword* order in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
56 $hi="HI";
57 $lo="LO";
58 # ====================================================================
60 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61 open STDOUT,">$output";
63 $ctx="r0"; # parameter block
64 $inp="r1";
65 $len="r2";
67 $Tlo="r3";
68 $Thi="r4";
69 $Alo="r5";
70 $Ahi="r6";
71 $Elo="r7";
72 $Ehi="r8";
73 $t0="r9";
74 $t1="r10";
75 $t2="r11";
76 $t3="r12";
77 ############ r13 is stack pointer
78 $Ktbl="r14";
79 ############ r15 is program counter
81 $Aoff=8*0;
82 $Boff=8*1;
83 $Coff=8*2;
84 $Doff=8*3;
85 $Eoff=8*4;
86 $Foff=8*5;
87 $Goff=8*6;
88 $Hoff=8*7;
89 $Xoff=8*8;
91 sub BODY_00_15() {
92 my $magic = shift;
93 $code.=<<___;
94 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
95 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
96 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
97 mov $t0,$Elo,lsr#14
98 str $Tlo,[sp,#$Xoff+0]
99 mov $t1,$Ehi,lsr#14
100 str $Thi,[sp,#$Xoff+4]
101 eor $t0,$t0,$Ehi,lsl#18
102 ldr $t2,[sp,#$Hoff+0] @ h.lo
103 eor $t1,$t1,$Elo,lsl#18
104 ldr $t3,[sp,#$Hoff+4] @ h.hi
105 eor $t0,$t0,$Elo,lsr#18
106 eor $t1,$t1,$Ehi,lsr#18
107 eor $t0,$t0,$Ehi,lsl#14
108 eor $t1,$t1,$Elo,lsl#14
109 eor $t0,$t0,$Ehi,lsr#9
110 eor $t1,$t1,$Elo,lsr#9
111 eor $t0,$t0,$Elo,lsl#23
112 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
113 adds $Tlo,$Tlo,$t0
114 ldr $t0,[sp,#$Foff+0] @ f.lo
115 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
116 ldr $t1,[sp,#$Foff+4] @ f.hi
117 adds $Tlo,$Tlo,$t2
118 ldr $t2,[sp,#$Goff+0] @ g.lo
119 adc $Thi,$Thi,$t3 @ T += h
120 ldr $t3,[sp,#$Goff+4] @ g.hi
122 eor $t0,$t0,$t2
123 str $Elo,[sp,#$Eoff+0]
124 eor $t1,$t1,$t3
125 str $Ehi,[sp,#$Eoff+4]
126 and $t0,$t0,$Elo
127 str $Alo,[sp,#$Aoff+0]
128 and $t1,$t1,$Ehi
129 str $Ahi,[sp,#$Aoff+4]
130 eor $t0,$t0,$t2
131 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
132 eor $t1,$t1,$t3 @ Ch(e,f,g)
133 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
135 adds $Tlo,$Tlo,$t0
136 ldr $Elo,[sp,#$Doff+0] @ d.lo
137 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
138 ldr $Ehi,[sp,#$Doff+4] @ d.hi
139 adds $Tlo,$Tlo,$t2
140 and $t0,$t2,#0xff
141 adc $Thi,$Thi,$t3 @ T += K[i]
142 adds $Elo,$Elo,$Tlo
143 ldr $t2,[sp,#$Boff+0] @ b.lo
144 adc $Ehi,$Ehi,$Thi @ d += T
145 teq $t0,#$magic
147 ldr $t3,[sp,#$Coff+0] @ c.lo
148 #if __ARM_ARCH__>=7
149 it eq @ Thumb2 thing, sanity check in ARM
150 #endif
151 orreq $Ktbl,$Ktbl,#1
152 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
153 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
154 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
155 mov $t0,$Alo,lsr#28
156 mov $t1,$Ahi,lsr#28
157 eor $t0,$t0,$Ahi,lsl#4
158 eor $t1,$t1,$Alo,lsl#4
159 eor $t0,$t0,$Ahi,lsr#2
160 eor $t1,$t1,$Alo,lsr#2
161 eor $t0,$t0,$Alo,lsl#30
162 eor $t1,$t1,$Ahi,lsl#30
163 eor $t0,$t0,$Ahi,lsr#7
164 eor $t1,$t1,$Alo,lsr#7
165 eor $t0,$t0,$Alo,lsl#25
166 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
167 adds $Tlo,$Tlo,$t0
168 and $t0,$Alo,$t2
169 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
171 ldr $t1,[sp,#$Boff+4] @ b.hi
172 orr $Alo,$Alo,$t2
173 ldr $t2,[sp,#$Coff+4] @ c.hi
174 and $Alo,$Alo,$t3
175 and $t3,$Ahi,$t1
176 orr $Ahi,$Ahi,$t1
177 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
178 and $Ahi,$Ahi,$t2
179 adds $Alo,$Alo,$Tlo
180 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
181 sub sp,sp,#8
182 adc $Ahi,$Ahi,$Thi @ h += T
183 tst $Ktbl,#1
184 add $Ktbl,$Ktbl,#8
187 $code=<<___;
188 #ifndef __KERNEL__
189 # include "arm_arch.h"
190 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
191 # define VFP_ABI_POP vldmia sp!,{d8-d15}
192 #else
193 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
194 # define __ARM_MAX_ARCH__ 7
195 # define VFP_ABI_PUSH
196 # define VFP_ABI_POP
197 #endif
199 #ifdef __ARMEL__
200 # define LO 0
201 # define HI 4
202 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
203 #else
204 # define HI 0
205 # define LO 4
206 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
207 #endif
209 .text
210 #if __ARM_ARCH__<7
211 .code 32
212 #else
213 .syntax unified
214 # ifdef __thumb2__
215 # define adrl adr
216 .thumb
217 # else
218 .code 32
219 # endif
220 #endif
222 .type K512,%object
223 .align 5
224 K512:
225 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
226 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
227 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
228 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
229 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
230 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
231 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
232 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
233 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
234 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
235 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
236 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
237 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
238 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
239 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
240 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
241 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
242 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
243 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
244 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
245 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
246 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
247 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
248 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
249 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
250 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
251 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
252 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
253 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
254 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
255 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
256 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
257 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
258 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
259 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
260 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
261 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
262 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
263 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
264 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
265 .size K512,.-K512
266 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
267 .LOPENSSL_armcap:
268 .word OPENSSL_armcap_P-sha512_block_data_order
269 .skip 32-4
270 #else
271 .skip 32
272 #endif
274 .global sha512_block_data_order
275 .type sha512_block_data_order,%function
276 sha512_block_data_order:
277 .Lsha512_block_data_order:
278 #if __ARM_ARCH__<7
279 sub r3,pc,#8 @ sha512_block_data_order
280 #else
281 adr r3,.Lsha512_block_data_order
282 #endif
283 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
284 ldr r12,.LOPENSSL_armcap
285 ldr r12,[r3,r12] @ OPENSSL_armcap_P
286 tst r12,#1
287 bne .LNEON
288 #endif
289 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
290 stmdb sp!,{r4-r12,lr}
291 sub $Ktbl,r3,#672 @ K512
292 sub sp,sp,#9*8
294 ldr $Elo,[$ctx,#$Eoff+$lo]
295 ldr $Ehi,[$ctx,#$Eoff+$hi]
296 ldr $t0, [$ctx,#$Goff+$lo]
297 ldr $t1, [$ctx,#$Goff+$hi]
298 ldr $t2, [$ctx,#$Hoff+$lo]
299 ldr $t3, [$ctx,#$Hoff+$hi]
300 .Loop:
301 str $t0, [sp,#$Goff+0]
302 str $t1, [sp,#$Goff+4]
303 str $t2, [sp,#$Hoff+0]
304 str $t3, [sp,#$Hoff+4]
305 ldr $Alo,[$ctx,#$Aoff+$lo]
306 ldr $Ahi,[$ctx,#$Aoff+$hi]
307 ldr $Tlo,[$ctx,#$Boff+$lo]
308 ldr $Thi,[$ctx,#$Boff+$hi]
309 ldr $t0, [$ctx,#$Coff+$lo]
310 ldr $t1, [$ctx,#$Coff+$hi]
311 ldr $t2, [$ctx,#$Doff+$lo]
312 ldr $t3, [$ctx,#$Doff+$hi]
313 str $Tlo,[sp,#$Boff+0]
314 str $Thi,[sp,#$Boff+4]
315 str $t0, [sp,#$Coff+0]
316 str $t1, [sp,#$Coff+4]
317 str $t2, [sp,#$Doff+0]
318 str $t3, [sp,#$Doff+4]
319 ldr $Tlo,[$ctx,#$Foff+$lo]
320 ldr $Thi,[$ctx,#$Foff+$hi]
321 str $Tlo,[sp,#$Foff+0]
322 str $Thi,[sp,#$Foff+4]
324 .L00_15:
325 #if __ARM_ARCH__<7
326 ldrb $Tlo,[$inp,#7]
327 ldrb $t0, [$inp,#6]
328 ldrb $t1, [$inp,#5]
329 ldrb $t2, [$inp,#4]
330 ldrb $Thi,[$inp,#3]
331 ldrb $t3, [$inp,#2]
332 orr $Tlo,$Tlo,$t0,lsl#8
333 ldrb $t0, [$inp,#1]
334 orr $Tlo,$Tlo,$t1,lsl#16
335 ldrb $t1, [$inp],#8
336 orr $Tlo,$Tlo,$t2,lsl#24
337 orr $Thi,$Thi,$t3,lsl#8
338 orr $Thi,$Thi,$t0,lsl#16
339 orr $Thi,$Thi,$t1,lsl#24
340 #else
341 ldr $Tlo,[$inp,#4]
342 ldr $Thi,[$inp],#8
343 #ifdef __ARMEL__
344 rev $Tlo,$Tlo
345 rev $Thi,$Thi
346 #endif
347 #endif
349 &BODY_00_15(0x94);
350 $code.=<<___;
351 tst $Ktbl,#1
352 beq .L00_15
353 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
354 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
355 bic $Ktbl,$Ktbl,#1
356 .L16_79:
357 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
358 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
359 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
360 mov $Tlo,$t0,lsr#1
361 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
362 mov $Thi,$t1,lsr#1
363 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
364 eor $Tlo,$Tlo,$t1,lsl#31
365 eor $Thi,$Thi,$t0,lsl#31
366 eor $Tlo,$Tlo,$t0,lsr#8
367 eor $Thi,$Thi,$t1,lsr#8
368 eor $Tlo,$Tlo,$t1,lsl#24
369 eor $Thi,$Thi,$t0,lsl#24
370 eor $Tlo,$Tlo,$t0,lsr#7
371 eor $Thi,$Thi,$t1,lsr#7
372 eor $Tlo,$Tlo,$t1,lsl#25
374 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
375 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
376 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
377 mov $t0,$t2,lsr#19
378 mov $t1,$t3,lsr#19
379 eor $t0,$t0,$t3,lsl#13
380 eor $t1,$t1,$t2,lsl#13
381 eor $t0,$t0,$t3,lsr#29
382 eor $t1,$t1,$t2,lsr#29
383 eor $t0,$t0,$t2,lsl#3
384 eor $t1,$t1,$t3,lsl#3
385 eor $t0,$t0,$t2,lsr#6
386 eor $t1,$t1,$t3,lsr#6
387 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
388 eor $t0,$t0,$t3,lsl#26
390 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
391 adds $Tlo,$Tlo,$t0
392 ldr $t0,[sp,#`$Xoff+8*16`+0]
393 adc $Thi,$Thi,$t1
395 ldr $t1,[sp,#`$Xoff+8*16`+4]
396 adds $Tlo,$Tlo,$t2
397 adc $Thi,$Thi,$t3
398 adds $Tlo,$Tlo,$t0
399 adc $Thi,$Thi,$t1
401 &BODY_00_15(0x17);
402 $code.=<<___;
403 #if __ARM_ARCH__>=7
404 ittt eq @ Thumb2 thing, sanity check in ARM
405 #endif
406 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
407 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
408 beq .L16_79
409 bic $Ktbl,$Ktbl,#1
411 ldr $Tlo,[sp,#$Boff+0]
412 ldr $Thi,[sp,#$Boff+4]
413 ldr $t0, [$ctx,#$Aoff+$lo]
414 ldr $t1, [$ctx,#$Aoff+$hi]
415 ldr $t2, [$ctx,#$Boff+$lo]
416 ldr $t3, [$ctx,#$Boff+$hi]
417 adds $t0,$Alo,$t0
418 str $t0, [$ctx,#$Aoff+$lo]
419 adc $t1,$Ahi,$t1
420 str $t1, [$ctx,#$Aoff+$hi]
421 adds $t2,$Tlo,$t2
422 str $t2, [$ctx,#$Boff+$lo]
423 adc $t3,$Thi,$t3
424 str $t3, [$ctx,#$Boff+$hi]
426 ldr $Alo,[sp,#$Coff+0]
427 ldr $Ahi,[sp,#$Coff+4]
428 ldr $Tlo,[sp,#$Doff+0]
429 ldr $Thi,[sp,#$Doff+4]
430 ldr $t0, [$ctx,#$Coff+$lo]
431 ldr $t1, [$ctx,#$Coff+$hi]
432 ldr $t2, [$ctx,#$Doff+$lo]
433 ldr $t3, [$ctx,#$Doff+$hi]
434 adds $t0,$Alo,$t0
435 str $t0, [$ctx,#$Coff+$lo]
436 adc $t1,$Ahi,$t1
437 str $t1, [$ctx,#$Coff+$hi]
438 adds $t2,$Tlo,$t2
439 str $t2, [$ctx,#$Doff+$lo]
440 adc $t3,$Thi,$t3
441 str $t3, [$ctx,#$Doff+$hi]
443 ldr $Tlo,[sp,#$Foff+0]
444 ldr $Thi,[sp,#$Foff+4]
445 ldr $t0, [$ctx,#$Eoff+$lo]
446 ldr $t1, [$ctx,#$Eoff+$hi]
447 ldr $t2, [$ctx,#$Foff+$lo]
448 ldr $t3, [$ctx,#$Foff+$hi]
449 adds $Elo,$Elo,$t0
450 str $Elo,[$ctx,#$Eoff+$lo]
451 adc $Ehi,$Ehi,$t1
452 str $Ehi,[$ctx,#$Eoff+$hi]
453 adds $t2,$Tlo,$t2
454 str $t2, [$ctx,#$Foff+$lo]
455 adc $t3,$Thi,$t3
456 str $t3, [$ctx,#$Foff+$hi]
458 ldr $Alo,[sp,#$Goff+0]
459 ldr $Ahi,[sp,#$Goff+4]
460 ldr $Tlo,[sp,#$Hoff+0]
461 ldr $Thi,[sp,#$Hoff+4]
462 ldr $t0, [$ctx,#$Goff+$lo]
463 ldr $t1, [$ctx,#$Goff+$hi]
464 ldr $t2, [$ctx,#$Hoff+$lo]
465 ldr $t3, [$ctx,#$Hoff+$hi]
466 adds $t0,$Alo,$t0
467 str $t0, [$ctx,#$Goff+$lo]
468 adc $t1,$Ahi,$t1
469 str $t1, [$ctx,#$Goff+$hi]
470 adds $t2,$Tlo,$t2
471 str $t2, [$ctx,#$Hoff+$lo]
472 adc $t3,$Thi,$t3
473 str $t3, [$ctx,#$Hoff+$hi]
475 add sp,sp,#640
476 sub $Ktbl,$Ktbl,#640
478 teq $inp,$len
479 bne .Loop
481 add sp,sp,#8*9 @ destroy frame
482 #if __ARM_ARCH__>=5
483 ldmia sp!,{r4-r12,pc}
484 #else
485 ldmia sp!,{r4-r12,lr}
486 tst lr,#1
487 moveq pc,lr @ be binary compatible with V4, yet
488 bx lr @ interoperable with Thumb ISA:-)
489 #endif
490 .size sha512_block_data_order,.-sha512_block_data_order
494 my @Sigma0=(28,34,39);
495 my @Sigma1=(14,18,41);
496 my @sigma0=(1, 8, 7);
497 my @sigma1=(19,61,6);
499 my $Ktbl="r3";
500 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
502 my @X=map("d$_",(0..15));
503 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
505 sub NEON_00_15() {
506 my $i=shift;
507 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
508 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
510 $code.=<<___ if ($i<16 || $i&1);
511 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
512 #if $i<16
513 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
514 #endif
515 vshr.u64 $t1,$e,#@Sigma1[1]
516 #if $i>0
517 vadd.i64 $a,$Maj @ h+=Maj from the past
518 #endif
519 vshr.u64 $t2,$e,#@Sigma1[2]
521 $code.=<<___;
522 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
523 vsli.64 $t0,$e,#`64-@Sigma1[0]`
524 vsli.64 $t1,$e,#`64-@Sigma1[1]`
525 vmov $Ch,$e
526 vsli.64 $t2,$e,#`64-@Sigma1[2]`
527 #if $i<16 && defined(__ARMEL__)
528 vrev64.8 @X[$i],@X[$i]
529 #endif
530 veor $t1,$t0
531 vbsl $Ch,$f,$g @ Ch(e,f,g)
532 vshr.u64 $t0,$a,#@Sigma0[0]
533 veor $t2,$t1 @ Sigma1(e)
534 vadd.i64 $T1,$Ch,$h
535 vshr.u64 $t1,$a,#@Sigma0[1]
536 vsli.64 $t0,$a,#`64-@Sigma0[0]`
537 vadd.i64 $T1,$t2
538 vshr.u64 $t2,$a,#@Sigma0[2]
539 vadd.i64 $K,@X[$i%16]
540 vsli.64 $t1,$a,#`64-@Sigma0[1]`
541 veor $Maj,$a,$b
542 vsli.64 $t2,$a,#`64-@Sigma0[2]`
543 veor $h,$t0,$t1
544 vadd.i64 $T1,$K
545 vbsl $Maj,$c,$b @ Maj(a,b,c)
546 veor $h,$t2 @ Sigma0(a)
547 vadd.i64 $d,$T1
548 vadd.i64 $Maj,$T1
549 @ vadd.i64 $h,$Maj
553 sub NEON_16_79() {
554 my $i=shift;
556 if ($i&1) { &NEON_00_15($i,@_); return; }
558 # 2x-vectorized, therefore runs every 2nd round
559 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
560 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
561 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
562 my $e=@_[4]; # $e from NEON_00_15
563 $i /= 2;
564 $code.=<<___;
565 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
566 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
567 vadd.i64 @_[0],d30 @ h+=Maj from the past
568 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
569 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
570 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
571 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
572 veor $s1,$t0
573 vshr.u64 $t0,$s0,#@sigma0[0]
574 veor $s1,$t1 @ sigma1(X[i+14])
575 vshr.u64 $t1,$s0,#@sigma0[1]
576 vadd.i64 @X[$i%8],$s1
577 vshr.u64 $s1,$s0,#@sigma0[2]
578 vsli.64 $t0,$s0,#`64-@sigma0[0]`
579 vsli.64 $t1,$s0,#`64-@sigma0[1]`
580 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
581 veor $s1,$t0
582 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
583 vadd.i64 @X[$i%8],$s0
584 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
585 veor $s1,$t1 @ sigma0(X[i+1])
586 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
587 vadd.i64 @X[$i%8],$s1
589 &NEON_00_15(2*$i,@_);
592 $code.=<<___;
593 #if __ARM_MAX_ARCH__>=7
594 .arch armv7-a
595 .fpu neon
597 .global sha512_block_data_order_neon
598 .type sha512_block_data_order_neon,%function
599 .align 4
600 sha512_block_data_order_neon:
601 .LNEON:
602 dmb @ errata #451034 on early Cortex A8
603 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
604 VFP_ABI_PUSH
605 adrl $Ktbl,K512
606 vldmia $ctx,{$A-$H} @ load context
607 .Loop_neon:
609 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
610 $code.=<<___;
611 mov $cnt,#4
612 .L16_79_neon:
613 subs $cnt,#1
615 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
616 $code.=<<___;
617 bne .L16_79_neon
619 vadd.i64 $A,d30 @ h+=Maj from the past
620 vldmia $ctx,{d24-d31} @ load context to temp
621 vadd.i64 q8,q12 @ vectorized accumulate
622 vadd.i64 q9,q13
623 vadd.i64 q10,q14
624 vadd.i64 q11,q15
625 vstmia $ctx,{$A-$H} @ save context
626 teq $inp,$len
627 sub $Ktbl,#640 @ rewind K512
628 bne .Loop_neon
630 VFP_ABI_POP
631 ret @ bx lr
632 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
633 #endif
636 $code.=<<___;
637 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
638 .align 2
639 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
640 .comm OPENSSL_armcap_P,4,4
641 #endif
644 $code =~ s/\`([^\`]*)\`/eval $1/gem;
645 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
646 $code =~ s/\bret\b/bx lr/gm;
648 open SELF,$0;
649 while(<SELF>) {
650 next if (/^#!/);
651 last if (!s/^#/@/ and !/^$/);
652 print;
654 close SELF;
656 print $code;
657 close STDOUT; # enforce flush