1 #define __ARM_ARCH__ __LINUX_ARM_ARCH__
2 @ SPDX-License-Identifier: GPL-2.0
4 @ This code is taken from the OpenSSL project but the author (Andy Polyakov)
5 @ has relicensed it under the GPLv2. Therefore this program is free software;
6 @ you can redistribute it and/or modify it under the terms of the GNU General
7 @ Public License version 2 as published by the Free Software Foundation.
9 @ The original headers, including the original license headers, are
10 @ included below for completeness.
12 @ ====================================================================
13 @ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
14 @ project. The module is, however, dual licensed under OpenSSL and
15 @ CRYPTOGAMS licenses depending on where you obtain it. For further
16 @ details see https://www.openssl.org/~appro/cryptogams/.
17 @ ====================================================================
19 @ sha1_block procedure for ARMv4.
23 @ Size/performance trade-off
24 @ ====================================================================
25 @ impl size in bytes comp cycles[*] measured performance
26 @ ====================================================================
28 @ armv4-small 392/+29% 1958/+64% 2250/+96%
29 @ armv4-compact 740/+89% 1552/+26% 1840/+22%
30 @ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
31 @ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
32 @ ====================================================================
33 @ thumb = same as 'small' but in Thumb instructions[**] and
34 @ with recurring code in two private functions;
35 @ small = detached Xload/update, loops are folded;
36 @ compact = detached Xload/update, 5x unroll;
37 @ large = interleaved Xload/update, 5x unroll;
38 @ full unroll = interleaved Xload/update, full unroll, estimated[!];
40 @ [*] Manually counted instructions in "grand" loop body. Measured
41 @ performance is affected by prologue and epilogue overhead,
42 @ i-cache availability, branch penalties, etc.
43 @ [**] While each Thumb instruction is twice smaller, they are not as
44 @ diverse as ARM ones: e.g., there are only two arithmetic
45 @ instructions with 3 arguments, no [fixed] rotate, addressing
46 @ modes are limited. As result it takes more instructions to do
47 @ the same job in Thumb, therefore the code is never twice as
48 @ small and always slower.
49 @ [***] which is also ~35% better than compiler generated code. Dual-
50 @ issue Cortex A8 core was measured to process input block in
55 @ Rescheduling for dual-issue pipeline resulted in 13% improvement on
56 @ Cortex A8 core and in absolute terms ~870 cycles per input block
57 @ [or 13.6 cycles per byte].
61 @ Profiler-assisted and platform-specific optimization resulted in 10%
62 @ improvement on Cortex A8 core and 12.2 cycles per byte.
64 #include <linux/linkage.h>
69 ENTRY(sha1_block_data_order)
71 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
72 ldmia r0,{r3,r4,r5,r6,r7}
79 mov r7,r7,ror#30 @ [6]
85 add r7,r8,r7,ror#2 @ E+=K_00_19
88 eor r10,r5,r6 @ F_xx_xx
90 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
93 ldr r9,[r1],#4 @ handles unaligned
94 add r7,r8,r7,ror#2 @ E+=K_00_19
95 eor r10,r5,r6 @ F_xx_xx
96 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
102 add r7,r7,r9 @ E+=X[i]
103 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
105 add r7,r7,r10 @ E+=F_00_19(B,C,D)
110 add r6,r8,r6,ror#2 @ E+=K_00_19
113 eor r10,r4,r5 @ F_xx_xx
115 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
118 ldr r9,[r1],#4 @ handles unaligned
119 add r6,r8,r6,ror#2 @ E+=K_00_19
120 eor r10,r4,r5 @ F_xx_xx
121 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
123 rev r9,r9 @ byte swap
127 add r6,r6,r9 @ E+=X[i]
128 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
130 add r6,r6,r10 @ E+=F_00_19(B,C,D)
135 add r5,r8,r5,ror#2 @ E+=K_00_19
138 eor r10,r3,r4 @ F_xx_xx
140 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
143 ldr r9,[r1],#4 @ handles unaligned
144 add r5,r8,r5,ror#2 @ E+=K_00_19
145 eor r10,r3,r4 @ F_xx_xx
146 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
148 rev r9,r9 @ byte swap
152 add r5,r5,r9 @ E+=X[i]
153 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
155 add r5,r5,r10 @ E+=F_00_19(B,C,D)
160 add r4,r8,r4,ror#2 @ E+=K_00_19
163 eor r10,r7,r3 @ F_xx_xx
165 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
168 ldr r9,[r1],#4 @ handles unaligned
169 add r4,r8,r4,ror#2 @ E+=K_00_19
170 eor r10,r7,r3 @ F_xx_xx
171 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
173 rev r9,r9 @ byte swap
177 add r4,r4,r9 @ E+=X[i]
178 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
180 add r4,r4,r10 @ E+=F_00_19(B,C,D)
185 add r3,r8,r3,ror#2 @ E+=K_00_19
188 eor r10,r6,r7 @ F_xx_xx
190 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
193 ldr r9,[r1],#4 @ handles unaligned
194 add r3,r8,r3,ror#2 @ E+=K_00_19
195 eor r10,r6,r7 @ F_xx_xx
196 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
198 rev r9,r9 @ byte swap
202 add r3,r3,r9 @ E+=X[i]
203 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
205 add r3,r3,r10 @ E+=F_00_19(B,C,D)
207 bne .L_00_15 @ [((11+4)*5+2)*3]
213 add r7,r8,r7,ror#2 @ E+=K_00_19
216 eor r10,r5,r6 @ F_xx_xx
218 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
221 ldr r9,[r1],#4 @ handles unaligned
222 add r7,r8,r7,ror#2 @ E+=K_00_19
223 eor r10,r5,r6 @ F_xx_xx
224 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
226 rev r9,r9 @ byte swap
230 add r7,r7,r9 @ E+=X[i]
231 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
233 add r7,r7,r10 @ E+=F_00_19(B,C,D)
237 add r6,r8,r6,ror#2 @ E+=K_xx_xx
240 eor r11,r11,r12 @ 1 cycle stall
241 eor r10,r4,r5 @ F_xx_xx
243 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
246 and r10,r3,r10,ror#2 @ F_xx_xx
248 add r6,r6,r9 @ E+=X[i]
249 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
250 add r6,r6,r10 @ E+=F_00_19(B,C,D)
254 add r5,r8,r5,ror#2 @ E+=K_xx_xx
257 eor r11,r11,r12 @ 1 cycle stall
258 eor r10,r3,r4 @ F_xx_xx
260 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
263 and r10,r7,r10,ror#2 @ F_xx_xx
265 add r5,r5,r9 @ E+=X[i]
266 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
267 add r5,r5,r10 @ E+=F_00_19(B,C,D)
271 add r4,r8,r4,ror#2 @ E+=K_xx_xx
274 eor r11,r11,r12 @ 1 cycle stall
275 eor r10,r7,r3 @ F_xx_xx
277 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
280 and r10,r6,r10,ror#2 @ F_xx_xx
282 add r4,r4,r9 @ E+=X[i]
283 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
284 add r4,r4,r10 @ E+=F_00_19(B,C,D)
288 add r3,r8,r3,ror#2 @ E+=K_xx_xx
291 eor r11,r11,r12 @ 1 cycle stall
292 eor r10,r6,r7 @ F_xx_xx
294 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
297 and r10,r5,r10,ror#2 @ F_xx_xx
299 add r3,r3,r9 @ E+=X[i]
300 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
301 add r3,r3,r10 @ E+=F_00_19(B,C,D)
303 ldr r8,.LK_20_39 @ [+15+16*4]
304 cmn sp,#0 @ [+3], clear carry to denote 20_39
309 add r7,r8,r7,ror#2 @ E+=K_xx_xx
312 eor r11,r11,r12 @ 1 cycle stall
313 eor r10,r5,r6 @ F_xx_xx
315 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
318 eor r10,r4,r10,ror#2 @ F_xx_xx
320 add r7,r7,r9 @ E+=X[i]
321 add r7,r7,r10 @ E+=F_20_39(B,C,D)
325 add r6,r8,r6,ror#2 @ E+=K_xx_xx
328 eor r11,r11,r12 @ 1 cycle stall
329 eor r10,r4,r5 @ F_xx_xx
331 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
334 eor r10,r3,r10,ror#2 @ F_xx_xx
336 add r6,r6,r9 @ E+=X[i]
337 add r6,r6,r10 @ E+=F_20_39(B,C,D)
341 add r5,r8,r5,ror#2 @ E+=K_xx_xx
344 eor r11,r11,r12 @ 1 cycle stall
345 eor r10,r3,r4 @ F_xx_xx
347 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
350 eor r10,r7,r10,ror#2 @ F_xx_xx
352 add r5,r5,r9 @ E+=X[i]
353 add r5,r5,r10 @ E+=F_20_39(B,C,D)
357 add r4,r8,r4,ror#2 @ E+=K_xx_xx
360 eor r11,r11,r12 @ 1 cycle stall
361 eor r10,r7,r3 @ F_xx_xx
363 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
366 eor r10,r6,r10,ror#2 @ F_xx_xx
368 add r4,r4,r9 @ E+=X[i]
369 add r4,r4,r10 @ E+=F_20_39(B,C,D)
373 add r3,r8,r3,ror#2 @ E+=K_xx_xx
376 eor r11,r11,r12 @ 1 cycle stall
377 eor r10,r6,r7 @ F_xx_xx
379 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
382 eor r10,r5,r10,ror#2 @ F_xx_xx
384 add r3,r3,r9 @ E+=X[i]
385 add r3,r3,r10 @ E+=F_20_39(B,C,D)
386 ARM( teq r14,sp ) @ preserve carry
388 THUMB( teq r14,r11 ) @ preserve carry
389 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
390 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
393 sub sp,sp,#20*4 @ [+2]
398 add r7,r8,r7,ror#2 @ E+=K_xx_xx
401 eor r11,r11,r12 @ 1 cycle stall
402 eor r10,r5,r6 @ F_xx_xx
404 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
407 and r10,r4,r10,ror#2 @ F_xx_xx
408 and r11,r5,r6 @ F_xx_xx
409 add r7,r7,r9 @ E+=X[i]
410 add r7,r7,r10 @ E+=F_40_59(B,C,D)
415 add r6,r8,r6,ror#2 @ E+=K_xx_xx
418 eor r11,r11,r12 @ 1 cycle stall
419 eor r10,r4,r5 @ F_xx_xx
421 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
424 and r10,r3,r10,ror#2 @ F_xx_xx
425 and r11,r4,r5 @ F_xx_xx
426 add r6,r6,r9 @ E+=X[i]
427 add r6,r6,r10 @ E+=F_40_59(B,C,D)
432 add r5,r8,r5,ror#2 @ E+=K_xx_xx
435 eor r11,r11,r12 @ 1 cycle stall
436 eor r10,r3,r4 @ F_xx_xx
438 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
441 and r10,r7,r10,ror#2 @ F_xx_xx
442 and r11,r3,r4 @ F_xx_xx
443 add r5,r5,r9 @ E+=X[i]
444 add r5,r5,r10 @ E+=F_40_59(B,C,D)
449 add r4,r8,r4,ror#2 @ E+=K_xx_xx
452 eor r11,r11,r12 @ 1 cycle stall
453 eor r10,r7,r3 @ F_xx_xx
455 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
458 and r10,r6,r10,ror#2 @ F_xx_xx
459 and r11,r7,r3 @ F_xx_xx
460 add r4,r4,r9 @ E+=X[i]
461 add r4,r4,r10 @ E+=F_40_59(B,C,D)
466 add r3,r8,r3,ror#2 @ E+=K_xx_xx
469 eor r11,r11,r12 @ 1 cycle stall
470 eor r10,r6,r7 @ F_xx_xx
472 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
475 and r10,r5,r10,ror#2 @ F_xx_xx
476 and r11,r6,r7 @ F_xx_xx
477 add r3,r3,r9 @ E+=X[i]
478 add r3,r3,r10 @ E+=F_40_59(B,C,D)
481 bne .L_40_59 @ [+((12+5)*5+2)*4]
485 cmp sp,#0 @ set carry to denote 60_79
486 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
488 add sp,sp,#80*4 @ "deallocate" stack frame
489 ldmia r0,{r8,r9,r10,r11,r12}
495 stmia r0,{r3,r4,r5,r6,r7}
497 bne .Lloop @ [+18], total 1307
499 ldmia sp!,{r4-r12,pc}
501 .LK_00_19: .word 0x5a827999
502 .LK_20_39: .word 0x6ed9eba1
503 .LK_40_59: .word 0x8f1bbcdc
504 .LK_60_79: .word 0xca62c1d6
505 ENDPROC(sha1_block_data_order)
506 .asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"