1 #define __ARM_ARCH__ __LINUX_ARM_ARCH__
2 @ ====================================================================
3 @ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 @ project. The module is, however, dual licensed under OpenSSL and
5 @ CRYPTOGAMS licenses depending on where you obtain it. For further
6 @ details see http://www.openssl.org/~appro/cryptogams/.
7 @ ====================================================================
9 @ sha1_block procedure for ARMv4.
13 @ Size/performance trade-off
14 @ ====================================================================
15 @ impl size in bytes comp cycles[*] measured performance
16 @ ====================================================================
18 @ armv4-small 392/+29% 1958/+64% 2250/+96%
19 @ armv4-compact 740/+89% 1552/+26% 1840/+22%
20 @ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
21 @ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
22 @ ====================================================================
23 @ thumb = same as 'small' but in Thumb instructions[**] and
24 @ with recurring code in two private functions;
25 @ small = detached Xload/update, loops are folded;
26 @ compact = detached Xload/update, 5x unroll;
27 @ large = interleaved Xload/update, 5x unroll;
28 @ full unroll = interleaved Xload/update, full unroll, estimated[!];
30 @ [*] Manually counted instructions in "grand" loop body. Measured
31 @ performance is affected by prologue and epilogue overhead,
32 @ i-cache availability, branch penalties, etc.
33 @ [**] While each Thumb instruction is twice smaller, they are not as
34 @ diverse as ARM ones: e.g., there are only two arithmetic
35 @ instructions with 3 arguments, no [fixed] rotate, addressing
36 @ modes are limited. As result it takes more instructions to do
37 @ the same job in Thumb, therefore the code is never twice as
38 @ small and always slower.
39 @ [***] which is also ~35% better than compiler generated code. Dual-
40 @ issue Cortex A8 core was measured to process input block in
45 @ Rescheduling for dual-issue pipeline resulted in 13% improvement on
46 @ Cortex A8 core and in absolute terms ~870 cycles per input block
47 @ [or 13.6 cycles per byte].
51 @ Profiler-assisted and platform-specific optimization resulted in 10%
52 @ improvement on Cortex A8 core and 12.2 cycles per byte.
54 #include <linux/linkage.h>
59 ENTRY(sha1_block_data_order)
61 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
62 ldmia r0,{r3,r4,r5,r6,r7}
69 mov r7,r7,ror#30 @ [6]
75 add r7,r8,r7,ror#2 @ E+=K_00_19
78 eor r10,r5,r6 @ F_xx_xx
80 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
83 ldr r9,[r1],#4 @ handles unaligned
84 add r7,r8,r7,ror#2 @ E+=K_00_19
85 eor r10,r5,r6 @ F_xx_xx
86 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
92 add r7,r7,r9 @ E+=X[i]
93 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
95 add r7,r7,r10 @ E+=F_00_19(B,C,D)
100 add r6,r8,r6,ror#2 @ E+=K_00_19
103 eor r10,r4,r5 @ F_xx_xx
105 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
108 ldr r9,[r1],#4 @ handles unaligned
109 add r6,r8,r6,ror#2 @ E+=K_00_19
110 eor r10,r4,r5 @ F_xx_xx
111 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
113 rev r9,r9 @ byte swap
117 add r6,r6,r9 @ E+=X[i]
118 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
120 add r6,r6,r10 @ E+=F_00_19(B,C,D)
125 add r5,r8,r5,ror#2 @ E+=K_00_19
128 eor r10,r3,r4 @ F_xx_xx
130 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
133 ldr r9,[r1],#4 @ handles unaligned
134 add r5,r8,r5,ror#2 @ E+=K_00_19
135 eor r10,r3,r4 @ F_xx_xx
136 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
138 rev r9,r9 @ byte swap
142 add r5,r5,r9 @ E+=X[i]
143 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
145 add r5,r5,r10 @ E+=F_00_19(B,C,D)
150 add r4,r8,r4,ror#2 @ E+=K_00_19
153 eor r10,r7,r3 @ F_xx_xx
155 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
158 ldr r9,[r1],#4 @ handles unaligned
159 add r4,r8,r4,ror#2 @ E+=K_00_19
160 eor r10,r7,r3 @ F_xx_xx
161 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
163 rev r9,r9 @ byte swap
167 add r4,r4,r9 @ E+=X[i]
168 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
170 add r4,r4,r10 @ E+=F_00_19(B,C,D)
175 add r3,r8,r3,ror#2 @ E+=K_00_19
178 eor r10,r6,r7 @ F_xx_xx
180 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
183 ldr r9,[r1],#4 @ handles unaligned
184 add r3,r8,r3,ror#2 @ E+=K_00_19
185 eor r10,r6,r7 @ F_xx_xx
186 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
188 rev r9,r9 @ byte swap
192 add r3,r3,r9 @ E+=X[i]
193 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
195 add r3,r3,r10 @ E+=F_00_19(B,C,D)
197 bne .L_00_15 @ [((11+4)*5+2)*3]
203 add r7,r8,r7,ror#2 @ E+=K_00_19
206 eor r10,r5,r6 @ F_xx_xx
208 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
211 ldr r9,[r1],#4 @ handles unaligned
212 add r7,r8,r7,ror#2 @ E+=K_00_19
213 eor r10,r5,r6 @ F_xx_xx
214 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
216 rev r9,r9 @ byte swap
220 add r7,r7,r9 @ E+=X[i]
221 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
223 add r7,r7,r10 @ E+=F_00_19(B,C,D)
227 add r6,r8,r6,ror#2 @ E+=K_xx_xx
230 eor r11,r11,r12 @ 1 cycle stall
231 eor r10,r4,r5 @ F_xx_xx
233 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
236 and r10,r3,r10,ror#2 @ F_xx_xx
238 add r6,r6,r9 @ E+=X[i]
239 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
240 add r6,r6,r10 @ E+=F_00_19(B,C,D)
244 add r5,r8,r5,ror#2 @ E+=K_xx_xx
247 eor r11,r11,r12 @ 1 cycle stall
248 eor r10,r3,r4 @ F_xx_xx
250 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
253 and r10,r7,r10,ror#2 @ F_xx_xx
255 add r5,r5,r9 @ E+=X[i]
256 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
257 add r5,r5,r10 @ E+=F_00_19(B,C,D)
261 add r4,r8,r4,ror#2 @ E+=K_xx_xx
264 eor r11,r11,r12 @ 1 cycle stall
265 eor r10,r7,r3 @ F_xx_xx
267 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
270 and r10,r6,r10,ror#2 @ F_xx_xx
272 add r4,r4,r9 @ E+=X[i]
273 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
274 add r4,r4,r10 @ E+=F_00_19(B,C,D)
278 add r3,r8,r3,ror#2 @ E+=K_xx_xx
281 eor r11,r11,r12 @ 1 cycle stall
282 eor r10,r6,r7 @ F_xx_xx
284 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
287 and r10,r5,r10,ror#2 @ F_xx_xx
289 add r3,r3,r9 @ E+=X[i]
290 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
291 add r3,r3,r10 @ E+=F_00_19(B,C,D)
293 ldr r8,.LK_20_39 @ [+15+16*4]
294 cmn sp,#0 @ [+3], clear carry to denote 20_39
299 add r7,r8,r7,ror#2 @ E+=K_xx_xx
302 eor r11,r11,r12 @ 1 cycle stall
303 eor r10,r5,r6 @ F_xx_xx
305 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
308 eor r10,r4,r10,ror#2 @ F_xx_xx
310 add r7,r7,r9 @ E+=X[i]
311 add r7,r7,r10 @ E+=F_20_39(B,C,D)
315 add r6,r8,r6,ror#2 @ E+=K_xx_xx
318 eor r11,r11,r12 @ 1 cycle stall
319 eor r10,r4,r5 @ F_xx_xx
321 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
324 eor r10,r3,r10,ror#2 @ F_xx_xx
326 add r6,r6,r9 @ E+=X[i]
327 add r6,r6,r10 @ E+=F_20_39(B,C,D)
331 add r5,r8,r5,ror#2 @ E+=K_xx_xx
334 eor r11,r11,r12 @ 1 cycle stall
335 eor r10,r3,r4 @ F_xx_xx
337 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
340 eor r10,r7,r10,ror#2 @ F_xx_xx
342 add r5,r5,r9 @ E+=X[i]
343 add r5,r5,r10 @ E+=F_20_39(B,C,D)
347 add r4,r8,r4,ror#2 @ E+=K_xx_xx
350 eor r11,r11,r12 @ 1 cycle stall
351 eor r10,r7,r3 @ F_xx_xx
353 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
356 eor r10,r6,r10,ror#2 @ F_xx_xx
358 add r4,r4,r9 @ E+=X[i]
359 add r4,r4,r10 @ E+=F_20_39(B,C,D)
363 add r3,r8,r3,ror#2 @ E+=K_xx_xx
366 eor r11,r11,r12 @ 1 cycle stall
367 eor r10,r6,r7 @ F_xx_xx
369 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
372 eor r10,r5,r10,ror#2 @ F_xx_xx
374 add r3,r3,r9 @ E+=X[i]
375 add r3,r3,r10 @ E+=F_20_39(B,C,D)
376 ARM( teq r14,sp ) @ preserve carry
378 THUMB( teq r14,r11 ) @ preserve carry
379 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
380 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
383 sub sp,sp,#20*4 @ [+2]
388 add r7,r8,r7,ror#2 @ E+=K_xx_xx
391 eor r11,r11,r12 @ 1 cycle stall
392 eor r10,r5,r6 @ F_xx_xx
394 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
397 and r10,r4,r10,ror#2 @ F_xx_xx
398 and r11,r5,r6 @ F_xx_xx
399 add r7,r7,r9 @ E+=X[i]
400 add r7,r7,r10 @ E+=F_40_59(B,C,D)
405 add r6,r8,r6,ror#2 @ E+=K_xx_xx
408 eor r11,r11,r12 @ 1 cycle stall
409 eor r10,r4,r5 @ F_xx_xx
411 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
414 and r10,r3,r10,ror#2 @ F_xx_xx
415 and r11,r4,r5 @ F_xx_xx
416 add r6,r6,r9 @ E+=X[i]
417 add r6,r6,r10 @ E+=F_40_59(B,C,D)
422 add r5,r8,r5,ror#2 @ E+=K_xx_xx
425 eor r11,r11,r12 @ 1 cycle stall
426 eor r10,r3,r4 @ F_xx_xx
428 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
431 and r10,r7,r10,ror#2 @ F_xx_xx
432 and r11,r3,r4 @ F_xx_xx
433 add r5,r5,r9 @ E+=X[i]
434 add r5,r5,r10 @ E+=F_40_59(B,C,D)
439 add r4,r8,r4,ror#2 @ E+=K_xx_xx
442 eor r11,r11,r12 @ 1 cycle stall
443 eor r10,r7,r3 @ F_xx_xx
445 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
448 and r10,r6,r10,ror#2 @ F_xx_xx
449 and r11,r7,r3 @ F_xx_xx
450 add r4,r4,r9 @ E+=X[i]
451 add r4,r4,r10 @ E+=F_40_59(B,C,D)
456 add r3,r8,r3,ror#2 @ E+=K_xx_xx
459 eor r11,r11,r12 @ 1 cycle stall
460 eor r10,r6,r7 @ F_xx_xx
462 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
465 and r10,r5,r10,ror#2 @ F_xx_xx
466 and r11,r6,r7 @ F_xx_xx
467 add r3,r3,r9 @ E+=X[i]
468 add r3,r3,r10 @ E+=F_40_59(B,C,D)
471 bne .L_40_59 @ [+((12+5)*5+2)*4]
475 cmp sp,#0 @ set carry to denote 60_79
476 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
478 add sp,sp,#80*4 @ "deallocate" stack frame
479 ldmia r0,{r8,r9,r10,r11,r12}
485 stmia r0,{r3,r4,r5,r6,r7}
487 bne .Lloop @ [+18], total 1307
489 ldmia sp!,{r4-r12,pc}
491 .LK_00_19: .word 0x5a827999
492 .LK_20_39: .word 0x6ed9eba1
493 .LK_40_59: .word 0x8f1bbcdc
494 .LK_60_79: .word 0xca62c1d6
495 ENDPROC(sha1_block_data_order)
496 .asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"