1 #define __ARM_ARCH__ __LINUX_ARM_ARCH__
2 @ ====================================================================
3 @ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 @ project. The module is, however, dual licensed under OpenSSL and
5 @ CRYPTOGAMS licenses depending on where you obtain it. For further
6 @ details see http://www.openssl.org/~appro/cryptogams/.
7 @ ====================================================================
9 @ sha1_block procedure for ARMv4.
13 @ Size/performance trade-off
14 @ ====================================================================
15 @ impl size in bytes comp cycles[*] measured performance
16 @ ====================================================================
18 @ armv4-small 392/+29% 1958/+64% 2250/+96%
19 @ armv4-compact 740/+89% 1552/+26% 1840/+22%
20 @ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
21 @ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
22 @ ====================================================================
23 @ thumb = same as 'small' but in Thumb instructions[**] and
24 @ with recurring code in two private functions;
25 @ small = detached Xload/update, loops are folded;
26 @ compact = detached Xload/update, 5x unroll;
27 @ large = interleaved Xload/update, 5x unroll;
28 @ full unroll = interleaved Xload/update, full unroll, estimated[!];
30 @ [*] Manually counted instructions in "grand" loop body. Measured
31 @ performance is affected by prologue and epilogue overhead,
32 @ i-cache availability, branch penalties, etc.
33 @ [**] While each Thumb instruction is twice smaller, they are not as
34 @ diverse as ARM ones: e.g., there are only two arithmetic
35 @ instructions with 3 arguments, no [fixed] rotate, addressing
36 @ modes are limited. As result it takes more instructions to do
37 @ the same job in Thumb, therefore the code is never twice as
38 @ small and always slower.
39 @ [***] which is also ~35% better than compiler generated code. Dual-
40 @ issue Cortex A8 core was measured to process input block in
45 @ Rescheduling for dual-issue pipeline resulted in 13% improvement on
46 @ Cortex A8 core and in absolute terms ~870 cycles per input block
47 @ [or 13.6 cycles per byte].
51 @ Profiler-assisted and platform-specific optimization resulted in 10%
52 @ improvement on Cortex A8 core and 12.2 cycles per byte.
56 .global sha1_block_data_order
57 .type sha1_block_data_order,%function
60 sha1_block_data_order:
62 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
63 ldmia r0,{r3,r4,r5,r6,r7}
70 mov r7,r7,ror#30 @ [6]
76 add r7,r8,r7,ror#2 @ E+=K_00_19
79 eor r10,r5,r6 @ F_xx_xx
81 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
84 ldr r9,[r1],#4 @ handles unaligned
85 add r7,r8,r7,ror#2 @ E+=K_00_19
86 eor r10,r5,r6 @ F_xx_xx
87 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
93 add r7,r7,r9 @ E+=X[i]
94 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
96 add r7,r7,r10 @ E+=F_00_19(B,C,D)
101 add r6,r8,r6,ror#2 @ E+=K_00_19
104 eor r10,r4,r5 @ F_xx_xx
106 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
109 ldr r9,[r1],#4 @ handles unaligned
110 add r6,r8,r6,ror#2 @ E+=K_00_19
111 eor r10,r4,r5 @ F_xx_xx
112 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
114 rev r9,r9 @ byte swap
118 add r6,r6,r9 @ E+=X[i]
119 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
121 add r6,r6,r10 @ E+=F_00_19(B,C,D)
126 add r5,r8,r5,ror#2 @ E+=K_00_19
129 eor r10,r3,r4 @ F_xx_xx
131 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
134 ldr r9,[r1],#4 @ handles unaligned
135 add r5,r8,r5,ror#2 @ E+=K_00_19
136 eor r10,r3,r4 @ F_xx_xx
137 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
139 rev r9,r9 @ byte swap
143 add r5,r5,r9 @ E+=X[i]
144 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
146 add r5,r5,r10 @ E+=F_00_19(B,C,D)
151 add r4,r8,r4,ror#2 @ E+=K_00_19
154 eor r10,r7,r3 @ F_xx_xx
156 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
159 ldr r9,[r1],#4 @ handles unaligned
160 add r4,r8,r4,ror#2 @ E+=K_00_19
161 eor r10,r7,r3 @ F_xx_xx
162 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
164 rev r9,r9 @ byte swap
168 add r4,r4,r9 @ E+=X[i]
169 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
171 add r4,r4,r10 @ E+=F_00_19(B,C,D)
176 add r3,r8,r3,ror#2 @ E+=K_00_19
179 eor r10,r6,r7 @ F_xx_xx
181 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
184 ldr r9,[r1],#4 @ handles unaligned
185 add r3,r8,r3,ror#2 @ E+=K_00_19
186 eor r10,r6,r7 @ F_xx_xx
187 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
189 rev r9,r9 @ byte swap
193 add r3,r3,r9 @ E+=X[i]
194 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
196 add r3,r3,r10 @ E+=F_00_19(B,C,D)
198 bne .L_00_15 @ [((11+4)*5+2)*3]
203 add r7,r8,r7,ror#2 @ E+=K_00_19
206 eor r10,r5,r6 @ F_xx_xx
208 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
211 ldr r9,[r1],#4 @ handles unaligned
212 add r7,r8,r7,ror#2 @ E+=K_00_19
213 eor r10,r5,r6 @ F_xx_xx
214 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
216 rev r9,r9 @ byte swap
220 add r7,r7,r9 @ E+=X[i]
221 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
223 add r7,r7,r10 @ E+=F_00_19(B,C,D)
227 add r6,r8,r6,ror#2 @ E+=K_xx_xx
230 eor r11,r11,r12 @ 1 cycle stall
231 eor r10,r4,r5 @ F_xx_xx
233 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
236 and r10,r3,r10,ror#2 @ F_xx_xx
238 add r6,r6,r9 @ E+=X[i]
239 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
240 add r6,r6,r10 @ E+=F_00_19(B,C,D)
244 add r5,r8,r5,ror#2 @ E+=K_xx_xx
247 eor r11,r11,r12 @ 1 cycle stall
248 eor r10,r3,r4 @ F_xx_xx
250 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
253 and r10,r7,r10,ror#2 @ F_xx_xx
255 add r5,r5,r9 @ E+=X[i]
256 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
257 add r5,r5,r10 @ E+=F_00_19(B,C,D)
261 add r4,r8,r4,ror#2 @ E+=K_xx_xx
264 eor r11,r11,r12 @ 1 cycle stall
265 eor r10,r7,r3 @ F_xx_xx
267 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
270 and r10,r6,r10,ror#2 @ F_xx_xx
272 add r4,r4,r9 @ E+=X[i]
273 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
274 add r4,r4,r10 @ E+=F_00_19(B,C,D)
278 add r3,r8,r3,ror#2 @ E+=K_xx_xx
281 eor r11,r11,r12 @ 1 cycle stall
282 eor r10,r6,r7 @ F_xx_xx
284 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
287 and r10,r5,r10,ror#2 @ F_xx_xx
289 add r3,r3,r9 @ E+=X[i]
290 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
291 add r3,r3,r10 @ E+=F_00_19(B,C,D)
293 ldr r8,.LK_20_39 @ [+15+16*4]
295 cmn sp,#0 @ [+3], clear carry to denote 20_39
300 add r7,r8,r7,ror#2 @ E+=K_xx_xx
303 eor r11,r11,r12 @ 1 cycle stall
304 eor r10,r5,r6 @ F_xx_xx
306 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
309 eor r10,r4,r10,ror#2 @ F_xx_xx
311 add r7,r7,r9 @ E+=X[i]
312 add r7,r7,r10 @ E+=F_20_39(B,C,D)
316 add r6,r8,r6,ror#2 @ E+=K_xx_xx
319 eor r11,r11,r12 @ 1 cycle stall
320 eor r10,r4,r5 @ F_xx_xx
322 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
325 eor r10,r3,r10,ror#2 @ F_xx_xx
327 add r6,r6,r9 @ E+=X[i]
328 add r6,r6,r10 @ E+=F_20_39(B,C,D)
332 add r5,r8,r5,ror#2 @ E+=K_xx_xx
335 eor r11,r11,r12 @ 1 cycle stall
336 eor r10,r3,r4 @ F_xx_xx
338 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
341 eor r10,r7,r10,ror#2 @ F_xx_xx
343 add r5,r5,r9 @ E+=X[i]
344 add r5,r5,r10 @ E+=F_20_39(B,C,D)
348 add r4,r8,r4,ror#2 @ E+=K_xx_xx
351 eor r11,r11,r12 @ 1 cycle stall
352 eor r10,r7,r3 @ F_xx_xx
354 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
357 eor r10,r6,r10,ror#2 @ F_xx_xx
359 add r4,r4,r9 @ E+=X[i]
360 add r4,r4,r10 @ E+=F_20_39(B,C,D)
364 add r3,r8,r3,ror#2 @ E+=K_xx_xx
367 eor r11,r11,r12 @ 1 cycle stall
368 eor r10,r6,r7 @ F_xx_xx
370 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
373 eor r10,r5,r10,ror#2 @ F_xx_xx
375 add r3,r3,r9 @ E+=X[i]
376 add r3,r3,r10 @ E+=F_20_39(B,C,D)
377 teq r14,sp @ preserve carry
378 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
379 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
382 sub sp,sp,#20*4 @ [+2]
387 add r7,r8,r7,ror#2 @ E+=K_xx_xx
390 eor r11,r11,r12 @ 1 cycle stall
391 eor r10,r5,r6 @ F_xx_xx
393 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
396 and r10,r4,r10,ror#2 @ F_xx_xx
397 and r11,r5,r6 @ F_xx_xx
398 add r7,r7,r9 @ E+=X[i]
399 add r7,r7,r10 @ E+=F_40_59(B,C,D)
404 add r6,r8,r6,ror#2 @ E+=K_xx_xx
407 eor r11,r11,r12 @ 1 cycle stall
408 eor r10,r4,r5 @ F_xx_xx
410 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
413 and r10,r3,r10,ror#2 @ F_xx_xx
414 and r11,r4,r5 @ F_xx_xx
415 add r6,r6,r9 @ E+=X[i]
416 add r6,r6,r10 @ E+=F_40_59(B,C,D)
421 add r5,r8,r5,ror#2 @ E+=K_xx_xx
424 eor r11,r11,r12 @ 1 cycle stall
425 eor r10,r3,r4 @ F_xx_xx
427 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
430 and r10,r7,r10,ror#2 @ F_xx_xx
431 and r11,r3,r4 @ F_xx_xx
432 add r5,r5,r9 @ E+=X[i]
433 add r5,r5,r10 @ E+=F_40_59(B,C,D)
438 add r4,r8,r4,ror#2 @ E+=K_xx_xx
441 eor r11,r11,r12 @ 1 cycle stall
442 eor r10,r7,r3 @ F_xx_xx
444 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
447 and r10,r6,r10,ror#2 @ F_xx_xx
448 and r11,r7,r3 @ F_xx_xx
449 add r4,r4,r9 @ E+=X[i]
450 add r4,r4,r10 @ E+=F_40_59(B,C,D)
455 add r3,r8,r3,ror#2 @ E+=K_xx_xx
458 eor r11,r11,r12 @ 1 cycle stall
459 eor r10,r6,r7 @ F_xx_xx
461 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
464 and r10,r5,r10,ror#2 @ F_xx_xx
465 and r11,r6,r7 @ F_xx_xx
466 add r3,r3,r9 @ E+=X[i]
467 add r3,r3,r10 @ E+=F_40_59(B,C,D)
470 bne .L_40_59 @ [+((12+5)*5+2)*4]
474 cmp sp,#0 @ set carry to denote 60_79
475 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
477 add sp,sp,#80*4 @ "deallocate" stack frame
478 ldmia r0,{r8,r9,r10,r11,r12}
484 stmia r0,{r3,r4,r5,r6,r7}
486 bne .Lloop @ [+18], total 1307
489 ldmia sp!,{r4-r12,pc}
491 ldmia sp!,{r4-r12,lr}
493 moveq pc,lr @ be binary compatible with V4, yet
494 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
497 .LK_00_19: .word 0x5a827999
498 .LK_20_39: .word 0x6ed9eba1
499 .LK_40_59: .word 0x8f1bbcdc
500 .LK_60_79: .word 0xca62c1d6
501 .size sha1_block_data_order,.-sha1_block_data_order
502 .asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"