1 C
-*- mode: asm
; asm-comment-char: ?C; -*-
2 C nettle
, low-level cryptographics library
4 C Copyright
(C
) 2013, Niels Möller
6 C The nettle library is free software
; you can redistribute it and/or modify
7 C it under the terms of the GNU Lesser General
Public License as published by
8 C the Free Software Foundation
; either version 2.1 of the License, or (at your
9 C option
) any later version.
11 C The nettle library is distributed
in the hope that it will be useful
, but
12 C WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
13 C
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General
Public
14 C License for more details.
16 C You should have received a copy of the GNU Lesser General
Public License
17 C along with the nettle library
; see the file COPYING.LIB. If not, write to
18 C the Free Software Foundation
, Inc.
, 51 Franklin Street
, Fifth Floor
, Boston
,
23 C The ldm instruction can do load two registers per cycle
,
24 C if the address is two
-word aligned.
Or three registers
in two
25 C cycles
, regardless of alignment.
42 C memxor
(uint8_t
*dst
, const uint8_t
*src
, size_t n
)
72 bne .Lmemxor_align_loop
74 C We have at least
4 bytes left to do here.
80 C Different alignment case.
88 C With little
-endian
, we need to do
89 C DST
[i
] ^
= (SRC
[i
] >> CNT
) ^
(SRC
[i
+1] << TNC
)
108 eor r3
, r3
, r4
, lsr CNT
109 eor r3
, r3
, r5
, lsl TNC
114 eor r3
, r3
, r5
, lsr CNT
115 eor r3
, r3
, r4
, lsl TNC
118 bcs .Lmemxor_word_loop
120 beq .Lmemxor_odd_done
122 C We have TNC
/8 left
-over bytes
in r4
, high end
129 C Store bytes
, one by one.
136 bne .Lmemxor_leftover
143 push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register
146 bcc .Lmemxor_same_end
148 ldmia SRC
!, {r3, r4, r5}
149 C Keep address for loads
in r14
151 ldmia r14
!, {r6, r7, r8}
156 bcc .Lmemxor_same_final_store
158 ldmia r14
!, {r6, r7, r8}
159 bcc .Lmemxor_same_wind_down
161 C
6 cycles per iteration
, 0.50 cycles
/byte. For
this speed
,
162 C
loop starts at
offset 0x11c in the object file.
165 C r10
-r12 contains values to be stored at DST
166 C r6
-r8 contains values read from r14
, in advance
167 ldmia SRC
!, {r3, r4, r5}
169 stmia DST
!, {r10, r11, r12}
173 ldmia r14
!, {r6, r7, r8}
174 bcs .Lmemxor_same_loop
176 .
Lmemxor_same_wind_down:
178 ldmia SRC
!, {r3, r4, r5}
179 stmia DST
!, {r10, r11, r12}
183 .
Lmemxor_same_final_store:
184 stmia DST
!, {r10, r11, r12}
187 C We have
0-11 bytes left to do
, and N holds number of bytes
-12.
189 bcc .Lmemxor_same_lt_8
190 C Do
8 bytes more
, leftover is
in N
196 pop {r4,r5,r6,r7,r8,r10,r11,r14}
201 pop {r4,r5,r6,r7,r8,r10,r11,r14}
203 bcc .Lmemxor_same_lt_4
228 define
(<ATNC
>, <r10
>)
229 define
(<BCNT
>, <r11
>)
230 define
(<BTNC
>, <r12
>)
232 C memxor3
(uint8_t
*dst
, const uint8_t
*a
, const uint8_t
*b
, size_t n
)
238 push {r4,r5,r6,r7,r8,r10,r11}
257 pop {r4,r5,r6,r7,r8,r10,r11}
261 .
Lmemxor3_align_loop:
270 bne .Lmemxor3_align_loop
272 C We have at least
4 bytes left to do here.
276 beq .Lmemxor3_a_aligned
288 C
NOTE: We have the relevant shift count
in ACNT
, not BCNT
290 C AP is aligned
, BP is
not
298 C With little
-endian
, we need to do
299 C DST
[i
-i
] ^
= (SRC
[i
-i
] >> CNT
) ^
(SRC
[i
] << TNC
)
314 eor r6
, r6
, r4
, lsl ATNC
315 eor r6
, r6
, r5
, lsr ACNT
320 eor r6
, r6
, r5
, lsl ATNC
321 eor r6
, r6
, r4
, lsr ACNT
324 bcs .Lmemxor3_au_loop
328 C Leftover bytes
in r4
, low end
330 eor r4
, r5
, r4
, lsl ATNC
332 .
Lmemxor3_au_leftover:
333 C Store a
byte at a time
340 bne .Lmemxor3_au_leftover
348 C a
, b
and dst all have the same alignment.
350 bcc .Lmemxor3_aligned_word_end
352 C
This loop runs at
8 cycles per iteration. It has been
353 C observed running at only
7 cycles
, for
this speed
, the
loop
354 C started at
offset 0x2ac in the object file.
356 C
FIXME: consider software pipelining
, similarly to the memxor
359 .
Lmemxor3_aligned_word_loop:
360 ldmdb AP
!, {r4,r5,r6}
361 ldmdb
BP!, {r7,r8,r10}
366 stmdb DST
!, {r4, r5,r6}
367 bcs .Lmemxor3_aligned_word_loop
369 .
Lmemxor3_aligned_word_end:
370 C We have
0-11 bytes left to do
, and N holds number of bytes
-12.
372 bcc .Lmemxor3_aligned_lt_8
373 C Do
8 bytes more
, leftover is
in N
382 .
Lmemxor3_aligned_lt_8:
384 bcc .Lmemxor3_aligned_lt_4
393 .
Lmemxor3_aligned_lt_4:
407 C AP
and BP are unaligned
in the same way
424 eor r4
, r4
, r5
, lsr ACNT
431 eor r5
, r5
, r4
, lsr ACNT
434 bcs .Lmemxor3_uu_loop
438 C Leftover bytes
in a4
, low end
440 .
Lmemxor3_uu_leftover:
446 bne .Lmemxor3_uu_leftover
450 C Both AP
and BP unaligned
, and in different ways
461 beq .Lmemxor3_uud_odd
467 eor r4
, r4
, r6
, lsl BTNC
468 eor r4
, r4
, r5
, lsr ACNT
469 eor r4
, r4
, r7
, lsr BCNT
475 eor r5
, r5
, r7
, lsl BTNC
476 eor r5
, r5
, r4
, lsr ACNT
477 eor r5
, r5
, r6
, lsr BCNT
480 bcs .Lmemxor3_uud_loop
484 C
FIXME: More clever left
-over handling
? For now
, just adjust pointers.
485 add AP
, AP
, ACNT
, lsr #
3
486 add BP, BP, BCNT
, lsr #
3