Correct PPTP server firewall rules chain.
[tomato/davidwu.git] / release / src / router / nettle / arm / memxor.asm
blob33f672c6d641f8213a0c22f146dbf8af87174162
1 C -*- mode: asm; asm-comment-char: ?C; -*-
2 C nettle, low-level cryptographics library
4 C Copyright (C) 2013, Niels Möller
6 C The nettle library is free software; you can redistribute it and/or modify
7 C it under the terms of the GNU Lesser General Public License as published by
8 C the Free Software Foundation; either version 2.1 of the License, or (at your
9 C option) any later version.
11 C The nettle library is distributed in the hope that it will be useful, but
12 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 C License for more details.
16 C You should have received a copy of the GNU Lesser General Public License
17 C along with the nettle library; see the file COPYING.LIB. If not, write to
18 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 C MA 02111-1301, USA.
21 C Possible speedups:
23 C The ldm instruction can do load two registers per cycle,
24 C if the address is two-word aligned. Or three registers in two
25 C cycles, regardless of alignment.
27 C Register usage:
29 define(<DST>, <r0>)
30 define(<SRC>, <r1>)
31 define(<N>, <r2>)
32 define(<CNT>, <r6>)
33 define(<TNC>, <r12>)
35 .syntax unified
37 .file "memxor.asm"
39 .text
40 .arm
42 C memxor(uint8_t *dst, const uint8_t *src, size_t n)
43 .align 4
44 PROLOGUE(memxor)
45 cmp N, #0
46 beq .Lmemxor_done
48 cmp N, #7
49 bcs .Lmemxor_large
51 C Simple byte loop
52 .Lmemxor_bytes:
53 ldrb r3, [SRC], #+1
54 ldrb r12, [DST]
55 eor r3, r12
56 strb r3, [DST], #+1
57 subs N, #1
58 bne .Lmemxor_bytes
60 .Lmemxor_done:
61 bx lr
63 .Lmemxor_align_loop:
64 ldrb r3, [SRC], #+1
65 ldrb r12, [DST]
66 eor r3, r12
67 strb r3, [DST], #+1
68 sub N, #1
70 .Lmemxor_large:
71 tst DST, #3
72 bne .Lmemxor_align_loop
74 C We have at least 4 bytes left to do here.
75 sub N, #4
77 ands r3, SRC, #3
78 beq .Lmemxor_same
80 C Different alignment case.
81 C v original SRC
82 C +-------+------+
83 C |SRC |SRC+4 |
84 C +---+---+------+
85 C |DST |
86 C +-------+
88 C With little-endian, we need to do
89 C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
91 push {r4,r5,r6}
93 lsl CNT, r3, #3
94 bic SRC, #3
95 rsb TNC, CNT, #32
97 ldr r4, [SRC], #+4
99 tst N, #4
100 itet eq
101 moveq r5, r4
102 subne N, #4
103 beq .Lmemxor_odd
105 .Lmemxor_word_loop:
106 ldr r5, [SRC], #+4
107 ldr r3, [DST]
108 eor r3, r3, r4, lsr CNT
109 eor r3, r3, r5, lsl TNC
110 str r3, [DST], #+4
111 .Lmemxor_odd:
112 ldr r4, [SRC], #+4
113 ldr r3, [DST]
114 eor r3, r3, r5, lsr CNT
115 eor r3, r3, r4, lsl TNC
116 str r3, [DST], #+4
117 subs N, #8
118 bcs .Lmemxor_word_loop
119 adds N, #8
120 beq .Lmemxor_odd_done
122 C We have TNC/8 left-over bytes in r4, high end
123 lsr r4, CNT
124 ldr r3, [DST]
125 eor r3, r4
127 pop {r4,r5,r6}
129 C Store bytes, one by one.
130 .Lmemxor_leftover:
131 strb r3, [DST], #+1
132 subs N, #1
133 beq .Lmemxor_done
134 subs TNC, #8
135 lsr r3, #8
136 bne .Lmemxor_leftover
137 b .Lmemxor_bytes
138 .Lmemxor_odd_done:
139 pop {r4,r5,r6}
140 bx lr
142 .Lmemxor_same:
143 push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register
145 subs N, #8
146 bcc .Lmemxor_same_end
148 ldmia SRC!, {r3, r4, r5}
149 C Keep address for loads in r14
150 mov r14, DST
151 ldmia r14!, {r6, r7, r8}
152 subs N, #12
153 eor r10, r3, r6
154 eor r11, r4, r7
155 eor r12, r5, r8
156 bcc .Lmemxor_same_final_store
157 subs N, #12
158 ldmia r14!, {r6, r7, r8}
159 bcc .Lmemxor_same_wind_down
161 C 6 cycles per iteration, 0.50 cycles/byte. For this speed,
162 C loop starts at offset 0x11c in the object file.
164 .Lmemxor_same_loop:
165 C r10-r12 contains values to be stored at DST
166 C r6-r8 contains values read from r14, in advance
167 ldmia SRC!, {r3, r4, r5}
168 subs N, #12
169 stmia DST!, {r10, r11, r12}
170 eor r10, r3, r6
171 eor r11, r4, r7
172 eor r12, r5, r8
173 ldmia r14!, {r6, r7, r8}
174 bcs .Lmemxor_same_loop
176 .Lmemxor_same_wind_down:
177 C Wind down code
178 ldmia SRC!, {r3, r4, r5}
179 stmia DST!, {r10, r11, r12}
180 eor r10, r3, r6
181 eor r11, r4, r7
182 eor r12, r5, r8
183 .Lmemxor_same_final_store:
184 stmia DST!, {r10, r11, r12}
186 .Lmemxor_same_end:
187 C We have 0-11 bytes left to do, and N holds number of bytes -12.
188 adds N, #4
189 bcc .Lmemxor_same_lt_8
190 C Do 8 bytes more, leftover is in N
191 ldmia SRC!, {r3, r4}
192 ldmia DST, {r6, r7}
193 eor r3, r6
194 eor r4, r7
195 stmia DST!, {r3, r4}
196 pop {r4,r5,r6,r7,r8,r10,r11,r14}
197 beq .Lmemxor_done
198 b .Lmemxor_bytes
200 .Lmemxor_same_lt_8:
201 pop {r4,r5,r6,r7,r8,r10,r11,r14}
202 adds N, #4
203 bcc .Lmemxor_same_lt_4
205 ldr r3, [SRC], #+4
206 ldr r12, [DST]
207 eor r3, r12
208 str r3, [DST], #+4
209 beq .Lmemxor_done
210 b .Lmemxor_bytes
212 .Lmemxor_same_lt_4:
213 adds N, #4
214 beq .Lmemxor_done
215 b .Lmemxor_bytes
217 EPILOGUE(memxor)
219 define(<DST>, <r0>)
220 define(<AP>, <r1>)
221 define(<BP>, <r2>)
222 define(<N>, <r3>)
223 undefine(<CNT>)
224 undefine(<TNC>)
226 C Temporaries r4-r7
227 define(<ACNT>, <r8>)
228 define(<ATNC>, <r10>)
229 define(<BCNT>, <r11>)
230 define(<BTNC>, <r12>)
232 C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
233 .align 2
234 PROLOGUE(memxor3)
235 cmp N, #0
236 beq .Lmemxor3_ret
238 push {r4,r5,r6,r7,r8,r10,r11}
239 cmp N, #7
241 add AP, N
242 add BP, N
243 add DST, N
245 bcs .Lmemxor3_large
247 C Simple byte loop
248 .Lmemxor3_bytes:
249 ldrb r4, [AP, #-1]!
250 ldrb r5, [BP, #-1]!
251 eor r4, r5
252 strb r4, [DST, #-1]!
253 subs N, #1
254 bne .Lmemxor3_bytes
256 .Lmemxor3_done:
257 pop {r4,r5,r6,r7,r8,r10,r11}
258 .Lmemxor3_ret:
259 bx lr
261 .Lmemxor3_align_loop:
262 ldrb r4, [AP, #-1]!
263 ldrb r5, [BP, #-1]!
264 eor r5, r4
265 strb r5, [DST, #-1]!
266 sub N, #1
268 .Lmemxor3_large:
269 tst DST, #3
270 bne .Lmemxor3_align_loop
272 C We have at least 4 bytes left to do here.
273 sub N, #4
274 ands ACNT, AP, #3
275 lsl ACNT, #3
276 beq .Lmemxor3_a_aligned
278 ands BCNT, BP, #3
279 lsl BCNT, #3
280 bne .Lmemxor3_uu
282 C Swap
283 mov r4, AP
284 mov AP, BP
285 mov BP, r4
287 .Lmemxor3_au:
288 C NOTE: We have the relevant shift count in ACNT, not BCNT
290 C AP is aligned, BP is not
291 C v original SRC
292 C +-------+------+
293 C |SRC-4 |SRC |
294 C +---+---+------+
295 C |DST-4 |
296 C +-------+
298 C With little-endian, we need to do
299 C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
300 rsb ATNC, ACNT, #32
301 bic BP, #3
303 ldr r4, [BP]
305 tst N, #4
306 itet eq
307 moveq r5, r4
308 subne N, #4
309 beq .Lmemxor3_au_odd
311 .Lmemxor3_au_loop:
312 ldr r5, [BP, #-4]!
313 ldr r6, [AP, #-4]!
314 eor r6, r6, r4, lsl ATNC
315 eor r6, r6, r5, lsr ACNT
316 str r6, [DST, #-4]!
317 .Lmemxor3_au_odd:
318 ldr r4, [BP, #-4]!
319 ldr r6, [AP, #-4]!
320 eor r6, r6, r5, lsl ATNC
321 eor r6, r6, r4, lsr ACNT
322 str r6, [DST, #-4]!
323 subs N, #8
324 bcs .Lmemxor3_au_loop
325 adds N, #8
326 beq .Lmemxor3_done
328 C Leftover bytes in r4, low end
329 ldr r5, [AP, #-4]
330 eor r4, r5, r4, lsl ATNC
332 .Lmemxor3_au_leftover:
333 C Store a byte at a time
334 ror r4, #24
335 strb r4, [DST, #-1]!
336 subs N, #1
337 beq .Lmemxor3_done
338 subs ACNT, #8
339 sub AP, #1
340 bne .Lmemxor3_au_leftover
341 b .Lmemxor3_bytes
343 .Lmemxor3_a_aligned:
344 ands ACNT, BP, #3
345 lsl ACNT, #3
346 bne .Lmemxor3_au ;
348 C a, b and dst all have the same alignment.
349 subs N, #8
350 bcc .Lmemxor3_aligned_word_end
352 C This loop runs at 8 cycles per iteration. It has been
353 C observed running at only 7 cycles, for this speed, the loop
354 C started at offset 0x2ac in the object file.
356 C FIXME: consider software pipelining, similarly to the memxor
357 C loop.
359 .Lmemxor3_aligned_word_loop:
360 ldmdb AP!, {r4,r5,r6}
361 ldmdb BP!, {r7,r8,r10}
362 subs N, #12
363 eor r4, r7
364 eor r5, r8
365 eor r6, r10
366 stmdb DST!, {r4, r5,r6}
367 bcs .Lmemxor3_aligned_word_loop
369 .Lmemxor3_aligned_word_end:
370 C We have 0-11 bytes left to do, and N holds number of bytes -12.
371 adds N, #4
372 bcc .Lmemxor3_aligned_lt_8
373 C Do 8 bytes more, leftover is in N
374 ldmdb AP!, {r4, r5}
375 ldmdb BP!, {r6, r7}
376 eor r4, r6
377 eor r5, r7
378 stmdb DST!, {r4,r5}
379 beq .Lmemxor3_done
380 b .Lmemxor3_bytes
382 .Lmemxor3_aligned_lt_8:
383 adds N, #4
384 bcc .Lmemxor3_aligned_lt_4
386 ldr r4, [AP,#-4]!
387 ldr r5, [BP,#-4]!
388 eor r4, r5
389 str r4, [DST,#-4]!
390 beq .Lmemxor3_done
391 b .Lmemxor3_bytes
393 .Lmemxor3_aligned_lt_4:
394 adds N, #4
395 beq .Lmemxor3_done
396 b .Lmemxor3_bytes
398 .Lmemxor3_uu:
400 cmp ACNT, BCNT
401 bic AP, #3
402 bic BP, #3
403 rsb ATNC, ACNT, #32
405 bne .Lmemxor3_uud
407 C AP and BP are unaligned in the same way
409 ldr r4, [AP]
410 ldr r6, [BP]
411 eor r4, r6
413 tst N, #4
414 itet eq
415 moveq r5, r4
416 subne N, #4
417 beq .Lmemxor3_uu_odd
419 .Lmemxor3_uu_loop:
420 ldr r5, [AP, #-4]!
421 ldr r6, [BP, #-4]!
422 eor r5, r6
423 lsl r4, ATNC
424 eor r4, r4, r5, lsr ACNT
425 str r4, [DST, #-4]!
426 .Lmemxor3_uu_odd:
427 ldr r4, [AP, #-4]!
428 ldr r6, [BP, #-4]!
429 eor r4, r6
430 lsl r5, ATNC
431 eor r5, r5, r4, lsr ACNT
432 str r5, [DST, #-4]!
433 subs N, #8
434 bcs .Lmemxor3_uu_loop
435 adds N, #8
436 beq .Lmemxor3_done
438 C Leftover bytes in a4, low end
439 ror r4, ACNT
440 .Lmemxor3_uu_leftover:
441 ror r4, #24
442 strb r4, [DST, #-1]!
443 subs N, #1
444 beq .Lmemxor3_done
445 subs ACNT, #8
446 bne .Lmemxor3_uu_leftover
447 b .Lmemxor3_bytes
449 .Lmemxor3_uud:
450 C Both AP and BP unaligned, and in different ways
451 rsb BTNC, BCNT, #32
453 ldr r4, [AP]
454 ldr r6, [BP]
456 tst N, #4
457 ittet eq
458 moveq r5, r4
459 moveq r7, r6
460 subne N, #4
461 beq .Lmemxor3_uud_odd
463 .Lmemxor3_uud_loop:
464 ldr r5, [AP, #-4]!
465 ldr r7, [BP, #-4]!
466 lsl r4, ATNC
467 eor r4, r4, r6, lsl BTNC
468 eor r4, r4, r5, lsr ACNT
469 eor r4, r4, r7, lsr BCNT
470 str r4, [DST, #-4]!
471 .Lmemxor3_uud_odd:
472 ldr r4, [AP, #-4]!
473 ldr r6, [BP, #-4]!
474 lsl r5, ATNC
475 eor r5, r5, r7, lsl BTNC
476 eor r5, r5, r4, lsr ACNT
477 eor r5, r5, r6, lsr BCNT
478 str r5, [DST, #-4]!
479 subs N, #8
480 bcs .Lmemxor3_uud_loop
481 adds N, #8
482 beq .Lmemxor3_done
484 C FIXME: More clever left-over handling? For now, just adjust pointers.
485 add AP, AP, ACNT, lsr #3
486 add BP, BP, BCNT, lsr #3
487 b .Lmemxor3_bytes
488 EPILOGUE(memxor3)