Correct PPTP server firewall rules chain.
[tomato/davidwu.git] / release / src / router / nettle / x86_64 / memxor.asm
blobb22a4721bc74d91e95a60078bcb77d795e4ae5f1
1 C nettle, low-level cryptographics library
2 C
3 C Copyright (C) 2010, Niels Möller
4 C
5 C The nettle library is free software; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General Public License as published by
7 C the Free Software Foundation; either version 2.1 of the License, or (at your
8 C option) any later version.
9 C
10 C The nettle library is distributed in the hope that it will be useful, but
11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General Public License
16 C along with the nettle library; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 C MA 02111-1301, USA.
20 C Register usage:
21 define(<DST>, <%rax>) C Originally in %rdi
22 define(<AP>, <%rsi>)
23 define(<BP>, <%rdx>)
24 define(<N>, <%r10>)
25 define(<TMP>, <%r8>)
26 define(<TMP2>, <%r9>)
27 define(<CNT>, <%rdi>)
28 define(<S0>, <%r11>)
29 define(<S1>, <%rdi>) C Overlaps with CNT
31 define(<USE_SSE2>, <no>)
33 .file "memxor.asm"
35 .text
37 C memxor(uint8_t *dst, const uint8_t *src, size_t n)
38 C %rdi %rsi %rdx
39 ALIGN(16)
41 PROLOGUE(memxor)
42 W64_ENTRY(3, 0)
43 mov %rdx, %r10
44 mov %rdi, %rdx
45 jmp .Lmemxor3_entry
46 EPILOGUE(memxor)
48 C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
49 C %rdi %rsi %rdx %rcx
50 ALIGN(16)
52 PROLOGUE(memxor3)
53 W64_ENTRY(4, 0)
54 C %cl needed for shift count, so move away N
55 mov %rcx, N
56 .Lmemxor3_entry:
57 test N, N
58 C Get number of unaligned bytes at the end
59 C %rdi is used as CNT, %rax as DST and as return value
60 mov %rdi, %rax
61 jz .Ldone
62 add N, CNT
63 and $7, CNT
65 jz .Laligned
67 cmp $8, N
68 jc .Lfinal_next
70 C FIXME: Instead of this loop, could try cmov with memory
71 C destination, as a sequence of one 8-bit, one 16-bit and one
72 C 32-bit operations. (Except that cmov can't do 8-bit ops, so
73 C that step has to use a conditional).
74 .Lalign_loop:
76 sub $1, N
77 movb (AP, N), LREG(TMP)
78 xorb (BP, N), LREG(TMP)
79 movb LREG(TMP), (DST, N)
80 sub $1, CNT
81 jnz .Lalign_loop
83 .Laligned:
84 ifelse(USE_SSE2, yes, <
85 cmp $16, N
86 jnc .Lsse2_case
88 C Check for the case that AP and BP have the same alignment,
89 C but different from DST.
90 mov AP, TMP
91 sub BP, TMP
92 test $7, TMP
93 jnz .Lno_shift_case
94 mov AP, %rcx
95 sub DST, %rcx
96 and $7, %rcx
97 jz .Lno_shift_case
98 sub %rcx, AP
99 sub %rcx, BP
100 shl $3, %rcx
102 C Unrolling, with aligned values alternating in S0 and S1
103 test $8, N
104 jnz .Lshift_odd
105 mov (AP, N), S1
106 xor (BP, N), S1
107 jmp .Lshift_next
109 .Lshift_odd:
110 mov -8(AP, N), S1
111 mov (AP, N), S0
112 xor -8(BP, N), S1
113 xor (BP, N), S0
114 mov S1, TMP
115 shr %cl, TMP
116 neg %cl
117 shl %cl, S0
118 neg %cl
120 or S0, TMP
121 mov TMP, -8(DST, N)
122 sub $8, N
123 jz .Ldone
124 jmp .Lshift_next
126 ALIGN(16)
128 .Lshift_loop:
129 mov 8(AP, N), S0
130 xor 8(BP, N), S0
131 mov S0, TMP
132 shr %cl, TMP
133 neg %cl
134 shl %cl, S1
135 neg %cl
136 or S1, TMP
137 mov TMP, 8(DST, N)
139 mov (AP, N), S1
140 xor (BP, N), S1
141 mov S1, TMP
142 shr %cl, TMP
143 neg %cl
144 shl %cl, S0
145 neg %cl
146 or S0, TMP
147 mov TMP, (DST, N)
148 .Lshift_next:
149 sub $16, N
150 C FIXME: Handle the case N == 16 specially,
151 C like in the non-shifted case?
152 C ja .Lshift_loop
153 C jz .Ldone
154 jnc .Lshift_loop
156 add $15, N
157 jnc .Ldone
159 shr $3, %rcx
160 add %rcx, AP
161 add %rcx, BP
162 jmp .Lfinal_loop
164 .Lno_shift_case:
165 C Next destination word is -8(DST, N)
166 C Setup for unrolling
167 test $8, N
168 jz .Lword_next
170 sub $8, N
171 jz .Lone_word
173 mov (AP, N), TMP
174 xor (BP, N), TMP
175 mov TMP, (DST, N)
177 jmp .Lword_next
179 ALIGN(16)
181 .Lword_loop:
182 mov 8(AP, N), TMP
183 mov (AP, N), TMP2
184 xor 8(BP, N), TMP
185 xor (BP, N), TMP2
186 mov TMP, 8(DST, N)
187 mov TMP2, (DST, N)
189 .Lword_next:
190 sub $16, N
191 ja .Lword_loop C Not zero and no carry
192 jnz .Lfinal
194 C Final operation is word aligned
195 mov 8(AP, N), TMP
196 xor 8(BP, N), TMP
197 mov TMP, 8(DST, N)
199 .Lone_word:
200 mov (AP, N), TMP
201 xor (BP, N), TMP
202 mov TMP, (DST, N)
204 C ENTRY might have been 3 args, too, but it doesn't matter for the exit
205 W64_EXIT(4, 0)
208 .Lfinal:
209 add $15, N
211 .Lfinal_loop:
212 movb (AP, N), LREG(TMP)
213 xorb (BP, N), LREG(TMP)
214 movb LREG(TMP), (DST, N)
215 .Lfinal_next:
216 sub $1, N
217 jnc .Lfinal_loop
219 .Ldone:
220 C ENTRY might have been 3 args, too, but it doesn't matter for the exit
221 W64_EXIT(4, 0)
224 ifelse(USE_SSE2, yes, <
226 .Lsse2_case:
227 lea (DST, N), TMP
228 test $8, TMP
229 jz .Lsse2_next
230 sub $8, N
231 mov (AP, N), TMP
232 xor (BP, N), TMP
233 mov TMP, (DST, N)
234 jmp .Lsse2_next
236 ALIGN(16)
237 .Lsse2_loop:
238 movdqu (AP, N), %xmm0
239 movdqu (BP, N), %xmm1
240 pxor %xmm0, %xmm1
241 movdqa %xmm1, (DST, N)
242 .Lsse2_next:
243 sub $16, N
244 ja .Lsse2_loop
246 C FIXME: See if we can do a full word first, before the
247 C byte-wise final loop.
248 jnz .Lfinal
250 C Final operation is aligned
251 movdqu (AP), %xmm0
252 movdqu (BP), %xmm1
253 pxor %xmm0, %xmm1
254 movdqa %xmm1, (DST)
255 C ENTRY might have been 3 args, too, but it doesn't matter for the exit
256 W64_EXIT(4, 0)
261 EPILOGUE(memxor3)