Correct PPTP server firewall rules chain.
[tomato/davidwu.git] / release / src / router / nettle / x86_64 / salsa20-crypt.asm
blob9d1b53d9f553fdd1c3bc14acfac08a45ec833fab
1 C nettle, low-level cryptographics library
2 C
3 C Copyright (C) 2012 Niels Möller
4 C
5 C The nettle library is free software; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General Public License as published by
7 C the Free Software Foundation; either version 2.1 of the License, or (at your
8 C option) any later version.
9 C
10 C The nettle library is distributed in the hope that it will be useful, but
11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General Public License
16 C along with the nettle library; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 C MA 02111-1301, USA.
20 define(<CTX>, <%rdi>)
21 define(<LENGTH>, <%rsi>)
22 define(<DST>, <%rdx>)
23 define(<SRC>, <%rcx>)
24 define(<T64>, <%r8>)
25 define(<POS>, <%r9>)
26 define(<X0>, <%xmm0>)
27 define(<X1>, <%xmm1>)
28 define(<X2>, <%xmm2>)
29 define(<X3>, <%xmm3>)
30 define(<T0>, <%xmm4>)
31 define(<T1>, <%xmm5>)
32 define(<M0101>, <%xmm6>)
33 define(<M0110>, <%xmm7>)
34 define(<M0011>, <%xmm8>)
35 define(<COUNT>, <%rax>)
37 include_src(<x86_64/salsa20.m4>)
39 C Possible improvements:
41 C Do two blocks (or more) at a time in parallel, to avoid limitations
42 C due to data dependencies.
44 C Avoid redoing the permutation of the input for each block (all but
45 C the two counter words are constant). Could also keep the input in
46 C registers.
48 .file "salsa20-crypt.asm"
50 C salsa20_crypt(struct salsa20_ctx *ctx, unsigned length,
51 C uint8_t *dst, const uint8_t *src)
52 .text
53 ALIGN(16)
54 PROLOGUE(nettle_salsa20_crypt)
55 W64_ENTRY(4, 9)
57 test LENGTH, LENGTH
58 jz .Lend
60 C Load mask registers
61 mov $-1, XREG(COUNT)
62 movd XREG(COUNT), M0101
63 pshufd $0x09, M0101, M0011 C 01 01 00 00
64 pshufd $0x41, M0101, M0110 C 01 00 00 01
65 pshufd $0x22, M0101, M0101 C 01 00 01 00
67 .Lblock_loop:
68 movups (CTX), X0
69 movups 16(CTX), X1
70 movups 32(CTX), X2
71 movups 48(CTX), X3
73 C On input, each xmm register is one row. We start with
75 C 0 1 2 3 C K K K
76 C 4 5 6 7 K C I I
77 C 8 9 10 11 B B C K
78 C 12 13 14 15 K K K C
80 C Diagrams are in little-endian order, with least significant word to
81 C the left. We rotate the columns, to get instead
83 C 0 5 10 15 C C C C
84 C 4 9 14 3 K B K K
85 C 8 13 2 7 B K K I
86 C 12 1 6 11 K K I K
88 C The original rows are now diagonals.
89 SWAP(X0, X1, M0101)
90 SWAP(X2, X3, M0101)
91 SWAP(X1, X3, M0110)
92 SWAP(X0, X2, M0011)
94 movl $10, XREG(COUNT)
95 ALIGN(16)
96 .Loop:
97 QROUND(X0, X1, X2, X3)
98 C For the row operations, we first rotate the rows, to get
100 C 0 5 10 15
101 C 3 4 9 14
102 C 2 7 8 13
103 C 1 6 11 12
105 C Now the original rows are turned into into columns. (This
106 C SIMD hack described in djb's papers).
108 pshufd $0x93, X1, X1 C 11 00 01 10 (least sign. left)
109 pshufd $0x4e, X2, X2 C 10 11 00 01
110 pshufd $0x39, X3, X3 C 01 10 11 00
112 QROUND(X0, X3, X2, X1)
114 C Inverse rotation of the rows
115 pshufd $0x39, X1, X1 C 01 10 11 00
116 pshufd $0x4e, X2, X2 C 10 11 00 01
117 pshufd $0x93, X3, X3 C 11 00 01 10
119 decl XREG(COUNT)
120 jnz .Loop
122 SWAP(X0, X2, M0011)
123 SWAP(X1, X3, M0110)
124 SWAP(X0, X1, M0101)
125 SWAP(X2, X3, M0101)
127 movups (CTX), T0
128 movups 16(CTX), T1
129 paddd T0, X0
130 paddd T1, X1
131 movups 32(CTX), T0
132 movups 48(CTX), T1
133 paddd T0, X2
134 paddd T1, X3
136 C Increment block counter
137 incq 32(CTX)
139 cmp $64, LENGTH
140 jc .Lfinal_xor
142 movups 48(SRC), T1
143 pxor T1, X3
144 movups X3, 48(DST)
145 .Lxor3:
146 movups 32(SRC), T0
147 pxor T0, X2
148 movups X2, 32(DST)
149 .Lxor2:
150 movups 16(SRC), T1
151 pxor T1, X1
152 movups X1, 16(DST)
153 .Lxor1:
154 movups (SRC), T0
155 pxor T0, X0
156 movups X0, (DST)
158 lea 64(SRC), SRC
159 lea 64(DST), DST
160 sub $64, LENGTH
161 ja .Lblock_loop
162 .Lend:
163 W64_EXIT(4, 9)
166 .Lfinal_xor:
167 cmp $32, LENGTH
168 jz .Lxor2
169 jc .Llt32
170 cmp $48, LENGTH
171 jz .Lxor3
172 jc .Llt48
173 movaps X3, T0
174 call .Lpartial
175 jmp .Lxor3
176 .Llt48:
177 movaps X2, T0
178 call .Lpartial
179 jmp .Lxor2
180 .Llt32:
181 cmp $16, LENGTH
182 jz .Lxor1
183 jc .Llt16
184 movaps X1, T0
185 call .Lpartial
186 jmp .Lxor1
187 .Llt16:
188 movaps X0, T0
189 call .Lpartial
190 jmp .Lend
192 .Lpartial:
193 mov LENGTH, POS
194 and $-16, POS
195 test $8, LENGTH
196 jz .Llt8
197 C This "movd" instruction should assemble to
198 C 66 49 0f 7e e0 movq %xmm4,%r8
199 C Apparently, assemblers treat movd and movq (with the
200 C arguments we use) in the same way, except for osx, which
201 C barfs at movq.
202 movd T0, T64
203 xor (SRC, POS), T64
204 mov T64, (DST, POS)
205 lea 8(POS), POS
206 pshufd $0xee, T0, T0 C 10 11 10 11
207 .Llt8:
208 C And this is also really a movq.
209 movd T0, T64
210 test $4, LENGTH
211 jz .Llt4
212 mov XREG(T64), XREG(COUNT)
213 xor (SRC, POS), XREG(COUNT)
214 mov XREG(COUNT), (DST, POS)
215 lea 4(POS), POS
216 shr $32, T64
217 .Llt4:
218 test $2, LENGTH
219 jz .Llt2
220 mov WREG(T64), WREG(COUNT)
221 xor (SRC, POS), WREG(COUNT)
222 mov WREG(COUNT), (DST, POS)
223 lea 2(POS), POS
224 shr $16, XREG(T64)
225 .Llt2:
226 test $1, LENGTH
227 jz .Lret
228 xor (SRC, POS), LREG(T64)
229 mov LREG(T64), (DST, POS)
231 .Lret:
234 EPILOGUE(nettle_salsa20_crypt)