Correct PPTP server firewall rules chain.
[tomato/davidwu.git] / release / src / router / nettle / x86_64 / sha3-permute.asm
blob7f9a6b79108d570278a4cbff0a801412ffd5e20d
1 C nettle, low-level cryptographics library
2 C
3 C Copyright (C) 2012 Niels Möller
4 C
5 C The nettle library is free software; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General Public License as published by
7 C the Free Software Foundation; either version 2.1 of the License, or (at your
8 C option) any later version.
9 C
10 C The nettle library is distributed in the hope that it will be useful, but
11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General Public License
16 C along with the nettle library; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 C MA 02111-1301, USA.
20 define(<CTX>, <%rdi>) C 25 64-bit values, 200 bytes.
21 define(<COUNT>, <%r8>) C Avoid clobbering %rsi, for W64.
23 define(<A00>, <%rax>)
24 define(<A0102>, <%xmm0>)
25 define(<A0304>, <%xmm1>)
27 define(<A05>, <%rcx>)
28 define(<A0607>, <%xmm2>)
29 define(<A0809>, <%xmm3>)
31 define(<A10>, <%rdx>)
32 define(<A1112>, <%xmm4>)
33 define(<A1314>, <%xmm5>)
35 define(<A15>, <%rbp>)
36 define(<A1617>, <%xmm6>)
37 define(<A1819>, <%xmm7>)
39 define(<A20>, <%r9>)
40 define(<A2122>, <%xmm8>)
41 define(<A2324>, <%xmm9>)
43 define(<C0>, <%r10>)
44 define(<C12>, <%xmm10>)
45 define(<C34>, <%xmm11>)
47 define(<D0>, <%r11>)
48 define(<D12>, <%xmm12>)
49 define(<D34>, <%xmm13>)
51 C Wide temporaries
52 define(<W0>, <%xmm14>)
53 define(<W1>, <%xmm15>)
54 define(<W2>, <%xmm12>) C Overlap D12
55 define(<W3>, <%xmm13>) C Overlap D34
57 define(<T0>, <%r12>)
58 define(<T1>, <%r13>)
59 define(<T2>, <%r11>) C Overlap D0
60 define(<T3>, <%r10>) C Overlap C0
62 define(<RC>, <%r14>)
64 define(<OFFSET>, <ifelse($1,0,,eval(8*$1))>)
65 define(<STATE>, <OFFSET($1)(CTX)>)
67 define(<SWAP64>, <pshufd <$>0x4e,>)
69 define(<DIRECT_MOVQ>, <no>)
71 C MOVQ(src, dst), for moves between a general register and an xmm
72 C register.
74 ifelse(DIRECT_MOVQ, yes, <
75 C movq calls that are equal to the corresponding movd,
76 C where the Apple assembler requires them to be written as movd.
77 define(<MOVQ>, <movd $1, $2>)
78 >, <
79 C Moving via (cached) memory is generally faster.
80 define(<MOVQ>, <
81 movq $1, (CTX)
82 movq (CTX), $2
83 >)>)
85 C ROTL64(rot, register, temp)
86 C Caller needs to or together the result.
87 define(<ROTL64>, <
88 movdqa $2, $3
89 psllq <$>$1, $2
90 psrlq <$>eval(64-$1), $3
93 .file "sha3-permute.asm"
95 C sha3_permute(struct sha3_state *ctx)
96 .text
97 ALIGN(16)
98 PROLOGUE(nettle_sha3_permute)
99 W64_ENTRY(1, 16)
100 push %rbp
101 push %r12
102 push %r13
103 push %r14
105 movl $24, XREG(COUNT)
106 lea .rc-8(%rip), RC
107 movq STATE(0), A00
108 movups STATE(1), A0102
109 movups STATE(3), A0304
110 movq A00, C0
112 movq STATE(5), A05
113 movdqa A0102, C12
114 movups STATE(6), A0607
115 movdqa A0304, C34
116 movups STATE(8), A0809
117 xorq A05, C0
119 movq STATE(10), A10
120 pxor A0607, C12
121 movups STATE(11), A1112
122 pxor A0809, C34
123 movups STATE(13), A1314
124 xorq A10, C0
126 movq STATE(15), A15
127 pxor A1112, C12
128 movups STATE(16), A1617
129 pxor A1314, C34
130 movups STATE(18), A1819
131 xorq A15, C0
133 movq STATE(20), A20
134 pxor A1617, C12
135 movups STATE(21), A2122
136 pxor A1819, C34
137 movups STATE(23), A2324
138 xorq A20, C0
139 pxor A2122, C12
140 pxor A2324, C34
142 ALIGN(16)
143 .Loop:
144 C The theta step. Combine parity bits, then xor to state.
145 C D0 = C4 ^ (C1 <<< 1)
146 C D1 = C0 ^ (C2 <<< 1)
147 C D2 = C1 ^ (C3 <<< 1)
148 C D3 = C2 ^ (C4 <<< 1)
149 C D4 = C3 ^ (C0 <<< 1)
151 C Shift the words around, putting (C0, C1) in D12, (C2, C3) in
152 C D34, and (C4, C0) in C34.
154 C Notes on "unpack" instructions:
155 C punpckhqdq 01, 23 gives 31
156 C punpcklqdq 01, 23 gives 20
158 SWAP64 C34, C34 C Holds C4, C3
159 movdqa C12, D34
160 MOVQ(C0, D12)
161 punpcklqdq C12, D12 C Holds C0, C1
162 punpckhqdq C34, D34 C Holds C2, C3
163 punpcklqdq D12, C34 C Holds C4, C0
164 MOVQ(C34, D0)
165 MOVQ(C12, T0)
166 rolq $1, T0
167 xorq T0, D0
169 C Can use C12 as temporary
170 movdqa D34, W0
171 movdqa D34, W1
172 psllq $1, W0
173 psrlq $63, W1
174 pxor W0, D12
175 pxor W1, D12 C Done D12
177 movdqa C34, C12
178 psrlq $63, C34
179 psllq $1, C12
180 pxor C34, D34
181 pxor C12, D34 C Done D34
183 xorq D0, A00
184 xorq D0, A05
185 xorq D0, A10
186 xorq D0, A15
187 xorq D0, A20
188 pxor D12, A0102
189 pxor D12, A0607
190 pxor D12, A1112
191 pxor D12, A1617
192 pxor D12, A2122
193 pxor D34, A0304
194 pxor D34, A0809
195 pxor D34, A1314
196 pxor D34, A1819
197 pxor D34, A2324
199 C theta step done, no C, D or W temporaries alive.
201 C rho and pi steps. When doing the permutations, also
202 C transpose the matrix.
204 C The combined permutation + transpose gives the following
205 C cycles (rotation counts in parenthesis)
206 C 0 <- 0(0)
207 C 1 <- 3(28) <- 4(27) <- 2(62) <- 1(1)
208 C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
209 C 7 <- 7(6)
210 C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
211 C 14 <- 14(39)
212 C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
213 C 16 <- 16(45)
214 C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
215 C 23 <- 23(56)
217 C Do the 1,2,3,4 row. First rotate, then permute.
218 movdqa A0102, W0
219 movdqa A0102, W1
220 movdqa A0102, W2
221 psllq $1, A0102
222 psrlq $63, W0
223 psllq $62, W1
224 por A0102, W0 C rotl 1 (A01)
225 psrlq $2, W2
226 por W1, W2 C rotl 62 (A02)
228 movdqa A0304, A0102
229 movdqa A0304, W1
230 psllq $28, A0102
231 psrlq $36, W1
232 por W1, A0102 C rotl 28 (A03)
233 movdqa A0304, W1
234 psllq $27, A0304
235 psrlq $37, W1
236 por W1, A0304 C rotl 27 (A04)
238 punpcklqdq W0, A0102
239 punpckhqdq W2, A0304
241 C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
242 C 7 <- 7(6)
243 C __ _______
244 C _ L' ` L_ __`
245 C |5| |6|7| |8|9|
246 C `-_________-^`-^
248 rolq $36, A05
249 MOVQ(A05, W0)
250 MOVQ(A0607, A05)
251 rolq $44, A05 C Done A05
252 ROTL64(6, A0607, W1)
253 por A0607, W1
254 movdqa A0809, A0607
255 ROTL64(20, A0607, W2)
256 por W2, A0607
257 punpckhqdq W1, A0607 C Done A0607
258 ROTL64(55, A0809, W1)
259 por A0809, W1
260 movdqa W0, A0809
261 punpcklqdq W1, A0809 C Done 0809
263 C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
264 C 14 <- 14(39)
265 C _____ ___
266 C __L' __`_L_ `_____
267 C |10| |11|12| |13|14|
268 C `-___-^`-______-^
271 rolq $42, A10 C 42 + 25 = 3 (mod 64)
272 SWAP64 A1112, W0
273 MOVQ(A10, A1112)
274 MOVQ(W0, A10)
275 rolq $43, A10 C Done A10
277 punpcklqdq A1314, A1112
278 ROTL64(25, A1112, W1)
279 por W1, A1112 C Done A1112
280 ROTL64(39, A1314, W2)
281 por A1314, W2
282 ROTL64(10, W0, A1314)
283 por W0, A1314
284 punpckhqdq W2, A1314 C Done A1314
287 C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
288 C 16 <- 16(45)
289 C _____________
290 C / _______
291 C _L' ____L' | `_
292 C |15| |16|17| |18|19|
293 C \ `_____-^ ^
294 C \_________________/
296 SWAP64 A1819, W0
297 rolq $41, A15
298 MOVQ(A15, W1)
299 MOVQ(A1819, A15)
300 rolq $21, A15 C Done A15
301 SWAP64 A1617, A1819
302 ROTL64(45, A1617, W2)
303 por W2, A1617
304 ROTL64(8, W0, W3)
305 por W3, W0
306 punpcklqdq W0, A1617 C Done A1617
307 ROTL64(15, A1819, W2)
308 por W2, A1819
309 punpcklqdq W1, A1819 C Done A1819
311 C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
312 C 23 <- 23(56)
313 C _______________
314 C / \
315 C _L' _L'\_ ___`_
316 C |20| |21|22| |23|24|
317 C \ `__ ^________-^
318 C \_______/
320 rolq $18, A20
321 MOVQ(A20, W0)
322 SWAP64 A2324, W1
323 movd W1, A20
324 rolq $14, A20 C Done A20
325 ROTL64(56, A2324, W1)
326 por W1, A2324
328 movdqa A2122, W2
329 ROTL64(2, W2, W1)
330 por W1, W2
331 punpcklqdq W2, A2324 C Done A2324
333 ROTL64(61, A2122, W1)
334 por W1, A2122
335 psrldq $8, A2122
336 punpcklqdq W0, A2122 C Done A2122
338 C chi step. With the transposed matrix, applied independently
339 C to each column.
340 movq A05, T0
341 notq T0
342 andq A10, T0
343 movq A10, T1
344 notq T1
345 andq A15, T1
346 movq A15, T2
347 notq T2
348 andq A20, T2
349 xorq T2, A10
350 movq A20, T3
351 notq T3
352 andq A00, T3
353 xorq T3, A15
354 movq A00, T2
355 notq T2
356 andq A05, T2
357 xorq T2, A20
358 xorq T0, A00
359 xorq T1, A05
361 movdqa A0607, W0
362 pandn A1112, W0
363 movdqa A1112, W1
364 pandn A1617, W1
365 movdqa A1617, W2
366 pandn A2122, W2
367 pxor W2, A1112
368 movdqa A2122, W3
369 pandn A0102, W3
370 pxor W3, A1617
371 movdqa A0102, W2
372 pandn A0607, W2
373 pxor W2, A2122
374 pxor W0, A0102
375 pxor W1, A0607
377 movdqa A0809, W0
378 pandn A1314, W0
379 movdqa A1314, W1
380 pandn A1819, W1
381 movdqa A1819, W2
382 pandn A2324, W2
383 pxor W2, A1314
384 movdqa A2324, W3
385 pandn A0304, W3
386 pxor W3, A1819
387 movdqa A0304, W2
388 pandn A0809, W2
389 pxor W2, A2324
390 pxor W0, A0304
391 pxor W1, A0809
393 xorq (RC, COUNT, 8), A00
395 C Transpose.
396 C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304,
397 C and also copy to C12 and C34 while at it.
399 MOVQ(A05, C12)
400 MOVQ(A15, C34)
401 MOVQ(A10, W0)
402 MOVQ(A20, W1)
403 movq A00, C0
404 punpcklqdq W0, C12
405 punpcklqdq W1, C34
406 MOVQ(A0102, A05)
407 MOVQ(A0304, A15)
408 psrldq $8, A0102
409 psrldq $8, A0304
410 xorq A05, C0
411 xorq A15, C0
412 MOVQ(A0102, A10)
413 MOVQ(A0304, A20)
415 movdqa C12, A0102
416 movdqa C34, A0304
418 C Transpose (A0607, A1112)
419 movdqa A0607, W0
420 punpcklqdq A1112, A0607
421 xorq A10, C0
422 xorq A20, C0
423 punpckhqdq W0, A1112
424 SWAP64 A1112, A1112
426 C Transpose (A1819, A2324)
427 movdqa A1819, W0
428 punpcklqdq A2324, A1819
429 pxor A0607, C12
430 pxor A1112, C12
431 punpckhqdq W0, A2324
432 SWAP64 A2324, A2324
434 C Transpose (A0809, A1314) and (A1617, A2122), and swap
435 movdqa A0809, W0
436 movdqa A1314, W1
437 movdqa A1617, A0809
438 movdqa A2122, A1314
439 pxor A1819, C34
440 pxor A2324, C34
441 punpcklqdq A2122, A0809
442 punpckhqdq A1617, A1314
443 SWAP64 A1314, A1314
444 movdqa W0, A1617
445 movdqa W1, A2122
446 pxor A0809, C34
447 pxor A1314, C34
448 punpcklqdq W1, A1617
449 punpckhqdq W0, A2122
450 SWAP64 A2122, A2122
452 decl XREG(COUNT)
453 pxor A1617, C12
454 pxor A2122, C12
455 jnz .Loop
457 movq A00, STATE(0)
458 movups A0102, STATE(1)
459 movups A0304, STATE(3)
461 movq A05, STATE(5)
462 movups A0607, STATE(6)
463 movups A0809, STATE(8)
465 movq A10, STATE(10)
466 movups A1112, STATE(11)
467 movups A1314, STATE(13)
469 movq A15, STATE(15)
470 movups A1617, STATE(16)
471 movups A1819, STATE(18)
473 movq A20, STATE(20)
474 movups A2122, STATE(21)
475 movups A2324, STATE(23)
477 pop %r14
478 pop %r13
479 pop %r12
480 pop %rbp
481 W64_EXIT(1, 16)
484 EPILOGUE(nettle_sha3_permute)
486 ALIGN(16)
487 .rc: C In reverse order
488 .quad 0x8000000080008008
489 .quad 0x0000000080000001
490 .quad 0x8000000000008080
491 .quad 0x8000000080008081
492 .quad 0x800000008000000A
493 .quad 0x000000000000800A
494 .quad 0x8000000000000080
495 .quad 0x8000000000008002
496 .quad 0x8000000000008003
497 .quad 0x8000000000008089
498 .quad 0x800000000000008B
499 .quad 0x000000008000808B
500 .quad 0x000000008000000A
501 .quad 0x0000000080008009
502 .quad 0x0000000000000088
503 .quad 0x000000000000008A
504 .quad 0x8000000000008009
505 .quad 0x8000000080008081
506 .quad 0x0000000080000001
507 .quad 0x000000000000808B
508 .quad 0x8000000080008000
509 .quad 0x800000000000808A
510 .quad 0x0000000000008082
511 .quad 0x0000000000000001