Correct PPTP server firewall rules chain.
[tomato/davidwu.git] / release / src / router / nettle / x86_64 / serpent-decrypt.asm
blobd6bacb5d8e4f3977578c7304a29acc5046314a77
1 C nettle, low-level cryptographics library
2 C
3 C Copyright (C) 2011 Niels Möller
4 C
5 C The nettle library is free software; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General Public License as published by
7 C the Free Software Foundation; either version 2.1 of the License, or (at your
8 C option) any later version.
9 C
10 C The nettle library is distributed in the hope that it will be useful, but
11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General Public License
16 C along with the nettle library; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 C MA 02111-1301, USA.
20 include_src(<x86_64/serpent.m4>)
22 C Register usage:
24 C Single block serpent state, two copies
25 define(<x0>, <%eax>)
26 define(<x1>, <%ebx>)
27 define(<x2>, <%ebp>)
28 define(<x3>, <%r8d>)
30 define(<y0>, <%r9d>)
31 define(<y1>, <%r10d>)
32 define(<y2>, <%r11d>)
33 define(<y3>, <%r12d>)
35 C Quadruple block serpent state, two copies
36 define(<X0>, <%xmm0>)
37 define(<X1>, <%xmm1>)
38 define(<X2>, <%xmm2>)
39 define(<X3>, <%xmm3>)
41 define(<Y0>, <%xmm4>)
42 define(<Y1>, <%xmm5>)
43 define(<Y2>, <%xmm6>)
44 define(<Y3>, <%xmm7>)
46 define(<MINUS1>, <%xmm8>)
47 define(<T0>, <%xmm9>)
48 define(<T1>, <%xmm10>)
49 define(<T2>, <%xmm11>)
50 define(<T3>, <%xmm12>)
52 C Arguments
53 define(<CTX>, <%rdi>)
54 define(<N>, <%rsi>)
55 define(<DST>, <%rdx>)
56 define(<SRC>, <%rcx>)
58 define(<CNT>, <%r13>)
59 define(<TMP32>, <%r14d>)
61 C SBOX macros. Inputs $1 - $4 (destroyed), outputs $5 - $8
63 define(<SBOX0I>, <
64 mov $1, $5
65 xor $3, $5
66 mov $1, $7
67 or $2, $7
68 mov $3, $6
69 xor $4, $6
70 xor $6, $7
71 and $3, $6
72 or $2, $3
73 xor $4, $2
74 or $1, $6
75 and $3, $2
76 xor $2, $6
77 or $7, $1
78 xor $6, $1
79 mov $7, $2
80 and $1, $2
81 not $7
82 or $7, $4
83 xor $3, $4
84 mov $1, $8
85 xor $4, $8
86 or $4, $2
87 xor $2, $5
90 define(<SBOX1I>, <
91 mov $2, $6
92 or $4, $6
93 xor $3, $6
94 mov $1, $8
95 xor $2, $8
96 mov $1, $5
97 or $6, $5
98 and $8, $5
99 xor $5, $2
100 xor $6, $8
101 and $4, $2
102 mov $1, $7
103 and $3, $7
104 or $7, $6
105 or $4, $7
106 xor $5, $7
107 not $7
108 xor $2, $6
109 xor $6, $5
110 xor $3, $5
111 or $7, $1
112 xor $1, $5
115 define(<SBOX2I>, <
116 mov $1, $5
117 xor $4, $5
118 mov $3, $7
119 xor $4, $7
120 mov $2, $6
121 or $7, $6
122 xor $6, $5
123 mov $4, $6
124 or $5, $6
125 and $2, $6
126 not $4
127 mov $1, $8
128 or $3, $8
129 and $8, $7
130 xor $7, $6
131 and $2, $8
132 and $3, $1
133 or $4, $1
134 xor $1, $8
135 and $8, $3
136 xor $1, $3
137 mov $5, $7
138 xor $6, $7
139 xor $3, $7
142 define(<SBOX3I>, <
143 mov $3, $8
144 or $4, $8
145 mov $2, $5
146 and $8, $5
147 mov $1, $7
148 or $4, $7
149 mov $3, $6
150 xor $7, $6
151 xor $6, $5
152 xor $1, $4
153 xor $4, $8
154 xor $2, $7
155 and $6, $7
156 xor $4, $7
157 xor $1, $6
158 or $5, $4
159 and $4, $6
160 xor $2, $6
161 and $7, $1
162 or $2, $1
163 xor $1, $8
166 define(<SBOX4I>, <
167 mov $3, $6
168 xor $4, $6
169 mov $3, $7
170 or $4, $7
171 xor $2, $7
172 or $4, $2
173 mov $1, $5
174 xor $7, $5
175 xor $7, $4
176 and $1, $7
177 xor $7, $6
178 xor $1, $7
179 or $3, $7
180 and $2, $1
181 mov $1, $8
182 xor $4, $8
183 not $1
184 or $6, $1
185 xor $1, $5
186 xor $2, $1
187 xor $1, $7
190 define(<SBOX5I>, <
191 mov $1, $6
192 and $4, $6
193 mov $3, $8
194 xor $6, $8
195 mov $2, $5
196 and $8, $5
197 mov $1, $7
198 xor $4, $7
199 xor $2, $4
200 xor $7, $5
201 and $1, $3
202 and $5, $1
203 or $2, $3
204 xor $5, $6
205 xor $3, $6
206 mov $5, $7
207 or $6, $7
208 xor $8, $7
209 xor $4, $7
210 not $2
211 or $1, $2
212 xor $2, $8
215 define(<SBOX6I>, <
216 mov $1, $7
217 xor $3, $7
218 not $3
219 mov $2, $5
220 xor $4, $5
221 mov $1, $6
222 or $3, $6
223 xor $5, $6
224 mov $2, $8
225 and $7, $8
226 or $4, $8
227 or $3, $4
228 or $2, $3
229 and $1, $3
230 mov $3, $5
231 xor $8, $5
232 not $5
233 and $7, $8
234 xor $3, $8
235 xor $6, $1
236 xor $1, $8
237 and $5, $2
238 xor $2, $7
239 xor $4, $7
242 define(<SBOX7I>, <
243 mov $1, $8
244 and $2, $8
245 mov $2, $7
246 xor $4, $7
247 or $8, $7
248 mov $1, $6
249 or $4, $6
250 and $3, $6
251 xor $6, $7
252 or $3, $8
253 mov $1, $5
254 or $2, $5
255 and $4, $5
256 xor $5, $8
257 xor $2, $5
258 mov $4, $6
259 xor $8, $6
260 not $6
261 or $5, $6
262 xor $3, $5
263 xor $1, $6
264 or $6, $4
265 xor $4, $5
268 define(<LTI>, <
269 rol <$>10, $3
270 rol <$>27, $1
271 mov $2, TMP32
272 shl <$>7, TMP32
273 xor $4, $3
274 xor TMP32, $3
275 xor $2, $1
276 xor $4, $1
277 rol <$>25, $4
278 rol <$>31, $2
279 mov $1, TMP32
280 shl <$>3, TMP32
281 xor $3, $4
282 xor TMP32, $4
283 xor $1, $2
284 xor $3, $2
285 rol <$>29, $3
286 rol <$>19, $1
289 define(<PNOT>, <
290 pxor MINUS1, $1
293 define(<WSBOX0I>, <
294 movdqa $1, $5
295 pxor $3, $5
296 movdqa $1, $7
297 por $2, $7
298 movdqa $3, $6
299 pxor $4, $6
300 pxor $6, $7
301 pand $3, $6
302 por $2, $3
303 pxor $4, $2
304 por $1, $6
305 pand $3, $2
306 pxor $2, $6
307 por $7, $1
308 pxor $6, $1
309 movdqa $7, $2
310 pand $1, $2
311 PNOT($7)
312 por $7, $4
313 pxor $3, $4
314 movdqa $1, $8
315 pxor $4, $8
316 por $4, $2
317 pxor $2, $5
320 define(<WSBOX1I>, <
321 movdqa $2, $6
322 por $4, $6
323 pxor $3, $6
324 movdqa $1, $8
325 pxor $2, $8
326 movdqa $1, $5
327 por $6, $5
328 pand $8, $5
329 pxor $5, $2
330 pxor $6, $8
331 pand $4, $2
332 movdqa $1, $7
333 pand $3, $7
334 por $7, $6
335 por $4, $7
336 pxor $5, $7
337 PNOT($7)
338 pxor $2, $6
339 pxor $6, $5
340 pxor $3, $5
341 por $7, $1
342 pxor $1, $5
345 define(<WSBOX2I>, <
346 movdqa $1, $5
347 pxor $4, $5
348 movdqa $3, $7
349 pxor $4, $7
350 movdqa $2, $6
351 por $7, $6
352 pxor $6, $5
353 movdqa $4, $6
354 por $5, $6
355 pand $2, $6
356 PNOT($4)
357 movdqa $1, $8
358 por $3, $8
359 pand $8, $7
360 pxor $7, $6
361 pand $2, $8
362 pand $3, $1
363 por $4, $1
364 pxor $1, $8
365 pand $8, $3
366 pxor $1, $3
367 movdqa $5, $7
368 pxor $6, $7
369 pxor $3, $7
372 define(<WSBOX3I>, <
373 movdqa $3, $8
374 por $4, $8
375 movdqa $2, $5
376 pand $8, $5
377 movdqa $1, $7
378 por $4, $7
379 movdqa $3, $6
380 pxor $7, $6
381 pxor $6, $5
382 pxor $1, $4
383 pxor $4, $8
384 pxor $2, $7
385 pand $6, $7
386 pxor $4, $7
387 pxor $1, $6
388 por $5, $4
389 pand $4, $6
390 pxor $2, $6
391 pand $7, $1
392 por $2, $1
393 pxor $1, $8
396 define(<WSBOX4I>, <
397 movdqa $3, $6
398 pxor $4, $6
399 movdqa $3, $7
400 por $4, $7
401 pxor $2, $7
402 por $4, $2
403 movdqa $1, $5
404 pxor $7, $5
405 pxor $7, $4
406 pand $1, $7
407 pxor $7, $6
408 pxor $1, $7
409 por $3, $7
410 pand $2, $1
411 movdqa $1, $8
412 pxor $4, $8
413 PNOT($1)
414 por $6, $1
415 pxor $1, $5
416 pxor $2, $1
417 pxor $1, $7
420 define(<WSBOX5I>, <
421 movdqa $1, $6
422 pand $4, $6
423 movdqa $3, $8
424 pxor $6, $8
425 movdqa $2, $5
426 pand $8, $5
427 movdqa $1, $7
428 pxor $4, $7
429 pxor $2, $4
430 pxor $7, $5
431 pand $1, $3
432 pand $5, $1
433 por $2, $3
434 pxor $5, $6
435 pxor $3, $6
436 movdqa $5, $7
437 por $6, $7
438 pxor $8, $7
439 pxor $4, $7
440 PNOT($2)
441 por $1, $2
442 pxor $2, $8
445 define(<WSBOX6I>, <
446 movdqa $1, $7
447 pxor $3, $7
448 PNOT($3)
449 movdqa $2, $5
450 pxor $4, $5
451 movdqa $1, $6
452 por $3, $6
453 pxor $5, $6
454 movdqa $2, $8
455 pand $7, $8
456 por $4, $8
457 por $3, $4
458 por $2, $3
459 pand $1, $3
460 movdqa $3, $5
461 pxor $8, $5
462 PNOT($5)
463 pand $7, $8
464 pxor $3, $8
465 pxor $6, $1
466 pxor $1, $8
467 pand $5, $2
468 pxor $2, $7
469 pxor $4, $7
472 define(<WSBOX7I>, <
473 movdqa $1, $8
474 pand $2, $8
475 movdqa $2, $7
476 pxor $4, $7
477 por $8, $7
478 movdqa $1, $6
479 por $4, $6
480 pand $3, $6
481 pxor $6, $7
482 por $3, $8
483 movdqa $1, $5
484 por $2, $5
485 pand $4, $5
486 pxor $5, $8
487 pxor $2, $5
488 movdqa $4, $6
489 pxor $8, $6
490 PNOT($6)
491 por $5, $6
492 pxor $3, $5
493 pxor $1, $6
494 por $6, $4
495 pxor $4, $5
498 define(<WLTI>, <
499 WROL(10, $3)
500 WROL(27, $1)
501 movdqa $2, T0
502 pslld <$>7, T0
503 pxor $4, $3
504 pxor T0, $3
505 pxor $2, $1
506 pxor $4, $1
507 WROL(25, $4)
508 WROL(31, $2)
509 movdqa $1, T0
510 pslld <$>3, T0
511 pxor $3, $4
512 pxor T0, $4
513 pxor $1, $2
514 pxor $3, $2
515 WROL(29, $3)
516 WROL(19, $1)
519 .file "serpent-decrypt.asm"
521 C serpent_decrypt(struct serpent_context *ctx,
522 C unsigned length, uint8_t *dst,
523 C const uint8_t *src)
524 .text
525 ALIGN(16)
526 PROLOGUE(nettle_serpent_decrypt)
527 C save all registers that need to be saved
528 W64_ENTRY(4, 13)
529 push %rbx
530 push %rbp
531 push %r12
532 push %r13
533 push %r14
535 lea (SRC, N), SRC
536 lea (DST, N), DST
537 neg N
538 jz .Lend
540 cmp $-64, N
541 ja .Lblock_loop
543 pcmpeqd MINUS1, MINUS1
545 .Lwblock_loop:
546 movups (SRC, N), X0
547 movups 16(SRC, N), X1
548 movups 32(SRC, N), X2
549 movups 48(SRC, N), X3
551 WTRANSPOSE(X0,X1,X2,X3)
553 mov $384, CNT
555 C FIXME: CNT known, no index register needed
556 WKEYXOR(128, X0,X1,X2,X3)
558 jmp .Lwround_start
560 ALIGN(16)
562 .Lwround_loop:
563 WLTI(X0,X1,X2,X3)
564 .Lwround_start:
565 WSBOX7I(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
566 WKEYXOR(112, Y0,Y1,Y2,Y3)
568 WLTI(Y0,Y1,Y2,Y3)
569 WSBOX6I(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
570 WKEYXOR(96, X0,X1,X2,X3)
572 WLTI(X0,X1,X2,X3)
573 WSBOX5I(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
574 WKEYXOR(80, Y0,Y1,Y2,Y3)
576 WLTI(Y0,Y1,Y2,Y3)
577 WSBOX4I(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
578 WKEYXOR(64, X0,X1,X2,X3)
580 WLTI(X0,X1,X2,X3)
581 WSBOX3I(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
582 WKEYXOR(48, Y0,Y1,Y2,Y3)
584 WLTI(Y0,Y1,Y2,Y3)
585 WSBOX2I(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
586 WKEYXOR(32, X0,X1,X2,X3)
588 WLTI(X0,X1,X2,X3)
589 WSBOX1I(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
590 WKEYXOR(16, Y0,Y1,Y2,Y3)
592 WLTI(Y0,Y1,Y2,Y3)
593 WSBOX0I(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
594 WKEYXOR(, X0,X1,X2,X3)
596 sub $128, CNT
597 jnc .Lwround_loop
599 WTRANSPOSE(X0,X1,X2,X3)
601 movups X0, (DST, N)
602 movups X1, 16(DST, N)
603 movups X2, 32(DST, N)
604 movups X3, 48(DST, N)
606 C FIXME: Adjust N, so we can use just jnc without an extra cmp.
607 add $64, N
608 jz .Lend
610 cmp $-64, N
611 jbe .Lwblock_loop
613 .Lblock_loop:
614 movl (SRC, N), x0
615 movl 4(SRC, N), x1
616 movl 8(SRC, N), x2
617 movl 12(SRC, N), x3
619 xor 512(CTX), x0
620 xor 516(CTX), x1
621 xor 520(CTX), x2
622 xor 524(CTX), x3
624 mov $384, CNT
625 jmp .Lround_start
627 ALIGN(16)
628 .Lround_loop:
629 LTI(x0,x1,x2,x3)
630 .Lround_start:
631 SBOX7I(x0,x1,x2,x3, y0,y1,y2,y3)
632 xor 112(CTX, CNT), y0
633 xor 116(CTX, CNT), y1
634 xor 120(CTX, CNT), y2
635 xor 124(CTX, CNT), y3
637 LTI(y0,y1,y2,y3)
638 SBOX6I(y0,y1,y2,y3, x0,x1,x2,x3)
639 xor 96(CTX, CNT), x0
640 xor 100(CTX, CNT), x1
641 xor 104(CTX, CNT), x2
642 xor 108(CTX, CNT), x3
644 LTI(x0,x1,x2,x3)
645 SBOX5I(x0,x1,x2,x3, y0,y1,y2,y3)
646 xor 80(CTX, CNT), y0
647 xor 84(CTX, CNT), y1
648 xor 88(CTX, CNT), y2
649 xor 92(CTX, CNT), y3
651 LTI(y0,y1,y2,y3)
652 SBOX4I(y0,y1,y2,y3, x0,x1,x2,x3)
653 xor 64(CTX, CNT), x0
654 xor 68(CTX, CNT), x1
655 xor 72(CTX, CNT), x2
656 xor 76(CTX, CNT), x3
658 LTI(x0,x1,x2,x3)
659 SBOX3I(x0,x1,x2,x3, y0,y1,y2,y3)
660 xor 48(CTX, CNT), y0
661 xor 52(CTX, CNT), y1
662 xor 56(CTX, CNT), y2
663 xor 60(CTX, CNT), y3
665 LTI(y0,y1,y2,y3)
666 SBOX2I(y0,y1,y2,y3, x0,x1,x2,x3)
667 xor 32(CTX, CNT), x0
668 xor 36(CTX, CNT), x1
669 xor 40(CTX, CNT), x2
670 xor 44(CTX, CNT), x3
672 LTI(x0,x1,x2,x3)
673 SBOX1I(x0,x1,x2,x3, y0,y1,y2,y3)
674 xor 16(CTX, CNT), y0
675 xor 20(CTX, CNT), y1
676 xor 24(CTX, CNT), y2
677 xor 28(CTX, CNT), y3
679 LTI(y0,y1,y2,y3)
680 SBOX0I(y0,y1,y2,y3, x0,x1,x2,x3)
681 xor (CTX, CNT), x0
682 xor 4(CTX, CNT), x1
683 xor 8(CTX, CNT), x2
684 xor 12(CTX, CNT), x3
685 sub $128, CNT
686 jnc .Lround_loop
688 movl x0, (DST, N)
689 movl x1, 4(DST, N)
690 movl x2, 8(DST, N)
691 movl x3, 12(DST, N)
692 add $16, N
693 jnc .Lblock_loop
695 .Lend:
696 pop %r14
697 pop %r13
698 pop %r12
699 pop %rbp
700 pop %rbx
701 W64_EXIT(4, 13)