Correct PPTP server firewall rules chain.
[tomato/davidwu.git] / release / src / router / nettle / x86_64 / serpent-encrypt.asm
blob613ef41e003b1612a215b824859a7d533c7b354d
1 C nettle, low-level cryptographics library
2 C
3 C Copyright (C) 2011 Niels Möller
4 C
5 C The nettle library is free software; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General Public License as published by
7 C the Free Software Foundation; either version 2.1 of the License, or (at your
8 C option) any later version.
9 C
10 C The nettle library is distributed in the hope that it will be useful, but
11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General Public License
16 C along with the nettle library; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 C MA 02111-1301, USA.
20 include_src(<x86_64/serpent.m4>)
22 C Register usage:
24 C Single block serpent state, two copies
25 define(<x0>, <%eax>)
26 define(<x1>, <%ebx>)
27 define(<x2>, <%ebp>)
28 define(<x3>, <%r8d>)
30 define(<y0>, <%r9d>)
31 define(<y1>, <%r10d>)
32 define(<y2>, <%r11d>)
33 define(<y3>, <%r12d>)
35 C Quadruple block serpent state, two copies
36 define(<X0>, <%xmm0>)
37 define(<X1>, <%xmm1>)
38 define(<X2>, <%xmm2>)
39 define(<X3>, <%xmm3>)
41 define(<Y0>, <%xmm4>)
42 define(<Y1>, <%xmm5>)
43 define(<Y2>, <%xmm6>)
44 define(<Y3>, <%xmm7>)
46 define(<MINUS1>, <%xmm8>)
47 define(<T0>, <%xmm9>)
48 define(<T1>, <%xmm10>)
49 define(<T2>, <%xmm11>)
50 define(<T3>, <%xmm12>)
52 C Arguments
53 define(<CTX>, <%rdi>)
54 define(<N>, <%rsi>)
55 define(<DST>, <%rdx>)
56 define(<SRC>, <%rcx>)
58 define(<CNT>, <%r13>)
59 define(<TMP32>, <%r14d>)
61 C SBOX macros. Inputs $1 - $4 (destroyed), outputs $5 - $8
63 define(<SBOX0>, <
64 mov $2, $8 C y3 = x1 ^ x2
65 xor $3, $8
66 mov $1, $5 C y0 = x0 | x3
67 or $4, $5
68 mov $1, $6 C y1 = x0 ^ x1
69 xor $2, $6
70 xor $5, $8 C y3 ^= y0
71 mov $3, $7 C y2 = x2 | y3
72 or $8, $7
73 xor $4, $1 C x0 ^= x3
74 and $4, $7 C y2 &= x3
75 xor $3, $4 C x3 ^= x2
76 or $2, $3 C x2 |= x1
77 mov $6, $5 C y0 = y1 & x2
78 and $3, $5
79 xor $5, $7 C y2 ^= y0
80 and $7, $5 C y0 &= y2
81 xor $3, $5 C y0 ^= x2
82 and $1, $2 C x1 &= x0
83 xor $1, $5 C y0 ^= x0
84 not $5 C y0 = ~y0
85 mov $5, $6 C y1 = y0 ^ x1
86 xor $2, $6
87 xor $4, $6 C y1 ^= x3
90 define(<SBOX1>, <
91 mov $1, $6 C y1 = x0 | x3
92 or $4, $6
93 mov $3, $7 C y2 = x2 ^ x3
94 xor $4, $7
95 mov $2, $5 C y0 = ~x1
96 not $5
97 mov $1, $8 C y3 = x0 ^ x2
98 xor $3, $8
99 or $1, $5 C y0 |= x0
100 and $4, $8 C y3 &= x3
101 mov $6, $1 C x0 = y1 & y2
102 and $7, $1
103 or $2, $8 C y3 |= x1
104 xor $5, $7 C y2 ^= y0
105 xor $1, $8 C y3 ^= x0
106 mov $6, $1 C x0 = y1 ^ y3
107 xor $8, $1
108 xor $7, $1 C x0 ^= y2
109 mov $2, $6 C y1 = x1 & x3
110 and $4, $6
111 xor $1, $6 C y1 ^= x0
112 mov $6, $4 C x3 = y1 | y3
113 or $8, $4
114 not $8 C y3 = ~y3
115 and $4, $5 C y0 &= x3
116 xor $3, $5 C y0 ^= x2
119 define(<SBOX2>, <
120 mov $1, $7 C y2 = x1 | x2
121 or $3, $7
122 mov $1, $6
123 xor $2, $6
124 mov $4, $8
125 xor $7, $8
126 mov $6, $5
127 xor $8, $5
128 or $1, $4
129 xor $5, $3
130 mov $2, $1
131 xor $3, $1
132 or $2, $3
133 and $7, $1
134 xor $3, $8
135 or $8, $6
136 xor $1, $6
137 mov $8, $7
138 xor $6, $7
139 xor $2, $7
140 not $8
141 xor $4, $7
144 define(<SBOX3>, <
145 mov $1, $6
146 xor $3, $6
147 mov $1, $5
148 or $4, $5
149 mov $1, $8
150 and $4, $8
151 and $5, $6
152 or $2, $8
153 mov $1, $7
154 and $2, $7
155 or $3, $7
156 mov $4, $3
157 xor $6, $3
158 xor $8, $6
159 or $3, $1
160 xor $2, $3
161 and $4, $8
162 xor $8, $5
163 mov $7, $8
164 xor $3, $8
165 xor $5, $7
166 or $8, $4
167 and $4, $2
168 mov $1, $5
169 xor $2, $5
172 define(<SBOX4>, <
173 mov $1, $8
174 or $2, $8
175 mov $2, $7
176 or $3, $7
177 xor $1, $7
178 and $4, $8
179 mov $2, $5
180 xor $4, $5
181 or $7, $4
182 and $4, $1
183 and $3, $2
184 xor $8, $3
185 xor $7, $8
186 or $2, $7
187 mov $8, $6
188 and $5, $6
189 xor $6, $7
190 xor $5, $6
191 or $2, $6
192 xor $1, $6
193 and $4, $5
194 xor $3, $5
195 not $5
198 define(<SBOX5>, <
199 mov $2, $5
200 or $4, $5
201 xor $3, $5
202 mov $2, $3
203 xor $4, $3
204 mov $1, $7
205 xor $3, $7
206 and $3, $1
207 xor $1, $5
208 mov $2, $8
209 or $7, $8
210 or $5, $2
211 not $5
212 or $5, $1
213 xor $3, $8
214 xor $1, $8
215 mov $4, $6
216 or $5, $6
217 xor $6, $4
218 xor $7, $6
219 or $4, $7
220 xor $2, $7
223 define(<SBOX6>, <
224 mov $1, $5
225 xor $4, $5
226 mov $1, $6
227 and $4, $6
228 mov $1, $7
229 or $3, $7
230 or $2, $4
231 xor $3, $4
232 xor $2, $1
233 mov $2, $8
234 or $3, $8
235 xor $2, $3
236 and $5, $8
237 xor $3, $6
238 not $6
239 and $6, $5
240 and $6, $2
241 xor $8, $2
242 xor $4, $8
243 xor $2, $7
244 not $7
245 xor $7, $5
246 xor $1, $5
249 define(<SBOX7>, <
250 mov $1, $5
251 and $3, $5
252 mov $2, $8
253 or $5, $8 C t04
254 xor $3, $8
255 mov $4, $6
256 not $6 C t02
257 and $1, $6
258 xor $6, $8
259 mov $3, $6
260 or $8, $6
261 xor $1, $6
262 mov $1, $7
263 and $2, $7
264 xor $7, $3
265 or $4, $7
266 xor $7, $6
267 mov $2, $7
268 or $5, $7 C t04
269 and $8, $7
270 xor $6, $2
271 or $2, $7
272 xor $1, $7
273 xor $6, $5
274 not $4 C t02
275 or $4, $5
276 xor $3, $5
279 define(<LT>, <
280 rol <$>13, $1
281 rol <$>3, $3
282 xor $1, $2
283 xor $3, $2
284 mov $1, TMP32
285 shl <$>3, TMP32
286 xor $3, $4
287 xor TMP32, $4
288 rol $2
289 rol <$>7, $4
290 xor $2, $1
291 xor $4, $1
292 mov $2, TMP32
293 shl <$>7, TMP32
294 xor $4, $3
295 xor TMP32, $3
296 rol <$>5, $1
297 rol <$>22, $3
300 C Parallel operation on four blocks at a time.
302 C pnot instruction is missing. For lack of a spare register, XOR with
303 C constant in memory.
305 define(<PNOT>, <
306 pxor MINUS1, $1
309 define(<WSBOX0>, <
310 movdqa $2, $8 C y3 = x1 ^ x2
311 pxor $3, $8
312 movdqa $1, $5 C y0 = x0 | x3
313 por $4, $5
314 movdqa $1, $6 C y1 = x0 ^ x1
315 pxor $2, $6
316 pxor $5, $8 C y3 ^= y0
317 movdqa $3, $7 C y2 = x2 | y3
318 por $8, $7
319 pxor $4, $1 C x0 ^= x3
320 pand $4, $7 C y2 &= x3
321 pxor $3, $4 C x3 ^= x2
322 por $2, $3 C x2 |= x1
323 movdqa $6, $5 C y0 = y1 & x2
324 pand $3, $5
325 pxor $5, $7 C y2 ^= y0
326 pand $7, $5 C y0 &= y2
327 pxor $3, $5 C y0 ^= x2
328 pand $1, $2 C x1 &= x0
329 pxor $1, $5 C y0 ^= x0
330 PNOT($5) C y0 = ~y0
331 movdqa $5, $6 C y1 = y0 ^ x1
332 pxor $2, $6
333 pxor $4, $6 C y1 ^= x3
336 define(<WSBOX1>, <
337 movdqa $1, $6 C y1 = x0 | x3
338 por $4, $6
339 movdqa $3, $7 C y2 = x2 ^ x3
340 pxor $4, $7
341 movdqa $2, $5 C y0 = ~x1
342 PNOT($5)
343 movdqa $1, $8 C y3 = x0 ^ x2
344 pxor $3, $8
345 por $1, $5 C y0 |= x0
346 pand $4, $8 C y3 &= x3
347 movdqa $6, $1 C x0 = y1 & y2
348 pand $7, $1
349 por $2, $8 C y3 |= x1
350 pxor $5, $7 C y2 ^= y0
351 pxor $1, $8 C y3 ^= x0
352 movdqa $6, $1 C x0 = y1 ^ y3
353 pxor $8, $1
354 pxor $7, $1 C x0 ^= y2
355 movdqa $2, $6 C y1 = x1 & x3
356 pand $4, $6
357 pxor $1, $6 C y1 ^= x0
358 movdqa $6, $4 C x3 = y1 | y3
359 por $8, $4
360 PNOT($8) C y3 = ~y3
361 pand $4, $5 C y0 &= x3
362 pxor $3, $5 C y0 ^= x2
365 define(<WSBOX2>, <
366 movdqa $1, $7 C y2 = x1 | x2
367 por $3, $7
368 movdqa $1, $6
369 pxor $2, $6
370 movdqa $4, $8
371 pxor $7, $8
372 movdqa $6, $5
373 pxor $8, $5
374 por $1, $4
375 pxor $5, $3
376 movdqa $2, $1
377 pxor $3, $1
378 por $2, $3
379 pand $7, $1
380 pxor $3, $8
381 por $8, $6
382 pxor $1, $6
383 movdqa $8, $7
384 pxor $6, $7
385 pxor $2, $7
386 PNOT($8)
387 pxor $4, $7
390 define(<WSBOX3>, <
391 movdqa $1, $6
392 pxor $3, $6
393 movdqa $1, $5
394 por $4, $5
395 movdqa $1, $8
396 pand $4, $8
397 pand $5, $6
398 por $2, $8
399 movdqa $1, $7
400 pand $2, $7
401 por $3, $7
402 movdqa $4, $3
403 pxor $6, $3
404 pxor $8, $6
405 por $3, $1
406 pxor $2, $3
407 pand $4, $8
408 pxor $8, $5
409 movdqa $7, $8
410 pxor $3, $8
411 pxor $5, $7
412 por $8, $4
413 pand $4, $2
414 movdqa $1, $5
415 pxor $2, $5
418 define(<WSBOX4>, <
419 movdqa $1, $8
420 por $2, $8
421 movdqa $2, $7
422 por $3, $7
423 pxor $1, $7
424 pand $4, $8
425 movdqa $2, $5
426 pxor $4, $5
427 por $7, $4
428 pand $4, $1
429 pand $3, $2
430 pxor $8, $3
431 pxor $7, $8
432 por $2, $7
433 movdqa $8, $6
434 pand $5, $6
435 pxor $6, $7
436 pxor $5, $6
437 por $2, $6
438 pxor $1, $6
439 pand $4, $5
440 pxor $3, $5
441 PNOT($5)
444 define(<WSBOX5>, <
445 movdqa $2, $5
446 por $4, $5
447 pxor $3, $5
448 movdqa $2, $3
449 pxor $4, $3
450 movdqa $1, $7
451 pxor $3, $7
452 pand $3, $1
453 pxor $1, $5
454 movdqa $2, $8
455 por $7, $8
456 por $5, $2
457 PNOT($5)
458 por $5, $1
459 pxor $3, $8
460 pxor $1, $8
461 movdqa $4, $6
462 por $5, $6
463 pxor $6, $4
464 pxor $7, $6
465 por $4, $7
466 pxor $2, $7
469 define(<WSBOX6>, <
470 movdqa $1, $5
471 pxor $4, $5
472 movdqa $1, $6
473 pand $4, $6
474 movdqa $1, $7
475 por $3, $7
476 por $2, $4
477 pxor $3, $4
478 pxor $2, $1
479 movdqa $2, $8
480 por $3, $8
481 pxor $2, $3
482 pand $5, $8
483 pxor $3, $6
484 PNOT($6)
485 pand $6, $5
486 pand $6, $2
487 pxor $8, $2
488 pxor $4, $8
489 pxor $2, $7
490 PNOT($7)
491 pxor $7, $5
492 pxor $1, $5
495 define(<WSBOX7>, <
496 movdqa $1, $5
497 pand $3, $5
498 movdqa $2, $8
499 por $5, $8 C t04
500 pxor $3, $8
501 movdqa $4, $6
502 pandn $1, $6 C t02 implicit
503 pxor $6, $8
504 movdqa $3, $6
505 por $8, $6
506 pxor $1, $6
507 movdqa $1, $7
508 pand $2, $7
509 pxor $7, $3
510 por $4, $7
511 pxor $7, $6
512 movdqa $2, $7
513 por $5, $7 C t04
514 pand $8, $7
515 pxor $6, $2
516 por $2, $7
517 pxor $1, $7
518 pxor $6, $5
519 PNOT($4) C t02
520 por $4, $5
521 pxor $3, $5
524 C WLT(x0, x1, x2, x3)
525 define(<WLT>, <
526 WROL(13, $1)
527 WROL(3, $3)
528 pxor $1, $2
529 pxor $3, $2
530 movdqa $1, T0
531 pslld <$>3, T0
532 pxor $3, $4
533 pxor T0, $4
534 WROL(1, $2)
535 WROL(7, $4)
536 pxor $2, $1
537 pxor $4, $1
538 movdqa $2, T0
539 pslld <$>7, T0
540 pxor $4, $3
541 pxor T0, $3
542 WROL(5, $1)
543 WROL(22, $3)
546 .file "serpent-encrypt.asm"
548 C serpent_encrypt(struct serpent_context *ctx,
549 C unsigned length, uint8_t *dst,
550 C const uint8_t *src)
551 .text
552 ALIGN(16)
553 PROLOGUE(nettle_serpent_encrypt)
554 C save all registers that need to be saved
555 W64_ENTRY(4, 13)
556 push %rbx
557 push %rbp
558 push %r12
559 push %r13
560 push %r14
562 lea (SRC, N), SRC
563 lea (DST, N), DST
564 neg N
565 jz .Lend
567 C Point at the final subkey.
568 lea 512(CTX), CTX
570 cmp $-64, N
571 ja .Lblock_loop
573 pcmpeqd MINUS1, MINUS1
575 .Lwblock_loop:
576 movups (SRC, N), X0
577 movups 16(SRC, N), X1
578 movups 32(SRC, N), X2
579 movups 48(SRC, N), X3
581 WTRANSPOSE(X0, X1, X2, X3)
583 mov $-512, CNT
584 jmp .Lwround_start
586 ALIGN(16)
587 .Lwround_loop:
588 WLT(X0,X1,X2,X3)
589 .Lwround_start:
590 WKEYXOR(, X0,X1,X2,X3)
591 WSBOX0(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
592 WLT(Y0,Y1,Y2,Y3)
594 WKEYXOR(16, Y0,Y1,Y2,Y3)
595 WSBOX1(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
596 WLT(X0,X1,X2,X3)
598 WKEYXOR(32, X0,X1,X2,X3)
599 WSBOX2(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
600 WLT(Y0,Y1,Y2,Y3)
602 WKEYXOR(48, Y0,Y1,Y2,Y3)
603 WSBOX3(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
604 WLT(X0,X1,X2,X3)
606 WKEYXOR(64, X0,X1,X2,X3)
607 WSBOX4(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
608 WLT(Y0,Y1,Y2,Y3)
610 WKEYXOR(80, Y0,Y1,Y2,Y3)
611 WSBOX5(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
612 WLT(X0,X1,X2,X3)
614 WKEYXOR(96, X0,X1,X2,X3)
615 WSBOX6(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
616 WLT(Y0,Y1,Y2,Y3)
618 WKEYXOR(112, Y0,Y1,Y2,Y3)
619 WSBOX7(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
620 add $128, CNT
621 jnz .Lwround_loop
623 C FIXME: CNT known to be zero, no index register needed
624 WKEYXOR(, X0,X1,X2,X3)
626 WTRANSPOSE(X0,X1,X2,X3)
628 movups X0, (DST, N)
629 movups X1, 16(DST, N)
630 movups X2, 32(DST, N)
631 movups X3, 48(DST, N)
633 C FIXME: Adjust N, so we can use just jnc without an extra cmp.
634 add $64, N
635 jz .Lend
637 cmp $-64, N
638 jbe .Lwblock_loop
640 C The single-block loop here is slightly slower than the double-block
641 C loop in serpent-encrypt.c.
643 C FIXME: Should use non-sse2 code only if we have a single block left.
644 C With two or three blocks, it should be better to do them in
645 C parallell.
647 .Lblock_loop:
648 movl (SRC, N), x0
649 movl 4(SRC, N), x1
650 movl 8(SRC, N), x2
651 movl 12(SRC, N), x3
653 mov $-512, CNT
654 jmp .Lround_start
656 ALIGN(16)
657 .Lround_loop:
658 LT(x0,x1,x2,x3)
659 .Lround_start:
660 xor (CTX, CNT), x0
661 xor 4(CTX, CNT), x1
662 xor 8(CTX, CNT), x2
663 xor 12(CTX, CNT), x3
664 SBOX0(x0,x1,x2,x3, y0,y1,y2,y3)
665 LT(y0,y1,y2,y3)
667 xor 16(CTX, CNT), y0
668 xor 20(CTX, CNT), y1
669 xor 24(CTX, CNT), y2
670 xor 28(CTX, CNT), y3
671 SBOX1(y0,y1,y2,y3, x0,x1,x2,x3)
672 LT(x0,x1,x2,x3)
674 xor 32(CTX, CNT), x0
675 xor 36(CTX, CNT), x1
676 xor 40(CTX, CNT), x2
677 xor 44(CTX, CNT), x3
678 SBOX2(x0,x1,x2,x3, y0,y1,y2,y3)
679 LT(y0,y1,y2,y3)
681 xor 48(CTX, CNT), y0
682 xor 52(CTX, CNT), y1
683 xor 56(CTX, CNT), y2
684 xor 60(CTX, CNT), y3
685 SBOX3(y0,y1,y2,y3, x0,x1,x2,x3)
686 LT(x0,x1,x2,x3)
688 xor 64(CTX, CNT), x0
689 xor 68(CTX, CNT), x1
690 xor 72(CTX, CNT), x2
691 xor 76(CTX, CNT), x3
692 SBOX4(x0,x1,x2,x3, y0,y1,y2,y3)
693 LT(y0,y1,y2,y3)
695 xor 80(CTX, CNT), y0
696 xor 84(CTX, CNT), y1
697 xor 88(CTX, CNT), y2
698 xor 92(CTX, CNT), y3
699 SBOX5(y0,y1,y2,y3, x0,x1,x2,x3)
700 LT(x0,x1,x2,x3)
702 xor 96(CTX, CNT), x0
703 xor 100(CTX, CNT), x1
704 xor 104(CTX, CNT), x2
705 xor 108(CTX, CNT), x3
706 SBOX6(x0,x1,x2,x3, y0,y1,y2,y3)
707 LT(y0,y1,y2,y3)
709 xor 112(CTX, CNT), y0
710 xor 116(CTX, CNT), y1
711 xor 120(CTX, CNT), y2
712 xor 124(CTX, CNT), y3
713 SBOX7(y0,y1,y2,y3, x0,x1,x2,x3)
714 add $128, CNT
715 jnz .Lround_loop
717 C Apply final subkey.
718 xor (CTX, CNT), x0
719 xor 4(CTX, CNT), x1
720 xor 8(CTX, CNT), x2
721 xor 12(CTX, CNT), x3
723 movl x0, (DST, N)
724 movl x1, 4(DST, N)
725 movl x2, 8(DST, N)
726 movl x3, 12(DST, N)
727 add $16, N
728 jnc .Lblock_loop
730 .Lend:
731 pop %r14
732 pop %r13
733 pop %r12
734 pop %rbp
735 pop %rbx
736 W64_EXIT(4, 13)