1 C nettle
, low-level cryptographics library
3 C Copyright
(C
) 2011 Niels Möller
5 C The nettle library is free software
; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General
Public License as published by
7 C the Free Software Foundation
; either version 2.1 of the License, or (at your
8 C option
) any later version.
10 C The nettle library is distributed
in the hope that it will be useful
, but
11 C WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
12 C
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General
Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General
Public License
16 C along with the nettle library
; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation
, Inc.
, 51 Franklin Street
, Fifth Floor
, Boston
,
20 include_src
(<x86_64
/serpent.m4
>)
24 C Single block serpent state
, two copies
35 C Quadruple block serpent state
, two copies
46 define
(<MINUS1
>, <%xmm8
>)
48 define
(<T1
>, <%xmm10
>)
49 define
(<T2
>, <%xmm11
>)
50 define
(<T3
>, <%xmm12
>)
59 define
(<TMP32
>, <%r14d
>)
61 C SBOX macros. Inputs
$1 - $4 (destroyed
), outputs
$5 - $8
64 mov $2, $8 C y3
= x1 ^ x2
66 mov $1, $5 C y0
= x0 | x3
68 mov $1, $6 C y1
= x0 ^ x1
71 mov $3, $7 C y2
= x2 | y3
77 mov $6, $5 C y0
= y1
& x2
85 mov $5, $6 C y1
= y0 ^ x1
91 mov $1, $6 C y1
= x0 | x3
93 mov $3, $7 C y2
= x2 ^ x3
97 mov $1, $8 C y3
= x0 ^ x2
100 and $4, $8 C y3
&= x3
101 mov $6, $1 C x0
= y1
& y2
104 xor $5, $7 C y2 ^
= y0
105 xor $1, $8 C y3 ^
= x0
106 mov $6, $1 C x0
= y1 ^ y3
108 xor $7, $1 C x0 ^
= y2
109 mov $2, $6 C y1
= x1
& x3
111 xor $1, $6 C y1 ^
= x0
112 mov $6, $4 C x3
= y1 | y3
115 and $4, $5 C y0
&= x3
116 xor $3, $5 C y0 ^
= x2
120 mov $1, $7 C y2
= x1 | x2
300 C Parallel operation on four blocks at a time.
302 C pnot instruction is missing. For lack of a spare register
, XOR with
303 C constant
in memory.
310 movdqa
$2, $8 C y3
= x1 ^ x2
312 movdqa
$1, $5 C y0
= x0 | x3
314 movdqa
$1, $6 C y1
= x0 ^ x1
316 pxor
$5, $8 C y3 ^
= y0
317 movdqa
$3, $7 C y2
= x2 | y3
319 pxor
$4, $1 C x0 ^
= x3
320 pand
$4, $7 C y2
&= x3
321 pxor
$3, $4 C x3 ^
= x2
322 por
$2, $3 C x2 |
= x1
323 movdqa
$6, $5 C y0
= y1
& x2
325 pxor
$5, $7 C y2 ^
= y0
326 pand
$7, $5 C y0
&= y2
327 pxor
$3, $5 C y0 ^
= x2
328 pand
$1, $2 C x1
&= x0
329 pxor
$1, $5 C y0 ^
= x0
331 movdqa
$5, $6 C y1
= y0 ^ x1
333 pxor
$4, $6 C y1 ^
= x3
337 movdqa
$1, $6 C y1
= x0 | x3
339 movdqa
$3, $7 C y2
= x2 ^ x3
341 movdqa
$2, $5 C y0
= ~x1
343 movdqa
$1, $8 C y3
= x0 ^ x2
345 por
$1, $5 C y0 |
= x0
346 pand
$4, $8 C y3
&= x3
347 movdqa
$6, $1 C x0
= y1
& y2
349 por
$2, $8 C y3 |
= x1
350 pxor
$5, $7 C y2 ^
= y0
351 pxor
$1, $8 C y3 ^
= x0
352 movdqa
$6, $1 C x0
= y1 ^ y3
354 pxor
$7, $1 C x0 ^
= y2
355 movdqa
$2, $6 C y1
= x1
& x3
357 pxor
$1, $6 C y1 ^
= x0
358 movdqa
$6, $4 C x3
= y1 | y3
361 pand
$4, $5 C y0
&= x3
362 pxor
$3, $5 C y0 ^
= x2
366 movdqa
$1, $7 C y2
= x1 | x2
502 pandn
$1, $6 C t02 implicit
524 C WLT
(x0
, x1
, x2
, x3
)
546 .file
"serpent-encrypt.asm"
548 C serpent_encrypt
(struct serpent_context
*ctx
,
549 C unsigned
length, uint8_t
*dst
,
550 C const uint8_t
*src
)
553 PROLOGUE
(nettle_serpent_encrypt
)
554 C save all registers that need to be saved
567 C Point at the final subkey.
573 pcmpeqd MINUS1
, MINUS1
577 movups
16(SRC
, N
), X1
578 movups
32(SRC
, N
), X2
579 movups
48(SRC
, N
), X3
581 WTRANSPOSE
(X0
, X1
, X2
, X3
)
590 WKEYXOR
(, X0
,X1
,X2
,X3
)
591 WSBOX0
(X0
,X1
,X2
,X3
, Y0
,Y1
,Y2
,Y3
)
594 WKEYXOR
(16, Y0
,Y1
,Y2
,Y3
)
595 WSBOX1
(Y0
,Y1
,Y2
,Y3
, X0
,X1
,X2
,X3
)
598 WKEYXOR
(32, X0
,X1
,X2
,X3
)
599 WSBOX2
(X0
,X1
,X2
,X3
, Y0
,Y1
,Y2
,Y3
)
602 WKEYXOR
(48, Y0
,Y1
,Y2
,Y3
)
603 WSBOX3
(Y0
,Y1
,Y2
,Y3
, X0
,X1
,X2
,X3
)
606 WKEYXOR
(64, X0
,X1
,X2
,X3
)
607 WSBOX4
(X0
,X1
,X2
,X3
, Y0
,Y1
,Y2
,Y3
)
610 WKEYXOR
(80, Y0
,Y1
,Y2
,Y3
)
611 WSBOX5
(Y0
,Y1
,Y2
,Y3
, X0
,X1
,X2
,X3
)
614 WKEYXOR
(96, X0
,X1
,X2
,X3
)
615 WSBOX6
(X0
,X1
,X2
,X3
, Y0
,Y1
,Y2
,Y3
)
618 WKEYXOR
(112, Y0
,Y1
,Y2
,Y3
)
619 WSBOX7
(Y0
,Y1
,Y2
,Y3
, X0
,X1
,X2
,X3
)
623 C
FIXME: CNT known to be zero
, no index register needed
624 WKEYXOR
(, X0
,X1
,X2
,X3
)
626 WTRANSPOSE
(X0
,X1
,X2
,X3
)
629 movups X1
, 16(DST
, N
)
630 movups X2
, 32(DST
, N
)
631 movups X3
, 48(DST
, N
)
633 C
FIXME: Adjust N
, so we can use just
jnc without an extra
cmp.
640 C The single
-block
loop here is slightly slower than the double
-block
641 C
loop in serpent
-encrypt.c.
643 C
FIXME: Should use non
-sse2 code only if we have a single block left.
644 C With two
or three blocks
, it should be better to do them
in
664 SBOX0
(x0
,x1
,x2
,x3
, y0
,y1
,y2
,y3
)
671 SBOX1
(y0
,y1
,y2
,y3
, x0
,x1
,x2
,x3
)
678 SBOX2
(x0
,x1
,x2
,x3
, y0
,y1
,y2
,y3
)
685 SBOX3
(y0
,y1
,y2
,y3
, x0
,x1
,x2
,x3
)
692 SBOX4
(x0
,x1
,x2
,x3
, y0
,y1
,y2
,y3
)
699 SBOX5
(y0
,y1
,y2
,y3
, x0
,x1
,x2
,x3
)
703 xor 100(CTX
, CNT
), x1
704 xor 104(CTX
, CNT
), x2
705 xor 108(CTX
, CNT
), x3
706 SBOX6
(x0
,x1
,x2
,x3
, y0
,y1
,y2
,y3
)
709 xor 112(CTX
, CNT
), y0
710 xor 116(CTX
, CNT
), y1
711 xor 120(CTX
, CNT
), y2
712 xor 124(CTX
, CNT
), y3
713 SBOX7
(y0
,y1
,y2
,y3
, x0
,x1
,x2
,x3
)
717 C Apply final subkey.