1 C nettle
, low-level cryptographics library
3 C Copyright
(C
) 2012 Niels Möller
5 C The nettle library is free software
; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General
Public License as published by
7 C the Free Software Foundation
; either version 2.1 of the License, or (at your
8 C option
) any later version.
10 C The nettle library is distributed
in the hope that it will be useful
, but
11 C WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
12 C
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General
Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General
Public License
16 C along with the nettle library
; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation
, Inc.
, 51 Franklin Street
, Fifth Floor
, Boston
,
21 define
(<LENGTH>, <%rsi
>)
32 define
(<M0101
>, <%xmm6
>)
33 define
(<M0110
>, <%xmm7
>)
34 define
(<M0011
>, <%xmm8
>)
35 define
(<COUNT
>, <%rax
>)
37 include_src
(<x86_64
/salsa20.m4
>)
39 C Possible
improvements:
41 C Do two blocks
(or more
) at a time
in parallel
, to avoid limitations
42 C due to data dependencies.
44 C Avoid redoing the permutation of the input for each block
(all but
45 C the two counter words are constant
). Could also keep the input
in
48 .file
"salsa20-crypt.asm"
50 C salsa20_crypt
(struct salsa20_ctx
*ctx
, unsigned
length,
51 C uint8_t
*dst
, const uint8_t
*src
)
54 PROLOGUE
(nettle_salsa20_crypt
)
62 movd XREG
(COUNT
), M0101
63 pshufd
$0x09
, M0101
, M0011 C
01 01 00 00
64 pshufd
$0x41
, M0101
, M0110 C
01 00 00 01
65 pshufd
$0x22
, M0101
, M0101 C
01 00 01 00
73 C On input
, each xmm register is one row. We start with
80 C Diagrams are
in little
-endian order
, with least significant
word to
81 C the left. We rotate the columns
, to get instead
88 C The original rows are now diagonals.
97 QROUND
(X0
, X1
, X2
, X3
)
98 C For the row operations
, we first rotate the rows
, to get
105 C Now the original rows are turned
into into columns.
(This
106 C SIMD hack described
in djb
's papers).
108 pshufd $0x93, X1, X1 C 11 00 01 10 (least sign. left)
109 pshufd $0x4e, X2, X2 C 10 11 00 01
110 pshufd $0x39, X3, X3 C 01 10 11 00
112 QROUND(X0, X3, X2, X1)
114 C Inverse rotation of the rows
115 pshufd $0x39, X1, X1 C 01 10 11 00
116 pshufd $0x4e, X2, X2 C 10 11 00 01
117 pshufd $0x93, X3, X3 C 11 00 01 10
136 C Increment block counter
197 C This "movd" instruction should assemble to
198 C 66 49 0f 7e e0 movq %xmm4,%r8
199 C Apparently, assemblers treat movd and movq (with the
200 C arguments we use) in the same way, except for osx, which
206 pshufd $0xee, T0, T0 C 10 11 10 11
208 C And this is also really a movq.
212 mov XREG(T64), XREG(COUNT)
213 xor (SRC, POS), XREG(COUNT)
214 mov XREG(COUNT), (DST, POS)
220 mov WREG(T64), WREG(COUNT)
221 xor (SRC, POS), WREG(COUNT)
222 mov WREG(COUNT), (DST, POS)
228 xor (SRC, POS), LREG(T64)
229 mov LREG(T64), (DST, POS)
234 EPILOGUE(nettle_salsa20_crypt)