1 C nettle
, low-level cryptographics library
3 C Copyright
(C
) 2012 Niels Möller
5 C The nettle library is free software
; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General
Public License as published by
7 C the Free Software Foundation
; either version 2.1 of the License, or (at your
8 C option
) any later version.
10 C The nettle library is distributed
in the hope that it will be useful
, but
11 C WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
12 C
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General
Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General
Public License
16 C along with the nettle library
; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation
, Inc.
, 51 Franklin Street
, Fifth Floor
, Boston
,
20 define
(<CTX
>, <%rdi
>) C
25 64-bit values
, 200 bytes.
21 define
(<COUNT
>, <%r8
>) C Avoid clobbering
%rsi
, for W64.
24 define
(<A0102
>, <%xmm0
>)
25 define
(<A0304
>, <%xmm1
>)
28 define
(<A0607
>, <%xmm2
>)
29 define
(<A0809
>, <%xmm3
>)
32 define
(<A1112
>, <%xmm4
>)
33 define
(<A1314
>, <%xmm5
>)
36 define
(<A1617
>, <%xmm6
>)
37 define
(<A1819
>, <%xmm7
>)
40 define
(<A2122
>, <%xmm8
>)
41 define
(<A2324
>, <%xmm9
>)
44 define
(<C12
>, <%xmm10
>)
45 define
(<C34
>, <%xmm11
>)
48 define
(<D12
>, <%xmm12
>)
49 define
(<D34
>, <%xmm13
>)
52 define
(<W0
>, <%xmm14
>)
53 define
(<W1
>, <%xmm15
>)
54 define
(<W2
>, <%xmm12
>) C Overlap D12
55 define
(<W3
>, <%xmm13
>) C Overlap D34
59 define
(<T2
>, <%r11
>) C Overlap D0
60 define
(<T3
>, <%r10
>) C Overlap C0
64 define
(<OFFSET>, <ifelse
($1,0,,eval
(8*$1))>)
65 define
(<STATE
>, <OFFSET($1)(CTX
)>)
67 define
(<SWAP64
>, <pshufd
<$
>0x4e,>)
69 define
(<DIRECT_MOVQ
>, <no
>)
71 C MOVQ
(src
, dst
), for moves between a general register
and an xmm
74 ifelse
(DIRECT_MOVQ
, yes
, <
75 C movq calls that are equal to the corresponding movd
,
76 C where the Apple assembler requires them to be written as movd.
77 define
(<MOVQ
>, <movd
$1, $2>)
79 C Moving via
(cached
) memory is generally faster.
85 C ROTL64
(rot
, register
, temp
)
86 C Caller needs to
or together the result.
90 psrlq
<$
>eval
(64-$1), $3
93 .file
"sha3-permute.asm"
95 C sha3_permute
(struct sha3_state
*ctx
)
98 PROLOGUE
(nettle_sha3_permute
)
105 movl
$24, XREG
(COUNT
)
108 movups STATE
(1), A0102
109 movups STATE
(3), A0304
114 movups STATE
(6), A0607
116 movups STATE
(8), A0809
121 movups STATE
(11), A1112
123 movups STATE
(13), A1314
128 movups STATE
(16), A1617
130 movups STATE
(18), A1819
135 movups STATE
(21), A2122
137 movups STATE
(23), A2324
144 C The theta step. Combine parity bits
, then
xor to state.
145 C D0
= C4 ^
(C1
<<< 1)
146 C D1
= C0 ^
(C2
<<< 1)
147 C D2
= C1 ^
(C3
<<< 1)
148 C D3
= C2 ^
(C4
<<< 1)
149 C D4
= C3 ^
(C0
<<< 1)
151 C Shift the words around
, putting
(C0
, C1
) in D12
, (C2
, C3
) in
152 C D34
, and (C4
, C0
) in C34.
154 C Notes on
"unpack" instructions:
155 C punpckhqdq
01, 23 gives
31
156 C punpcklqdq
01, 23 gives
20
158 SWAP64 C34
, C34 C Holds C4
, C3
161 punpcklqdq C12
, D12 C Holds C0
, C1
162 punpckhqdq C34
, D34 C Holds C2
, C3
163 punpcklqdq D12
, C34 C Holds C4
, C0
169 C Can use C12 as temporary
175 pxor W1
, D12 C Done D12
181 pxor C12
, D34 C Done D34
199 C theta step done
, no C
, D
or W temporaries alive.
201 C rho
and pi steps. When doing the permutations
, also
202 C transpose the matrix.
204 C The combined permutation
+ transpose gives the following
205 C cycles
(rotation counts
in parenthesis
)
207 C
1 <- 3(28) <- 4(27) <- 2(62) <- 1(1)
208 C
5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
210 C
10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
212 C
15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
214 C
20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
217 C Do the
1,2,3,4 row. First rotate
, then permute.
224 por A0102
, W0 C rotl
1 (A01
)
226 por W1
, W2 C rotl
62 (A02
)
232 por W1
, A0102 C rotl
28 (A03
)
236 por W1
, A0304 C rotl
27 (A04
)
241 C
5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
251 rolq $44, A05 C Done A05
255 ROTL64(20, A0607, W2)
257 punpckhqdq W1, A0607 C Done A0607
258 ROTL64(55, A0809, W1)
261 punpcklqdq W1, A0809 C Done 0809
263 C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
267 C |
10| |
11|
12| |
13|
14|
271 rolq
$42, A10 C
42 + 25 = 3 (mod 64)
275 rolq
$43, A10 C Done A10
277 punpcklqdq A1314
, A1112
278 ROTL64
(25, A1112
, W1
)
279 por W1
, A1112 C Done A1112
280 ROTL64
(39, A1314
, W2
)
282 ROTL64
(10, W0
, A1314
)
284 punpckhqdq W2
, A1314 C Done A1314
287 C
15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
292 C |
15| |
16|
17| |
18|
19|
294 C \_________________
/
300 rolq
$21, A15 C Done A15
302 ROTL64
(45, A1617
, W2
)
306 punpcklqdq W0
, A1617 C Done A1617
307 ROTL64
(15, A1819
, W2
)
309 punpcklqdq W1
, A1819 C Done A1819
311 C
20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
316 C |
20| |
21|
22| |
23|
24|
324 rolq
$14, A20 C Done A20
325 ROTL64
(56, A2324
, W1
)
331 punpcklqdq W2
, A2324 C Done A2324
333 ROTL64
(61, A2122
, W1
)
336 punpcklqdq W0
, A2122 C Done A2122
338 C chi step. With the transposed matrix
, applied independently
393 xorq
(RC
, COUNT
, 8), A00
396 C Swap
(A05
, A10
) <-> A0102
, and (A15
, A20
) <-> A0304
,
397 C
and also copy to C12
and C34 while at it.
418 C Transpose
(A0607
, A1112
)
420 punpcklqdq A1112
, A0607
426 C Transpose
(A1819
, A2324
)
428 punpcklqdq A2324
, A1819
434 C Transpose
(A0809
, A1314
) and (A1617
, A2122
), and swap
441 punpcklqdq A2122
, A0809
442 punpckhqdq A1617
, A1314
458 movups A0102
, STATE
(1)
459 movups A0304
, STATE
(3)
462 movups A0607
, STATE
(6)
463 movups A0809
, STATE
(8)
466 movups A1112
, STATE
(11)
467 movups A1314
, STATE
(13)
470 movups A1617
, STATE
(16)
471 movups A1819
, STATE
(18)
474 movups A2122
, STATE
(21)
475 movups A2324
, STATE
(23)
484 EPILOGUE
(nettle_sha3_permute
)
487 .
rc: C
In reverse order
488 .quad
0x8000000080008008
489 .quad
0x0000000080000001
490 .quad
0x8000000000008080
491 .quad
0x8000000080008081
492 .quad
0x800000008000000A
493 .quad
0x000000000000800A
494 .quad
0x8000000000000080
495 .quad
0x8000000000008002
496 .quad
0x8000000000008003
497 .quad
0x8000000000008089
498 .quad
0x800000000000008B
499 .quad
0x000000008000808B
500 .quad
0x000000008000000A
501 .quad
0x0000000080008009
502 .quad
0x0000000000000088
503 .quad
0x000000000000008A
504 .quad
0x8000000000008009
505 .quad
0x8000000080008081
506 .quad
0x0000000080000001
507 .quad
0x000000000000808B
508 .quad
0x8000000080008000
509 .quad
0x800000000000808A
510 .quad
0x0000000000008082
511 .quad
0x0000000000000001