release/src/router/nettle/x86_64/sha3-permute.asm

   1 C nettle, low-level cryptographics library
   2 C
   3 C Copyright (C) 2012 Niels Möller
   4 C
   5 C The nettle library is free software; you can redistribute it and/or modify
   6 C it under the terms of the GNU Lesser General Public License as published by
   7 C the Free Software Foundation; either version 2.1 of the License, or (at your
   8 C option) any later version.
   9 C
  10 C The nettle library is distributed in the hope that it will be useful, but
  11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  12 C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  13 C License for more details.
  14 C
  15 C You should have received a copy of the GNU Lesser General Public License
  16 C along with the nettle library; see the file COPYING.LIB.  If not, write to
  17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 C MA 02111-1301, USA.
  19
  20 define(<CTX>, <%rdi>)           C 25 64-bit values, 200 bytes.
  21 define(<COUNT>, <%r8>)          C Avoid clobbering %rsi, for W64.
  22
  23 define(<A00>,  <%rax>)
  24 define(<A0102>, <%xmm0>)
  25 define(<A0304>, <%xmm1>)
  26
  27 define(<A05>,  <%rcx>)
  28 define(<A0607>, <%xmm2>)
  29 define(<A0809>, <%xmm3>)
  30
  31 define(<A10>,  <%rdx>)
  32 define(<A1112>, <%xmm4>)
  33 define(<A1314>, <%xmm5>)
  34
  35 define(<A15>,  <%rbp>)
  36 define(<A1617>, <%xmm6>)
  37 define(<A1819>, <%xmm7>)
  38
  39 define(<A20>,  <%r9>)
  40 define(<A2122>, <%xmm8>)
  41 define(<A2324>, <%xmm9>)
  42
  43 define(<C0>, <%r10>)
  44 define(<C12>, <%xmm10>)
  45 define(<C34>, <%xmm11>)
  46
  47 define(<D0>, <%r11>)
  48 define(<D12>, <%xmm12>)
  49 define(<D34>, <%xmm13>)
  50
  51 C Wide temporaries
  52 define(<W0>, <%xmm14>)
  53 define(<W1>, <%xmm15>)
  54 define(<W2>, <%xmm12>)          C Overlap D12
  55 define(<W3>, <%xmm13>)          C Overlap D34
  56
  57 define(<T0>, <%r12>)
  58 define(<T1>, <%r13>)
  59 define(<T2>, <%r11>)            C Overlap D0
  60 define(<T3>, <%r10>)            C Overlap C0
  61
  62 define(<RC>, <%r14>)
  63
  64 define(<OFFSET>, <ifelse($1,0,,eval(8*$1))>)
  65 define(<STATE>, <OFFSET($1)(CTX)>)
  66
  67 define(<SWAP64>, <pshufd        <$>0x4e,>)
  68
  69 define(<DIRECT_MOVQ>, <no>)
  70
  71 C MOVQ(src, dst), for moves between a general register and an xmm
  72 C register.
  73
  74 ifelse(DIRECT_MOVQ, yes, <
  75 C movq calls that are equal to the corresponding movd,
  76 C where the Apple assembler requires them to be written as movd.
  77 define(<MOVQ>, <movd    $1, $2>)
  78 >, <
  79 C Moving via (cached) memory is generally faster.
  80 define(<MOVQ>, <
  81         movq    $1, (CTX)
  82         movq    (CTX), $2
  83 >)>)
  84
  85 C ROTL64(rot, register, temp)
  86 C Caller needs to or together the result.
  87 define(<ROTL64>, <
  88         movdqa  $2, $3
  89         psllq   <$>$1, $2
  90         psrlq   <$>eval(64-$1), $3
  91 >)
  92
  93         .file "sha3-permute.asm"
  94
  95         C sha3_permute(struct sha3_state *ctx)
  96         .text
  97         ALIGN(16)
  98 PROLOGUE(nettle_sha3_permute)
  99         W64_ENTRY(1, 16)
 100         push    %rbp
 101         push    %r12
 102         push    %r13
 103         push    %r14
 104
 105         movl    $24, XREG(COUNT)
 106         lea     .rc-8(%rip), RC
 107         movq    STATE(0), A00
 108         movups  STATE(1), A0102
 109         movups  STATE(3), A0304
 110         movq    A00, C0
 111
 112         movq    STATE(5), A05
 113         movdqa  A0102, C12
 114         movups  STATE(6), A0607
 115         movdqa  A0304, C34
 116         movups  STATE(8), A0809
 117         xorq    A05, C0
 118
 119         movq    STATE(10), A10
 120         pxor    A0607, C12
 121         movups  STATE(11), A1112
 122         pxor    A0809, C34
 123         movups  STATE(13), A1314
 124         xorq    A10, C0
 125
 126         movq    STATE(15), A15
 127         pxor    A1112, C12
 128         movups  STATE(16), A1617
 129         pxor    A1314, C34
 130         movups  STATE(18), A1819
 131         xorq    A15, C0
 132
 133         movq    STATE(20), A20
 134         pxor    A1617, C12
 135         movups  STATE(21), A2122
 136         pxor    A1819, C34
 137         movups  STATE(23), A2324
 138         xorq    A20, C0
 139         pxor    A2122, C12
 140         pxor    A2324, C34
 141
 142         ALIGN(16)
 143 .Loop:
 144         C The theta step. Combine parity bits, then xor to state.
 145         C D0 = C4 ^ (C1 <<< 1)
 146         C D1 = C0 ^ (C2 <<< 1)
 147         C D2 = C1 ^ (C3 <<< 1)
 148         C D3 = C2 ^ (C4 <<< 1)
 149         C D4 = C3 ^ (C0 <<< 1)
 150
 151         C Shift the words around, putting (C0, C1) in D12, (C2, C3) in
 152         C   D34, and (C4, C0) in C34.
 153
 154         C Notes on "unpack" instructions:
 155         C   punpckhqdq 01, 23 gives 31
 156         C   punpcklqdq 01, 23 gives 20
 157
 158         SWAP64  C34, C34                C Holds C4, C3
 159         movdqa  C12, D34
 160         MOVQ(C0, D12)
 161         punpcklqdq      C12, D12        C Holds C0, C1
 162         punpckhqdq      C34, D34        C Holds C2, C3
 163         punpcklqdq      D12, C34        C Holds C4, C0
 164         MOVQ(C34, D0)
 165         MOVQ(C12, T0)
 166         rolq    $1, T0
 167         xorq    T0, D0
 168
 169         C Can use C12 as temporary
 170         movdqa  D34, W0
 171         movdqa  D34, W1
 172         psllq   $1, W0
 173         psrlq   $63, W1
 174         pxor    W0, D12
 175         pxor    W1, D12         C Done D12
 176
 177         movdqa  C34, C12
 178         psrlq   $63, C34
 179         psllq   $1, C12
 180         pxor    C34, D34
 181         pxor    C12, D34        C Done D34
 182
 183         xorq    D0, A00
 184         xorq    D0, A05
 185         xorq    D0, A10
 186         xorq    D0, A15
 187         xorq    D0, A20
 188         pxor    D12, A0102
 189         pxor    D12, A0607
 190         pxor    D12, A1112
 191         pxor    D12, A1617
 192         pxor    D12, A2122
 193         pxor    D34, A0304
 194         pxor    D34, A0809
 195         pxor    D34, A1314
 196         pxor    D34, A1819
 197         pxor    D34, A2324
 198
 199         C theta step done, no C, D or W temporaries alive.
 200
 201         C rho and pi steps. When doing the permutations, also
 202         C transpose the matrix.
 203
 204         C The combined permutation + transpose gives the following
 205         C cycles (rotation counts in parenthesis)
 206         C   0 <- 0(0)
 207         C   1 <- 3(28) <- 4(27) <- 2(62) <- 1(1)
 208         C   5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
 209         C   7 <- 7(6)
 210         C   10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
 211         C   14 <- 14(39)
 212         C   15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
 213         C   16 <- 16(45)
 214         C   20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
 215         C   23 <- 23(56)
 216
 217         C Do the 1,2,3,4 row. First rotate, then permute.
 218         movdqa  A0102, W0
 219         movdqa  A0102, W1
 220         movdqa  A0102, W2
 221         psllq   $1, A0102
 222         psrlq   $63, W0
 223         psllq   $62, W1
 224         por     A0102, W0       C rotl 1  (A01)
 225         psrlq   $2, W2
 226         por     W1, W2          C rotl 62 (A02)
 227
 228         movdqa  A0304, A0102
 229         movdqa  A0304, W1
 230         psllq   $28, A0102
 231         psrlq   $36, W1
 232         por     W1, A0102       C rotl 28 (A03)
 233         movdqa  A0304, W1
 234         psllq   $27, A0304
 235         psrlq   $37, W1
 236         por     W1, A0304       C rotl 27 (A04)
 237
 238         punpcklqdq      W0, A0102
 239         punpckhqdq      W2, A0304
 240
 241         C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
 242         C 7 <- 7(6)
 243         C      __   _______
 244         C  _ L'  ` L_    __`
 245         C |5|    |6|7|  |8|9|
 246         C   `-_________-^`-^
 247
 248         rolq    $36, A05
 249         MOVQ(A05, W0)
 250         MOVQ(A0607, A05)
 251         rolq    $44, A05                C Done A05
 252         ROTL64(6, A0607, W1)
 253         por     A0607, W1
 254         movdqa  A0809, A0607
 255         ROTL64(20, A0607, W2)
 256         por     W2, A0607
 257         punpckhqdq      W1, A0607       C Done A0607
 258         ROTL64(55, A0809, W1)
 259         por     A0809, W1
 260         movdqa W0, A0809
 261         punpcklqdq      W1, A0809       C Done 0809
 262
 263         C   10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
 264         C   14 <- 14(39)
 265         C      _____   ___
 266         C  __L'   __`_L_  `_____
 267         C |10|   |11|12|  |13|14|
 268         C   `-___-^`-______-^
 269         C
 270
 271         rolq    $42, A10                C 42 + 25 = 3 (mod 64)
 272         SWAP64  A1112, W0
 273         MOVQ(A10, A1112)
 274         MOVQ(W0, A10)
 275         rolq    $43, A10                C Done A10
 276
 277         punpcklqdq      A1314, A1112
 278         ROTL64(25, A1112, W1)
 279         por     W1, A1112               C Done A1112
 280         ROTL64(39, A1314, W2)
 281         por     A1314, W2
 282         ROTL64(10, W0, A1314)
 283         por     W0, A1314
 284         punpckhqdq      W2, A1314       C Done A1314
 285
 286
 287         C   15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
 288         C   16 <- 16(45)
 289         C      _____________
 290         C     /         _______
 291         C  _L'    ____L'    |  `_
 292         C |15|   |16|17|   |18|19|
 293         C   \        `_____-^   ^
 294         C    \_________________/
 295
 296         SWAP64  A1819, W0
 297         rolq    $41, A15
 298         MOVQ(A15, W1)
 299         MOVQ(A1819, A15)
 300         rolq    $21, A15                C Done A15
 301         SWAP64  A1617, A1819
 302         ROTL64(45, A1617, W2)
 303         por     W2, A1617
 304         ROTL64(8, W0, W3)
 305         por     W3, W0
 306         punpcklqdq      W0, A1617       C Done A1617
 307         ROTL64(15, A1819, W2)
 308         por     W2, A1819
 309         punpcklqdq      W1, A1819       C Done A1819
 310
 311         C   20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
 312         C   23 <- 23(56)
 313         C      _______________
 314         C     /               \
 315         C  _L'    _L'\_     ___`_
 316         C |20|   |21|22|   |23|24|
 317         C   \     `__ ^________-^
 318         C    \_______/
 319
 320         rolq    $18, A20
 321         MOVQ(A20, W0)
 322         SWAP64  A2324, W1
 323         movd    W1, A20
 324         rolq    $14, A20                C Done A20
 325         ROTL64(56, A2324, W1)
 326         por     W1, A2324
 327
 328         movdqa  A2122, W2
 329         ROTL64(2, W2, W1)
 330         por     W1, W2
 331         punpcklqdq      W2, A2324       C Done A2324
 332
 333         ROTL64(61, A2122, W1)
 334         por     W1, A2122
 335         psrldq  $8, A2122
 336         punpcklqdq      W0, A2122       C Done A2122
 337
 338         C chi step. With the transposed matrix, applied independently
 339         C to each column.
 340         movq    A05, T0
 341         notq    T0
 342         andq    A10, T0
 343         movq    A10, T1
 344         notq    T1
 345         andq    A15, T1
 346         movq    A15, T2
 347         notq    T2
 348         andq    A20, T2
 349         xorq    T2, A10
 350         movq    A20, T3
 351         notq    T3
 352         andq    A00, T3
 353         xorq    T3, A15
 354         movq    A00, T2
 355         notq    T2
 356         andq    A05, T2
 357         xorq    T2, A20
 358         xorq    T0, A00
 359         xorq    T1, A05
 360
 361         movdqa  A0607, W0
 362         pandn   A1112, W0
 363         movdqa  A1112, W1
 364         pandn   A1617, W1
 365         movdqa  A1617, W2
 366         pandn   A2122, W2
 367         pxor    W2, A1112
 368         movdqa  A2122, W3
 369         pandn   A0102, W3
 370         pxor    W3, A1617
 371         movdqa  A0102, W2
 372         pandn   A0607, W2
 373         pxor    W2, A2122
 374         pxor    W0, A0102
 375         pxor    W1, A0607
 376
 377         movdqa  A0809, W0
 378         pandn   A1314, W0
 379         movdqa  A1314, W1
 380         pandn   A1819, W1
 381         movdqa  A1819, W2
 382         pandn   A2324, W2
 383         pxor    W2, A1314
 384         movdqa  A2324, W3
 385         pandn   A0304, W3
 386         pxor    W3, A1819
 387         movdqa  A0304, W2
 388         pandn   A0809, W2
 389         pxor    W2, A2324
 390         pxor    W0, A0304
 391         pxor    W1, A0809
 392
 393         xorq    (RC, COUNT, 8), A00
 394
 395         C Transpose.
 396         C Swap (A05, A10) <->  A0102, and (A15, A20) <->  A0304,
 397         C and also copy to C12 and C34 while at it.
 398
 399         MOVQ(A05, C12)
 400         MOVQ(A15, C34)
 401         MOVQ(A10, W0)
 402         MOVQ(A20, W1)
 403         movq    A00, C0
 404         punpcklqdq      W0, C12
 405         punpcklqdq      W1, C34
 406         MOVQ(A0102, A05)
 407         MOVQ(A0304, A15)
 408         psrldq  $8, A0102
 409         psrldq  $8, A0304
 410         xorq    A05, C0
 411         xorq    A15, C0
 412         MOVQ(A0102, A10)
 413         MOVQ(A0304, A20)
 414
 415         movdqa  C12, A0102
 416         movdqa  C34, A0304
 417
 418         C Transpose (A0607, A1112)
 419         movdqa  A0607, W0
 420         punpcklqdq      A1112, A0607
 421         xorq    A10, C0
 422         xorq    A20, C0
 423         punpckhqdq      W0, A1112
 424         SWAP64  A1112, A1112
 425
 426         C Transpose (A1819, A2324)
 427         movdqa  A1819, W0
 428         punpcklqdq      A2324, A1819
 429         pxor    A0607, C12
 430         pxor    A1112, C12
 431         punpckhqdq      W0, A2324
 432         SWAP64  A2324, A2324
 433
 434         C Transpose (A0809, A1314) and (A1617, A2122), and swap
 435         movdqa  A0809, W0
 436         movdqa  A1314, W1
 437         movdqa  A1617, A0809
 438         movdqa  A2122, A1314
 439         pxor    A1819, C34
 440         pxor    A2324, C34
 441         punpcklqdq      A2122, A0809
 442         punpckhqdq      A1617, A1314
 443         SWAP64  A1314, A1314
 444         movdqa  W0, A1617
 445         movdqa  W1, A2122
 446         pxor    A0809, C34
 447         pxor    A1314, C34
 448         punpcklqdq      W1, A1617
 449         punpckhqdq      W0, A2122
 450         SWAP64  A2122, A2122
 451
 452         decl    XREG(COUNT)
 453         pxor    A1617, C12
 454         pxor    A2122, C12
 455         jnz     .Loop
 456
 457         movq    A00, STATE(0)
 458         movups  A0102, STATE(1)
 459         movups  A0304, STATE(3)
 460
 461         movq    A05, STATE(5)
 462         movups  A0607, STATE(6)
 463         movups  A0809, STATE(8)
 464
 465         movq    A10, STATE(10)
 466         movups  A1112, STATE(11)
 467         movups  A1314, STATE(13)
 468
 469         movq    A15, STATE(15)
 470         movups  A1617, STATE(16)
 471         movups  A1819, STATE(18)
 472
 473         movq    A20, STATE(20)
 474         movups  A2122, STATE(21)
 475         movups  A2324, STATE(23)
 476
 477         pop     %r14
 478         pop     %r13
 479         pop     %r12
 480         pop     %rbp
 481         W64_EXIT(1, 16)
 482         ret
 483
 484 EPILOGUE(nettle_sha3_permute)
 485
 486 ALIGN(16)
 487 .rc:    C In reverse order
 488         .quad   0x8000000080008008
 489         .quad   0x0000000080000001
 490         .quad   0x8000000000008080
 491         .quad   0x8000000080008081
 492         .quad   0x800000008000000A
 493         .quad   0x000000000000800A
 494         .quad   0x8000000000000080
 495         .quad   0x8000000000008002
 496         .quad   0x8000000000008003
 497         .quad   0x8000000000008089
 498         .quad   0x800000000000008B
 499         .quad   0x000000008000808B
 500         .quad   0x000000008000000A
 501         .quad   0x0000000080008009
 502         .quad   0x0000000000000088
 503         .quad   0x000000000000008A
 504         .quad   0x8000000000008009
 505         .quad   0x8000000080008081
 506         .quad   0x0000000080000001
 507         .quad   0x000000000000808B
 508         .quad   0x8000000080008000
 509         .quad   0x800000000000808A
 510         .quad   0x0000000000008082
 511         .quad   0x0000000000000001