2 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
3 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
4 * processors. CPUs supporting Intel(R) AVX extensions will get an additional
7 * This work was inspired by the vectorized implementation of Dean Gaudet.
8 * Additional information on it can be found at:
9 * http://www.arctic.org/~dean/crypto/sha1.html
11 * It was improved upon with more efficient vectorization of the message
12 * scheduling. This implementation has also been optimized for all current and
13 * several future generations of Intel CPUs.
15 * See this article for more information about the implementation details:
16 * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
18 * Copyright (C) 2010, Intel Corp.
19 * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
20 * Ronen Zohar <ronen.zohar@intel.com>
22 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
23 * Author: Mathias Krause <minipli@googlemail.com>
25 * This program is free software; you can redistribute it and/or modify
26 * it under the terms of the GNU General Public License as published by
27 * the Free Software Foundation; either version 2 of the License, or
28 * (at your option) any later version.
31 #include <linux/linkage.h>
33 #define CTX %rdi // arg1
34 #define BUF %rsi // arg2
35 #define CNT %rdx // arg3
48 #define BUFFER_PTR %r10
49 #define BUFFER_END %r11
63 #define XMM_SHUFB_BSWAP %xmm10
65 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
66 #define WK(t) (((t) & 15) * 4)(%rsp)
67 #define W_PRECALC_AHEAD 16
70 * This macro implements the SHA-1 function's body for single 64-byte block
71 * param: function's name
73 .macro SHA1_VECTOR_ASM name
81 sub $64, %rsp # allocate workspace
82 and $~15, %rsp # align stack
87 shl $6, CNT # multiply by 64
91 lea K_XMM_AR(%rip), K_BASE
92 xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
94 SHA1_PIPELINED_MAIN_BODY
102 mov %r12, %rsp # deallocate workspace
113 * This macro implements 80 rounds of SHA-1 for one 64-byte block
115 .macro SHA1_PIPELINED_MAIN_BODY
125 .rept W_PRECALC_AHEAD
168 add $64, BUFFER_PTR # move to the next 64-byte block
169 cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
170 cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
184 UPDATE_HASH (HASH_PTR), A
185 UPDATE_HASH 4(HASH_PTR), B
186 UPDATE_HASH 8(HASH_PTR), C
187 UPDATE_HASH 12(HASH_PTR), D
188 UPDATE_HASH 16(HASH_PTR), E
191 cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
205 .macro RESTORE_RENAMED_REGS
206 # order is important (REG_C is where it should be)
213 .macro SWAP_REG_NAMES a, b
221 SWAP_REG_NAMES \c, T1
229 SWAP_REG_NAMES \d, T1
236 SWAP_REG_NAMES \c, T1
248 .macro UPDATE_HASH hash, val
254 * RR does two rounds of SHA-1 back to back with W[] pre-calc
255 * t1 = F(b, c, d); e += w(i)
256 * e += t1; b <<= 30; d += w(i+1);
264 .macro RR F, a, b, c, d, e, round
266 \F \b, \c, \d # t1 = F(b, c, d);
267 W_PRECALC (\round + W_PRECALC_AHEAD)
270 add WK(\round + 1), \d
273 W_PRECALC (\round + W_PRECALC_AHEAD + 1)
277 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
280 SWAP_REG_NAMES \e, T1
286 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
302 .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
303 .set i, ((\r) % 80) # pre-compute for the next iteration
310 .elseif (i < 80) // rounds 32-79
315 .macro W_PRECALC_RESET
327 .macro W_PRECALC_ROTATE
328 .set W_minus_32, W_minus_28
329 .set W_minus_28, W_minus_24
330 .set W_minus_24, W_minus_20
331 .set W_minus_20, W_minus_16
332 .set W_minus_16, W_minus_12
333 .set W_minus_12, W_minus_08
334 .set W_minus_08, W_minus_04
339 .macro W_PRECALC_SSSE3
341 .macro W_PRECALC_00_15
342 W_PRECALC_00_15_SSSE3
344 .macro W_PRECALC_16_31
345 W_PRECALC_16_31_SSSE3
347 .macro W_PRECALC_32_79
348 W_PRECALC_32_79_SSSE3
351 /* message scheduling pre-compute for rounds 0-15 */
352 .macro W_PRECALC_00_15_SSSE3
354 movdqu (i*4)(BUFFER_PTR), W_TMP1
355 .elseif ((i & 3) == 1)
356 pshufb XMM_SHUFB_BSWAP, W_TMP1
358 .elseif ((i & 3) == 2)
359 paddd (K_BASE), W_TMP1
360 .elseif ((i & 3) == 3)
361 movdqa W_TMP1, WK(i&~3)
366 /* message scheduling pre-compute for rounds 16-31
368 * - calculating last 32 w[i] values in 8 XMM registers
369 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
372 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
373 * dependency, but improves for 32-79
375 .macro W_PRECALC_16_31_SSSE3
376 # blended scheduling of vector and scalar instruction streams, one 4-wide
377 # vector iteration / 4 scalar rounds
380 palignr $8, W_minus_16, W # w[i-14]
381 movdqa W_minus_04, W_TMP1
382 psrldq $4, W_TMP1 # w[i-3]
384 .elseif ((i & 3) == 1)
385 pxor W_minus_16, W_TMP1
390 .elseif ((i & 3) == 2)
397 .elseif ((i & 3) == 3)
401 paddd K_XMM(K_BASE), W_TMP1
402 movdqa W_TMP1, WK(i&~3)
407 /* message scheduling pre-compute for rounds 32-79
409 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
410 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
411 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
413 .macro W_PRECALC_32_79_SSSE3
415 movdqa W_minus_04, W_TMP1
416 pxor W_minus_28, W # W is W_minus_32 before xor
417 palignr $8, W_minus_08, W_TMP1
418 .elseif ((i & 3) == 1)
422 .elseif ((i & 3) == 2)
426 .elseif ((i & 3) == 3)
428 paddd K_XMM(K_BASE), W_TMP1
429 movdqa W_TMP1, WK(i&~3)
434 .endm // W_PRECALC_SSSE3
437 #define K1 0x5a827999
438 #define K2 0x6ed9eba1
439 #define K3 0x8f1bbcdc
440 #define K4 0xca62c1d6
465 /* SSSE3 optimized implementation:
466 * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
467 * unsigned int rounds);
469 SHA1_VECTOR_ASM sha1_transform_ssse3
475 .purgem W_PRECALC_00_15
476 .macro W_PRECALC_00_15
479 .purgem W_PRECALC_16_31
480 .macro W_PRECALC_16_31
483 .purgem W_PRECALC_32_79
484 .macro W_PRECALC_32_79
488 .macro W_PRECALC_00_15_AVX
490 vmovdqu (i*4)(BUFFER_PTR), W_TMP1
491 .elseif ((i & 3) == 1)
492 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
493 .elseif ((i & 3) == 2)
494 vpaddd (K_BASE), W, W_TMP1
495 .elseif ((i & 3) == 3)
496 vmovdqa W_TMP1, WK(i&~3)
501 .macro W_PRECALC_16_31_AVX
503 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
504 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
505 vpxor W_minus_08, W, W
506 vpxor W_minus_16, W_TMP1, W_TMP1
507 .elseif ((i & 3) == 1)
509 vpslldq $12, W, W_TMP2
511 .elseif ((i & 3) == 2)
513 vpor W, W_TMP1, W_TMP1
515 vpsrld $30, W_TMP2, W_TMP2
516 .elseif ((i & 3) == 3)
517 vpxor W, W_TMP1, W_TMP1
518 vpxor W_TMP2, W_TMP1, W
519 vpaddd K_XMM(K_BASE), W, W_TMP1
520 vmovdqu W_TMP1, WK(i&~3)
525 .macro W_PRECALC_32_79_AVX
527 vpalignr $8, W_minus_08, W_minus_04, W_TMP1
528 vpxor W_minus_28, W, W # W is W_minus_32 before xor
529 .elseif ((i & 3) == 1)
530 vpxor W_minus_16, W_TMP1, W_TMP1
532 .elseif ((i & 3) == 2)
536 .elseif ((i & 3) == 3)
537 vpaddd K_XMM(K_BASE), W, W_TMP1
538 vmovdqu W_TMP1, WK(i&~3)
543 .endm // W_PRECALC_AVX
552 /* AVX optimized implementation:
553 * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
554 * unsigned int rounds);
556 SHA1_VECTOR_ASM sha1_transform_avx