1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
4 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
5 * processors. CPUs supporting Intel(R) AVX extensions will get an additional
8 * This work was inspired by the vectorized implementation of Dean Gaudet.
9 * Additional information on it can be found at:
10 * http://www.arctic.org/~dean/crypto/sha1.html
12 * It was improved upon with more efficient vectorization of the message
13 * scheduling. This implementation has also been optimized for all current and
14 * several future generations of Intel CPUs.
16 * See this article for more information about the implementation details:
17 * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
19 * Copyright (C) 2010, Intel Corp.
20 * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
21 * Ronen Zohar <ronen.zohar@intel.com>
23 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
24 * Author: Mathias Krause <minipli@googlemail.com>
27 #include <linux/linkage.h>
28 #include <linux/cfi_types.h>
30 #define CTX %rdi // arg1
31 #define BUF %rsi // arg2
32 #define CNT %rdx // arg3
45 #define BUFFER_PTR %r10
46 #define BUFFER_END %r11
60 #define XMM_SHUFB_BSWAP %xmm10
62 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
63 #define WK(t) (((t) & 15) * 4)(%rsp)
64 #define W_PRECALC_AHEAD 16
67 * This macro implements the SHA-1 function's body for single 64-byte block
68 * param: function's name
70 .macro SHA1_VECTOR_ASM name
71 SYM_TYPED_FUNC_START(\name)
78 sub $64, %rsp # allocate workspace
79 and $~15, %rsp # align stack
84 shl $6, CNT # multiply by 64
88 lea K_XMM_AR(%rip), K_BASE
89 xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
91 SHA1_PIPELINED_MAIN_BODY
99 mov %rbp, %rsp # deallocate workspace
109 * This macro implements 80 rounds of SHA-1 for one 64-byte block
111 .macro SHA1_PIPELINED_MAIN_BODY
121 .rept W_PRECALC_AHEAD
164 add $64, BUFFER_PTR # move to the next 64-byte block
165 cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
166 cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
180 UPDATE_HASH (HASH_PTR), A
181 UPDATE_HASH 4(HASH_PTR), B
182 UPDATE_HASH 8(HASH_PTR), C
183 UPDATE_HASH 12(HASH_PTR), D
184 UPDATE_HASH 16(HASH_PTR), E
187 cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
201 .macro RESTORE_RENAMED_REGS
202 # order is important (REG_C is where it should be)
209 .macro SWAP_REG_NAMES a, b
217 SWAP_REG_NAMES \c, T1
225 SWAP_REG_NAMES \d, T1
232 SWAP_REG_NAMES \c, T1
244 .macro UPDATE_HASH hash, val
250 * RR does two rounds of SHA-1 back to back with W[] pre-calc
251 * t1 = F(b, c, d); e += w(i)
252 * e += t1; b <<= 30; d += w(i+1);
260 .macro RR F, a, b, c, d, e, round
262 \F \b, \c, \d # t1 = F(b, c, d);
263 W_PRECALC (\round + W_PRECALC_AHEAD)
266 add WK(\round + 1), \d
269 W_PRECALC (\round + W_PRECALC_AHEAD + 1)
273 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
276 SWAP_REG_NAMES \e, T1
282 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
298 .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
299 .set i, ((\r) % 80) # pre-compute for the next iteration
306 .elseif (i < 80) // rounds 32-79
311 .macro W_PRECALC_RESET
323 .macro W_PRECALC_ROTATE
324 .set W_minus_32, W_minus_28
325 .set W_minus_28, W_minus_24
326 .set W_minus_24, W_minus_20
327 .set W_minus_20, W_minus_16
328 .set W_minus_16, W_minus_12
329 .set W_minus_12, W_minus_08
330 .set W_minus_08, W_minus_04
335 .macro W_PRECALC_SSSE3
337 .macro W_PRECALC_00_15
338 W_PRECALC_00_15_SSSE3
340 .macro W_PRECALC_16_31
341 W_PRECALC_16_31_SSSE3
343 .macro W_PRECALC_32_79
344 W_PRECALC_32_79_SSSE3
347 /* message scheduling pre-compute for rounds 0-15 */
348 .macro W_PRECALC_00_15_SSSE3
350 movdqu (i*4)(BUFFER_PTR), W_TMP1
351 .elseif ((i & 3) == 1)
352 pshufb XMM_SHUFB_BSWAP, W_TMP1
354 .elseif ((i & 3) == 2)
355 paddd (K_BASE), W_TMP1
356 .elseif ((i & 3) == 3)
357 movdqa W_TMP1, WK(i&~3)
362 /* message scheduling pre-compute for rounds 16-31
364 * - calculating last 32 w[i] values in 8 XMM registers
365 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
368 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
369 * dependency, but improves for 32-79
371 .macro W_PRECALC_16_31_SSSE3
372 # blended scheduling of vector and scalar instruction streams, one 4-wide
373 # vector iteration / 4 scalar rounds
376 palignr $8, W_minus_16, W # w[i-14]
377 movdqa W_minus_04, W_TMP1
378 psrldq $4, W_TMP1 # w[i-3]
380 .elseif ((i & 3) == 1)
381 pxor W_minus_16, W_TMP1
386 .elseif ((i & 3) == 2)
393 .elseif ((i & 3) == 3)
397 paddd K_XMM(K_BASE), W_TMP1
398 movdqa W_TMP1, WK(i&~3)
403 /* message scheduling pre-compute for rounds 32-79
405 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
406 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
407 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
409 .macro W_PRECALC_32_79_SSSE3
411 movdqa W_minus_04, W_TMP1
412 pxor W_minus_28, W # W is W_minus_32 before xor
413 palignr $8, W_minus_08, W_TMP1
414 .elseif ((i & 3) == 1)
418 .elseif ((i & 3) == 2)
422 .elseif ((i & 3) == 3)
424 paddd K_XMM(K_BASE), W_TMP1
425 movdqa W_TMP1, WK(i&~3)
430 .endm // W_PRECALC_SSSE3
433 #define K1 0x5a827999
434 #define K2 0x6ed9eba1
435 #define K3 0x8f1bbcdc
436 #define K4 0xca62c1d6
462 * SSSE3 optimized implementation:
464 * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
465 * const u8 *data, int blocks);
467 * Note that struct sha1_state is assumed to begin with u32 state[5].
469 SHA1_VECTOR_ASM sha1_transform_ssse3
473 .purgem W_PRECALC_00_15
474 .macro W_PRECALC_00_15
477 .purgem W_PRECALC_16_31
478 .macro W_PRECALC_16_31
481 .purgem W_PRECALC_32_79
482 .macro W_PRECALC_32_79
486 .macro W_PRECALC_00_15_AVX
488 vmovdqu (i*4)(BUFFER_PTR), W_TMP1
489 .elseif ((i & 3) == 1)
490 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
491 .elseif ((i & 3) == 2)
492 vpaddd (K_BASE), W, W_TMP1
493 .elseif ((i & 3) == 3)
494 vmovdqa W_TMP1, WK(i&~3)
499 .macro W_PRECALC_16_31_AVX
501 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
502 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
503 vpxor W_minus_08, W, W
504 vpxor W_minus_16, W_TMP1, W_TMP1
505 .elseif ((i & 3) == 1)
507 vpslldq $12, W, W_TMP2
509 .elseif ((i & 3) == 2)
511 vpor W, W_TMP1, W_TMP1
513 vpsrld $30, W_TMP2, W_TMP2
514 .elseif ((i & 3) == 3)
515 vpxor W, W_TMP1, W_TMP1
516 vpxor W_TMP2, W_TMP1, W
517 vpaddd K_XMM(K_BASE), W, W_TMP1
518 vmovdqu W_TMP1, WK(i&~3)
523 .macro W_PRECALC_32_79_AVX
525 vpalignr $8, W_minus_08, W_minus_04, W_TMP1
526 vpxor W_minus_28, W, W # W is W_minus_32 before xor
527 .elseif ((i & 3) == 1)
528 vpxor W_minus_16, W_TMP1, W_TMP1
530 .elseif ((i & 3) == 2)
534 .elseif ((i & 3) == 3)
535 vpaddd K_XMM(K_BASE), W, W_TMP1
536 vmovdqu W_TMP1, WK(i&~3)
541 .endm // W_PRECALC_AVX
550 /* AVX optimized implementation:
551 * extern "C" void sha1_transform_avx(struct sha1_state *state,
552 * const u8 *data, int blocks);
554 SHA1_VECTOR_ASM sha1_transform_avx