2 * Implement fast SHA-1 with AVX2 instructions. (x86_64)
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
9 * Copyright(c) 2014 Intel Corporation.
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * Contact Information:
21 * Ilya Albrekht <ilya.albrekht@intel.com>
22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
23 * Ronen Zohar <ronen.zohar@intel.com>
24 * Chandramouli Narayanan <mouli@linux.intel.com>
28 * Copyright(c) 2014 Intel Corporation.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
34 * Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in
38 * the documentation and/or other materials provided with the
40 * Neither the name of Intel Corporation nor the names of its
41 * contributors may be used to endorse or promote products derived
42 * from this software without specific prior written permission.
44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
61 *This implementation is based on the previous SSSE3 release:
62 *Visit http://software.intel.com/en-us/articles/
63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
65 *Updates 20-byte SHA-1 record in 'hash' for even number of
66 *'num_blocks' consecutive 64-byte blocks
68 *extern "C" void sha1_transform_avx2(
69 * int *hash, const char* input, size_t num_blocks );
72 #include <linux/linkage.h>
74 #define CTX %rdi /* arg1 */
75 #define BUF %rsi /* arg2 */
76 #define CNT %rdx /* arg3 */
93 #define xmm_mov vmovups
94 #define avx2_zeroupper vzeroupper
122 #define BUFFER_PTR %r10
123 #define BUFFER_PTR2 %r13
124 #define BUFFER_END %r11
126 #define PRECALC_BUF %r14
131 #define WY_TMP2 %ymm9
143 #define YMM_SHUFB_BSWAP %ymm10
146 * Keep 2 iterations precalculated at a time:
147 * - 80 DWORDs per iteration * 2
149 #define W_SIZE (80*2*2 +16)
151 #define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
152 #define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
155 .macro UPDATE_HASH hash, val
160 .macro PRECALC_RESET_WY
172 .macro PRECALC_ROTATE_WY
184 /* Define register aliases */
186 .set WY_minus_04, WY_04
187 .set WY_minus_08, WY_08
188 .set WY_minus_12, WY_12
189 .set WY_minus_16, WY_16
190 .set WY_minus_20, WY_20
191 .set WY_minus_24, WY_24
192 .set WY_minus_28, WY_28
197 .if (i == 0) # Initialize and rotate registers
202 /* message scheduling pre-compute for rounds 0-15 */
205 * blended AVX2 and ALU instruction scheduling
206 * 1 vector iteration per 8 rounds
208 vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
209 .elseif ((i & 7) == 1)
210 vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
212 .elseif ((i & 7) == 2)
213 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
214 .elseif ((i & 7) == 4)
215 vpaddd K_XMM(K_BASE), WY, WY_TMP
216 .elseif ((i & 7) == 7)
217 vmovdqu WY_TMP, PRECALC_WK(i&~7)
225 * message scheduling pre-compute for rounds 16-31
226 * calculating last 32 w[i] values in 8 XMM registers
227 * pre-calculate K+w[i] values and store to mem
228 * for later load by ALU add instruction
230 * "brute force" vectorization for rounds 16-31 only
231 * due to w[i]->w[i-3] dependency
235 * blended AVX2 and ALU instruction scheduling
236 * 1 vector iteration per 8 rounds
239 vpalignr $8, WY_minus_16, WY_minus_12, WY
240 vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
241 .elseif ((i & 7) == 1)
242 vpxor WY_minus_08, WY, WY
243 vpxor WY_minus_16, WY_TMP, WY_TMP
244 .elseif ((i & 7) == 2)
246 vpslldq $12, WY, WY_TMP2
247 .elseif ((i & 7) == 3)
248 vpslld $1, WY, WY_TMP
250 .elseif ((i & 7) == 4)
251 vpor WY, WY_TMP, WY_TMP
252 vpslld $2, WY_TMP2, WY
253 .elseif ((i & 7) == 5)
254 vpsrld $30, WY_TMP2, WY_TMP2
255 vpxor WY, WY_TMP, WY_TMP
256 .elseif ((i & 7) == 7)
257 vpxor WY_TMP2, WY_TMP, WY
258 vpaddd K_XMM(K_BASE), WY, WY_TMP
259 vmovdqu WY_TMP, PRECALC_WK(i&~7)
267 * in SHA-1 specification:
268 * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
269 * instead we do equal:
270 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
271 * allows more efficient vectorization
272 * since w[i]=>w[i-3] dependency is broken
277 * blended AVX2 and ALU instruction scheduling
278 * 1 vector iteration per 8 rounds
280 vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
281 .elseif ((i & 7) == 1)
282 /* W is W_minus_32 before xor */
283 vpxor WY_minus_28, WY, WY
284 .elseif ((i & 7) == 2)
285 vpxor WY_minus_16, WY_TMP, WY_TMP
286 .elseif ((i & 7) == 3)
288 .elseif ((i & 7) == 4)
289 vpslld $2, WY, WY_TMP
290 .elseif ((i & 7) == 5)
293 .elseif ((i & 7) == 7)
294 vpaddd K_XMM(K_BASE), WY, WY_TMP
295 vmovdqu WY_TMP, PRECALC_WK(i&~7)
341 /* Macro relies on saved ROUND_Fx */
346 .elseif (\f == RND_F2)
348 .elseif (\f == RND_F3)
354 .set round_id, (\r % 80)
356 .if (round_id == 0) /* Precalculate F for first round */
357 .set ROUND_FUNC, RND_F1
360 rorx $(32-30), B, B /* b>>>2 */
366 RND_FUN ROUND_FUNC, \r
370 .set ROUND_FUNC, RND_F2
371 .elseif (round_id == 38)
372 .set ROUND_FUNC, RND_F3
373 .elseif (round_id == 58)
374 .set ROUND_FUNC, RND_F2
377 .set round_id, ( (\r+1) % 80)
379 RND_FUN ROUND_FUNC, (\r+1)
386 andn C, A, T1 /* ~b&d */
387 lea (RE,RTB), E /* Add F from the previous round */
389 rorx $(32-5), A, TA /* T2 = A >>> 5 */
390 rorx $(32-30),A, TB /* b>>>2 for next round */
392 PRECALC (\r) /* msg scheduling for next 2 blocks */
395 * Calculate F for the next round
396 * (b & c) ^ andn[b, d]
399 xor T1, A /* F1 = (b&c) ^ (~b&d) */
401 lea (RE,RTA), E /* E += A >>> 5 */
406 lea (RE,RTB), E /* Add F from the previous round */
408 /* Calculate F for the next round */
409 rorx $(32-5), A, TA /* T2 = A >>> 5 */
410 .if ((round_id) < 79)
411 rorx $(32-30), A, TB /* b>>>2 for next round */
413 PRECALC (\r) /* msg scheduling for next 2 blocks */
415 .if ((round_id) < 79)
419 add TA, E /* E += A >>> 5 */
421 .if ((round_id) < 79)
428 PRECALC (\r) /* msg scheduling for next 2 blocks */
430 lea (RE,RTB), E /* Add F from the previous round */
435 rorx $(32-5), A, TA /* T2 = A >>> 5 */
436 rorx $(32-30), A, TB /* b>>>2 for next round */
438 /* Calculate F for the next round
439 * (b and c) or (d and (b or c))
445 add TA, E /* E += A >>> 5 */
450 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
452 .macro SHA1_PIPELINED_MAIN_BODY
462 mov %rsp, PRECALC_BUF
463 lea (2*4*80+32)(%rsp), WK_BUF
465 # Precalc WK for first 2 blocks
473 xchg WK_BUF, PRECALC_BUF
478 * code loops through more than one block
479 * we use K_BASE value as a signal of a last block,
480 * it is set below by: cmovae BUFFER_PTR, K_BASE
482 cmp K_BASE, BUFFER_PTR
515 add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */
516 cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */
517 cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
529 UPDATE_HASH (HASH_PTR), A
530 UPDATE_HASH 4(HASH_PTR), TB
531 UPDATE_HASH 8(HASH_PTR), C
532 UPDATE_HASH 12(HASH_PTR), D
533 UPDATE_HASH 16(HASH_PTR), E
535 cmp K_BASE, BUFFER_PTR /* is current block the last one? */
540 /* Process second block */
543 * 0+80, 2+80, 4+80, 6+80, 8+80
544 * 10+80,12+80,14+80,16+80,18+80
557 * 20+80,22+80,24+80,26+80,28+80
558 * 30+80,32+80,34+80,36+80,38+80
570 * 40+80,42+80,44+80,46+80,48+80
571 * 50+80,52+80,54+80,56+80,58+80
578 add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */
580 cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */
581 cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
588 * 60+80,62+80,64+80,66+80,68+80
589 * 70+80,72+80,74+80,76+80,78+80
596 UPDATE_HASH (HASH_PTR), A
597 UPDATE_HASH 4(HASH_PTR), TB
598 UPDATE_HASH 8(HASH_PTR), C
599 UPDATE_HASH 12(HASH_PTR), D
600 UPDATE_HASH 16(HASH_PTR), E
602 /* Reset state for AVX2 reg permutation */
612 xchg WK_BUF, PRECALC_BUF
621 * macro implements SHA-1 function's body for several 64-byte blocks
622 * param: function's name
624 .macro SHA1_VECTOR_ASM name
634 RESERVE_STACK = (W_SIZE*4 + 8+24)
640 sub $RESERVE_STACK, %rsp
644 lea K_XMM_AR(%rip), K_BASE
648 lea 64(BUF), BUFFER_PTR2
650 shl $6, CNT /* mul by 64 */
655 cmp BUFFER_END, BUFFER_PTR2
656 cmovae K_BASE, BUFFER_PTR2
658 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
660 SHA1_PIPELINED_MAIN_BODY
664 add $RESERVE_STACK, %rsp
681 #define K1 0x5a827999
682 #define K2 0x6ed9eba1
683 #define K3 0x8f1bbcdc
684 #define K4 0xca62c1d6
708 SHA1_VECTOR_ASM sha1_transform_avx2