2 * Multi-buffer SHA1 algorithm hash compute routine
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
9 * Copyright(c) 2014 Intel Corporation.
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * Contact Information:
21 * James Guilford <james.guilford@intel.com>
22 * Tim Chen <tim.c.chen@linux.intel.com>
26 * Copyright(c) 2014 Intel Corporation.
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
32 * * Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * * Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in
36 * the documentation and/or other materials provided with the
38 * * Neither the name of Intel Corporation nor the names of its
39 * contributors may be used to endorse or promote products derived
40 * from this software without specific prior written permission.
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 #include <linux/linkage.h>
56 #include "sha1_mb_mgr_datastruct.S"
58 ## code to compute oct SHA1 using SSE-256
59 ## outer calling routine takes care of save and restore of XMM registers
61 ## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15
63 ## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
64 ## Linux preserves: rdi rbp r8
69 # TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
70 # "transpose" data in {r0...r7} using temps {t0...t1}
71 # Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
72 # r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
73 # r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
74 # r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
75 # r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
76 # r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
77 # r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
78 # r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
79 # r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
81 # Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
82 # r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
83 # r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
84 # r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
85 # r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
86 # r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
87 # r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
88 # r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
89 # r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
92 .macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
93 # process top half (r0..r3) {a...d}
94 vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
95 vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
96 vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
97 vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
98 vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
99 vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
100 vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
101 vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
103 # use r2 in place of t0
104 # process bottom half (r4..r7) {e...h}
105 vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
106 vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
107 vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
108 vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
109 vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
110 vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
111 vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
112 vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
114 vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
115 vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
116 vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
117 vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
118 vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
119 vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
120 vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
121 vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
125 ## Magic functions defined in FIPS 180-1
127 # macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D)))
128 .macro MAGIC_F0 regF regB regC regD regT
129 vpxor \regD, \regC, \regF
130 vpand \regB, \regF, \regF
131 vpxor \regD, \regF, \regF
134 # macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D)
135 .macro MAGIC_F1 regF regB regC regD regT
136 vpxor \regC, \regD, \regF
137 vpxor \regB, \regF, \regF
140 # macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D))
141 .macro MAGIC_F2 regF regB regC regD regT
142 vpor \regC, \regB, \regF
143 vpand \regC, \regB, \regT
144 vpand \regD, \regF, \regF
145 vpor \regT, \regF, \regF
148 # macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D)
149 .macro MAGIC_F3 regF regB regC regD regT
150 MAGIC_F1 \regF,\regB,\regC,\regD,\regT
153 # PROLD reg, imm, tmp
154 .macro PROLD reg imm tmp
155 vpsrld $(32-\imm), \reg, \tmp
156 vpslld $\imm, \reg, \reg
157 vpor \tmp, \reg, \reg
160 .macro PROLD_nd reg imm tmp src
161 vpsrld $(32-\imm), \src, \tmp
162 vpslld $\imm, \src, \reg
163 vpor \tmp, \reg, \reg
166 .macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
167 vpaddd \immCNT, \regE, \regE
168 vpaddd \memW*32(%rsp), \regE, \regE
169 PROLD_nd \regT, 5, \regF, \regA
170 vpaddd \regT, \regE, \regE
171 \MAGIC \regF, \regB, \regC, \regD, \regT
172 PROLD \regB, 30, \regT
173 vpaddd \regF, \regE, \regE
176 .macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
177 vpaddd \immCNT, \regE, \regE
178 offset = ((\memW - 14) & 15) * 32
179 vmovdqu offset(%rsp), W14
181 offset = ((\memW - 8) & 15) * 32
182 vpxor offset(%rsp), W16, W16
183 offset = ((\memW - 3) & 15) * 32
184 vpxor offset(%rsp), W16, W16
185 vpsrld $(32-1), W16, \regF
187 vpor W16, \regF, \regF
191 offset = ((\memW - 0) & 15) * 32
192 vmovdqu \regF, offset(%rsp)
193 vpaddd \regF, \regE, \regE
194 PROLD_nd \regT, 5, \regF, \regA
195 vpaddd \regT, \regE, \regE
196 \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D)
197 PROLD \regB,30, \regT
198 vpaddd \regF, \regE, \regE
201 ########################################################################
202 ########################################################################
203 ########################################################################
205 ## FRAMESZ plus pushes must be an odd multiple of 8
206 YMM_SAVE = (15-15)*32
207 FRAMESZ = 32*16 + YMM_SAVE
208 _YMM = FRAMESZ - YMM_SAVE
210 #define VMOVPS vmovups
288 # 8 streams x 5 32bit words per digest x 4 bytes per word
289 #define DIGEST_SIZE (8*5*4)
293 # void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
294 # arg 1 : pointer to array[4] of pointer to input data
295 # arg 2 : size (in blocks) ;; assumed to be >= 1
299 # save callee-saved clobbered registers to comply with C function ABI
309 #align rsp to 32 Bytes
312 ## Initialize digests
313 vmovdqu 0*32(arg1), A
314 vmovdqu 1*32(arg1), B
315 vmovdqu 2*32(arg1), C
316 vmovdqu 3*32(arg1), D
317 vmovdqu 4*32(arg1), E
319 ## transpose input onto stack
320 mov _data_ptr+0*8(arg1),inp0
321 mov _data_ptr+1*8(arg1),inp1
322 mov _data_ptr+2*8(arg1),inp2
323 mov _data_ptr+3*8(arg1),inp3
324 mov _data_ptr+4*8(arg1),inp4
325 mov _data_ptr+5*8(arg1),inp5
326 mov _data_ptr+6*8(arg1),inp6
327 mov _data_ptr+7*8(arg1),inp7
331 vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
334 VMOVPS (inp0, IDX), T0
335 VMOVPS (inp1, IDX), T1
336 VMOVPS (inp2, IDX), T2
337 VMOVPS (inp3, IDX), T3
338 VMOVPS (inp4, IDX), T4
339 VMOVPS (inp5, IDX), T5
340 VMOVPS (inp6, IDX), T6
341 VMOVPS (inp7, IDX), T7
343 TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
345 vmovdqu T0, (I*8)*32(%rsp)
347 vmovdqu T1, (I*8+1)*32(%rsp)
349 vmovdqu T2, (I*8+2)*32(%rsp)
351 vmovdqu T3, (I*8+3)*32(%rsp)
353 vmovdqu T4, (I*8+4)*32(%rsp)
355 vmovdqu T5, (I*8+5)*32(%rsp)
357 vmovdqu T6, (I*8+6)*32(%rsp)
359 vmovdqu T7, (I*8+7)*32(%rsp)
371 ## perform 0-79 steps
373 vmovdqu K00_19(%rip), K
377 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
383 vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
384 vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
386 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
392 vmovdqu K20_39(%rip), K
394 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
400 vmovdqu K40_59(%rip), K
402 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
408 vmovdqu K60_79(%rip), K
410 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
425 vmovdqu A, 0*32(arg1)
426 vmovdqu B, 1*32(arg1)
427 vmovdqu C, 2*32(arg1)
428 vmovdqu D, 3*32(arg1)
429 vmovdqu E, 4*32(arg1)
431 # update input pointers
440 mov inp0, _data_ptr (arg1)
441 mov inp1, _data_ptr + 1*8(arg1)
442 mov inp2, _data_ptr + 2*8(arg1)
443 mov inp3, _data_ptr + 3*8(arg1)
444 mov inp4, _data_ptr + 4*8(arg1)
445 mov inp5, _data_ptr + 5*8(arg1)
446 mov inp6, _data_ptr + 6*8(arg1)
447 mov inp7, _data_ptr + 7*8(arg1)
454 # restore callee-saved clobbered registers
461 ENDPROC(sha1_x8_avx2)
464 .section .rodata.cst32.K00_19, "aM", @progbits, 32
467 .octa 0x5A8279995A8279995A8279995A827999
468 .octa 0x5A8279995A8279995A8279995A827999
470 .section .rodata.cst32.K20_39, "aM", @progbits, 32
473 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
474 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
476 .section .rodata.cst32.K40_59, "aM", @progbits, 32
479 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
480 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
482 .section .rodata.cst32.K60_79, "aM", @progbits, 32
485 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
486 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
488 .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
490 PSHUFFLE_BYTE_FLIP_MASK:
491 .octa 0x0c0d0e0f08090a0b0405060700010203
492 .octa 0x0c0d0e0f08090a0b0405060700010203