1 ########################################################################
2 # Implement fast SHA-256 with AVX1 instructions. (x86_64)
4 # Copyright (C) 2013 Intel Corporation.
7 # James Guilford <james.guilford@intel.com>
8 # Kirk Yap <kirk.s.yap@intel.com>
9 # Tim Chen <tim.c.chen@linux.intel.com>
11 # This software is available to you under a choice of one of two
12 # licenses. You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
17 # Redistribution and use in source and binary forms, with or
18 # without modification, are permitted provided that the following
21 # - Redistributions of source code must retain the above
22 # copyright notice, this list of conditions and the following
25 # - Redistributions in binary form must reproduce the above
26 # copyright notice, this list of conditions and the following
27 # disclaimer in the documentation and/or other materials
28 # provided with the distribution.
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 ########################################################################
40 # This code is described in an Intel White-Paper:
41 # "Fast SHA-256 Implementations on Intel Architecture Processors"
43 # To find it, surf to http://www.intel.com/p/en_US/embedded
44 # and search for that title.
46 ########################################################################
47 # This code schedules 1 block at a time, with 4 lanes per block
48 ########################################################################
51 #include <linux/linkage.h>
53 ## assume buffers not aligned
54 #define VMOVDQ vmovdqu
56 ################################ Define Macros
59 # Add reg to mem using reg-mem add and store
67 shld $(32-(\p1)), \p2, \p2
70 ################################
72 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
73 # Load xmm with mem and byte swap each dword
74 .macro COPY_XMM_AND_BSWAP p1 p2 p3
79 ################################
94 SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
95 SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
96 BYTE_FLIP_MASK = %xmm13
98 NUM_BLKS = %rdx # 3rd arg
102 SRND = %rsi # clobbers INP
125 _INP = _INP_END + _INP_END_SIZE
126 _XFER = _INP + _INP_SIZE
127 _XMM_SAVE = _XFER + _XFER_SIZE
128 STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
131 # Rotate values of symbols X0...X3
141 # Rotate values of symbols a...h
154 .macro FOUR_ROUNDS_AND_SCHED
155 ## compute s0 four at a time and s1 two at a time
156 ## compute W[-16] + W[-7] 4 at a time
159 MY_ROR (25-11), y0 # y0 = e >> (25-11)
161 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
162 MY_ROR (22-13), y1 # y1 = a >> (22-13)
163 xor e, y0 # y0 = e ^ (e >> (25-11))
165 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
166 xor a, y1 # y1 = a ^ (a >> (22-13)
168 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
169 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
170 and e, y2 # y2 = (f^g)&e
171 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
173 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
174 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
175 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
176 xor g, y2 # y2 = CH = ((f^g)&e)^g
177 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
178 add y0, y2 # y2 = S1 + CH
179 add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
181 add y2, h # h = h + S1 + CH + k + w
183 vpsrld $7, XTMP1, XTMP2
185 add h, d # d = d + h + S1 + CH + k + w
187 vpslld $(32-7), XTMP1, XTMP3
188 and b, y0 # y0 = (a|c)&b
189 add y1, h # h = h + S1 + CH + k + w + S0
190 vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
191 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
192 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
196 MY_ROR (25-11), y0 # y0 = e >> (25-11)
197 xor e, y0 # y0 = e ^ (e >> (25-11))
199 MY_ROR (22-13), y1 # y1 = a >> (22-13)
200 vpsrld $18, XTMP1, XTMP2 #
201 xor a, y1 # y1 = a ^ (a >> (22-13)
202 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
204 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
205 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
206 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207 and e, y2 # y2 = (f^g)&e
208 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
209 vpslld $(32-18), XTMP1, XTMP1
210 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211 xor g, y2 # y2 = CH = ((f^g)&e)^g
212 vpxor XTMP1, XTMP3, XTMP3 #
213 add y0, y2 # y2 = S1 + CH
214 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
215 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
216 vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
218 add y2, h # h = h + S1 + CH + k + w
220 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
222 add h, d # d = d + h + S1 + CH + k + w
225 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
226 and b, y0 # y0 = (a|c)&b
227 add y1, h # h = h + S1 + CH + k + w + S0
228 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
229 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
230 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
234 MY_ROR (25-11), y0 # y0 = e >> (25-11)
235 xor e, y0 # y0 = e ^ (e >> (25-11))
236 MY_ROR (22-13), y1 # y1 = a >> (22-13)
238 xor a, y1 # y1 = a ^ (a >> (22-13)
239 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
240 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
242 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
243 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
244 and e, y2 # y2 = (f^g)&e
245 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
246 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
247 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
248 xor g, y2 # y2 = CH = ((f^g)&e)^g
249 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
250 vpxor XTMP3, XTMP2, XTMP2 #
251 add y0, y2 # y2 = S1 + CH
252 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
253 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
254 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
256 add y2, h # h = h + S1 + CH + k + w
258 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
260 add h, d # d = d + h + S1 + CH + k + w
262 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
263 and b, y0 # y0 = (a|c)&b
264 add y1, h # h = h + S1 + CH + k + w + S0
266 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
267 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
268 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
271 MY_ROR (25-11), y0 # y0 = e >> (25-11)
273 MY_ROR (22-13), y1 # y1 = a >> (22-13)
274 xor e, y0 # y0 = e ^ (e >> (25-11))
276 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
277 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
278 xor a, y1 # y1 = a ^ (a >> (22-13)
280 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
281 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
282 and e, y2 # y2 = (f^g)&e
283 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
284 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
285 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
286 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
287 xor g, y2 # y2 = CH = ((f^g)&e)^g
288 vpxor XTMP3, XTMP2, XTMP2
289 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
290 add y0, y2 # y2 = S1 + CH
291 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
292 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
294 add y2, h # h = h + S1 + CH + k + w
296 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
298 add h, d # d = d + h + S1 + CH + k + w
300 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
301 and b, y0 # y0 = (a|c)&b
302 add y1, h # h = h + S1 + CH + k + w + S0
303 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
304 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
309 ## input is [rsp + _XFER + %1 * 4]
310 .macro DO_ROUND round
312 MY_ROR (25-11), y0 # y0 = e >> (25-11)
314 xor e, y0 # y0 = e ^ (e >> (25-11))
315 MY_ROR (22-13), y1 # y1 = a >> (22-13)
317 xor a, y1 # y1 = a ^ (a >> (22-13)
318 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
320 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
321 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
322 and e, y2 # y2 = (f^g)&e
323 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
324 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
325 xor g, y2 # y2 = CH = ((f^g)&e)^g
326 add y0, y2 # y2 = S1 + CH
327 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
328 offset = \round * 4 + _XFER #
329 add offset(%rsp), y2 # y2 = k + w + S1 + CH
331 add y2, h # h = h + S1 + CH + k + w
334 add h, d # d = d + h + S1 + CH + k + w
336 and b, y0 # y0 = (a|c)&b
337 add y1, h # h = h + S1 + CH + k + w + S0
338 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
339 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
343 ########################################################################
344 ## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
345 ## arg 1 : pointer to digest
346 ## arg 2 : pointer to input data
347 ## arg 3 : Num blocks
348 ########################################################################
350 ENTRY(sha256_transform_avx)
360 subq $STACK_SIZE, %rsp # allocate stack space
361 and $~15, %rsp # align stack pointer
363 shl $6, NUM_BLKS # convert to bytes
365 add INP, NUM_BLKS # pointer to end of data
366 mov NUM_BLKS, _INP_END(%rsp)
368 ## load initial digest
378 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
379 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
380 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
384 ## byte swap first 16 dwords
385 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
386 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
387 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
388 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
392 ## schedule 48 input dwords, by doing 3 rounds of 16 each
396 vpaddd (TBL), X0, XFER
397 vmovdqa XFER, _XFER(%rsp)
398 FOUR_ROUNDS_AND_SCHED
400 vpaddd 1*16(TBL), X0, XFER
401 vmovdqa XFER, _XFER(%rsp)
402 FOUR_ROUNDS_AND_SCHED
404 vpaddd 2*16(TBL), X0, XFER
405 vmovdqa XFER, _XFER(%rsp)
406 FOUR_ROUNDS_AND_SCHED
408 vpaddd 3*16(TBL), X0, XFER
409 vmovdqa XFER, _XFER(%rsp)
411 FOUR_ROUNDS_AND_SCHED
418 vpaddd (TBL), X0, XFER
419 vmovdqa XFER, _XFER(%rsp)
425 vpaddd 1*16(TBL), X1, XFER
426 vmovdqa XFER, _XFER(%rsp)
450 cmp _INP_END(%rsp), INP
463 ENDPROC(sha256_transform_avx)
465 .section .rodata.cst256.K256, "aM", @progbits, 256
468 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
469 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
470 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
471 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
472 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
473 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
474 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
475 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
476 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
477 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
478 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
479 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
480 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
481 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
482 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
483 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
485 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
487 PSHUFFLE_BYTE_FLIP_MASK:
488 .octa 0x0c0d0e0f08090a0b0405060700010203
490 .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
492 # shuffle xBxA -> 00BA
494 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
496 .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
498 # shuffle xDxC -> DC00
500 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF