1 ########################################################################
2 # Implement fast SHA-256 with AVX1 instructions. (x86_64)
4 # Copyright (C) 2013 Intel Corporation.
7 # James Guilford <james.guilford@intel.com>
8 # Kirk Yap <kirk.s.yap@intel.com>
9 # Tim Chen <tim.c.chen@linux.intel.com>
11 # This software is available to you under a choice of one of two
12 # licenses. You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
17 # Redistribution and use in source and binary forms, with or
18 # without modification, are permitted provided that the following
21 # - Redistributions of source code must retain the above
22 # copyright notice, this list of conditions and the following
25 # - Redistributions in binary form must reproduce the above
26 # copyright notice, this list of conditions and the following
27 # disclaimer in the documentation and/or other materials
28 # provided with the distribution.
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 ########################################################################
40 # This code is described in an Intel White-Paper:
41 # "Fast SHA-256 Implementations on Intel Architecture Processors"
43 # To find it, surf to http://www.intel.com/p/en_US/embedded
44 # and search for that title.
46 ########################################################################
47 # This code schedules 1 block at a time, with 4 lanes per block
48 ########################################################################
50 #include <linux/linkage.h>
52 ## assume buffers not aligned
53 #define VMOVDQ vmovdqu
55 ################################ Define Macros
58 # Add reg to mem using reg-mem add and store
66 shld $(32-(\p1)), \p2, \p2
69 ################################
71 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
72 # Load xmm with mem and byte swap each dword
73 .macro COPY_XMM_AND_BSWAP p1 p2 p3
78 ################################
93 SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
94 SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
95 BYTE_FLIP_MASK = %xmm13
97 NUM_BLKS = %rdx # 3rd arg
101 SRND = %rsi # clobbers INP
124 _INP = _INP_END + _INP_END_SIZE
125 _XFER = _INP + _INP_SIZE
126 _XMM_SAVE = _XFER + _XFER_SIZE
127 STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
130 # Rotate values of symbols X0...X3
140 # Rotate values of symbols a...h
153 .macro FOUR_ROUNDS_AND_SCHED
154 ## compute s0 four at a time and s1 two at a time
155 ## compute W[-16] + W[-7] 4 at a time
158 MY_ROR (25-11), y0 # y0 = e >> (25-11)
160 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
161 MY_ROR (22-13), y1 # y1 = a >> (22-13)
162 xor e, y0 # y0 = e ^ (e >> (25-11))
164 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
165 xor a, y1 # y1 = a ^ (a >> (22-13)
167 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
168 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
169 and e, y2 # y2 = (f^g)&e
170 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
172 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
173 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
174 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
175 xor g, y2 # y2 = CH = ((f^g)&e)^g
176 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
177 add y0, y2 # y2 = S1 + CH
178 add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
180 add y2, h # h = h + S1 + CH + k + w
182 vpsrld $7, XTMP1, XTMP2
184 add h, d # d = d + h + S1 + CH + k + w
186 vpslld $(32-7), XTMP1, XTMP3
187 and b, y0 # y0 = (a|c)&b
188 add y1, h # h = h + S1 + CH + k + w + S0
189 vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
190 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
191 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
195 MY_ROR (25-11), y0 # y0 = e >> (25-11)
196 xor e, y0 # y0 = e ^ (e >> (25-11))
198 MY_ROR (22-13), y1 # y1 = a >> (22-13)
199 vpsrld $18, XTMP1, XTMP2 #
200 xor a, y1 # y1 = a ^ (a >> (22-13)
201 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
203 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
204 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
205 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
206 and e, y2 # y2 = (f^g)&e
207 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
208 vpslld $(32-18), XTMP1, XTMP1
209 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
210 xor g, y2 # y2 = CH = ((f^g)&e)^g
211 vpxor XTMP1, XTMP3, XTMP3 #
212 add y0, y2 # y2 = S1 + CH
213 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
214 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
215 vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
217 add y2, h # h = h + S1 + CH + k + w
219 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
221 add h, d # d = d + h + S1 + CH + k + w
224 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
225 and b, y0 # y0 = (a|c)&b
226 add y1, h # h = h + S1 + CH + k + w + S0
227 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
228 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
229 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
233 MY_ROR (25-11), y0 # y0 = e >> (25-11)
234 xor e, y0 # y0 = e ^ (e >> (25-11))
235 MY_ROR (22-13), y1 # y1 = a >> (22-13)
237 xor a, y1 # y1 = a ^ (a >> (22-13)
238 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
239 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
241 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
242 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
243 and e, y2 # y2 = (f^g)&e
244 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
245 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
246 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
247 xor g, y2 # y2 = CH = ((f^g)&e)^g
248 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
249 vpxor XTMP3, XTMP2, XTMP2 #
250 add y0, y2 # y2 = S1 + CH
251 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
252 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
253 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
255 add y2, h # h = h + S1 + CH + k + w
257 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
259 add h, d # d = d + h + S1 + CH + k + w
261 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
262 and b, y0 # y0 = (a|c)&b
263 add y1, h # h = h + S1 + CH + k + w + S0
265 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
266 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
267 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
270 MY_ROR (25-11), y0 # y0 = e >> (25-11)
272 MY_ROR (22-13), y1 # y1 = a >> (22-13)
273 xor e, y0 # y0 = e ^ (e >> (25-11))
275 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
276 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
277 xor a, y1 # y1 = a ^ (a >> (22-13)
279 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
280 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
281 and e, y2 # y2 = (f^g)&e
282 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
283 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
284 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
285 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
286 xor g, y2 # y2 = CH = ((f^g)&e)^g
287 vpxor XTMP3, XTMP2, XTMP2
288 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
289 add y0, y2 # y2 = S1 + CH
290 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
291 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
293 add y2, h # h = h + S1 + CH + k + w
295 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
297 add h, d # d = d + h + S1 + CH + k + w
299 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
300 and b, y0 # y0 = (a|c)&b
301 add y1, h # h = h + S1 + CH + k + w + S0
302 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
303 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
308 ## input is [rsp + _XFER + %1 * 4]
309 .macro DO_ROUND round
311 MY_ROR (25-11), y0 # y0 = e >> (25-11)
313 xor e, y0 # y0 = e ^ (e >> (25-11))
314 MY_ROR (22-13), y1 # y1 = a >> (22-13)
316 xor a, y1 # y1 = a ^ (a >> (22-13)
317 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
319 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
320 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
321 and e, y2 # y2 = (f^g)&e
322 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
323 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
324 xor g, y2 # y2 = CH = ((f^g)&e)^g
325 add y0, y2 # y2 = S1 + CH
326 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
327 offset = \round * 4 + _XFER #
328 add offset(%rsp), y2 # y2 = k + w + S1 + CH
330 add y2, h # h = h + S1 + CH + k + w
333 add h, d # d = d + h + S1 + CH + k + w
335 and b, y0 # y0 = (a|c)&b
336 add y1, h # h = h + S1 + CH + k + w + S0
337 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
338 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
342 ########################################################################
343 ## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
344 ## arg 1 : pointer to state
345 ## arg 2 : pointer to input data
346 ## arg 3 : Num blocks
347 ########################################################################
349 SYM_FUNC_START(sha256_transform_avx)
359 subq $STACK_SIZE, %rsp # allocate stack space
360 and $~15, %rsp # align stack pointer
362 shl $6, NUM_BLKS # convert to bytes
364 add INP, NUM_BLKS # pointer to end of data
365 mov NUM_BLKS, _INP_END(%rsp)
367 ## load initial digest
377 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
378 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
379 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
383 ## byte swap first 16 dwords
384 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
385 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
386 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
387 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
391 ## schedule 48 input dwords, by doing 3 rounds of 16 each
395 vpaddd (TBL), X0, XFER
396 vmovdqa XFER, _XFER(%rsp)
397 FOUR_ROUNDS_AND_SCHED
399 vpaddd 1*16(TBL), X0, XFER
400 vmovdqa XFER, _XFER(%rsp)
401 FOUR_ROUNDS_AND_SCHED
403 vpaddd 2*16(TBL), X0, XFER
404 vmovdqa XFER, _XFER(%rsp)
405 FOUR_ROUNDS_AND_SCHED
407 vpaddd 3*16(TBL), X0, XFER
408 vmovdqa XFER, _XFER(%rsp)
410 FOUR_ROUNDS_AND_SCHED
417 vpaddd (TBL), X0, XFER
418 vmovdqa XFER, _XFER(%rsp)
424 vpaddd 1*16(TBL), X1, XFER
425 vmovdqa XFER, _XFER(%rsp)
449 cmp _INP_END(%rsp), INP
462 SYM_FUNC_END(sha256_transform_avx)
464 .section .rodata.cst256.K256, "aM", @progbits, 256
467 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
468 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
469 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
470 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
471 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
472 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
473 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
474 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
475 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
476 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
477 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
478 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
479 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
480 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
481 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
482 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
484 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
486 PSHUFFLE_BYTE_FLIP_MASK:
487 .octa 0x0c0d0e0f08090a0b0405060700010203
489 .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
491 # shuffle xBxA -> 00BA
493 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
495 .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
497 # shuffle xDxC -> DC00
499 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF