1 ########################################################################
2 # Implement fast SHA-256 with SSSE3 instructions. (x86_64)
4 # Copyright (C) 2013 Intel Corporation.
7 # James Guilford <james.guilford@intel.com>
8 # Kirk Yap <kirk.s.yap@intel.com>
9 # Tim Chen <tim.c.chen@linux.intel.com>
11 # This software is available to you under a choice of one of two
12 # licenses. You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
17 # Redistribution and use in source and binary forms, with or
18 # without modification, are permitted provided that the following
21 # - Redistributions of source code must retain the above
22 # copyright notice, this list of conditions and the following
25 # - Redistributions in binary form must reproduce the above
26 # copyright notice, this list of conditions and the following
27 # disclaimer in the documentation and/or other materials
28 # provided with the distribution.
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 ########################################################################
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
44 # To find it, surf to http://www.intel.com/p/en_US/embedded
45 # and search for that title.
47 ########################################################################
49 #include <linux/linkage.h>
51 ## assume buffers not aligned
54 ################################ Define Macros
57 # Add reg to mem using reg-mem add and store
63 ################################
65 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
66 # Load xmm with mem and byte swap each dword
67 .macro COPY_XMM_AND_BSWAP p1 p2 p3
72 ################################
86 SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
87 SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
88 BYTE_FLIP_MASK = %xmm12
90 NUM_BLKS = %rdx # 3rd arg
94 SRND = %rsi # clobbers INP
118 _INP = _INP_END + _INP_END_SIZE
119 _XFER = _INP + _INP_SIZE
120 _XMM_SAVE = _XFER + _XFER_SIZE
121 STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
124 # Rotate values of symbols X0...X3
134 # Rotate values of symbols a...h
147 .macro FOUR_ROUNDS_AND_SCHED
148 ## compute s0 four at a time and s1 two at a time
149 ## compute W[-16] + W[-7] 4 at a time
152 ror $(25-11), y0 # y0 = e >> (25-11)
154 palignr $4, X2, XTMP0 # XTMP0 = W[-7]
155 ror $(22-13), y1 # y1 = a >> (22-13)
156 xor e, y0 # y0 = e ^ (e >> (25-11))
158 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
160 xor a, y1 # y1 = a ^ (a >> (22-13)
162 paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
163 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
164 and e, y2 # y2 = (f^g)&e
165 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
167 palignr $4, X0, XTMP1 # XTMP1 = W[-15]
168 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
169 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
170 xor g, y2 # y2 = CH = ((f^g)&e)^g
171 movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
172 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
173 add y0, y2 # y2 = S1 + CH
174 add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
175 movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
177 add y2, h # h = h + S1 + CH + k + w
179 pslld $(32-7), XTMP1 #
181 add h, d # d = d + h + S1 + CH + k + w
184 and b, y0 # y0 = (a|c)&b
185 add y1, h # h = h + S1 + CH + k + w + S0
186 por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
187 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
188 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
191 movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
194 movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
195 ror $(25-11), y0 # y0 = e >> (25-11)
196 xor e, y0 # y0 = e ^ (e >> (25-11))
198 ror $(22-13), y1 # y1 = a >> (22-13)
199 pslld $(32-18), XTMP3 #
200 xor a, y1 # y1 = a ^ (a >> (22-13)
201 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
204 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
205 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
206 and e, y2 # y2 = (f^g)&e
207 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
209 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
210 xor g, y2 # y2 = CH = ((f^g)&e)^g
211 psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
212 add y0, y2 # y2 = S1 + CH
213 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
214 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
215 pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
217 add y2, h # h = h + S1 + CH + k + w
219 pxor XTMP4, XTMP1 # XTMP1 = s0
221 add h, d # d = d + h + S1 + CH + k + w
224 pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
225 and b, y0 # y0 = (a|c)&b
226 add y1, h # h = h + S1 + CH + k + w + S0
227 paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
228 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
229 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
232 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
235 ror $(25-11), y0 # y0 = e >> (25-11)
236 movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
237 xor e, y0 # y0 = e ^ (e >> (25-11))
238 ror $(22-13), y1 # y1 = a >> (22-13)
240 xor a, y1 # y1 = a ^ (a >> (22-13)
241 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
242 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
244 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
245 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
246 and e, y2 # y2 = (f^g)&e
247 psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
248 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
249 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
250 xor g, y2 # y2 = CH = ((f^g)&e)^g
251 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
253 add y0, y2 # y2 = S1 + CH
254 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
255 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
256 pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
258 add y2, h # h = h + S1 + CH + k + w
260 pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
262 add h, d # d = d + h + S1 + CH + k + w
264 paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
265 and b, y0 # y0 = (a|c)&b
266 add y1, h # h = h + S1 + CH + k + w + S0
268 pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
269 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
270 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
273 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
275 ror $(25-11), y0 # y0 = e >> (25-11)
277 movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
278 ror $(22-13), y1 # y1 = a >> (22-13)
279 xor e, y0 # y0 = e ^ (e >> (25-11))
281 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
282 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
283 xor a, y1 # y1 = a ^ (a >> (22-13)
285 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
286 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
287 and e, y2 # y2 = (f^g)&e
288 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
289 psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
290 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
291 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
292 xor g, y2 # y2 = CH = ((f^g)&e)^g
294 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
295 add y0, y2 # y2 = S1 + CH
296 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
297 pxor XTMP2, X0 # X0 = s1 {xDxC}
299 add y2, h # h = h + S1 + CH + k + w
301 pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
303 add h, d # d = d + h + S1 + CH + k + w
305 paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
306 and b, y0 # y0 = (a|c)&b
307 add y1, h # h = h + S1 + CH + k + w + S0
308 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
309 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
315 ## input is [rsp + _XFER + %1 * 4]
316 .macro DO_ROUND round
318 ror $(25-11), y0 # y0 = e >> (25-11)
320 xor e, y0 # y0 = e ^ (e >> (25-11))
321 ror $(22-13), y1 # y1 = a >> (22-13)
323 xor a, y1 # y1 = a ^ (a >> (22-13)
324 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
326 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
327 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
328 and e, y2 # y2 = (f^g)&e
329 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
330 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
331 xor g, y2 # y2 = CH = ((f^g)&e)^g
332 add y0, y2 # y2 = S1 + CH
333 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
334 offset = \round * 4 + _XFER
335 add offset(%rsp), y2 # y2 = k + w + S1 + CH
337 add y2, h # h = h + S1 + CH + k + w
340 add h, d # d = d + h + S1 + CH + k + w
342 and b, y0 # y0 = (a|c)&b
343 add y1, h # h = h + S1 + CH + k + w + S0
344 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
345 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
349 ########################################################################
350 ## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
351 ## arg 1 : pointer to digest
352 ## arg 2 : pointer to input data
353 ## arg 3 : Num blocks
354 ########################################################################
356 ENTRY(sha256_transform_ssse3)
366 subq $STACK_SIZE, %rsp
369 shl $6, NUM_BLKS # convert to bytes
372 mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
374 ## load initial digest
384 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
385 movdqa _SHUF_00BA(%rip), SHUF_00BA
386 movdqa _SHUF_DC00(%rip), SHUF_DC00
391 ## byte swap first 16 dwords
392 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
393 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
394 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
395 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
399 ## schedule 48 input dwords, by doing 3 rounds of 16 each
405 movdqa XFER, _XFER(%rsp)
406 FOUR_ROUNDS_AND_SCHED
408 movdqa 1*16(TBL), XFER
410 movdqa XFER, _XFER(%rsp)
411 FOUR_ROUNDS_AND_SCHED
413 movdqa 2*16(TBL), XFER
415 movdqa XFER, _XFER(%rsp)
416 FOUR_ROUNDS_AND_SCHED
418 movdqa 3*16(TBL), XFER
420 movdqa XFER, _XFER(%rsp)
422 FOUR_ROUNDS_AND_SCHED
430 movdqa X0, _XFER(%rsp)
436 movdqa X1, _XFER(%rsp)
460 cmp _INP_END(%rsp), INP
475 ENDPROC(sha256_transform_ssse3)
477 .section .rodata.cst256.K256, "aM", @progbits, 256
480 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
481 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
482 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
483 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
484 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
485 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
486 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
487 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
488 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
489 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
490 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
491 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
492 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
493 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
494 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
495 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
497 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
499 PSHUFFLE_BYTE_FLIP_MASK:
500 .octa 0x0c0d0e0f08090a0b0405060700010203
502 .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
504 # shuffle xBxA -> 00BA
506 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
508 .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
510 # shuffle xDxC -> DC00
512 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF