1 ########################################################################
2 # Implement fast SHA-256 with SSSE3 instructions. (x86_64)
4 # Copyright (C) 2013 Intel Corporation.
7 # James Guilford <james.guilford@intel.com>
8 # Kirk Yap <kirk.s.yap@intel.com>
9 # Tim Chen <tim.c.chen@linux.intel.com>
11 # This software is available to you under a choice of one of two
12 # licenses. You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
17 # Redistribution and use in source and binary forms, with or
18 # without modification, are permitted provided that the following
21 # - Redistributions of source code must retain the above
22 # copyright notice, this list of conditions and the following
25 # - Redistributions in binary form must reproduce the above
26 # copyright notice, this list of conditions and the following
27 # disclaimer in the documentation and/or other materials
28 # provided with the distribution.
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 ########################################################################
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
44 # To find it, surf to http://www.intel.com/p/en_US/embedded
45 # and search for that title.
47 ########################################################################
49 #include <linux/linkage.h>
50 #include <linux/cfi_types.h>
52 ## assume buffers not aligned
55 ################################ Define Macros
58 # Add reg to mem using reg-mem add and store
64 ################################
66 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
67 # Load xmm with mem and byte swap each dword
68 .macro COPY_XMM_AND_BSWAP p1 p2 p3
73 ################################
87 SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
89 BYTE_FLIP_MASK = %xmm12
91 NUM_BLKS = %rdx # 3rd arg
95 SRND = %rsi # clobbers INP
119 _INP = _INP_END + _INP_END_SIZE
120 _XFER = _INP + _INP_SIZE
121 _XMM_SAVE = _XFER + _XFER_SIZE
122 STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
125 # Rotate values of symbols X0...X3
135 # Rotate values of symbols a...h
148 .macro FOUR_ROUNDS_AND_SCHED
149 ## compute s0 four at a time and s1 two at a time
150 ## compute W[-16] + W[-7] 4 at a time
153 ror $(25-11), y0 # y0 = e >> (25-11)
155 palignr $4, X2, XTMP0 # XTMP0 = W[-7]
156 ror $(22-13), y1 # y1 = a >> (22-13)
157 xor e, y0 # y0 = e ^ (e >> (25-11))
159 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
161 xor a, y1 # y1 = a ^ (a >> (22-13)
163 paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
164 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
165 and e, y2 # y2 = (f^g)&e
166 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
168 palignr $4, X0, XTMP1 # XTMP1 = W[-15]
169 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
170 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
171 xor g, y2 # y2 = CH = ((f^g)&e)^g
172 movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
173 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
174 add y0, y2 # y2 = S1 + CH
175 add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
176 movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
178 add y2, h # h = h + S1 + CH + k + w
180 pslld $(32-7), XTMP1 #
182 add h, d # d = d + h + S1 + CH + k + w
185 and b, y0 # y0 = (a|c)&b
186 add y1, h # h = h + S1 + CH + k + w + S0
187 por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
188 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
189 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
192 movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
195 movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
196 ror $(25-11), y0 # y0 = e >> (25-11)
197 xor e, y0 # y0 = e ^ (e >> (25-11))
199 ror $(22-13), y1 # y1 = a >> (22-13)
200 pslld $(32-18), XTMP3 #
201 xor a, y1 # y1 = a ^ (a >> (22-13)
202 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
205 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
206 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207 and e, y2 # y2 = (f^g)&e
208 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
210 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211 xor g, y2 # y2 = CH = ((f^g)&e)^g
212 psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
213 add y0, y2 # y2 = S1 + CH
214 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
215 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
216 pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
218 add y2, h # h = h + S1 + CH + k + w
220 pxor XTMP4, XTMP1 # XTMP1 = s0
222 add h, d # d = d + h + S1 + CH + k + w
225 pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
226 and b, y0 # y0 = (a|c)&b
227 add y1, h # h = h + S1 + CH + k + w + S0
228 paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
229 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
230 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
233 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
236 ror $(25-11), y0 # y0 = e >> (25-11)
237 movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
238 xor e, y0 # y0 = e ^ (e >> (25-11))
239 ror $(22-13), y1 # y1 = a >> (22-13)
241 xor a, y1 # y1 = a ^ (a >> (22-13)
242 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
243 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
245 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
246 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
247 and e, y2 # y2 = (f^g)&e
248 psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
249 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
250 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
251 xor g, y2 # y2 = CH = ((f^g)&e)^g
252 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
254 add y0, y2 # y2 = S1 + CH
255 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
256 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
257 pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
259 add y2, h # h = h + S1 + CH + k + w
261 pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
263 add h, d # d = d + h + S1 + CH + k + w
265 paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
266 and b, y0 # y0 = (a|c)&b
267 add y1, h # h = h + S1 + CH + k + w + S0
269 pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
270 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
271 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
274 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
276 ror $(25-11), y0 # y0 = e >> (25-11)
278 movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
279 ror $(22-13), y1 # y1 = a >> (22-13)
280 xor e, y0 # y0 = e ^ (e >> (25-11))
282 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
283 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
284 xor a, y1 # y1 = a ^ (a >> (22-13)
286 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
287 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
288 and e, y2 # y2 = (f^g)&e
289 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
290 psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
291 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
292 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
293 xor g, y2 # y2 = CH = ((f^g)&e)^g
295 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
296 add y0, y2 # y2 = S1 + CH
297 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
298 pxor XTMP2, X0 # X0 = s1 {xDxC}
300 add y2, h # h = h + S1 + CH + k + w
302 pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
304 add h, d # d = d + h + S1 + CH + k + w
306 paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
307 and b, y0 # y0 = (a|c)&b
308 add y1, h # h = h + S1 + CH + k + w + S0
309 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
310 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
316 ## input is [rsp + _XFER + %1 * 4]
317 .macro DO_ROUND round
319 ror $(25-11), y0 # y0 = e >> (25-11)
321 xor e, y0 # y0 = e ^ (e >> (25-11))
322 ror $(22-13), y1 # y1 = a >> (22-13)
324 xor a, y1 # y1 = a ^ (a >> (22-13)
325 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
327 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
328 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
329 and e, y2 # y2 = (f^g)&e
330 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
331 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
332 xor g, y2 # y2 = CH = ((f^g)&e)^g
333 add y0, y2 # y2 = S1 + CH
334 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
335 offset = \round * 4 + _XFER
336 add offset(%rsp), y2 # y2 = k + w + S1 + CH
338 add y2, h # h = h + S1 + CH + k + w
341 add h, d # d = d + h + S1 + CH + k + w
343 and b, y0 # y0 = (a|c)&b
344 add y1, h # h = h + S1 + CH + k + w + S0
345 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
346 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
350 ########################################################################
351 ## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
353 ## arg 1 : pointer to state
354 ## (struct sha256_state is assumed to begin with u32 state[8])
355 ## arg 2 : pointer to input data
356 ## arg 3 : Num blocks
357 ########################################################################
359 SYM_TYPED_FUNC_START(sha256_transform_ssse3)
368 subq $STACK_SIZE, %rsp
371 shl $6, NUM_BLKS # convert to bytes
374 mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
376 ## load initial digest
386 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
387 movdqa _SHUF_00BA(%rip), SHUF_00BA
388 movdqa _SHUF_DC00(%rip), SHUF_DC00
393 ## byte swap first 16 dwords
394 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
395 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
396 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
397 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
401 ## schedule 48 input dwords, by doing 3 rounds of 16 each
407 movdqa XFER, _XFER(%rsp)
408 FOUR_ROUNDS_AND_SCHED
410 movdqa 1*16(TBL), XFER
412 movdqa XFER, _XFER(%rsp)
413 FOUR_ROUNDS_AND_SCHED
415 movdqa 2*16(TBL), XFER
417 movdqa XFER, _XFER(%rsp)
418 FOUR_ROUNDS_AND_SCHED
420 movdqa 3*16(TBL), XFER
422 movdqa XFER, _XFER(%rsp)
424 FOUR_ROUNDS_AND_SCHED
432 movdqa X0, _XFER(%rsp)
438 movdqa X1, _XFER(%rsp)
462 cmp _INP_END(%rsp), INP
476 SYM_FUNC_END(sha256_transform_ssse3)
478 .section .rodata.cst256.K256, "aM", @progbits, 256
481 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
482 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
483 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
484 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
485 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
486 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
487 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
488 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
489 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
490 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
491 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
492 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
493 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
494 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
495 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
496 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
498 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
500 PSHUFFLE_BYTE_FLIP_MASK:
501 .octa 0x0c0d0e0f08090a0b0405060700010203
503 .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
505 # shuffle xBxA -> 00BA
507 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
509 .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
511 # shuffle xDxC -> DC00
513 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF