1 ########################################################################
2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
4 # Copyright (C) 2013 Intel Corporation.
7 # James Guilford <james.guilford@intel.com>
8 # Kirk Yap <kirk.s.yap@intel.com>
9 # Tim Chen <tim.c.chen@linux.intel.com>
11 # This software is available to you under a choice of one of two
12 # licenses. You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
17 # Redistribution and use in source and binary forms, with or
18 # without modification, are permitted provided that the following
21 # - Redistributions of source code must retain the above
22 # copyright notice, this list of conditions and the following
25 # - Redistributions in binary form must reproduce the above
26 # copyright notice, this list of conditions and the following
27 # disclaimer in the documentation and/or other materials
28 # provided with the distribution.
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 ########################################################################
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
44 # To find it, surf to http://www.intel.com/p/en_US/embedded
45 # and search for that title.
47 ########################################################################
48 # This code schedules 2 blocks at a time, with 4 lanes per block
49 ########################################################################
52 #include <linux/linkage.h>
54 ## assume buffers not aligned
55 #define VMOVDQ vmovdqu
57 ################################ Define Macros
60 # Add reg to mem using reg-mem add and store
66 ################################
73 # XMM versions of above
87 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
89 BYTE_FLIP_MASK = %ymm13
91 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
93 NUM_BLKS = %rdx # 3rd arg
98 e = %edx # clobbers NUM_BLKS
99 y3 = %edi # clobbers INP
103 SRND = CTX # SRND is same register as CTX
118 _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
126 _XMM_SAVE = _XFER + _XFER_SIZE
127 _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
128 _INP = _INP_END + _INP_END_SIZE
129 _CTX = _INP + _INP_SIZE
130 _RSP = _CTX + _CTX_SIZE
131 STACK_SIZE = _RSP + _RSP_SIZE
134 # Rotate values of symbols X0...X3
144 # Rotate values of symbols a...h
158 .macro FOUR_ROUNDS_AND_SCHED disp
159 ################################### RND N + 0 ############################
161 mov a, y3 # y3 = a # MAJA
162 rorx $25, e, y0 # y0 = e >> 25 # S1A
163 rorx $11, e, y1 # y1 = e >> 11 # S1B
165 addl \disp(%rsp, SRND), h # h = k + w + h # --
166 or c, y3 # y3 = a|c # MAJA
167 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
168 mov f, y2 # y2 = f # CH
169 rorx $13, a, T1 # T1 = a >> 13 # S0B
171 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
172 xor g, y2 # y2 = f^g # CH
173 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
174 rorx $6, e, y1 # y1 = (e >> 6) # S1
176 and e, y2 # y2 = (f^g)&e # CH
177 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
178 rorx $22, a, y1 # y1 = a >> 22 # S0A
179 add h, d # d = k + w + h + d # --
181 and b, y3 # y3 = (a|c)&b # MAJA
182 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
183 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
184 rorx $2, a, T1 # T1 = (a >> 2) # S0
186 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
187 vpsrld $7, XTMP1, XTMP2
188 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
189 mov a, T1 # T1 = a # MAJB
190 and c, T1 # T1 = a&c # MAJB
192 add y0, y2 # y2 = S1 + CH # --
193 vpslld $(32-7), XTMP1, XTMP3
194 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
195 add y1, h # h = k + w + h + S0 # --
197 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
198 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
200 vpsrld $18, XTMP1, XTMP2
201 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
202 add y3, h # h = t1 + S0 + MAJ # --
207 ################################### RND N + 1 ############################
209 mov a, y3 # y3 = a # MAJA
210 rorx $25, e, y0 # y0 = e >> 25 # S1A
211 rorx $11, e, y1 # y1 = e >> 11 # S1B
213 addl offset(%rsp, SRND), h # h = k + w + h # --
214 or c, y3 # y3 = a|c # MAJA
217 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
218 mov f, y2 # y2 = f # CH
219 rorx $13, a, T1 # T1 = a >> 13 # S0B
220 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
221 xor g, y2 # y2 = f^g # CH
224 rorx $6, e, y1 # y1 = (e >> 6) # S1
225 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
226 rorx $22, a, y1 # y1 = a >> 22 # S0A
227 and e, y2 # y2 = (f^g)&e # CH
228 add h, d # d = k + w + h + d # --
230 vpslld $(32-18), XTMP1, XTMP1
231 and b, y3 # y3 = (a|c)&b # MAJA
232 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
234 vpxor XTMP1, XTMP3, XTMP3
235 rorx $2, a, T1 # T1 = (a >> 2) # S0
236 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
238 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
239 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
240 mov a, T1 # T1 = a # MAJB
241 and c, T1 # T1 = a&c # MAJB
242 add y0, y2 # y2 = S1 + CH # --
244 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
245 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
246 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
247 add y1, h # h = k + w + h + S0 # --
249 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
250 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
251 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
252 add y3, h # h = t1 + S0 + MAJ # --
254 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
259 ################################### RND N + 2 ############################
261 mov a, y3 # y3 = a # MAJA
262 rorx $25, e, y0 # y0 = e >> 25 # S1A
264 addl offset(%rsp, SRND), h # h = k + w + h # --
266 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
267 rorx $11, e, y1 # y1 = e >> 11 # S1B
268 or c, y3 # y3 = a|c # MAJA
269 mov f, y2 # y2 = f # CH
270 xor g, y2 # y2 = f^g # CH
272 rorx $13, a, T1 # T1 = a >> 13 # S0B
273 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
274 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
275 and e, y2 # y2 = (f^g)&e # CH
277 rorx $6, e, y1 # y1 = (e >> 6) # S1
278 vpxor XTMP3, XTMP2, XTMP2
279 add h, d # d = k + w + h + d # --
280 and b, y3 # y3 = (a|c)&b # MAJA
282 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
283 rorx $22, a, y1 # y1 = a >> 22 # S0A
284 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
285 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
287 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
288 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
289 rorx $2, a ,T1 # T1 = (a >> 2) # S0
290 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
292 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
293 mov a, T1 # T1 = a # MAJB
294 and c, T1 # T1 = a&c # MAJB
295 add y0, y2 # y2 = S1 + CH # --
296 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
298 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
299 add y1,h # h = k + w + h + S0 # --
300 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
301 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
303 add y3,h # h = t1 + S0 + MAJ # --
308 ################################### RND N + 3 ############################
310 mov a, y3 # y3 = a # MAJA
311 rorx $25, e, y0 # y0 = e >> 25 # S1A
312 rorx $11, e, y1 # y1 = e >> 11 # S1B
314 addl offset(%rsp, SRND), h # h = k + w + h # --
315 or c, y3 # y3 = a|c # MAJA
318 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
319 mov f, y2 # y2 = f # CH
320 rorx $13, a, T1 # T1 = a >> 13 # S0B
321 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
322 xor g, y2 # y2 = f^g # CH
325 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
326 rorx $6, e, y1 # y1 = (e >> 6) # S1
327 and e, y2 # y2 = (f^g)&e # CH
328 add h, d # d = k + w + h + d # --
329 and b, y3 # y3 = (a|c)&b # MAJA
331 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
332 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
333 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
335 vpxor XTMP3, XTMP2, XTMP2
336 rorx $22, a, y1 # y1 = a >> 22 # S0A
337 add y0, y2 # y2 = S1 + CH # --
339 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
340 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
341 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
343 rorx $2, a, T1 # T1 = (a >> 2) # S0
344 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
346 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
347 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
348 mov a, T1 # T1 = a # MAJB
349 and c, T1 # T1 = a&c # MAJB
350 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
352 add y1, h # h = k + w + h + S0 # --
353 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
354 add y3, h # h = t1 + S0 + MAJ # --
360 .macro DO_4ROUNDS disp
361 ################################### RND N + 0 ###########################
363 mov f, y2 # y2 = f # CH
364 rorx $25, e, y0 # y0 = e >> 25 # S1A
365 rorx $11, e, y1 # y1 = e >> 11 # S1B
366 xor g, y2 # y2 = f^g # CH
368 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
369 rorx $6, e, y1 # y1 = (e >> 6) # S1
370 and e, y2 # y2 = (f^g)&e # CH
372 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
373 rorx $13, a, T1 # T1 = a >> 13 # S0B
374 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
375 rorx $22, a, y1 # y1 = a >> 22 # S0A
376 mov a, y3 # y3 = a # MAJA
378 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
379 rorx $2, a, T1 # T1 = (a >> 2) # S0
380 addl \disp(%rsp, SRND), h # h = k + w + h # --
381 or c, y3 # y3 = a|c # MAJA
383 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
384 mov a, T1 # T1 = a # MAJB
385 and b, y3 # y3 = (a|c)&b # MAJA
386 and c, T1 # T1 = a&c # MAJB
387 add y0, y2 # y2 = S1 + CH # --
390 add h, d # d = k + w + h + d # --
391 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
392 add y1, h # h = k + w + h + S0 # --
393 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
397 ################################### RND N + 1 ###########################
399 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
400 mov f, y2 # y2 = f # CH
401 rorx $25, e, y0 # y0 = e >> 25 # S1A
402 rorx $11, e, y1 # y1 = e >> 11 # S1B
403 xor g, y2 # y2 = f^g # CH
405 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
406 rorx $6, e, y1 # y1 = (e >> 6) # S1
407 and e, y2 # y2 = (f^g)&e # CH
408 add y3, old_h # h = t1 + S0 + MAJ # --
410 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
411 rorx $13, a, T1 # T1 = a >> 13 # S0B
412 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
413 rorx $22, a, y1 # y1 = a >> 22 # S0A
414 mov a, y3 # y3 = a # MAJA
416 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
417 rorx $2, a, T1 # T1 = (a >> 2) # S0
419 addl offset(%rsp, SRND), h # h = k + w + h # --
420 or c, y3 # y3 = a|c # MAJA
422 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
423 mov a, T1 # T1 = a # MAJB
424 and b, y3 # y3 = (a|c)&b # MAJA
425 and c, T1 # T1 = a&c # MAJB
426 add y0, y2 # y2 = S1 + CH # --
429 add h, d # d = k + w + h + d # --
430 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
431 add y1, h # h = k + w + h + S0 # --
433 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
437 ################################### RND N + 2 ##############################
439 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
440 mov f, y2 # y2 = f # CH
441 rorx $25, e, y0 # y0 = e >> 25 # S1A
442 rorx $11, e, y1 # y1 = e >> 11 # S1B
443 xor g, y2 # y2 = f^g # CH
445 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
446 rorx $6, e, y1 # y1 = (e >> 6) # S1
447 and e, y2 # y2 = (f^g)&e # CH
448 add y3, old_h # h = t1 + S0 + MAJ # --
450 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
451 rorx $13, a, T1 # T1 = a >> 13 # S0B
452 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
453 rorx $22, a, y1 # y1 = a >> 22 # S0A
454 mov a, y3 # y3 = a # MAJA
456 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
457 rorx $2, a, T1 # T1 = (a >> 2) # S0
459 addl offset(%rsp, SRND), h # h = k + w + h # --
460 or c, y3 # y3 = a|c # MAJA
462 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
463 mov a, T1 # T1 = a # MAJB
464 and b, y3 # y3 = (a|c)&b # MAJA
465 and c, T1 # T1 = a&c # MAJB
466 add y0, y2 # y2 = S1 + CH # --
469 add h, d # d = k + w + h + d # --
470 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
471 add y1, h # h = k + w + h + S0 # --
473 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
477 ################################### RND N + 3 ###########################
479 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
480 mov f, y2 # y2 = f # CH
481 rorx $25, e, y0 # y0 = e >> 25 # S1A
482 rorx $11, e, y1 # y1 = e >> 11 # S1B
483 xor g, y2 # y2 = f^g # CH
485 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
486 rorx $6, e, y1 # y1 = (e >> 6) # S1
487 and e, y2 # y2 = (f^g)&e # CH
488 add y3, old_h # h = t1 + S0 + MAJ # --
490 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
491 rorx $13, a, T1 # T1 = a >> 13 # S0B
492 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
493 rorx $22, a, y1 # y1 = a >> 22 # S0A
494 mov a, y3 # y3 = a # MAJA
496 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
497 rorx $2, a, T1 # T1 = (a >> 2) # S0
499 addl offset(%rsp, SRND), h # h = k + w + h # --
500 or c, y3 # y3 = a|c # MAJA
502 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
503 mov a, T1 # T1 = a # MAJB
504 and b, y3 # y3 = (a|c)&b # MAJA
505 and c, T1 # T1 = a&c # MAJB
506 add y0, y2 # y2 = S1 + CH # --
509 add h, d # d = k + w + h + d # --
510 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
511 add y1, h # h = k + w + h + S0 # --
513 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
516 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
518 add y3, h # h = t1 + S0 + MAJ # --
524 ########################################################################
525 ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
526 ## arg 1 : pointer to input data
527 ## arg 2 : pointer to digest
528 ## arg 3 : Num blocks
529 ########################################################################
531 ENTRY(sha256_transform_rorx)
541 subq $STACK_SIZE, %rsp
542 and $-32, %rsp # align rsp to 32 byte boundary
546 shl $6, NUM_BLKS # convert to bytes
548 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
549 mov NUM_BLKS, _INP_END(%rsp)
554 ## load initial digest
564 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
565 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
566 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
573 ## Load first 16 dwords from two blocks
574 VMOVDQ 0*32(INP),XTMP0
575 VMOVDQ 1*32(INP),XTMP1
576 VMOVDQ 2*32(INP),XTMP2
577 VMOVDQ 3*32(INP),XTMP3
580 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
581 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
582 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
583 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
585 ## transpose data into high/low halves
586 vperm2i128 $0x20, XTMP2, XTMP0, X0
587 vperm2i128 $0x31, XTMP2, XTMP0, X1
588 vperm2i128 $0x20, XTMP3, XTMP1, X2
589 vperm2i128 $0x31, XTMP3, XTMP1, X3
595 ## schedule 48 input dwords, by doing 3 rounds of 12 each
600 vpaddd 0*32(TBL, SRND), X0, XFER
601 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
602 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
604 vpaddd 1*32(TBL, SRND), X0, XFER
605 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
606 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
608 vpaddd 2*32(TBL, SRND), X0, XFER
609 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
610 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
612 vpaddd 3*32(TBL, SRND), X0, XFER
613 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
614 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
621 ## Do last 16 rounds with no scheduling
622 vpaddd 0*32(TBL, SRND), X0, XFER
623 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
624 DO_4ROUNDS _XFER + 0*32
625 vpaddd 1*32(TBL, SRND), X1, XFER
626 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
627 DO_4ROUNDS _XFER + 1*32
648 cmp _INP_END(%rsp), INP
651 #### Do second block using previously scheduled results
655 DO_4ROUNDS _XFER + 0*32 + 16
656 DO_4ROUNDS _XFER + 1*32 + 16
674 cmp _INP_END(%rsp), INP
682 VMOVDQ 0*16(INP),XWORD0
683 VMOVDQ 1*16(INP),XWORD1
684 VMOVDQ 2*16(INP),XWORD2
685 VMOVDQ 3*16(INP),XWORD3
687 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
688 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
689 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
690 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
696 ## load initial digest
706 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
707 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
708 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
724 ENDPROC(sha256_transform_rorx)
729 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
730 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
731 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
732 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
733 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
734 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
735 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
736 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
737 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
738 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
739 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
740 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
741 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
742 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
743 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
744 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
745 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
746 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
747 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
748 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
749 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
750 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
751 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
752 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
753 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
754 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
755 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
756 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
757 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
758 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
759 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
760 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
762 PSHUFFLE_BYTE_FLIP_MASK:
763 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
765 # shuffle xBxA -> 00BA
767 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
769 # shuffle xDxC -> DC00
771 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF