1 ########################################################################
2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
4 # Copyright (C) 2013 Intel Corporation.
7 # James Guilford <james.guilford@intel.com>
8 # Kirk Yap <kirk.s.yap@intel.com>
9 # Tim Chen <tim.c.chen@linux.intel.com>
11 # This software is available to you under a choice of one of two
12 # licenses. You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
17 # Redistribution and use in source and binary forms, with or
18 # without modification, are permitted provided that the following
21 # - Redistributions of source code must retain the above
22 # copyright notice, this list of conditions and the following
25 # - Redistributions in binary form must reproduce the above
26 # copyright notice, this list of conditions and the following
27 # disclaimer in the documentation and/or other materials
28 # provided with the distribution.
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 ########################################################################
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
44 # To find it, surf to http://www.intel.com/p/en_US/embedded
45 # and search for that title.
47 ########################################################################
48 # This code schedules 2 blocks at a time, with 4 lanes per block
49 ########################################################################
51 #include <linux/linkage.h>
53 ## assume buffers not aligned
54 #define VMOVDQ vmovdqu
56 ################################ Define Macros
59 # Add reg to mem using reg-mem add and store
65 ################################
72 # XMM versions of above
86 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
87 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
88 BYTE_FLIP_MASK = %ymm13
90 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92 NUM_BLKS = %rdx # 3rd arg
97 e = %edx # clobbers NUM_BLKS
98 y3 = %esi # clobbers INP
100 SRND = CTX # SRND is same register as CTX
115 _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
123 _XMM_SAVE = _XFER + _XFER_SIZE
124 _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
125 _INP = _INP_END + _INP_END_SIZE
126 _CTX = _INP + _INP_SIZE
127 _RSP = _CTX + _CTX_SIZE
128 STACK_SIZE = _RSP + _RSP_SIZE
131 # Rotate values of symbols X0...X3
141 # Rotate values of symbols a...h
155 .macro FOUR_ROUNDS_AND_SCHED disp
156 ################################### RND N + 0 ############################
158 mov a, y3 # y3 = a # MAJA
159 rorx $25, e, y0 # y0 = e >> 25 # S1A
160 rorx $11, e, y1 # y1 = e >> 11 # S1B
162 addl \disp(%rsp, SRND), h # h = k + w + h # --
163 or c, y3 # y3 = a|c # MAJA
164 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
165 mov f, y2 # y2 = f # CH
166 rorx $13, a, T1 # T1 = a >> 13 # S0B
168 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
169 xor g, y2 # y2 = f^g # CH
170 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
171 rorx $6, e, y1 # y1 = (e >> 6) # S1
173 and e, y2 # y2 = (f^g)&e # CH
174 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
175 rorx $22, a, y1 # y1 = a >> 22 # S0A
176 add h, d # d = k + w + h + d # --
178 and b, y3 # y3 = (a|c)&b # MAJA
179 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
180 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
181 rorx $2, a, T1 # T1 = (a >> 2) # S0
183 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
184 vpsrld $7, XTMP1, XTMP2
185 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
186 mov a, T1 # T1 = a # MAJB
187 and c, T1 # T1 = a&c # MAJB
189 add y0, y2 # y2 = S1 + CH # --
190 vpslld $(32-7), XTMP1, XTMP3
191 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
192 add y1, h # h = k + w + h + S0 # --
194 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
195 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
197 vpsrld $18, XTMP1, XTMP2
198 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
199 add y3, h # h = t1 + S0 + MAJ # --
204 ################################### RND N + 1 ############################
206 mov a, y3 # y3 = a # MAJA
207 rorx $25, e, y0 # y0 = e >> 25 # S1A
208 rorx $11, e, y1 # y1 = e >> 11 # S1B
210 addl offset(%rsp, SRND), h # h = k + w + h # --
211 or c, y3 # y3 = a|c # MAJA
214 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
215 mov f, y2 # y2 = f # CH
216 rorx $13, a, T1 # T1 = a >> 13 # S0B
217 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
218 xor g, y2 # y2 = f^g # CH
221 rorx $6, e, y1 # y1 = (e >> 6) # S1
222 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
223 rorx $22, a, y1 # y1 = a >> 22 # S0A
224 and e, y2 # y2 = (f^g)&e # CH
225 add h, d # d = k + w + h + d # --
227 vpslld $(32-18), XTMP1, XTMP1
228 and b, y3 # y3 = (a|c)&b # MAJA
229 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
231 vpxor XTMP1, XTMP3, XTMP3
232 rorx $2, a, T1 # T1 = (a >> 2) # S0
233 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
235 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
236 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
237 mov a, T1 # T1 = a # MAJB
238 and c, T1 # T1 = a&c # MAJB
239 add y0, y2 # y2 = S1 + CH # --
241 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
242 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
243 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
244 add y1, h # h = k + w + h + S0 # --
246 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
247 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
248 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
249 add y3, h # h = t1 + S0 + MAJ # --
251 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
256 ################################### RND N + 2 ############################
258 mov a, y3 # y3 = a # MAJA
259 rorx $25, e, y0 # y0 = e >> 25 # S1A
261 addl offset(%rsp, SRND), h # h = k + w + h # --
263 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
264 rorx $11, e, y1 # y1 = e >> 11 # S1B
265 or c, y3 # y3 = a|c # MAJA
266 mov f, y2 # y2 = f # CH
267 xor g, y2 # y2 = f^g # CH
269 rorx $13, a, T1 # T1 = a >> 13 # S0B
270 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
271 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
272 and e, y2 # y2 = (f^g)&e # CH
274 rorx $6, e, y1 # y1 = (e >> 6) # S1
275 vpxor XTMP3, XTMP2, XTMP2
276 add h, d # d = k + w + h + d # --
277 and b, y3 # y3 = (a|c)&b # MAJA
279 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
280 rorx $22, a, y1 # y1 = a >> 22 # S0A
281 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
282 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
284 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
285 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
286 rorx $2, a ,T1 # T1 = (a >> 2) # S0
287 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
289 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
290 mov a, T1 # T1 = a # MAJB
291 and c, T1 # T1 = a&c # MAJB
292 add y0, y2 # y2 = S1 + CH # --
293 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
295 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
296 add y1,h # h = k + w + h + S0 # --
297 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
298 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
300 add y3,h # h = t1 + S0 + MAJ # --
305 ################################### RND N + 3 ############################
307 mov a, y3 # y3 = a # MAJA
308 rorx $25, e, y0 # y0 = e >> 25 # S1A
309 rorx $11, e, y1 # y1 = e >> 11 # S1B
311 addl offset(%rsp, SRND), h # h = k + w + h # --
312 or c, y3 # y3 = a|c # MAJA
315 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
316 mov f, y2 # y2 = f # CH
317 rorx $13, a, T1 # T1 = a >> 13 # S0B
318 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
319 xor g, y2 # y2 = f^g # CH
322 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
323 rorx $6, e, y1 # y1 = (e >> 6) # S1
324 and e, y2 # y2 = (f^g)&e # CH
325 add h, d # d = k + w + h + d # --
326 and b, y3 # y3 = (a|c)&b # MAJA
328 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
329 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
330 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
332 vpxor XTMP3, XTMP2, XTMP2
333 rorx $22, a, y1 # y1 = a >> 22 # S0A
334 add y0, y2 # y2 = S1 + CH # --
336 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
337 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
338 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
340 rorx $2, a, T1 # T1 = (a >> 2) # S0
341 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
343 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
344 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
345 mov a, T1 # T1 = a # MAJB
346 and c, T1 # T1 = a&c # MAJB
347 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
349 add y1, h # h = k + w + h + S0 # --
350 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
351 add y3, h # h = t1 + S0 + MAJ # --
357 .macro DO_4ROUNDS disp
358 ################################### RND N + 0 ###########################
360 mov f, y2 # y2 = f # CH
361 rorx $25, e, y0 # y0 = e >> 25 # S1A
362 rorx $11, e, y1 # y1 = e >> 11 # S1B
363 xor g, y2 # y2 = f^g # CH
365 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
366 rorx $6, e, y1 # y1 = (e >> 6) # S1
367 and e, y2 # y2 = (f^g)&e # CH
369 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
370 rorx $13, a, T1 # T1 = a >> 13 # S0B
371 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
372 rorx $22, a, y1 # y1 = a >> 22 # S0A
373 mov a, y3 # y3 = a # MAJA
375 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
376 rorx $2, a, T1 # T1 = (a >> 2) # S0
377 addl \disp(%rsp, SRND), h # h = k + w + h # --
378 or c, y3 # y3 = a|c # MAJA
380 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
381 mov a, T1 # T1 = a # MAJB
382 and b, y3 # y3 = (a|c)&b # MAJA
383 and c, T1 # T1 = a&c # MAJB
384 add y0, y2 # y2 = S1 + CH # --
387 add h, d # d = k + w + h + d # --
388 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
389 add y1, h # h = k + w + h + S0 # --
390 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
394 ################################### RND N + 1 ###########################
396 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
397 mov f, y2 # y2 = f # CH
398 rorx $25, e, y0 # y0 = e >> 25 # S1A
399 rorx $11, e, y1 # y1 = e >> 11 # S1B
400 xor g, y2 # y2 = f^g # CH
402 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
403 rorx $6, e, y1 # y1 = (e >> 6) # S1
404 and e, y2 # y2 = (f^g)&e # CH
405 add y3, old_h # h = t1 + S0 + MAJ # --
407 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
408 rorx $13, a, T1 # T1 = a >> 13 # S0B
409 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
410 rorx $22, a, y1 # y1 = a >> 22 # S0A
411 mov a, y3 # y3 = a # MAJA
413 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
414 rorx $2, a, T1 # T1 = (a >> 2) # S0
416 addl offset(%rsp, SRND), h # h = k + w + h # --
417 or c, y3 # y3 = a|c # MAJA
419 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
420 mov a, T1 # T1 = a # MAJB
421 and b, y3 # y3 = (a|c)&b # MAJA
422 and c, T1 # T1 = a&c # MAJB
423 add y0, y2 # y2 = S1 + CH # --
426 add h, d # d = k + w + h + d # --
427 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
428 add y1, h # h = k + w + h + S0 # --
430 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
434 ################################### RND N + 2 ##############################
436 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
437 mov f, y2 # y2 = f # CH
438 rorx $25, e, y0 # y0 = e >> 25 # S1A
439 rorx $11, e, y1 # y1 = e >> 11 # S1B
440 xor g, y2 # y2 = f^g # CH
442 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
443 rorx $6, e, y1 # y1 = (e >> 6) # S1
444 and e, y2 # y2 = (f^g)&e # CH
445 add y3, old_h # h = t1 + S0 + MAJ # --
447 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
448 rorx $13, a, T1 # T1 = a >> 13 # S0B
449 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
450 rorx $22, a, y1 # y1 = a >> 22 # S0A
451 mov a, y3 # y3 = a # MAJA
453 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
454 rorx $2, a, T1 # T1 = (a >> 2) # S0
456 addl offset(%rsp, SRND), h # h = k + w + h # --
457 or c, y3 # y3 = a|c # MAJA
459 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
460 mov a, T1 # T1 = a # MAJB
461 and b, y3 # y3 = (a|c)&b # MAJA
462 and c, T1 # T1 = a&c # MAJB
463 add y0, y2 # y2 = S1 + CH # --
466 add h, d # d = k + w + h + d # --
467 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
468 add y1, h # h = k + w + h + S0 # --
470 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
474 ################################### RND N + 3 ###########################
476 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
477 mov f, y2 # y2 = f # CH
478 rorx $25, e, y0 # y0 = e >> 25 # S1A
479 rorx $11, e, y1 # y1 = e >> 11 # S1B
480 xor g, y2 # y2 = f^g # CH
482 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
483 rorx $6, e, y1 # y1 = (e >> 6) # S1
484 and e, y2 # y2 = (f^g)&e # CH
485 add y3, old_h # h = t1 + S0 + MAJ # --
487 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
488 rorx $13, a, T1 # T1 = a >> 13 # S0B
489 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
490 rorx $22, a, y1 # y1 = a >> 22 # S0A
491 mov a, y3 # y3 = a # MAJA
493 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
494 rorx $2, a, T1 # T1 = (a >> 2) # S0
496 addl offset(%rsp, SRND), h # h = k + w + h # --
497 or c, y3 # y3 = a|c # MAJA
499 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
500 mov a, T1 # T1 = a # MAJB
501 and b, y3 # y3 = (a|c)&b # MAJA
502 and c, T1 # T1 = a&c # MAJB
503 add y0, y2 # y2 = S1 + CH # --
506 add h, d # d = k + w + h + d # --
507 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
508 add y1, h # h = k + w + h + S0 # --
510 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
513 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
515 add y3, h # h = t1 + S0 + MAJ # --
521 ########################################################################
522 ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
523 ## arg 1 : pointer to state
524 ## arg 2 : pointer to input data
525 ## arg 3 : Num blocks
526 ########################################################################
528 SYM_FUNC_START(sha256_transform_rorx)
537 subq $STACK_SIZE, %rsp
538 and $-32, %rsp # align rsp to 32 byte boundary
542 shl $6, NUM_BLKS # convert to bytes
544 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
545 mov NUM_BLKS, _INP_END(%rsp)
550 ## load initial digest
560 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
561 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
562 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
567 ## Load first 16 dwords from two blocks
568 VMOVDQ 0*32(INP),XTMP0
569 VMOVDQ 1*32(INP),XTMP1
570 VMOVDQ 2*32(INP),XTMP2
571 VMOVDQ 3*32(INP),XTMP3
574 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
575 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
576 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
577 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
579 ## transpose data into high/low halves
580 vperm2i128 $0x20, XTMP2, XTMP0, X0
581 vperm2i128 $0x31, XTMP2, XTMP0, X1
582 vperm2i128 $0x20, XTMP3, XTMP1, X2
583 vperm2i128 $0x31, XTMP3, XTMP1, X3
589 ## schedule 48 input dwords, by doing 3 rounds of 12 each
594 vpaddd K256+0*32(SRND), X0, XFER
595 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
596 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
598 vpaddd K256+1*32(SRND), X0, XFER
599 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
600 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
602 vpaddd K256+2*32(SRND), X0, XFER
603 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
604 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
606 vpaddd K256+3*32(SRND), X0, XFER
607 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
608 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
615 ## Do last 16 rounds with no scheduling
616 vpaddd K256+0*32(SRND), X0, XFER
617 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
618 DO_4ROUNDS _XFER + 0*32
620 vpaddd K256+1*32(SRND), X1, XFER
621 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
622 DO_4ROUNDS _XFER + 1*32
643 cmp _INP_END(%rsp), INP
646 #### Do second block using previously scheduled results
650 DO_4ROUNDS _XFER + 0*32 + 16
651 DO_4ROUNDS _XFER + 1*32 + 16
669 cmp _INP_END(%rsp), INP
674 VMOVDQ 0*16(INP),XWORD0
675 VMOVDQ 1*16(INP),XWORD1
676 VMOVDQ 2*16(INP),XWORD2
677 VMOVDQ 3*16(INP),XWORD3
679 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
680 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
681 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
682 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
688 ## load initial digest
698 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
699 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
700 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
715 SYM_FUNC_END(sha256_transform_rorx)
717 .section .rodata.cst512.K256, "aM", @progbits, 512
720 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
721 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
723 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
725 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
727 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
729 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
731 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
733 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
735 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
737 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
739 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
741 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
743 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
745 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
747 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
749 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
751 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
753 .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
755 PSHUFFLE_BYTE_FLIP_MASK:
756 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
758 # shuffle xBxA -> 00BA
759 .section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
762 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
764 # shuffle xDxC -> DC00
765 .section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
768 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF