1 #include "llvm_blake3_prefix.h"
4 .global blake3_hash_many_sse41
5 .global _blake3_hash_many_sse41
6 .global blake3_compress_in_place_sse41
7 .global _blake3_compress_in_place_sse41
8 .global blake3_compress_xof_sse41
9 .global _blake3_compress_xof_sse41
12 _blake3_hash_many_sse41:
13 blake3_hash_many_sse41:
24 and rsp, 0xFFFFFFFFFFFFFFC0
25 movdqa xmmword ptr [rsp+0x170], xmm6
26 movdqa xmmword ptr [rsp+0x180], xmm7
27 movdqa xmmword ptr [rsp+0x190], xmm8
28 movdqa xmmword ptr [rsp+0x1A0], xmm9
29 movdqa xmmword ptr [rsp+0x1B0], xmm10
30 movdqa xmmword ptr [rsp+0x1C0], xmm11
31 movdqa xmmword ptr [rsp+0x1D0], xmm12
32 movdqa xmmword ptr [rsp+0x1E0], xmm13
33 movdqa xmmword ptr [rsp+0x1F0], xmm14
34 movdqa xmmword ptr [rsp+0x200], xmm15
39 mov r8, qword ptr [rbp+0x68]
40 movzx r9, byte ptr [rbp+0x70]
43 pshufd xmm0, xmm0, 0x00
44 movdqa xmmword ptr [rsp+0x130], xmm0
46 pand xmm1, xmmword ptr [ADD0+rip]
47 pand xmm0, xmmword ptr [ADD1+rip]
48 movdqa xmmword ptr [rsp+0x150], xmm0
50 pshufd xmm0, xmm0, 0x00
52 movdqa xmmword ptr [rsp+0x110], xmm0
53 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
54 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
58 pshufd xmm2, xmm2, 0x00
60 movdqa xmmword ptr [rsp+0x120], xmm2
61 mov rbx, qword ptr [rbp+0x90]
64 movzx r13d, byte ptr [rbp+0x78]
65 movzx r12d, byte ptr [rbp+0x88]
69 movdqu xmm3, xmmword ptr [rcx]
70 pshufd xmm0, xmm3, 0x00
71 pshufd xmm1, xmm3, 0x55
72 pshufd xmm2, xmm3, 0xAA
73 pshufd xmm3, xmm3, 0xFF
74 movdqu xmm7, xmmword ptr [rcx+0x10]
75 pshufd xmm4, xmm7, 0x00
76 pshufd xmm5, xmm7, 0x55
77 pshufd xmm6, xmm7, 0xAA
78 pshufd xmm7, xmm7, 0xFF
79 mov r8, qword ptr [rdi]
80 mov r9, qword ptr [rdi+0x8]
81 mov r10, qword ptr [rdi+0x10]
82 mov r11, qword ptr [rdi+0x18]
83 movzx eax, byte ptr [rbp+0x80]
92 movdqu xmm8, xmmword ptr [r8+rdx-0x40]
93 movdqu xmm9, xmmword ptr [r9+rdx-0x40]
94 movdqu xmm10, xmmword ptr [r10+rdx-0x40]
95 movdqu xmm11, xmmword ptr [r11+rdx-0x40]
100 punpckldq xmm10, xmm11
101 punpckhdq xmm14, xmm11
103 punpcklqdq xmm8, xmm10
104 punpckhqdq xmm9, xmm10
106 punpcklqdq xmm12, xmm14
107 punpckhqdq xmm13, xmm14
108 movdqa xmmword ptr [rsp], xmm8
109 movdqa xmmword ptr [rsp+0x10], xmm9
110 movdqa xmmword ptr [rsp+0x20], xmm12
111 movdqa xmmword ptr [rsp+0x30], xmm13
112 movdqu xmm8, xmmword ptr [r8+rdx-0x30]
113 movdqu xmm9, xmmword ptr [r9+rdx-0x30]
114 movdqu xmm10, xmmword ptr [r10+rdx-0x30]
115 movdqu xmm11, xmmword ptr [r11+rdx-0x30]
118 punpckhdq xmm12, xmm9
120 punpckldq xmm10, xmm11
121 punpckhdq xmm14, xmm11
123 punpcklqdq xmm8, xmm10
124 punpckhqdq xmm9, xmm10
126 punpcklqdq xmm12, xmm14
127 punpckhqdq xmm13, xmm14
128 movdqa xmmword ptr [rsp+0x40], xmm8
129 movdqa xmmword ptr [rsp+0x50], xmm9
130 movdqa xmmword ptr [rsp+0x60], xmm12
131 movdqa xmmword ptr [rsp+0x70], xmm13
132 movdqu xmm8, xmmword ptr [r8+rdx-0x20]
133 movdqu xmm9, xmmword ptr [r9+rdx-0x20]
134 movdqu xmm10, xmmword ptr [r10+rdx-0x20]
135 movdqu xmm11, xmmword ptr [r11+rdx-0x20]
138 punpckhdq xmm12, xmm9
140 punpckldq xmm10, xmm11
141 punpckhdq xmm14, xmm11
143 punpcklqdq xmm8, xmm10
144 punpckhqdq xmm9, xmm10
146 punpcklqdq xmm12, xmm14
147 punpckhqdq xmm13, xmm14
148 movdqa xmmword ptr [rsp+0x80], xmm8
149 movdqa xmmword ptr [rsp+0x90], xmm9
150 movdqa xmmword ptr [rsp+0xA0], xmm12
151 movdqa xmmword ptr [rsp+0xB0], xmm13
152 movdqu xmm8, xmmword ptr [r8+rdx-0x10]
153 movdqu xmm9, xmmword ptr [r9+rdx-0x10]
154 movdqu xmm10, xmmword ptr [r10+rdx-0x10]
155 movdqu xmm11, xmmword ptr [r11+rdx-0x10]
158 punpckhdq xmm12, xmm9
160 punpckldq xmm10, xmm11
161 punpckhdq xmm14, xmm11
163 punpcklqdq xmm8, xmm10
164 punpckhqdq xmm9, xmm10
166 punpcklqdq xmm12, xmm14
167 punpckhqdq xmm13, xmm14
168 movdqa xmmword ptr [rsp+0xC0], xmm8
169 movdqa xmmword ptr [rsp+0xD0], xmm9
170 movdqa xmmword ptr [rsp+0xE0], xmm12
171 movdqa xmmword ptr [rsp+0xF0], xmm13
172 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
173 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
174 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
175 movdqa xmm12, xmmword ptr [rsp+0x110]
176 movdqa xmm13, xmmword ptr [rsp+0x120]
177 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
179 pshufd xmm15, xmm15, 0x00
180 prefetcht0 [r8+rdx+0x80]
181 prefetcht0 [r9+rdx+0x80]
182 prefetcht0 [r10+rdx+0x80]
183 prefetcht0 [r11+rdx+0x80]
184 paddd xmm0, xmmword ptr [rsp]
185 paddd xmm1, xmmword ptr [rsp+0x20]
186 paddd xmm2, xmmword ptr [rsp+0x40]
187 paddd xmm3, xmmword ptr [rsp+0x60]
196 movdqa xmm8, xmmword ptr [ROT16+rip]
201 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
210 movdqa xmmword ptr [rsp+0x100], xmm8
227 paddd xmm0, xmmword ptr [rsp+0x10]
228 paddd xmm1, xmmword ptr [rsp+0x30]
229 paddd xmm2, xmmword ptr [rsp+0x50]
230 paddd xmm3, xmmword ptr [rsp+0x70]
239 movdqa xmm8, xmmword ptr [ROT8+rip]
244 movdqa xmm8, xmmword ptr [rsp+0x100]
253 movdqa xmmword ptr [rsp+0x100], xmm8
270 paddd xmm0, xmmword ptr [rsp+0x80]
271 paddd xmm1, xmmword ptr [rsp+0xA0]
272 paddd xmm2, xmmword ptr [rsp+0xC0]
273 paddd xmm3, xmmword ptr [rsp+0xE0]
282 movdqa xmm8, xmmword ptr [ROT16+rip]
289 movdqa xmm8, xmmword ptr [rsp+0x100]
296 movdqa xmmword ptr [rsp+0x100], xmm8
313 paddd xmm0, xmmword ptr [rsp+0x90]
314 paddd xmm1, xmmword ptr [rsp+0xB0]
315 paddd xmm2, xmmword ptr [rsp+0xD0]
316 paddd xmm3, xmmword ptr [rsp+0xF0]
325 movdqa xmm8, xmmword ptr [ROT8+rip]
332 movdqa xmm8, xmmword ptr [rsp+0x100]
339 movdqa xmmword ptr [rsp+0x100], xmm8
356 paddd xmm0, xmmword ptr [rsp+0x20]
357 paddd xmm1, xmmword ptr [rsp+0x30]
358 paddd xmm2, xmmword ptr [rsp+0x70]
359 paddd xmm3, xmmword ptr [rsp+0x40]
368 movdqa xmm8, xmmword ptr [ROT16+rip]
373 movdqa xmm8, xmmword ptr [rsp+0x100]
382 movdqa xmmword ptr [rsp+0x100], xmm8
399 paddd xmm0, xmmword ptr [rsp+0x60]
400 paddd xmm1, xmmword ptr [rsp+0xA0]
401 paddd xmm2, xmmword ptr [rsp]
402 paddd xmm3, xmmword ptr [rsp+0xD0]
411 movdqa xmm8, xmmword ptr [ROT8+rip]
416 movdqa xmm8, xmmword ptr [rsp+0x100]
425 movdqa xmmword ptr [rsp+0x100], xmm8
442 paddd xmm0, xmmword ptr [rsp+0x10]
443 paddd xmm1, xmmword ptr [rsp+0xC0]
444 paddd xmm2, xmmword ptr [rsp+0x90]
445 paddd xmm3, xmmword ptr [rsp+0xF0]
454 movdqa xmm8, xmmword ptr [ROT16+rip]
461 movdqa xmm8, xmmword ptr [rsp+0x100]
468 movdqa xmmword ptr [rsp+0x100], xmm8
485 paddd xmm0, xmmword ptr [rsp+0xB0]
486 paddd xmm1, xmmword ptr [rsp+0x50]
487 paddd xmm2, xmmword ptr [rsp+0xE0]
488 paddd xmm3, xmmword ptr [rsp+0x80]
497 movdqa xmm8, xmmword ptr [ROT8+rip]
504 movdqa xmm8, xmmword ptr [rsp+0x100]
511 movdqa xmmword ptr [rsp+0x100], xmm8
528 paddd xmm0, xmmword ptr [rsp+0x30]
529 paddd xmm1, xmmword ptr [rsp+0xA0]
530 paddd xmm2, xmmword ptr [rsp+0xD0]
531 paddd xmm3, xmmword ptr [rsp+0x70]
540 movdqa xmm8, xmmword ptr [ROT16+rip]
545 movdqa xmm8, xmmword ptr [rsp+0x100]
554 movdqa xmmword ptr [rsp+0x100], xmm8
571 paddd xmm0, xmmword ptr [rsp+0x40]
572 paddd xmm1, xmmword ptr [rsp+0xC0]
573 paddd xmm2, xmmword ptr [rsp+0x20]
574 paddd xmm3, xmmword ptr [rsp+0xE0]
583 movdqa xmm8, xmmword ptr [ROT8+rip]
588 movdqa xmm8, xmmword ptr [rsp+0x100]
597 movdqa xmmword ptr [rsp+0x100], xmm8
614 paddd xmm0, xmmword ptr [rsp+0x60]
615 paddd xmm1, xmmword ptr [rsp+0x90]
616 paddd xmm2, xmmword ptr [rsp+0xB0]
617 paddd xmm3, xmmword ptr [rsp+0x80]
626 movdqa xmm8, xmmword ptr [ROT16+rip]
633 movdqa xmm8, xmmword ptr [rsp+0x100]
640 movdqa xmmword ptr [rsp+0x100], xmm8
657 paddd xmm0, xmmword ptr [rsp+0x50]
658 paddd xmm1, xmmword ptr [rsp]
659 paddd xmm2, xmmword ptr [rsp+0xF0]
660 paddd xmm3, xmmword ptr [rsp+0x10]
669 movdqa xmm8, xmmword ptr [ROT8+rip]
676 movdqa xmm8, xmmword ptr [rsp+0x100]
683 movdqa xmmword ptr [rsp+0x100], xmm8
700 paddd xmm0, xmmword ptr [rsp+0xA0]
701 paddd xmm1, xmmword ptr [rsp+0xC0]
702 paddd xmm2, xmmword ptr [rsp+0xE0]
703 paddd xmm3, xmmword ptr [rsp+0xD0]
712 movdqa xmm8, xmmword ptr [ROT16+rip]
717 movdqa xmm8, xmmword ptr [rsp+0x100]
726 movdqa xmmword ptr [rsp+0x100], xmm8
743 paddd xmm0, xmmword ptr [rsp+0x70]
744 paddd xmm1, xmmword ptr [rsp+0x90]
745 paddd xmm2, xmmword ptr [rsp+0x30]
746 paddd xmm3, xmmword ptr [rsp+0xF0]
755 movdqa xmm8, xmmword ptr [ROT8+rip]
760 movdqa xmm8, xmmword ptr [rsp+0x100]
769 movdqa xmmword ptr [rsp+0x100], xmm8
786 paddd xmm0, xmmword ptr [rsp+0x40]
787 paddd xmm1, xmmword ptr [rsp+0xB0]
788 paddd xmm2, xmmword ptr [rsp+0x50]
789 paddd xmm3, xmmword ptr [rsp+0x10]
798 movdqa xmm8, xmmword ptr [ROT16+rip]
805 movdqa xmm8, xmmword ptr [rsp+0x100]
812 movdqa xmmword ptr [rsp+0x100], xmm8
829 paddd xmm0, xmmword ptr [rsp]
830 paddd xmm1, xmmword ptr [rsp+0x20]
831 paddd xmm2, xmmword ptr [rsp+0x80]
832 paddd xmm3, xmmword ptr [rsp+0x60]
841 movdqa xmm8, xmmword ptr [ROT8+rip]
848 movdqa xmm8, xmmword ptr [rsp+0x100]
855 movdqa xmmword ptr [rsp+0x100], xmm8
872 paddd xmm0, xmmword ptr [rsp+0xC0]
873 paddd xmm1, xmmword ptr [rsp+0x90]
874 paddd xmm2, xmmword ptr [rsp+0xF0]
875 paddd xmm3, xmmword ptr [rsp+0xE0]
884 movdqa xmm8, xmmword ptr [ROT16+rip]
889 movdqa xmm8, xmmword ptr [rsp+0x100]
898 movdqa xmmword ptr [rsp+0x100], xmm8
915 paddd xmm0, xmmword ptr [rsp+0xD0]
916 paddd xmm1, xmmword ptr [rsp+0xB0]
917 paddd xmm2, xmmword ptr [rsp+0xA0]
918 paddd xmm3, xmmword ptr [rsp+0x80]
927 movdqa xmm8, xmmword ptr [ROT8+rip]
932 movdqa xmm8, xmmword ptr [rsp+0x100]
941 movdqa xmmword ptr [rsp+0x100], xmm8
958 paddd xmm0, xmmword ptr [rsp+0x70]
959 paddd xmm1, xmmword ptr [rsp+0x50]
960 paddd xmm2, xmmword ptr [rsp]
961 paddd xmm3, xmmword ptr [rsp+0x60]
970 movdqa xmm8, xmmword ptr [ROT16+rip]
977 movdqa xmm8, xmmword ptr [rsp+0x100]
984 movdqa xmmword ptr [rsp+0x100], xmm8
1001 paddd xmm0, xmmword ptr [rsp+0x20]
1002 paddd xmm1, xmmword ptr [rsp+0x30]
1003 paddd xmm2, xmmword ptr [rsp+0x10]
1004 paddd xmm3, xmmword ptr [rsp+0x40]
1013 movdqa xmm8, xmmword ptr [ROT8+rip]
1020 movdqa xmm8, xmmword ptr [rsp+0x100]
1027 movdqa xmmword ptr [rsp+0x100], xmm8
1044 paddd xmm0, xmmword ptr [rsp+0x90]
1045 paddd xmm1, xmmword ptr [rsp+0xB0]
1046 paddd xmm2, xmmword ptr [rsp+0x80]
1047 paddd xmm3, xmmword ptr [rsp+0xF0]
1056 movdqa xmm8, xmmword ptr [ROT16+rip]
1061 movdqa xmm8, xmmword ptr [rsp+0x100]
1070 movdqa xmmword ptr [rsp+0x100], xmm8
1087 paddd xmm0, xmmword ptr [rsp+0xE0]
1088 paddd xmm1, xmmword ptr [rsp+0x50]
1089 paddd xmm2, xmmword ptr [rsp+0xC0]
1090 paddd xmm3, xmmword ptr [rsp+0x10]
1099 movdqa xmm8, xmmword ptr [ROT8+rip]
1104 movdqa xmm8, xmmword ptr [rsp+0x100]
1113 movdqa xmmword ptr [rsp+0x100], xmm8
1130 paddd xmm0, xmmword ptr [rsp+0xD0]
1131 paddd xmm1, xmmword ptr [rsp]
1132 paddd xmm2, xmmword ptr [rsp+0x20]
1133 paddd xmm3, xmmword ptr [rsp+0x40]
1142 movdqa xmm8, xmmword ptr [ROT16+rip]
1149 movdqa xmm8, xmmword ptr [rsp+0x100]
1156 movdqa xmmword ptr [rsp+0x100], xmm8
1173 paddd xmm0, xmmword ptr [rsp+0x30]
1174 paddd xmm1, xmmword ptr [rsp+0xA0]
1175 paddd xmm2, xmmword ptr [rsp+0x60]
1176 paddd xmm3, xmmword ptr [rsp+0x70]
1185 movdqa xmm8, xmmword ptr [ROT8+rip]
1192 movdqa xmm8, xmmword ptr [rsp+0x100]
1199 movdqa xmmword ptr [rsp+0x100], xmm8
1216 paddd xmm0, xmmword ptr [rsp+0xB0]
1217 paddd xmm1, xmmword ptr [rsp+0x50]
1218 paddd xmm2, xmmword ptr [rsp+0x10]
1219 paddd xmm3, xmmword ptr [rsp+0x80]
1228 movdqa xmm8, xmmword ptr [ROT16+rip]
1233 movdqa xmm8, xmmword ptr [rsp+0x100]
1242 movdqa xmmword ptr [rsp+0x100], xmm8
1259 paddd xmm0, xmmword ptr [rsp+0xF0]
1260 paddd xmm1, xmmword ptr [rsp]
1261 paddd xmm2, xmmword ptr [rsp+0x90]
1262 paddd xmm3, xmmword ptr [rsp+0x60]
1271 movdqa xmm8, xmmword ptr [ROT8+rip]
1276 movdqa xmm8, xmmword ptr [rsp+0x100]
1285 movdqa xmmword ptr [rsp+0x100], xmm8
1302 paddd xmm0, xmmword ptr [rsp+0xE0]
1303 paddd xmm1, xmmword ptr [rsp+0x20]
1304 paddd xmm2, xmmword ptr [rsp+0x30]
1305 paddd xmm3, xmmword ptr [rsp+0x70]
1314 movdqa xmm8, xmmword ptr [ROT16+rip]
1321 movdqa xmm8, xmmword ptr [rsp+0x100]
1328 movdqa xmmword ptr [rsp+0x100], xmm8
1345 paddd xmm0, xmmword ptr [rsp+0xA0]
1346 paddd xmm1, xmmword ptr [rsp+0xC0]
1347 paddd xmm2, xmmword ptr [rsp+0x40]
1348 paddd xmm3, xmmword ptr [rsp+0xD0]
1357 movdqa xmm8, xmmword ptr [ROT8+rip]
1364 movdqa xmm8, xmmword ptr [rsp+0x100]
1398 punpckldq xmm0, xmm1
1399 punpckhdq xmm9, xmm1
1401 punpckldq xmm2, xmm3
1402 punpckhdq xmm11, xmm3
1404 punpcklqdq xmm0, xmm2
1405 punpckhqdq xmm1, xmm2
1407 punpcklqdq xmm9, xmm11
1408 punpckhqdq xmm3, xmm11
1409 movdqu xmmword ptr [rbx], xmm0
1410 movdqu xmmword ptr [rbx+0x20], xmm1
1411 movdqu xmmword ptr [rbx+0x40], xmm9
1412 movdqu xmmword ptr [rbx+0x60], xmm3
1414 punpckldq xmm4, xmm5
1415 punpckhdq xmm9, xmm5
1417 punpckldq xmm6, xmm7
1418 punpckhdq xmm11, xmm7
1420 punpcklqdq xmm4, xmm6
1421 punpckhqdq xmm5, xmm6
1423 punpcklqdq xmm9, xmm11
1424 punpckhqdq xmm7, xmm11
1425 movdqu xmmword ptr [rbx+0x10], xmm4
1426 movdqu xmmword ptr [rbx+0x30], xmm5
1427 movdqu xmmword ptr [rbx+0x50], xmm9
1428 movdqu xmmword ptr [rbx+0x70], xmm7
1429 movdqa xmm1, xmmword ptr [rsp+0x110]
1431 paddd xmm1, xmmword ptr [rsp+0x150]
1432 movdqa xmmword ptr [rsp+0x110], xmm1
1433 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1434 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1436 movdqa xmm1, xmmword ptr [rsp+0x120]
1438 movdqa xmmword ptr [rsp+0x120], xmm1
1447 movdqa xmm6, xmmword ptr [rsp+0x170]
1448 movdqa xmm7, xmmword ptr [rsp+0x180]
1449 movdqa xmm8, xmmword ptr [rsp+0x190]
1450 movdqa xmm9, xmmword ptr [rsp+0x1A0]
1451 movdqa xmm10, xmmword ptr [rsp+0x1B0]
1452 movdqa xmm11, xmmword ptr [rsp+0x1C0]
1453 movdqa xmm12, xmmword ptr [rsp+0x1D0]
1454 movdqa xmm13, xmmword ptr [rsp+0x1E0]
1455 movdqa xmm14, xmmword ptr [rsp+0x1F0]
1456 movdqa xmm15, xmmword ptr [rsp+0x200]
1471 movups xmm0, xmmword ptr [rcx]
1472 movups xmm1, xmmword ptr [rcx+0x10]
1475 movd xmm13, dword ptr [rsp+0x110]
1476 pinsrd xmm13, dword ptr [rsp+0x120], 1
1477 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1478 movaps xmmword ptr [rsp], xmm13
1479 movd xmm14, dword ptr [rsp+0x114]
1480 pinsrd xmm14, dword ptr [rsp+0x124], 1
1481 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1482 movaps xmmword ptr [rsp+0x10], xmm14
1483 mov r8, qword ptr [rdi]
1484 mov r9, qword ptr [rdi+0x8]
1485 movzx eax, byte ptr [rbp+0x80]
1494 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1496 movups xmm4, xmmword ptr [r8+rdx-0x40]
1497 movups xmm5, xmmword ptr [r8+rdx-0x30]
1499 shufps xmm4, xmm5, 136
1500 shufps xmm3, xmm5, 221
1502 movups xmm6, xmmword ptr [r8+rdx-0x20]
1503 movups xmm7, xmmword ptr [r8+rdx-0x10]
1505 shufps xmm6, xmm7, 136
1506 pshufd xmm6, xmm6, 0x93
1507 shufps xmm3, xmm7, 221
1508 pshufd xmm7, xmm3, 0x93
1509 movups xmm12, xmmword ptr [r9+rdx-0x40]
1510 movups xmm13, xmmword ptr [r9+rdx-0x30]
1512 shufps xmm12, xmm13, 136
1513 shufps xmm11, xmm13, 221
1515 movups xmm14, xmmword ptr [r9+rdx-0x20]
1516 movups xmm15, xmmword ptr [r9+rdx-0x10]
1518 shufps xmm14, xmm15, 136
1519 pshufd xmm14, xmm14, 0x93
1520 shufps xmm11, xmm15, 221
1521 pshufd xmm15, xmm11, 0x93
1522 movaps xmm3, xmmword ptr [rsp]
1523 movaps xmm11, xmmword ptr [rsp+0x10]
1525 pinsrd xmm11, eax, 3
1530 movaps xmmword ptr [rsp+0x20], xmm4
1531 movaps xmmword ptr [rsp+0x30], xmm12
1536 movaps xmm12, xmmword ptr [ROT16+rip]
1553 movaps xmmword ptr [rsp+0x40], xmm5
1554 movaps xmmword ptr [rsp+0x50], xmm13
1559 movaps xmm13, xmmword ptr [ROT8+rip]
1574 pshufd xmm0, xmm0, 0x93
1575 pshufd xmm8, xmm8, 0x93
1576 pshufd xmm3, xmm3, 0x4E
1577 pshufd xmm11, xmm11, 0x4E
1578 pshufd xmm2, xmm2, 0x39
1579 pshufd xmm10, xmm10, 0x39
1620 pshufd xmm0, xmm0, 0x39
1621 pshufd xmm8, xmm8, 0x39
1622 pshufd xmm3, xmm3, 0x4E
1623 pshufd xmm11, xmm11, 0x4E
1624 pshufd xmm2, xmm2, 0x93
1625 pshufd xmm10, xmm10, 0x93
1628 movdqa xmm12, xmmword ptr [rsp+0x20]
1629 movdqa xmm5, xmmword ptr [rsp+0x40]
1630 pshufd xmm13, xmm12, 0x0F
1631 shufps xmm12, xmm5, 214
1632 pshufd xmm4, xmm12, 0x39
1634 shufps xmm12, xmm7, 250
1635 pblendw xmm13, xmm12, 0xCC
1637 punpcklqdq xmm12, xmm5
1638 pblendw xmm12, xmm6, 0xC0
1639 pshufd xmm12, xmm12, 0x78
1640 punpckhdq xmm5, xmm7
1641 punpckldq xmm6, xmm5
1642 pshufd xmm7, xmm6, 0x1E
1643 movdqa xmmword ptr [rsp+0x20], xmm13
1644 movdqa xmmword ptr [rsp+0x40], xmm12
1645 movdqa xmm5, xmmword ptr [rsp+0x30]
1646 movdqa xmm13, xmmword ptr [rsp+0x50]
1647 pshufd xmm6, xmm5, 0x0F
1648 shufps xmm5, xmm13, 214
1649 pshufd xmm12, xmm5, 0x39
1651 shufps xmm5, xmm15, 250
1652 pblendw xmm6, xmm5, 0xCC
1654 punpcklqdq xmm5, xmm13
1655 pblendw xmm5, xmm14, 0xC0
1656 pshufd xmm5, xmm5, 0x78
1657 punpckhdq xmm13, xmm15
1658 punpckldq xmm14, xmm13
1659 pshufd xmm15, xmm14, 0x1E
1662 movdqa xmm5, xmmword ptr [rsp+0x20]
1663 movdqa xmm6, xmmword ptr [rsp+0x40]
1673 movups xmmword ptr [rbx], xmm0
1674 movups xmmword ptr [rbx+0x10], xmm1
1675 movups xmmword ptr [rbx+0x20], xmm8
1676 movups xmmword ptr [rbx+0x30], xmm9
1677 movdqa xmm0, xmmword ptr [rsp+0x130]
1678 movdqa xmm1, xmmword ptr [rsp+0x110]
1679 movdqa xmm2, xmmword ptr [rsp+0x120]
1680 movdqu xmm3, xmmword ptr [rsp+0x118]
1681 movdqu xmm4, xmmword ptr [rsp+0x128]
1682 blendvps xmm1, xmm3, xmm0
1683 blendvps xmm2, xmm4, xmm0
1684 movdqa xmmword ptr [rsp+0x110], xmm1
1685 movdqa xmmword ptr [rsp+0x120], xmm2
1692 movups xmm0, xmmword ptr [rcx]
1693 movups xmm1, xmmword ptr [rcx+0x10]
1694 movd xmm13, dword ptr [rsp+0x110]
1695 pinsrd xmm13, dword ptr [rsp+0x120], 1
1696 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1697 movaps xmm14, xmmword ptr [ROT8+rip]
1698 movaps xmm15, xmmword ptr [ROT16+rip]
1699 mov r8, qword ptr [rdi]
1700 movzx eax, byte ptr [rbp+0x80]
1709 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1712 movups xmm4, xmmword ptr [r8+rdx-0x40]
1713 movups xmm5, xmmword ptr [r8+rdx-0x30]
1715 shufps xmm4, xmm5, 136
1716 shufps xmm8, xmm5, 221
1718 movups xmm6, xmmword ptr [r8+rdx-0x20]
1719 movups xmm7, xmmword ptr [r8+rdx-0x10]
1721 shufps xmm6, xmm7, 136
1722 pshufd xmm6, xmm6, 0x93
1723 shufps xmm8, xmm7, 221
1724 pshufd xmm7, xmm8, 0x93
1747 pshufd xmm0, xmm0, 0x93
1748 pshufd xmm3, xmm3, 0x4E
1749 pshufd xmm2, xmm2, 0x39
1770 pshufd xmm0, xmm0, 0x39
1771 pshufd xmm3, xmm3, 0x4E
1772 pshufd xmm2, xmm2, 0x93
1776 shufps xmm8, xmm5, 214
1777 pshufd xmm9, xmm4, 0x0F
1778 pshufd xmm4, xmm8, 0x39
1780 shufps xmm8, xmm7, 250
1781 pblendw xmm9, xmm8, 0xCC
1783 punpcklqdq xmm8, xmm5
1784 pblendw xmm8, xmm6, 0xC0
1785 pshufd xmm8, xmm8, 0x78
1786 punpckhdq xmm5, xmm7
1787 punpckldq xmm6, xmm5
1788 pshufd xmm7, xmm6, 0x1E
1798 movups xmmword ptr [rbx], xmm0
1799 movups xmmword ptr [rbx+0x10], xmm1
1803 blake3_compress_in_place_sse41:
1804 _blake3_compress_in_place_sse41:
1806 movdqa xmmword ptr [rsp], xmm6
1807 movdqa xmmword ptr [rsp+0x10], xmm7
1808 movdqa xmmword ptr [rsp+0x20], xmm8
1809 movdqa xmmword ptr [rsp+0x30], xmm9
1810 movdqa xmmword ptr [rsp+0x40], xmm11
1811 movdqa xmmword ptr [rsp+0x50], xmm14
1812 movdqa xmmword ptr [rsp+0x60], xmm15
1813 movups xmm0, xmmword ptr [rcx]
1814 movups xmm1, xmmword ptr [rcx+0x10]
1815 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1816 movzx eax, byte ptr [rsp+0xA0]
1822 punpcklqdq xmm3, xmm4
1823 movups xmm4, xmmword ptr [rdx]
1824 movups xmm5, xmmword ptr [rdx+0x10]
1826 shufps xmm4, xmm5, 136
1827 shufps xmm8, xmm5, 221
1829 movups xmm6, xmmword ptr [rdx+0x20]
1830 movups xmm7, xmmword ptr [rdx+0x30]
1832 shufps xmm6, xmm7, 136
1833 pshufd xmm6, xmm6, 0x93
1834 shufps xmm8, xmm7, 221
1835 pshufd xmm7, xmm8, 0x93
1836 movaps xmm14, xmmword ptr [ROT8+rip]
1837 movaps xmm15, xmmword ptr [ROT16+rip]
1860 pshufd xmm0, xmm0, 0x93
1861 pshufd xmm3, xmm3, 0x4E
1862 pshufd xmm2, xmm2, 0x39
1883 pshufd xmm0, xmm0, 0x39
1884 pshufd xmm3, xmm3, 0x4E
1885 pshufd xmm2, xmm2, 0x93
1889 shufps xmm8, xmm5, 214
1890 pshufd xmm9, xmm4, 0x0F
1891 pshufd xmm4, xmm8, 0x39
1893 shufps xmm8, xmm7, 250
1894 pblendw xmm9, xmm8, 0xCC
1896 punpcklqdq xmm8, xmm5
1897 pblendw xmm8, xmm6, 0xC0
1898 pshufd xmm8, xmm8, 0x78
1899 punpckhdq xmm5, xmm7
1900 punpckldq xmm6, xmm5
1901 pshufd xmm7, xmm6, 0x1E
1908 movups xmmword ptr [rcx], xmm0
1909 movups xmmword ptr [rcx+0x10], xmm1
1910 movdqa xmm6, xmmword ptr [rsp]
1911 movdqa xmm7, xmmword ptr [rsp+0x10]
1912 movdqa xmm8, xmmword ptr [rsp+0x20]
1913 movdqa xmm9, xmmword ptr [rsp+0x30]
1914 movdqa xmm11, xmmword ptr [rsp+0x40]
1915 movdqa xmm14, xmmword ptr [rsp+0x50]
1916 movdqa xmm15, xmmword ptr [rsp+0x60]
1922 _blake3_compress_xof_sse41:
1923 blake3_compress_xof_sse41:
1925 movdqa xmmword ptr [rsp], xmm6
1926 movdqa xmmword ptr [rsp+0x10], xmm7
1927 movdqa xmmword ptr [rsp+0x20], xmm8
1928 movdqa xmmword ptr [rsp+0x30], xmm9
1929 movdqa xmmword ptr [rsp+0x40], xmm11
1930 movdqa xmmword ptr [rsp+0x50], xmm14
1931 movdqa xmmword ptr [rsp+0x60], xmm15
1932 movups xmm0, xmmword ptr [rcx]
1933 movups xmm1, xmmword ptr [rcx+0x10]
1934 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1935 movzx eax, byte ptr [rsp+0xA0]
1937 mov r10, qword ptr [rsp+0xA8]
1942 punpcklqdq xmm3, xmm4
1943 movups xmm4, xmmword ptr [rdx]
1944 movups xmm5, xmmword ptr [rdx+0x10]
1946 shufps xmm4, xmm5, 136
1947 shufps xmm8, xmm5, 221
1949 movups xmm6, xmmword ptr [rdx+0x20]
1950 movups xmm7, xmmword ptr [rdx+0x30]
1952 shufps xmm6, xmm7, 136
1953 pshufd xmm6, xmm6, 0x93
1954 shufps xmm8, xmm7, 221
1955 pshufd xmm7, xmm8, 0x93
1956 movaps xmm14, xmmword ptr [ROT8+rip]
1957 movaps xmm15, xmmword ptr [ROT16+rip]
1980 pshufd xmm0, xmm0, 0x93
1981 pshufd xmm3, xmm3, 0x4E
1982 pshufd xmm2, xmm2, 0x39
2003 pshufd xmm0, xmm0, 0x39
2004 pshufd xmm3, xmm3, 0x4E
2005 pshufd xmm2, xmm2, 0x93
2009 shufps xmm8, xmm5, 214
2010 pshufd xmm9, xmm4, 0x0F
2011 pshufd xmm4, xmm8, 0x39
2013 shufps xmm8, xmm7, 250
2014 pblendw xmm9, xmm8, 0xCC
2016 punpcklqdq xmm8, xmm5
2017 pblendw xmm8, xmm6, 0xC0
2018 pshufd xmm8, xmm8, 0x78
2019 punpckhdq xmm5, xmm7
2020 punpckldq xmm6, xmm5
2021 pshufd xmm7, xmm6, 0x1E
2026 movdqu xmm4, xmmword ptr [rcx]
2027 movdqu xmm5, xmmword ptr [rcx+0x10]
2032 movups xmmword ptr [r10], xmm0
2033 movups xmmword ptr [r10+0x10], xmm1
2034 movups xmmword ptr [r10+0x20], xmm2
2035 movups xmmword ptr [r10+0x30], xmm3
2036 movdqa xmm6, xmmword ptr [rsp]
2037 movdqa xmm7, xmmword ptr [rsp+0x10]
2038 movdqa xmm8, xmmword ptr [rsp+0x20]
2039 movdqa xmm9, xmmword ptr [rsp+0x30]
2040 movdqa xmm11, xmmword ptr [rsp+0x40]
2041 movdqa xmm14, xmmword ptr [rsp+0x50]
2042 movdqa xmm15, xmmword ptr [rsp+0x60]
2050 .long 0x6A09E667, 0xBB67AE85
2051 .long 0x3C6EF372, 0xA54FF53A
2053 .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2055 .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2061 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2063 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2065 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2067 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2069 .long 64, 64, 64, 64
2071 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000