4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2020 Samuel Neves
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
28 #if defined(HAVE_SSE4_1)
31 #include <sys/asm_linkage.h>
33 .intel_syntax noprefix
37 ENTRY_ALIGN(zfs_blake3_hash_many_sse41, 64)
47 and rsp, 0xFFFFFFFFFFFFFFC0
50 pshufd xmm0, xmm0, 0x00
51 movdqa xmmword ptr [rsp+0x130], xmm0
53 pand xmm1, xmmword ptr [ADD0+rip]
54 pand xmm0, xmmword ptr [ADD1+rip]
55 movdqa xmmword ptr [rsp+0x150], xmm0
57 pshufd xmm0, xmm0, 0x00
59 movdqa xmmword ptr [rsp+0x110], xmm0
60 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
61 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
65 pshufd xmm2, xmm2, 0x00
67 movdqa xmmword ptr [rsp+0x120], xmm2
68 mov rbx, qword ptr [rbp+0x50]
71 movzx r13d, byte ptr [rbp+0x38]
72 movzx r12d, byte ptr [rbp+0x48]
76 movdqu xmm3, xmmword ptr [rcx]
77 pshufd xmm0, xmm3, 0x00
78 pshufd xmm1, xmm3, 0x55
79 pshufd xmm2, xmm3, 0xAA
80 pshufd xmm3, xmm3, 0xFF
81 movdqu xmm7, xmmword ptr [rcx+0x10]
82 pshufd xmm4, xmm7, 0x00
83 pshufd xmm5, xmm7, 0x55
84 pshufd xmm6, xmm7, 0xAA
85 pshufd xmm7, xmm7, 0xFF
86 mov r8, qword ptr [rdi]
87 mov r9, qword ptr [rdi+0x8]
88 mov r10, qword ptr [rdi+0x10]
89 mov r11, qword ptr [rdi+0x18]
90 movzx eax, byte ptr [rbp+0x40]
99 movdqu xmm8, xmmword ptr [r8+rdx-0x40]
100 movdqu xmm9, xmmword ptr [r9+rdx-0x40]
101 movdqu xmm10, xmmword ptr [r10+rdx-0x40]
102 movdqu xmm11, xmmword ptr [r11+rdx-0x40]
105 punpckhdq xmm12, xmm9
107 punpckldq xmm10, xmm11
108 punpckhdq xmm14, xmm11
110 punpcklqdq xmm8, xmm10
111 punpckhqdq xmm9, xmm10
113 punpcklqdq xmm12, xmm14
114 punpckhqdq xmm13, xmm14
115 movdqa xmmword ptr [rsp], xmm8
116 movdqa xmmword ptr [rsp+0x10], xmm9
117 movdqa xmmword ptr [rsp+0x20], xmm12
118 movdqa xmmword ptr [rsp+0x30], xmm13
119 movdqu xmm8, xmmword ptr [r8+rdx-0x30]
120 movdqu xmm9, xmmword ptr [r9+rdx-0x30]
121 movdqu xmm10, xmmword ptr [r10+rdx-0x30]
122 movdqu xmm11, xmmword ptr [r11+rdx-0x30]
125 punpckhdq xmm12, xmm9
127 punpckldq xmm10, xmm11
128 punpckhdq xmm14, xmm11
130 punpcklqdq xmm8, xmm10
131 punpckhqdq xmm9, xmm10
133 punpcklqdq xmm12, xmm14
134 punpckhqdq xmm13, xmm14
135 movdqa xmmword ptr [rsp+0x40], xmm8
136 movdqa xmmword ptr [rsp+0x50], xmm9
137 movdqa xmmword ptr [rsp+0x60], xmm12
138 movdqa xmmword ptr [rsp+0x70], xmm13
139 movdqu xmm8, xmmword ptr [r8+rdx-0x20]
140 movdqu xmm9, xmmword ptr [r9+rdx-0x20]
141 movdqu xmm10, xmmword ptr [r10+rdx-0x20]
142 movdqu xmm11, xmmword ptr [r11+rdx-0x20]
145 punpckhdq xmm12, xmm9
147 punpckldq xmm10, xmm11
148 punpckhdq xmm14, xmm11
150 punpcklqdq xmm8, xmm10
151 punpckhqdq xmm9, xmm10
153 punpcklqdq xmm12, xmm14
154 punpckhqdq xmm13, xmm14
155 movdqa xmmword ptr [rsp+0x80], xmm8
156 movdqa xmmword ptr [rsp+0x90], xmm9
157 movdqa xmmword ptr [rsp+0xA0], xmm12
158 movdqa xmmword ptr [rsp+0xB0], xmm13
159 movdqu xmm8, xmmword ptr [r8+rdx-0x10]
160 movdqu xmm9, xmmword ptr [r9+rdx-0x10]
161 movdqu xmm10, xmmword ptr [r10+rdx-0x10]
162 movdqu xmm11, xmmword ptr [r11+rdx-0x10]
165 punpckhdq xmm12, xmm9
167 punpckldq xmm10, xmm11
168 punpckhdq xmm14, xmm11
170 punpcklqdq xmm8, xmm10
171 punpckhqdq xmm9, xmm10
173 punpcklqdq xmm12, xmm14
174 punpckhqdq xmm13, xmm14
175 movdqa xmmword ptr [rsp+0xC0], xmm8
176 movdqa xmmword ptr [rsp+0xD0], xmm9
177 movdqa xmmword ptr [rsp+0xE0], xmm12
178 movdqa xmmword ptr [rsp+0xF0], xmm13
179 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
180 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
181 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
182 movdqa xmm12, xmmword ptr [rsp+0x110]
183 movdqa xmm13, xmmword ptr [rsp+0x120]
184 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
186 pshufd xmm15, xmm15, 0x00
187 prefetcht0 [r8+rdx+0x80]
188 prefetcht0 [r9+rdx+0x80]
189 prefetcht0 [r10+rdx+0x80]
190 prefetcht0 [r11+rdx+0x80]
191 paddd xmm0, xmmword ptr [rsp]
192 paddd xmm1, xmmword ptr [rsp+0x20]
193 paddd xmm2, xmmword ptr [rsp+0x40]
194 paddd xmm3, xmmword ptr [rsp+0x60]
203 movdqa xmm8, xmmword ptr [ROT16+rip]
208 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
217 movdqa xmmword ptr [rsp+0x100], xmm8
234 paddd xmm0, xmmword ptr [rsp+0x10]
235 paddd xmm1, xmmword ptr [rsp+0x30]
236 paddd xmm2, xmmword ptr [rsp+0x50]
237 paddd xmm3, xmmword ptr [rsp+0x70]
246 movdqa xmm8, xmmword ptr [ROT8+rip]
251 movdqa xmm8, xmmword ptr [rsp+0x100]
260 movdqa xmmword ptr [rsp+0x100], xmm8
277 paddd xmm0, xmmword ptr [rsp+0x80]
278 paddd xmm1, xmmword ptr [rsp+0xA0]
279 paddd xmm2, xmmword ptr [rsp+0xC0]
280 paddd xmm3, xmmword ptr [rsp+0xE0]
289 movdqa xmm8, xmmword ptr [ROT16+rip]
296 movdqa xmm8, xmmword ptr [rsp+0x100]
303 movdqa xmmword ptr [rsp+0x100], xmm8
320 paddd xmm0, xmmword ptr [rsp+0x90]
321 paddd xmm1, xmmword ptr [rsp+0xB0]
322 paddd xmm2, xmmword ptr [rsp+0xD0]
323 paddd xmm3, xmmword ptr [rsp+0xF0]
332 movdqa xmm8, xmmword ptr [ROT8+rip]
339 movdqa xmm8, xmmword ptr [rsp+0x100]
346 movdqa xmmword ptr [rsp+0x100], xmm8
363 paddd xmm0, xmmword ptr [rsp+0x20]
364 paddd xmm1, xmmword ptr [rsp+0x30]
365 paddd xmm2, xmmword ptr [rsp+0x70]
366 paddd xmm3, xmmword ptr [rsp+0x40]
375 movdqa xmm8, xmmword ptr [ROT16+rip]
380 movdqa xmm8, xmmword ptr [rsp+0x100]
389 movdqa xmmword ptr [rsp+0x100], xmm8
406 paddd xmm0, xmmword ptr [rsp+0x60]
407 paddd xmm1, xmmword ptr [rsp+0xA0]
408 paddd xmm2, xmmword ptr [rsp]
409 paddd xmm3, xmmword ptr [rsp+0xD0]
418 movdqa xmm8, xmmword ptr [ROT8+rip]
423 movdqa xmm8, xmmword ptr [rsp+0x100]
432 movdqa xmmword ptr [rsp+0x100], xmm8
449 paddd xmm0, xmmword ptr [rsp+0x10]
450 paddd xmm1, xmmword ptr [rsp+0xC0]
451 paddd xmm2, xmmword ptr [rsp+0x90]
452 paddd xmm3, xmmword ptr [rsp+0xF0]
461 movdqa xmm8, xmmword ptr [ROT16+rip]
468 movdqa xmm8, xmmword ptr [rsp+0x100]
475 movdqa xmmword ptr [rsp+0x100], xmm8
492 paddd xmm0, xmmword ptr [rsp+0xB0]
493 paddd xmm1, xmmword ptr [rsp+0x50]
494 paddd xmm2, xmmword ptr [rsp+0xE0]
495 paddd xmm3, xmmword ptr [rsp+0x80]
504 movdqa xmm8, xmmword ptr [ROT8+rip]
511 movdqa xmm8, xmmword ptr [rsp+0x100]
518 movdqa xmmword ptr [rsp+0x100], xmm8
535 paddd xmm0, xmmword ptr [rsp+0x30]
536 paddd xmm1, xmmword ptr [rsp+0xA0]
537 paddd xmm2, xmmword ptr [rsp+0xD0]
538 paddd xmm3, xmmword ptr [rsp+0x70]
547 movdqa xmm8, xmmword ptr [ROT16+rip]
552 movdqa xmm8, xmmword ptr [rsp+0x100]
561 movdqa xmmword ptr [rsp+0x100], xmm8
578 paddd xmm0, xmmword ptr [rsp+0x40]
579 paddd xmm1, xmmword ptr [rsp+0xC0]
580 paddd xmm2, xmmword ptr [rsp+0x20]
581 paddd xmm3, xmmword ptr [rsp+0xE0]
590 movdqa xmm8, xmmword ptr [ROT8+rip]
595 movdqa xmm8, xmmword ptr [rsp+0x100]
604 movdqa xmmword ptr [rsp+0x100], xmm8
621 paddd xmm0, xmmword ptr [rsp+0x60]
622 paddd xmm1, xmmword ptr [rsp+0x90]
623 paddd xmm2, xmmword ptr [rsp+0xB0]
624 paddd xmm3, xmmword ptr [rsp+0x80]
633 movdqa xmm8, xmmword ptr [ROT16+rip]
640 movdqa xmm8, xmmword ptr [rsp+0x100]
647 movdqa xmmword ptr [rsp+0x100], xmm8
664 paddd xmm0, xmmword ptr [rsp+0x50]
665 paddd xmm1, xmmword ptr [rsp]
666 paddd xmm2, xmmword ptr [rsp+0xF0]
667 paddd xmm3, xmmword ptr [rsp+0x10]
676 movdqa xmm8, xmmword ptr [ROT8+rip]
683 movdqa xmm8, xmmword ptr [rsp+0x100]
690 movdqa xmmword ptr [rsp+0x100], xmm8
707 paddd xmm0, xmmword ptr [rsp+0xA0]
708 paddd xmm1, xmmword ptr [rsp+0xC0]
709 paddd xmm2, xmmword ptr [rsp+0xE0]
710 paddd xmm3, xmmword ptr [rsp+0xD0]
719 movdqa xmm8, xmmword ptr [ROT16+rip]
724 movdqa xmm8, xmmword ptr [rsp+0x100]
733 movdqa xmmword ptr [rsp+0x100], xmm8
750 paddd xmm0, xmmword ptr [rsp+0x70]
751 paddd xmm1, xmmword ptr [rsp+0x90]
752 paddd xmm2, xmmword ptr [rsp+0x30]
753 paddd xmm3, xmmword ptr [rsp+0xF0]
762 movdqa xmm8, xmmword ptr [ROT8+rip]
767 movdqa xmm8, xmmword ptr [rsp+0x100]
776 movdqa xmmword ptr [rsp+0x100], xmm8
793 paddd xmm0, xmmword ptr [rsp+0x40]
794 paddd xmm1, xmmword ptr [rsp+0xB0]
795 paddd xmm2, xmmword ptr [rsp+0x50]
796 paddd xmm3, xmmword ptr [rsp+0x10]
805 movdqa xmm8, xmmword ptr [ROT16+rip]
812 movdqa xmm8, xmmword ptr [rsp+0x100]
819 movdqa xmmword ptr [rsp+0x100], xmm8
836 paddd xmm0, xmmword ptr [rsp]
837 paddd xmm1, xmmword ptr [rsp+0x20]
838 paddd xmm2, xmmword ptr [rsp+0x80]
839 paddd xmm3, xmmword ptr [rsp+0x60]
848 movdqa xmm8, xmmword ptr [ROT8+rip]
855 movdqa xmm8, xmmword ptr [rsp+0x100]
862 movdqa xmmword ptr [rsp+0x100], xmm8
879 paddd xmm0, xmmword ptr [rsp+0xC0]
880 paddd xmm1, xmmword ptr [rsp+0x90]
881 paddd xmm2, xmmword ptr [rsp+0xF0]
882 paddd xmm3, xmmword ptr [rsp+0xE0]
891 movdqa xmm8, xmmword ptr [ROT16+rip]
896 movdqa xmm8, xmmword ptr [rsp+0x100]
905 movdqa xmmword ptr [rsp+0x100], xmm8
922 paddd xmm0, xmmword ptr [rsp+0xD0]
923 paddd xmm1, xmmword ptr [rsp+0xB0]
924 paddd xmm2, xmmword ptr [rsp+0xA0]
925 paddd xmm3, xmmword ptr [rsp+0x80]
934 movdqa xmm8, xmmword ptr [ROT8+rip]
939 movdqa xmm8, xmmword ptr [rsp+0x100]
948 movdqa xmmword ptr [rsp+0x100], xmm8
965 paddd xmm0, xmmword ptr [rsp+0x70]
966 paddd xmm1, xmmword ptr [rsp+0x50]
967 paddd xmm2, xmmword ptr [rsp]
968 paddd xmm3, xmmword ptr [rsp+0x60]
977 movdqa xmm8, xmmword ptr [ROT16+rip]
984 movdqa xmm8, xmmword ptr [rsp+0x100]
991 movdqa xmmword ptr [rsp+0x100], xmm8
1008 paddd xmm0, xmmword ptr [rsp+0x20]
1009 paddd xmm1, xmmword ptr [rsp+0x30]
1010 paddd xmm2, xmmword ptr [rsp+0x10]
1011 paddd xmm3, xmmword ptr [rsp+0x40]
1020 movdqa xmm8, xmmword ptr [ROT8+rip]
1027 movdqa xmm8, xmmword ptr [rsp+0x100]
1034 movdqa xmmword ptr [rsp+0x100], xmm8
1051 paddd xmm0, xmmword ptr [rsp+0x90]
1052 paddd xmm1, xmmword ptr [rsp+0xB0]
1053 paddd xmm2, xmmword ptr [rsp+0x80]
1054 paddd xmm3, xmmword ptr [rsp+0xF0]
1063 movdqa xmm8, xmmword ptr [ROT16+rip]
1068 movdqa xmm8, xmmword ptr [rsp+0x100]
1077 movdqa xmmword ptr [rsp+0x100], xmm8
1094 paddd xmm0, xmmword ptr [rsp+0xE0]
1095 paddd xmm1, xmmword ptr [rsp+0x50]
1096 paddd xmm2, xmmword ptr [rsp+0xC0]
1097 paddd xmm3, xmmword ptr [rsp+0x10]
1106 movdqa xmm8, xmmword ptr [ROT8+rip]
1111 movdqa xmm8, xmmword ptr [rsp+0x100]
1120 movdqa xmmword ptr [rsp+0x100], xmm8
1137 paddd xmm0, xmmword ptr [rsp+0xD0]
1138 paddd xmm1, xmmword ptr [rsp]
1139 paddd xmm2, xmmword ptr [rsp+0x20]
1140 paddd xmm3, xmmword ptr [rsp+0x40]
1149 movdqa xmm8, xmmword ptr [ROT16+rip]
1156 movdqa xmm8, xmmword ptr [rsp+0x100]
1163 movdqa xmmword ptr [rsp+0x100], xmm8
1180 paddd xmm0, xmmword ptr [rsp+0x30]
1181 paddd xmm1, xmmword ptr [rsp+0xA0]
1182 paddd xmm2, xmmword ptr [rsp+0x60]
1183 paddd xmm3, xmmword ptr [rsp+0x70]
1192 movdqa xmm8, xmmword ptr [ROT8+rip]
1199 movdqa xmm8, xmmword ptr [rsp+0x100]
1206 movdqa xmmword ptr [rsp+0x100], xmm8
1223 paddd xmm0, xmmword ptr [rsp+0xB0]
1224 paddd xmm1, xmmword ptr [rsp+0x50]
1225 paddd xmm2, xmmword ptr [rsp+0x10]
1226 paddd xmm3, xmmword ptr [rsp+0x80]
1235 movdqa xmm8, xmmword ptr [ROT16+rip]
1240 movdqa xmm8, xmmword ptr [rsp+0x100]
1249 movdqa xmmword ptr [rsp+0x100], xmm8
1266 paddd xmm0, xmmword ptr [rsp+0xF0]
1267 paddd xmm1, xmmword ptr [rsp]
1268 paddd xmm2, xmmword ptr [rsp+0x90]
1269 paddd xmm3, xmmword ptr [rsp+0x60]
1278 movdqa xmm8, xmmword ptr [ROT8+rip]
1283 movdqa xmm8, xmmword ptr [rsp+0x100]
1292 movdqa xmmword ptr [rsp+0x100], xmm8
1309 paddd xmm0, xmmword ptr [rsp+0xE0]
1310 paddd xmm1, xmmword ptr [rsp+0x20]
1311 paddd xmm2, xmmword ptr [rsp+0x30]
1312 paddd xmm3, xmmword ptr [rsp+0x70]
1321 movdqa xmm8, xmmword ptr [ROT16+rip]
1328 movdqa xmm8, xmmword ptr [rsp+0x100]
1335 movdqa xmmword ptr [rsp+0x100], xmm8
1352 paddd xmm0, xmmword ptr [rsp+0xA0]
1353 paddd xmm1, xmmword ptr [rsp+0xC0]
1354 paddd xmm2, xmmword ptr [rsp+0x40]
1355 paddd xmm3, xmmword ptr [rsp+0xD0]
1364 movdqa xmm8, xmmword ptr [ROT8+rip]
1371 movdqa xmm8, xmmword ptr [rsp+0x100]
1405 punpckldq xmm0, xmm1
1406 punpckhdq xmm9, xmm1
1408 punpckldq xmm2, xmm3
1409 punpckhdq xmm11, xmm3
1411 punpcklqdq xmm0, xmm2
1412 punpckhqdq xmm1, xmm2
1414 punpcklqdq xmm9, xmm11
1415 punpckhqdq xmm3, xmm11
1416 movdqu xmmword ptr [rbx], xmm0
1417 movdqu xmmword ptr [rbx+0x20], xmm1
1418 movdqu xmmword ptr [rbx+0x40], xmm9
1419 movdqu xmmword ptr [rbx+0x60], xmm3
1421 punpckldq xmm4, xmm5
1422 punpckhdq xmm9, xmm5
1424 punpckldq xmm6, xmm7
1425 punpckhdq xmm11, xmm7
1427 punpcklqdq xmm4, xmm6
1428 punpckhqdq xmm5, xmm6
1430 punpcklqdq xmm9, xmm11
1431 punpckhqdq xmm7, xmm11
1432 movdqu xmmword ptr [rbx+0x10], xmm4
1433 movdqu xmmword ptr [rbx+0x30], xmm5
1434 movdqu xmmword ptr [rbx+0x50], xmm9
1435 movdqu xmmword ptr [rbx+0x70], xmm7
1436 movdqa xmm1, xmmword ptr [rsp+0x110]
1438 paddd xmm1, xmmword ptr [rsp+0x150]
1439 movdqa xmmword ptr [rsp+0x110], xmm1
1440 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1441 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1443 movdqa xmm1, xmmword ptr [rsp+0x120]
1445 movdqa xmmword ptr [rsp+0x120], xmm1
1466 movups xmm0, xmmword ptr [rcx]
1467 movups xmm1, xmmword ptr [rcx+0x10]
1470 movd xmm13, dword ptr [rsp+0x110]
1471 pinsrd xmm13, dword ptr [rsp+0x120], 1
1472 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1473 movaps xmmword ptr [rsp], xmm13
1474 movd xmm14, dword ptr [rsp+0x114]
1475 pinsrd xmm14, dword ptr [rsp+0x124], 1
1476 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1477 movaps xmmword ptr [rsp+0x10], xmm14
1478 mov r8, qword ptr [rdi]
1479 mov r9, qword ptr [rdi+0x8]
1480 movzx eax, byte ptr [rbp+0x40]
1489 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1491 movups xmm4, xmmword ptr [r8+rdx-0x40]
1492 movups xmm5, xmmword ptr [r8+rdx-0x30]
1494 shufps xmm4, xmm5, 136
1495 shufps xmm3, xmm5, 221
1497 movups xmm6, xmmword ptr [r8+rdx-0x20]
1498 movups xmm7, xmmword ptr [r8+rdx-0x10]
1500 shufps xmm6, xmm7, 136
1501 pshufd xmm6, xmm6, 0x93
1502 shufps xmm3, xmm7, 221
1503 pshufd xmm7, xmm3, 0x93
1504 movups xmm12, xmmword ptr [r9+rdx-0x40]
1505 movups xmm13, xmmword ptr [r9+rdx-0x30]
1507 shufps xmm12, xmm13, 136
1508 shufps xmm11, xmm13, 221
1510 movups xmm14, xmmword ptr [r9+rdx-0x20]
1511 movups xmm15, xmmword ptr [r9+rdx-0x10]
1513 shufps xmm14, xmm15, 136
1514 pshufd xmm14, xmm14, 0x93
1515 shufps xmm11, xmm15, 221
1516 pshufd xmm15, xmm11, 0x93
1517 movaps xmm3, xmmword ptr [rsp]
1518 movaps xmm11, xmmword ptr [rsp+0x10]
1520 pinsrd xmm11, eax, 3
1525 movaps xmmword ptr [rsp+0x20], xmm4
1526 movaps xmmword ptr [rsp+0x30], xmm12
1531 movaps xmm12, xmmword ptr [ROT16+rip]
1548 movaps xmmword ptr [rsp+0x40], xmm5
1549 movaps xmmword ptr [rsp+0x50], xmm13
1554 movaps xmm13, xmmword ptr [ROT8+rip]
1569 pshufd xmm0, xmm0, 0x93
1570 pshufd xmm8, xmm8, 0x93
1571 pshufd xmm3, xmm3, 0x4E
1572 pshufd xmm11, xmm11, 0x4E
1573 pshufd xmm2, xmm2, 0x39
1574 pshufd xmm10, xmm10, 0x39
1615 pshufd xmm0, xmm0, 0x39
1616 pshufd xmm8, xmm8, 0x39
1617 pshufd xmm3, xmm3, 0x4E
1618 pshufd xmm11, xmm11, 0x4E
1619 pshufd xmm2, xmm2, 0x93
1620 pshufd xmm10, xmm10, 0x93
1623 movdqa xmm12, xmmword ptr [rsp+0x20]
1624 movdqa xmm5, xmmword ptr [rsp+0x40]
1625 pshufd xmm13, xmm12, 0x0F
1626 shufps xmm12, xmm5, 214
1627 pshufd xmm4, xmm12, 0x39
1629 shufps xmm12, xmm7, 250
1630 pblendw xmm13, xmm12, 0xCC
1632 punpcklqdq xmm12, xmm5
1633 pblendw xmm12, xmm6, 0xC0
1634 pshufd xmm12, xmm12, 0x78
1635 punpckhdq xmm5, xmm7
1636 punpckldq xmm6, xmm5
1637 pshufd xmm7, xmm6, 0x1E
1638 movdqa xmmword ptr [rsp+0x20], xmm13
1639 movdqa xmmword ptr [rsp+0x40], xmm12
1640 movdqa xmm5, xmmword ptr [rsp+0x30]
1641 movdqa xmm13, xmmword ptr [rsp+0x50]
1642 pshufd xmm6, xmm5, 0x0F
1643 shufps xmm5, xmm13, 214
1644 pshufd xmm12, xmm5, 0x39
1646 shufps xmm5, xmm15, 250
1647 pblendw xmm6, xmm5, 0xCC
1649 punpcklqdq xmm5, xmm13
1650 pblendw xmm5, xmm14, 0xC0
1651 pshufd xmm5, xmm5, 0x78
1652 punpckhdq xmm13, xmm15
1653 punpckldq xmm14, xmm13
1654 pshufd xmm15, xmm14, 0x1E
1657 movdqa xmm5, xmmword ptr [rsp+0x20]
1658 movdqa xmm6, xmmword ptr [rsp+0x40]
1668 movups xmmword ptr [rbx], xmm0
1669 movups xmmword ptr [rbx+0x10], xmm1
1670 movups xmmword ptr [rbx+0x20], xmm8
1671 movups xmmword ptr [rbx+0x30], xmm9
1672 movdqa xmm0, xmmword ptr [rsp+0x130]
1673 movdqa xmm1, xmmword ptr [rsp+0x110]
1674 movdqa xmm2, xmmword ptr [rsp+0x120]
1675 movdqu xmm3, xmmword ptr [rsp+0x118]
1676 movdqu xmm4, xmmword ptr [rsp+0x128]
1677 blendvps xmm1, xmm3, xmm0
1678 blendvps xmm2, xmm4, xmm0
1679 movdqa xmmword ptr [rsp+0x110], xmm1
1680 movdqa xmmword ptr [rsp+0x120], xmm2
1687 movups xmm0, xmmword ptr [rcx]
1688 movups xmm1, xmmword ptr [rcx+0x10]
1689 movd xmm13, dword ptr [rsp+0x110]
1690 pinsrd xmm13, dword ptr [rsp+0x120], 1
1691 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1692 movaps xmm14, xmmword ptr [ROT8+rip]
1693 movaps xmm15, xmmword ptr [ROT16+rip]
1694 mov r8, qword ptr [rdi]
1695 movzx eax, byte ptr [rbp+0x40]
1704 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1707 movups xmm4, xmmword ptr [r8+rdx-0x40]
1708 movups xmm5, xmmword ptr [r8+rdx-0x30]
1710 shufps xmm4, xmm5, 136
1711 shufps xmm8, xmm5, 221
1713 movups xmm6, xmmword ptr [r8+rdx-0x20]
1714 movups xmm7, xmmword ptr [r8+rdx-0x10]
1716 shufps xmm6, xmm7, 136
1717 pshufd xmm6, xmm6, 0x93
1718 shufps xmm8, xmm7, 221
1719 pshufd xmm7, xmm8, 0x93
1742 pshufd xmm0, xmm0, 0x93
1743 pshufd xmm3, xmm3, 0x4E
1744 pshufd xmm2, xmm2, 0x39
1765 pshufd xmm0, xmm0, 0x39
1766 pshufd xmm3, xmm3, 0x4E
1767 pshufd xmm2, xmm2, 0x93
1771 shufps xmm8, xmm5, 214
1772 pshufd xmm9, xmm4, 0x0F
1773 pshufd xmm4, xmm8, 0x39
1775 shufps xmm8, xmm7, 250
1776 pblendw xmm9, xmm8, 0xCC
1778 punpcklqdq xmm8, xmm5
1779 pblendw xmm8, xmm6, 0xC0
1780 pshufd xmm8, xmm8, 0x78
1781 punpckhdq xmm5, xmm7
1782 punpckldq xmm6, xmm5
1783 pshufd xmm7, xmm6, 0x1E
1793 movups xmmword ptr [rbx], xmm0
1794 movups xmmword ptr [rbx+0x10], xmm1
1796 SET_SIZE(zfs_blake3_hash_many_sse41)
1798 ENTRY_ALIGN(zfs_blake3_compress_in_place_sse41, 64)
1800 movups xmm0, xmmword ptr [rdi]
1801 movups xmm1, xmmword ptr [rdi+0x10]
1802 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1807 punpcklqdq xmm3, xmm4
1808 movups xmm4, xmmword ptr [rsi]
1809 movups xmm5, xmmword ptr [rsi+0x10]
1811 shufps xmm4, xmm5, 136
1812 shufps xmm8, xmm5, 221
1814 movups xmm6, xmmword ptr [rsi+0x20]
1815 movups xmm7, xmmword ptr [rsi+0x30]
1817 shufps xmm6, xmm7, 136
1818 pshufd xmm6, xmm6, 0x93
1819 shufps xmm8, xmm7, 221
1820 pshufd xmm7, xmm8, 0x93
1821 movaps xmm14, xmmword ptr [ROT8+rip]
1822 movaps xmm15, xmmword ptr [ROT16+rip]
1845 pshufd xmm0, xmm0, 0x93
1846 pshufd xmm3, xmm3, 0x4E
1847 pshufd xmm2, xmm2, 0x39
1868 pshufd xmm0, xmm0, 0x39
1869 pshufd xmm3, xmm3, 0x4E
1870 pshufd xmm2, xmm2, 0x93
1874 shufps xmm8, xmm5, 214
1875 pshufd xmm9, xmm4, 0x0F
1876 pshufd xmm4, xmm8, 0x39
1878 shufps xmm8, xmm7, 250
1879 pblendw xmm9, xmm8, 0xCC
1881 punpcklqdq xmm8, xmm5
1882 pblendw xmm8, xmm6, 0xC0
1883 pshufd xmm8, xmm8, 0x78
1884 punpckhdq xmm5, xmm7
1885 punpckldq xmm6, xmm5
1886 pshufd xmm7, xmm6, 0x1E
1893 movups xmmword ptr [rdi], xmm0
1894 movups xmmword ptr [rdi+0x10], xmm1
1896 SET_SIZE(zfs_blake3_compress_in_place_sse41)
1898 ENTRY_ALIGN(zfs_blake3_compress_xof_sse41, 64)
1900 movups xmm0, xmmword ptr [rdi]
1901 movups xmm1, xmmword ptr [rdi+0x10]
1902 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1909 punpcklqdq xmm3, xmm4
1910 movups xmm4, xmmword ptr [rsi]
1911 movups xmm5, xmmword ptr [rsi+0x10]
1913 shufps xmm4, xmm5, 136
1914 shufps xmm8, xmm5, 221
1916 movups xmm6, xmmword ptr [rsi+0x20]
1917 movups xmm7, xmmword ptr [rsi+0x30]
1919 shufps xmm6, xmm7, 136
1920 pshufd xmm6, xmm6, 0x93
1921 shufps xmm8, xmm7, 221
1922 pshufd xmm7, xmm8, 0x93
1923 movaps xmm14, xmmword ptr [ROT8+rip]
1924 movaps xmm15, xmmword ptr [ROT16+rip]
1947 pshufd xmm0, xmm0, 0x93
1948 pshufd xmm3, xmm3, 0x4E
1949 pshufd xmm2, xmm2, 0x39
1970 pshufd xmm0, xmm0, 0x39
1971 pshufd xmm3, xmm3, 0x4E
1972 pshufd xmm2, xmm2, 0x93
1976 shufps xmm8, xmm5, 214
1977 pshufd xmm9, xmm4, 0x0F
1978 pshufd xmm4, xmm8, 0x39
1980 shufps xmm8, xmm7, 250
1981 pblendw xmm9, xmm8, 0xCC
1983 punpcklqdq xmm8, xmm5
1984 pblendw xmm8, xmm6, 0xC0
1985 pshufd xmm8, xmm8, 0x78
1986 punpckhdq xmm5, xmm7
1987 punpckldq xmm6, xmm5
1988 pshufd xmm7, xmm6, 0x1E
1993 movdqu xmm4, xmmword ptr [rdi]
1994 movdqu xmm5, xmmword ptr [rdi+0x10]
1999 movups xmmword ptr [r9], xmm0
2000 movups xmmword ptr [r9+0x10], xmm1
2001 movups xmmword ptr [r9+0x20], xmm2
2002 movups xmmword ptr [r9+0x30], xmm3
2004 SET_SIZE(zfs_blake3_compress_xof_sse41)
2010 .long 0x6A09E667, 0xBB67AE85
2011 .long 0x3C6EF372, 0xA54FF53A
2013 .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2015 .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2021 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2023 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2025 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2027 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2029 .long 64, 64, 64, 64
2031 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2033 #endif /* HAVE_SSE4_1 */
2036 .section .note.GNU-stack,"",%progbits