4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
28 #if defined(HAVE_SSE2)
31 #include <sys/asm_linkage.h>
33 .intel_syntax noprefix
37 ENTRY_ALIGN(zfs_blake3_hash_many_sse2, 64)
47 and rsp, 0xFFFFFFFFFFFFFFC0
50 pshufd xmm0, xmm0, 0x00
51 movdqa xmmword ptr [rsp+0x130], xmm0
53 pand xmm1, xmmword ptr [ADD0+rip]
54 pand xmm0, xmmword ptr [ADD1+rip]
55 movdqa xmmword ptr [rsp+0x150], xmm0
57 pshufd xmm0, xmm0, 0x00
59 movdqa xmmword ptr [rsp+0x110], xmm0
60 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
61 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
65 pshufd xmm2, xmm2, 0x00
67 movdqa xmmword ptr [rsp+0x120], xmm2
68 mov rbx, qword ptr [rbp+0x50]
71 movzx r13d, byte ptr [rbp+0x38]
72 movzx r12d, byte ptr [rbp+0x48]
76 movdqu xmm3, xmmword ptr [rcx]
77 pshufd xmm0, xmm3, 0x00
78 pshufd xmm1, xmm3, 0x55
79 pshufd xmm2, xmm3, 0xAA
80 pshufd xmm3, xmm3, 0xFF
81 movdqu xmm7, xmmword ptr [rcx+0x10]
82 pshufd xmm4, xmm7, 0x00
83 pshufd xmm5, xmm7, 0x55
84 pshufd xmm6, xmm7, 0xAA
85 pshufd xmm7, xmm7, 0xFF
86 mov r8, qword ptr [rdi]
87 mov r9, qword ptr [rdi+0x8]
88 mov r10, qword ptr [rdi+0x10]
89 mov r11, qword ptr [rdi+0x18]
90 movzx eax, byte ptr [rbp+0x40]
99 movdqu xmm8, xmmword ptr [r8+rdx-0x40]
100 movdqu xmm9, xmmword ptr [r9+rdx-0x40]
101 movdqu xmm10, xmmword ptr [r10+rdx-0x40]
102 movdqu xmm11, xmmword ptr [r11+rdx-0x40]
105 punpckhdq xmm12, xmm9
107 punpckldq xmm10, xmm11
108 punpckhdq xmm14, xmm11
110 punpcklqdq xmm8, xmm10
111 punpckhqdq xmm9, xmm10
113 punpcklqdq xmm12, xmm14
114 punpckhqdq xmm13, xmm14
115 movdqa xmmword ptr [rsp], xmm8
116 movdqa xmmword ptr [rsp+0x10], xmm9
117 movdqa xmmword ptr [rsp+0x20], xmm12
118 movdqa xmmword ptr [rsp+0x30], xmm13
119 movdqu xmm8, xmmword ptr [r8+rdx-0x30]
120 movdqu xmm9, xmmword ptr [r9+rdx-0x30]
121 movdqu xmm10, xmmword ptr [r10+rdx-0x30]
122 movdqu xmm11, xmmword ptr [r11+rdx-0x30]
125 punpckhdq xmm12, xmm9
127 punpckldq xmm10, xmm11
128 punpckhdq xmm14, xmm11
130 punpcklqdq xmm8, xmm10
131 punpckhqdq xmm9, xmm10
133 punpcklqdq xmm12, xmm14
134 punpckhqdq xmm13, xmm14
135 movdqa xmmword ptr [rsp+0x40], xmm8
136 movdqa xmmword ptr [rsp+0x50], xmm9
137 movdqa xmmword ptr [rsp+0x60], xmm12
138 movdqa xmmword ptr [rsp+0x70], xmm13
139 movdqu xmm8, xmmword ptr [r8+rdx-0x20]
140 movdqu xmm9, xmmword ptr [r9+rdx-0x20]
141 movdqu xmm10, xmmword ptr [r10+rdx-0x20]
142 movdqu xmm11, xmmword ptr [r11+rdx-0x20]
145 punpckhdq xmm12, xmm9
147 punpckldq xmm10, xmm11
148 punpckhdq xmm14, xmm11
150 punpcklqdq xmm8, xmm10
151 punpckhqdq xmm9, xmm10
153 punpcklqdq xmm12, xmm14
154 punpckhqdq xmm13, xmm14
155 movdqa xmmword ptr [rsp+0x80], xmm8
156 movdqa xmmword ptr [rsp+0x90], xmm9
157 movdqa xmmword ptr [rsp+0xA0], xmm12
158 movdqa xmmword ptr [rsp+0xB0], xmm13
159 movdqu xmm8, xmmword ptr [r8+rdx-0x10]
160 movdqu xmm9, xmmword ptr [r9+rdx-0x10]
161 movdqu xmm10, xmmword ptr [r10+rdx-0x10]
162 movdqu xmm11, xmmword ptr [r11+rdx-0x10]
165 punpckhdq xmm12, xmm9
167 punpckldq xmm10, xmm11
168 punpckhdq xmm14, xmm11
170 punpcklqdq xmm8, xmm10
171 punpckhqdq xmm9, xmm10
173 punpcklqdq xmm12, xmm14
174 punpckhqdq xmm13, xmm14
175 movdqa xmmword ptr [rsp+0xC0], xmm8
176 movdqa xmmword ptr [rsp+0xD0], xmm9
177 movdqa xmmword ptr [rsp+0xE0], xmm12
178 movdqa xmmword ptr [rsp+0xF0], xmm13
179 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
180 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
181 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
182 movdqa xmm12, xmmword ptr [rsp+0x110]
183 movdqa xmm13, xmmword ptr [rsp+0x120]
184 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
186 pshufd xmm15, xmm15, 0x00
187 prefetcht0 [r8+rdx+0x80]
188 prefetcht0 [r9+rdx+0x80]
189 prefetcht0 [r10+rdx+0x80]
190 prefetcht0 [r11+rdx+0x80]
191 paddd xmm0, xmmword ptr [rsp]
192 paddd xmm1, xmmword ptr [rsp+0x20]
193 paddd xmm2, xmmword ptr [rsp+0x40]
194 paddd xmm3, xmmword ptr [rsp+0x60]
203 pshuflw xmm12, xmm12, 0xB1
204 pshufhw xmm12, xmm12, 0xB1
205 pshuflw xmm13, xmm13, 0xB1
206 pshufhw xmm13, xmm13, 0xB1
207 pshuflw xmm14, xmm14, 0xB1
208 pshufhw xmm14, xmm14, 0xB1
209 pshuflw xmm15, xmm15, 0xB1
210 pshufhw xmm15, xmm15, 0xB1
211 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
220 movdqa xmmword ptr [rsp+0x100], xmm8
237 paddd xmm0, xmmword ptr [rsp+0x10]
238 paddd xmm1, xmmword ptr [rsp+0x30]
239 paddd xmm2, xmmword ptr [rsp+0x50]
240 paddd xmm3, xmmword ptr [rsp+0x70]
265 movdqa xmm8, xmmword ptr [rsp+0x100]
274 movdqa xmmword ptr [rsp+0x100], xmm8
291 paddd xmm0, xmmword ptr [rsp+0x80]
292 paddd xmm1, xmmword ptr [rsp+0xA0]
293 paddd xmm2, xmmword ptr [rsp+0xC0]
294 paddd xmm3, xmmword ptr [rsp+0xE0]
303 pshuflw xmm15, xmm15, 0xB1
304 pshufhw xmm15, xmm15, 0xB1
305 pshuflw xmm12, xmm12, 0xB1
306 pshufhw xmm12, xmm12, 0xB1
307 pshuflw xmm13, xmm13, 0xB1
308 pshufhw xmm13, xmm13, 0xB1
309 pshuflw xmm14, xmm14, 0xB1
310 pshufhw xmm14, xmm14, 0xB1
313 movdqa xmm8, xmmword ptr [rsp+0x100]
320 movdqa xmmword ptr [rsp+0x100], xmm8
337 paddd xmm0, xmmword ptr [rsp+0x90]
338 paddd xmm1, xmmword ptr [rsp+0xB0]
339 paddd xmm2, xmmword ptr [rsp+0xD0]
340 paddd xmm3, xmmword ptr [rsp+0xF0]
367 movdqa xmm8, xmmword ptr [rsp+0x100]
374 movdqa xmmword ptr [rsp+0x100], xmm8
391 paddd xmm0, xmmword ptr [rsp+0x20]
392 paddd xmm1, xmmword ptr [rsp+0x30]
393 paddd xmm2, xmmword ptr [rsp+0x70]
394 paddd xmm3, xmmword ptr [rsp+0x40]
403 pshuflw xmm12, xmm12, 0xB1
404 pshufhw xmm12, xmm12, 0xB1
405 pshuflw xmm13, xmm13, 0xB1
406 pshufhw xmm13, xmm13, 0xB1
407 pshuflw xmm14, xmm14, 0xB1
408 pshufhw xmm14, xmm14, 0xB1
409 pshuflw xmm15, xmm15, 0xB1
410 pshufhw xmm15, xmm15, 0xB1
411 movdqa xmm8, xmmword ptr [rsp+0x100]
420 movdqa xmmword ptr [rsp+0x100], xmm8
437 paddd xmm0, xmmword ptr [rsp+0x60]
438 paddd xmm1, xmmword ptr [rsp+0xA0]
439 paddd xmm2, xmmword ptr [rsp]
440 paddd xmm3, xmmword ptr [rsp+0xD0]
465 movdqa xmm8, xmmword ptr [rsp+0x100]
474 movdqa xmmword ptr [rsp+0x100], xmm8
491 paddd xmm0, xmmword ptr [rsp+0x10]
492 paddd xmm1, xmmword ptr [rsp+0xC0]
493 paddd xmm2, xmmword ptr [rsp+0x90]
494 paddd xmm3, xmmword ptr [rsp+0xF0]
503 pshuflw xmm15, xmm15, 0xB1
504 pshufhw xmm15, xmm15, 0xB1
505 pshuflw xmm12, xmm12, 0xB1
506 pshufhw xmm12, xmm12, 0xB1
507 pshuflw xmm13, xmm13, 0xB1
508 pshufhw xmm13, xmm13, 0xB1
509 pshuflw xmm14, xmm14, 0xB1
510 pshufhw xmm14, xmm14, 0xB1
513 movdqa xmm8, xmmword ptr [rsp+0x100]
520 movdqa xmmword ptr [rsp+0x100], xmm8
537 paddd xmm0, xmmword ptr [rsp+0xB0]
538 paddd xmm1, xmmword ptr [rsp+0x50]
539 paddd xmm2, xmmword ptr [rsp+0xE0]
540 paddd xmm3, xmmword ptr [rsp+0x80]
567 movdqa xmm8, xmmword ptr [rsp+0x100]
574 movdqa xmmword ptr [rsp+0x100], xmm8
591 paddd xmm0, xmmword ptr [rsp+0x30]
592 paddd xmm1, xmmword ptr [rsp+0xA0]
593 paddd xmm2, xmmword ptr [rsp+0xD0]
594 paddd xmm3, xmmword ptr [rsp+0x70]
603 pshuflw xmm12, xmm12, 0xB1
604 pshufhw xmm12, xmm12, 0xB1
605 pshuflw xmm13, xmm13, 0xB1
606 pshufhw xmm13, xmm13, 0xB1
607 pshuflw xmm14, xmm14, 0xB1
608 pshufhw xmm14, xmm14, 0xB1
609 pshuflw xmm15, xmm15, 0xB1
610 pshufhw xmm15, xmm15, 0xB1
611 movdqa xmm8, xmmword ptr [rsp+0x100]
620 movdqa xmmword ptr [rsp+0x100], xmm8
637 paddd xmm0, xmmword ptr [rsp+0x40]
638 paddd xmm1, xmmword ptr [rsp+0xC0]
639 paddd xmm2, xmmword ptr [rsp+0x20]
640 paddd xmm3, xmmword ptr [rsp+0xE0]
665 movdqa xmm8, xmmword ptr [rsp+0x100]
674 movdqa xmmword ptr [rsp+0x100], xmm8
691 paddd xmm0, xmmword ptr [rsp+0x60]
692 paddd xmm1, xmmword ptr [rsp+0x90]
693 paddd xmm2, xmmword ptr [rsp+0xB0]
694 paddd xmm3, xmmword ptr [rsp+0x80]
703 pshuflw xmm15, xmm15, 0xB1
704 pshufhw xmm15, xmm15, 0xB1
705 pshuflw xmm12, xmm12, 0xB1
706 pshufhw xmm12, xmm12, 0xB1
707 pshuflw xmm13, xmm13, 0xB1
708 pshufhw xmm13, xmm13, 0xB1
709 pshuflw xmm14, xmm14, 0xB1
710 pshufhw xmm14, xmm14, 0xB1
713 movdqa xmm8, xmmword ptr [rsp+0x100]
720 movdqa xmmword ptr [rsp+0x100], xmm8
737 paddd xmm0, xmmword ptr [rsp+0x50]
738 paddd xmm1, xmmword ptr [rsp]
739 paddd xmm2, xmmword ptr [rsp+0xF0]
740 paddd xmm3, xmmword ptr [rsp+0x10]
767 movdqa xmm8, xmmword ptr [rsp+0x100]
774 movdqa xmmword ptr [rsp+0x100], xmm8
791 paddd xmm0, xmmword ptr [rsp+0xA0]
792 paddd xmm1, xmmword ptr [rsp+0xC0]
793 paddd xmm2, xmmword ptr [rsp+0xE0]
794 paddd xmm3, xmmword ptr [rsp+0xD0]
803 pshuflw xmm12, xmm12, 0xB1
804 pshufhw xmm12, xmm12, 0xB1
805 pshuflw xmm13, xmm13, 0xB1
806 pshufhw xmm13, xmm13, 0xB1
807 pshuflw xmm14, xmm14, 0xB1
808 pshufhw xmm14, xmm14, 0xB1
809 pshuflw xmm15, xmm15, 0xB1
810 pshufhw xmm15, xmm15, 0xB1
811 movdqa xmm8, xmmword ptr [rsp+0x100]
820 movdqa xmmword ptr [rsp+0x100], xmm8
837 paddd xmm0, xmmword ptr [rsp+0x70]
838 paddd xmm1, xmmword ptr [rsp+0x90]
839 paddd xmm2, xmmword ptr [rsp+0x30]
840 paddd xmm3, xmmword ptr [rsp+0xF0]
865 movdqa xmm8, xmmword ptr [rsp+0x100]
874 movdqa xmmword ptr [rsp+0x100], xmm8
891 paddd xmm0, xmmword ptr [rsp+0x40]
892 paddd xmm1, xmmword ptr [rsp+0xB0]
893 paddd xmm2, xmmword ptr [rsp+0x50]
894 paddd xmm3, xmmword ptr [rsp+0x10]
903 pshuflw xmm15, xmm15, 0xB1
904 pshufhw xmm15, xmm15, 0xB1
905 pshuflw xmm12, xmm12, 0xB1
906 pshufhw xmm12, xmm12, 0xB1
907 pshuflw xmm13, xmm13, 0xB1
908 pshufhw xmm13, xmm13, 0xB1
909 pshuflw xmm14, xmm14, 0xB1
910 pshufhw xmm14, xmm14, 0xB1
913 movdqa xmm8, xmmword ptr [rsp+0x100]
920 movdqa xmmword ptr [rsp+0x100], xmm8
937 paddd xmm0, xmmword ptr [rsp]
938 paddd xmm1, xmmword ptr [rsp+0x20]
939 paddd xmm2, xmmword ptr [rsp+0x80]
940 paddd xmm3, xmmword ptr [rsp+0x60]
967 movdqa xmm8, xmmword ptr [rsp+0x100]
974 movdqa xmmword ptr [rsp+0x100], xmm8
991 paddd xmm0, xmmword ptr [rsp+0xC0]
992 paddd xmm1, xmmword ptr [rsp+0x90]
993 paddd xmm2, xmmword ptr [rsp+0xF0]
994 paddd xmm3, xmmword ptr [rsp+0xE0]
1003 pshuflw xmm12, xmm12, 0xB1
1004 pshufhw xmm12, xmm12, 0xB1
1005 pshuflw xmm13, xmm13, 0xB1
1006 pshufhw xmm13, xmm13, 0xB1
1007 pshuflw xmm14, xmm14, 0xB1
1008 pshufhw xmm14, xmm14, 0xB1
1009 pshuflw xmm15, xmm15, 0xB1
1010 pshufhw xmm15, xmm15, 0xB1
1011 movdqa xmm8, xmmword ptr [rsp+0x100]
1020 movdqa xmmword ptr [rsp+0x100], xmm8
1037 paddd xmm0, xmmword ptr [rsp+0xD0]
1038 paddd xmm1, xmmword ptr [rsp+0xB0]
1039 paddd xmm2, xmmword ptr [rsp+0xA0]
1040 paddd xmm3, xmmword ptr [rsp+0x80]
1065 movdqa xmm8, xmmword ptr [rsp+0x100]
1074 movdqa xmmword ptr [rsp+0x100], xmm8
1091 paddd xmm0, xmmword ptr [rsp+0x70]
1092 paddd xmm1, xmmword ptr [rsp+0x50]
1093 paddd xmm2, xmmword ptr [rsp]
1094 paddd xmm3, xmmword ptr [rsp+0x60]
1103 pshuflw xmm15, xmm15, 0xB1
1104 pshufhw xmm15, xmm15, 0xB1
1105 pshuflw xmm12, xmm12, 0xB1
1106 pshufhw xmm12, xmm12, 0xB1
1107 pshuflw xmm13, xmm13, 0xB1
1108 pshufhw xmm13, xmm13, 0xB1
1109 pshuflw xmm14, xmm14, 0xB1
1110 pshufhw xmm14, xmm14, 0xB1
1113 movdqa xmm8, xmmword ptr [rsp+0x100]
1120 movdqa xmmword ptr [rsp+0x100], xmm8
1137 paddd xmm0, xmmword ptr [rsp+0x20]
1138 paddd xmm1, xmmword ptr [rsp+0x30]
1139 paddd xmm2, xmmword ptr [rsp+0x10]
1140 paddd xmm3, xmmword ptr [rsp+0x40]
1167 movdqa xmm8, xmmword ptr [rsp+0x100]
1174 movdqa xmmword ptr [rsp+0x100], xmm8
1191 paddd xmm0, xmmword ptr [rsp+0x90]
1192 paddd xmm1, xmmword ptr [rsp+0xB0]
1193 paddd xmm2, xmmword ptr [rsp+0x80]
1194 paddd xmm3, xmmword ptr [rsp+0xF0]
1203 pshuflw xmm12, xmm12, 0xB1
1204 pshufhw xmm12, xmm12, 0xB1
1205 pshuflw xmm13, xmm13, 0xB1
1206 pshufhw xmm13, xmm13, 0xB1
1207 pshuflw xmm14, xmm14, 0xB1
1208 pshufhw xmm14, xmm14, 0xB1
1209 pshuflw xmm15, xmm15, 0xB1
1210 pshufhw xmm15, xmm15, 0xB1
1211 movdqa xmm8, xmmword ptr [rsp+0x100]
1220 movdqa xmmword ptr [rsp+0x100], xmm8
1237 paddd xmm0, xmmword ptr [rsp+0xE0]
1238 paddd xmm1, xmmword ptr [rsp+0x50]
1239 paddd xmm2, xmmword ptr [rsp+0xC0]
1240 paddd xmm3, xmmword ptr [rsp+0x10]
1265 movdqa xmm8, xmmword ptr [rsp+0x100]
1274 movdqa xmmword ptr [rsp+0x100], xmm8
1291 paddd xmm0, xmmword ptr [rsp+0xD0]
1292 paddd xmm1, xmmword ptr [rsp]
1293 paddd xmm2, xmmword ptr [rsp+0x20]
1294 paddd xmm3, xmmword ptr [rsp+0x40]
1303 pshuflw xmm15, xmm15, 0xB1
1304 pshufhw xmm15, xmm15, 0xB1
1305 pshuflw xmm12, xmm12, 0xB1
1306 pshufhw xmm12, xmm12, 0xB1
1307 pshuflw xmm13, xmm13, 0xB1
1308 pshufhw xmm13, xmm13, 0xB1
1309 pshuflw xmm14, xmm14, 0xB1
1310 pshufhw xmm14, xmm14, 0xB1
1313 movdqa xmm8, xmmword ptr [rsp+0x100]
1320 movdqa xmmword ptr [rsp+0x100], xmm8
1337 paddd xmm0, xmmword ptr [rsp+0x30]
1338 paddd xmm1, xmmword ptr [rsp+0xA0]
1339 paddd xmm2, xmmword ptr [rsp+0x60]
1340 paddd xmm3, xmmword ptr [rsp+0x70]
1367 movdqa xmm8, xmmword ptr [rsp+0x100]
1374 movdqa xmmword ptr [rsp+0x100], xmm8
1391 paddd xmm0, xmmword ptr [rsp+0xB0]
1392 paddd xmm1, xmmword ptr [rsp+0x50]
1393 paddd xmm2, xmmword ptr [rsp+0x10]
1394 paddd xmm3, xmmword ptr [rsp+0x80]
1403 pshuflw xmm12, xmm12, 0xB1
1404 pshufhw xmm12, xmm12, 0xB1
1405 pshuflw xmm13, xmm13, 0xB1
1406 pshufhw xmm13, xmm13, 0xB1
1407 pshuflw xmm14, xmm14, 0xB1
1408 pshufhw xmm14, xmm14, 0xB1
1409 pshuflw xmm15, xmm15, 0xB1
1410 pshufhw xmm15, xmm15, 0xB1
1411 movdqa xmm8, xmmword ptr [rsp+0x100]
1420 movdqa xmmword ptr [rsp+0x100], xmm8
1437 paddd xmm0, xmmword ptr [rsp+0xF0]
1438 paddd xmm1, xmmword ptr [rsp]
1439 paddd xmm2, xmmword ptr [rsp+0x90]
1440 paddd xmm3, xmmword ptr [rsp+0x60]
1465 movdqa xmm8, xmmword ptr [rsp+0x100]
1474 movdqa xmmword ptr [rsp+0x100], xmm8
1491 paddd xmm0, xmmword ptr [rsp+0xE0]
1492 paddd xmm1, xmmword ptr [rsp+0x20]
1493 paddd xmm2, xmmword ptr [rsp+0x30]
1494 paddd xmm3, xmmword ptr [rsp+0x70]
1503 pshuflw xmm15, xmm15, 0xB1
1504 pshufhw xmm15, xmm15, 0xB1
1505 pshuflw xmm12, xmm12, 0xB1
1506 pshufhw xmm12, xmm12, 0xB1
1507 pshuflw xmm13, xmm13, 0xB1
1508 pshufhw xmm13, xmm13, 0xB1
1509 pshuflw xmm14, xmm14, 0xB1
1510 pshufhw xmm14, xmm14, 0xB1
1513 movdqa xmm8, xmmword ptr [rsp+0x100]
1520 movdqa xmmword ptr [rsp+0x100], xmm8
1537 paddd xmm0, xmmword ptr [rsp+0xA0]
1538 paddd xmm1, xmmword ptr [rsp+0xC0]
1539 paddd xmm2, xmmword ptr [rsp+0x40]
1540 paddd xmm3, xmmword ptr [rsp+0xD0]
1567 movdqa xmm8, xmmword ptr [rsp+0x100]
1601 punpckldq xmm0, xmm1
1602 punpckhdq xmm9, xmm1
1604 punpckldq xmm2, xmm3
1605 punpckhdq xmm11, xmm3
1607 punpcklqdq xmm0, xmm2
1608 punpckhqdq xmm1, xmm2
1610 punpcklqdq xmm9, xmm11
1611 punpckhqdq xmm3, xmm11
1612 movdqu xmmword ptr [rbx], xmm0
1613 movdqu xmmword ptr [rbx+0x20], xmm1
1614 movdqu xmmword ptr [rbx+0x40], xmm9
1615 movdqu xmmword ptr [rbx+0x60], xmm3
1617 punpckldq xmm4, xmm5
1618 punpckhdq xmm9, xmm5
1620 punpckldq xmm6, xmm7
1621 punpckhdq xmm11, xmm7
1623 punpcklqdq xmm4, xmm6
1624 punpckhqdq xmm5, xmm6
1626 punpcklqdq xmm9, xmm11
1627 punpckhqdq xmm7, xmm11
1628 movdqu xmmword ptr [rbx+0x10], xmm4
1629 movdqu xmmword ptr [rbx+0x30], xmm5
1630 movdqu xmmword ptr [rbx+0x50], xmm9
1631 movdqu xmmword ptr [rbx+0x70], xmm7
1632 movdqa xmm1, xmmword ptr [rsp+0x110]
1634 paddd xmm1, xmmword ptr [rsp+0x150]
1635 movdqa xmmword ptr [rsp+0x110], xmm1
1636 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1637 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1639 movdqa xmm1, xmmword ptr [rsp+0x120]
1641 movdqa xmmword ptr [rsp+0x120], xmm1
1662 movups xmm0, xmmword ptr [rcx]
1663 movups xmm1, xmmword ptr [rcx+0x10]
1666 movd xmm13, dword ptr [rsp+0x110]
1667 movd xmm14, dword ptr [rsp+0x120]
1668 punpckldq xmm13, xmm14
1669 movaps xmmword ptr [rsp], xmm13
1670 movd xmm14, dword ptr [rsp+0x114]
1671 movd xmm13, dword ptr [rsp+0x124]
1672 punpckldq xmm14, xmm13
1673 movaps xmmword ptr [rsp+0x10], xmm14
1674 mov r8, qword ptr [rdi]
1675 mov r9, qword ptr [rdi+0x8]
1676 movzx eax, byte ptr [rbp+0x40]
1685 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1687 movups xmm4, xmmword ptr [r8+rdx-0x40]
1688 movups xmm5, xmmword ptr [r8+rdx-0x30]
1690 shufps xmm4, xmm5, 136
1691 shufps xmm3, xmm5, 221
1693 movups xmm6, xmmword ptr [r8+rdx-0x20]
1694 movups xmm7, xmmword ptr [r8+rdx-0x10]
1696 shufps xmm6, xmm7, 136
1697 pshufd xmm6, xmm6, 0x93
1698 shufps xmm3, xmm7, 221
1699 pshufd xmm7, xmm3, 0x93
1700 movups xmm12, xmmword ptr [r9+rdx-0x40]
1701 movups xmm13, xmmword ptr [r9+rdx-0x30]
1703 shufps xmm12, xmm13, 136
1704 shufps xmm11, xmm13, 221
1706 movups xmm14, xmmword ptr [r9+rdx-0x20]
1707 movups xmm15, xmmword ptr [r9+rdx-0x10]
1709 shufps xmm14, xmm15, 136
1710 pshufd xmm14, xmm14, 0x93
1711 shufps xmm11, xmm15, 221
1712 pshufd xmm15, xmm11, 0x93
1716 movdqa xmmword ptr [rsp+0x20], xmm3
1717 movaps xmm3, xmmword ptr [rsp]
1718 movaps xmm11, xmmword ptr [rsp+0x10]
1719 punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1720 punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1725 movaps xmmword ptr [rsp+0x20], xmm4
1726 movaps xmmword ptr [rsp+0x30], xmm12
1731 pshuflw xmm3, xmm3, 0xB1
1732 pshufhw xmm3, xmm3, 0xB1
1733 pshuflw xmm11, xmm11, 0xB1
1734 pshufhw xmm11, xmm11, 0xB1
1749 movaps xmmword ptr [rsp+0x40], xmm5
1750 movaps xmmword ptr [rsp+0x50], xmm13
1775 pshufd xmm0, xmm0, 0x93
1776 pshufd xmm8, xmm8, 0x93
1777 pshufd xmm3, xmm3, 0x4E
1778 pshufd xmm11, xmm11, 0x4E
1779 pshufd xmm2, xmm2, 0x39
1780 pshufd xmm10, xmm10, 0x39
1787 pshuflw xmm3, xmm3, 0xB1
1788 pshufhw xmm3, xmm3, 0xB1
1789 pshuflw xmm11, xmm11, 0xB1
1790 pshufhw xmm11, xmm11, 0xB1
1829 pshufd xmm0, xmm0, 0x39
1830 pshufd xmm8, xmm8, 0x39
1831 pshufd xmm3, xmm3, 0x4E
1832 pshufd xmm11, xmm11, 0x4E
1833 pshufd xmm2, xmm2, 0x93
1834 pshufd xmm10, xmm10, 0x93
1837 movdqa xmm12, xmmword ptr [rsp+0x20]
1838 movdqa xmm5, xmmword ptr [rsp+0x40]
1839 pshufd xmm13, xmm12, 0x0F
1840 shufps xmm12, xmm5, 214
1841 pshufd xmm4, xmm12, 0x39
1843 shufps xmm12, xmm7, 250
1844 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1845 pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1847 movdqa xmmword ptr [rsp+0x20], xmm13
1849 punpcklqdq xmm12, xmm5
1851 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1852 pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1854 pshufd xmm12, xmm12, 0x78
1855 punpckhdq xmm5, xmm7
1856 punpckldq xmm6, xmm5
1857 pshufd xmm7, xmm6, 0x1E
1858 movdqa xmmword ptr [rsp+0x40], xmm12
1859 movdqa xmm5, xmmword ptr [rsp+0x30]
1860 movdqa xmm13, xmmword ptr [rsp+0x50]
1861 pshufd xmm6, xmm5, 0x0F
1862 shufps xmm5, xmm13, 214
1863 pshufd xmm12, xmm5, 0x39
1865 shufps xmm5, xmm15, 250
1866 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1867 pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1870 punpcklqdq xmm5, xmm13
1871 movdqa xmmword ptr [rsp+0x30], xmm2
1873 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1874 pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1876 movdqa xmm2, xmmword ptr [rsp+0x30]
1877 pshufd xmm5, xmm5, 0x78
1878 punpckhdq xmm13, xmm15
1879 punpckldq xmm14, xmm13
1880 pshufd xmm15, xmm14, 0x1E
1883 movdqa xmm5, xmmword ptr [rsp+0x20]
1884 movdqa xmm6, xmmword ptr [rsp+0x40]
1894 movups xmmword ptr [rbx], xmm0
1895 movups xmmword ptr [rbx+0x10], xmm1
1896 movups xmmword ptr [rbx+0x20], xmm8
1897 movups xmmword ptr [rbx+0x30], xmm9
1898 mov eax, dword ptr [rsp+0x130]
1900 mov r10d, dword ptr [rsp+0x110+8*rax]
1901 mov r11d, dword ptr [rsp+0x120+8*rax]
1902 mov dword ptr [rsp+0x110], r10d
1903 mov dword ptr [rsp+0x120], r11d
1910 movups xmm0, xmmword ptr [rcx]
1911 movups xmm1, xmmword ptr [rcx+0x10]
1912 movd xmm13, dword ptr [rsp+0x110]
1913 movd xmm14, dword ptr [rsp+0x120]
1914 punpckldq xmm13, xmm14
1915 mov r8, qword ptr [rdi]
1916 movzx eax, byte ptr [rbp+0x40]
1925 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1930 punpcklqdq xmm3, xmm12
1931 movups xmm4, xmmword ptr [r8+rdx-0x40]
1932 movups xmm5, xmmword ptr [r8+rdx-0x30]
1934 shufps xmm4, xmm5, 136
1935 shufps xmm8, xmm5, 221
1937 movups xmm6, xmmword ptr [r8+rdx-0x20]
1938 movups xmm7, xmmword ptr [r8+rdx-0x10]
1940 shufps xmm6, xmm7, 136
1941 pshufd xmm6, xmm6, 0x93
1942 shufps xmm8, xmm7, 221
1943 pshufd xmm7, xmm8, 0x93
1949 pshuflw xmm3, xmm3, 0xB1
1950 pshufhw xmm3, xmm3, 0xB1
1970 pshufd xmm0, xmm0, 0x93
1971 pshufd xmm3, xmm3, 0x4E
1972 pshufd xmm2, xmm2, 0x39
1976 pshuflw xmm3, xmm3, 0xB1
1977 pshufhw xmm3, xmm3, 0xB1
1997 pshufd xmm0, xmm0, 0x39
1998 pshufd xmm3, xmm3, 0x4E
1999 pshufd xmm2, xmm2, 0x93
2003 shufps xmm8, xmm5, 214
2004 pshufd xmm9, xmm4, 0x0F
2005 pshufd xmm4, xmm8, 0x39
2007 shufps xmm8, xmm7, 250
2008 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2009 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2012 punpcklqdq xmm8, xmm5
2014 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2015 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2017 pshufd xmm8, xmm8, 0x78
2018 punpckhdq xmm5, xmm7
2019 punpckldq xmm6, xmm5
2020 pshufd xmm7, xmm6, 0x1E
2030 movups xmmword ptr [rbx], xmm0
2031 movups xmmword ptr [rbx+0x10], xmm1
2033 SET_SIZE(zfs_blake3_hash_many_sse2)
2035 ENTRY_ALIGN(zfs_blake3_compress_in_place_sse2, 64)
2037 movups xmm0, xmmword ptr [rdi]
2038 movups xmm1, xmmword ptr [rdi+0x10]
2039 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2044 punpcklqdq xmm3, xmm4
2045 movups xmm4, xmmword ptr [rsi]
2046 movups xmm5, xmmword ptr [rsi+0x10]
2048 shufps xmm4, xmm5, 136
2049 shufps xmm8, xmm5, 221
2051 movups xmm6, xmmword ptr [rsi+0x20]
2052 movups xmm7, xmmword ptr [rsi+0x30]
2054 shufps xmm6, xmm7, 136
2055 pshufd xmm6, xmm6, 0x93
2056 shufps xmm8, xmm7, 221
2057 pshufd xmm7, xmm8, 0x93
2063 pshuflw xmm3, xmm3, 0xB1
2064 pshufhw xmm3, xmm3, 0xB1
2084 pshufd xmm0, xmm0, 0x93
2085 pshufd xmm3, xmm3, 0x4E
2086 pshufd xmm2, xmm2, 0x39
2090 pshuflw xmm3, xmm3, 0xB1
2091 pshufhw xmm3, xmm3, 0xB1
2111 pshufd xmm0, xmm0, 0x39
2112 pshufd xmm3, xmm3, 0x4E
2113 pshufd xmm2, xmm2, 0x93
2117 shufps xmm8, xmm5, 214
2118 pshufd xmm9, xmm4, 0x0F
2119 pshufd xmm4, xmm8, 0x39
2121 shufps xmm8, xmm7, 250
2122 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2123 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2126 punpcklqdq xmm8, xmm5
2128 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2129 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2131 pshufd xmm8, xmm8, 0x78
2132 punpckhdq xmm5, xmm7
2133 punpckldq xmm6, xmm5
2134 pshufd xmm7, xmm6, 0x1E
2141 movups xmmword ptr [rdi], xmm0
2142 movups xmmword ptr [rdi+0x10], xmm1
2144 SET_SIZE(zfs_blake3_compress_in_place_sse2)
2146 ENTRY_ALIGN(zfs_blake3_compress_xof_sse2, 64)
2148 movups xmm0, xmmword ptr [rdi]
2149 movups xmm1, xmmword ptr [rdi+0x10]
2150 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2157 punpcklqdq xmm3, xmm4
2158 movups xmm4, xmmword ptr [rsi]
2159 movups xmm5, xmmword ptr [rsi+0x10]
2161 shufps xmm4, xmm5, 136
2162 shufps xmm8, xmm5, 221
2164 movups xmm6, xmmword ptr [rsi+0x20]
2165 movups xmm7, xmmword ptr [rsi+0x30]
2167 shufps xmm6, xmm7, 136
2168 pshufd xmm6, xmm6, 0x93
2169 shufps xmm8, xmm7, 221
2170 pshufd xmm7, xmm8, 0x93
2176 pshuflw xmm3, xmm3, 0xB1
2177 pshufhw xmm3, xmm3, 0xB1
2197 pshufd xmm0, xmm0, 0x93
2198 pshufd xmm3, xmm3, 0x4E
2199 pshufd xmm2, xmm2, 0x39
2203 pshuflw xmm3, xmm3, 0xB1
2204 pshufhw xmm3, xmm3, 0xB1
2224 pshufd xmm0, xmm0, 0x39
2225 pshufd xmm3, xmm3, 0x4E
2226 pshufd xmm2, xmm2, 0x93
2230 shufps xmm8, xmm5, 214
2231 pshufd xmm9, xmm4, 0x0F
2232 pshufd xmm4, xmm8, 0x39
2234 shufps xmm8, xmm7, 250
2235 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2236 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2239 punpcklqdq xmm8, xmm5
2241 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2242 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2244 pshufd xmm8, xmm8, 0x78
2245 punpckhdq xmm5, xmm7
2246 punpckldq xmm6, xmm5
2247 pshufd xmm7, xmm6, 0x1E
2252 movdqu xmm4, xmmword ptr [rdi]
2253 movdqu xmm5, xmmword ptr [rdi+0x10]
2258 movups xmmword ptr [r9], xmm0
2259 movups xmmword ptr [r9+0x10], xmm1
2260 movups xmmword ptr [r9+0x20], xmm2
2261 movups xmmword ptr [r9+0x30], xmm3
2263 SET_SIZE(zfs_blake3_compress_xof_sse2)
2268 .long 0x6A09E667, 0xBB67AE85
2269 .long 0x3C6EF372, 0xA54FF53A
2275 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2277 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2279 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2281 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2283 .long 64, 64, 64, 64
2285 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2287 .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2289 .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2291 .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2293 .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2295 #endif /* HAVE_SSE2 */
2298 .section .note.GNU-stack,"",%progbits