1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i8_stride8_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
24 ; SSE-NEXT: movdqa (%rdi), %xmm0
25 ; SSE-NEXT: movdqa (%rdx), %xmm1
26 ; SSE-NEXT: movdqa (%r8), %xmm2
27 ; SSE-NEXT: movdqa (%r11), %xmm3
28 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
29 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
30 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
31 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
32 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
33 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
34 ; SSE-NEXT: pxor %xmm1, %xmm1
35 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
36 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
37 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
38 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
39 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
40 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
41 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
42 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
43 ; SSE-NEXT: packuswb %xmm2, %xmm3
44 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
45 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
46 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
47 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
48 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
49 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
50 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
51 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
52 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
53 ; SSE-NEXT: packuswb %xmm0, %xmm1
54 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
55 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
56 ; SSE-NEXT: movdqa %xmm0, (%rax)
59 ; AVX-LABEL: store_i8_stride8_vf2:
61 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
62 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
63 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
64 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
65 ; AVX-NEXT: vmovdqa (%rdx), %xmm1
66 ; AVX-NEXT: vmovdqa (%r8), %xmm2
67 ; AVX-NEXT: vmovdqa (%r11), %xmm3
68 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
69 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
70 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
71 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
72 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
73 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
74 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
75 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
76 ; AVX-NEXT: vmovdqa %xmm0, (%rax)
79 ; AVX2-LABEL: store_i8_stride8_vf2:
81 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
82 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
83 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
84 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
85 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1
86 ; AVX2-NEXT: vmovdqa (%r8), %xmm2
87 ; AVX2-NEXT: vmovdqa (%r11), %xmm3
88 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
89 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
90 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
91 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
92 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
93 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
94 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
95 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
96 ; AVX2-NEXT: vmovdqa %xmm0, (%rax)
99 ; AVX2-FP-LABEL: store_i8_stride8_vf2:
101 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
102 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
103 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
104 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
105 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
106 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
107 ; AVX2-FP-NEXT: vmovdqa (%r11), %xmm3
108 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
109 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
110 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
111 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
112 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
113 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
114 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
115 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
116 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax)
119 ; AVX2-FCP-LABEL: store_i8_stride8_vf2:
121 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
122 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
123 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
124 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
125 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
126 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
127 ; AVX2-FCP-NEXT: vmovdqa (%r11), %xmm3
128 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
129 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
130 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
131 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
132 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
133 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
134 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
135 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
136 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax)
137 ; AVX2-FCP-NEXT: retq
139 ; AVX512-LABEL: store_i8_stride8_vf2:
141 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
142 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
143 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
144 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
145 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1
146 ; AVX512-NEXT: vmovdqa (%r8), %xmm2
147 ; AVX512-NEXT: vmovdqa (%r11), %xmm3
148 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
149 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
150 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
151 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
152 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
153 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
154 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
155 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
156 ; AVX512-NEXT: vmovdqa %xmm0, (%rax)
159 ; AVX512-FCP-LABEL: store_i8_stride8_vf2:
160 ; AVX512-FCP: # %bb.0:
161 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
162 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
163 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
164 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
165 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
166 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
167 ; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3
168 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
169 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
170 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
171 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
172 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
173 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
174 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
175 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
176 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax)
177 ; AVX512-FCP-NEXT: retq
179 ; AVX512DQ-LABEL: store_i8_stride8_vf2:
181 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
182 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
183 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
184 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
185 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
186 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
187 ; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3
188 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
189 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
190 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
191 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
192 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
193 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
194 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
195 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
196 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax)
197 ; AVX512DQ-NEXT: retq
199 ; AVX512DQ-FCP-LABEL: store_i8_stride8_vf2:
200 ; AVX512DQ-FCP: # %bb.0:
201 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
202 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
203 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
204 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
205 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
206 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
207 ; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3
208 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
209 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
210 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
211 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
212 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
213 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
214 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
215 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
216 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax)
217 ; AVX512DQ-FCP-NEXT: retq
219 ; AVX512BW-LABEL: store_i8_stride8_vf2:
221 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
222 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
223 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
224 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
225 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
226 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
227 ; AVX512BW-NEXT: vmovdqa (%r11), %xmm3
228 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
229 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
230 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
231 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
232 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
233 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
234 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
235 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
236 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rax)
237 ; AVX512BW-NEXT: retq
239 ; AVX512BW-FCP-LABEL: store_i8_stride8_vf2:
240 ; AVX512BW-FCP: # %bb.0:
241 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
242 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
243 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
244 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
245 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
246 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
247 ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3
248 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
249 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
250 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
251 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
252 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
253 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
254 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
255 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
256 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax)
257 ; AVX512BW-FCP-NEXT: retq
259 ; AVX512DQ-BW-LABEL: store_i8_stride8_vf2:
260 ; AVX512DQ-BW: # %bb.0:
261 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
262 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
263 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
264 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
265 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
266 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
267 ; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3
268 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
269 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
270 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
271 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
272 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
273 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
274 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
275 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
276 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rax)
277 ; AVX512DQ-BW-NEXT: retq
279 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf2:
280 ; AVX512DQ-BW-FCP: # %bb.0:
281 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
282 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
283 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
284 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
285 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
286 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
287 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3
288 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
289 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
290 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
291 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
292 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
293 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
294 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
295 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
296 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax)
297 ; AVX512DQ-BW-FCP-NEXT: retq
298 %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64
299 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64
300 %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64
301 %in.vec3 = load <2 x i8>, ptr %in.vecptr3, align 64
302 %in.vec4 = load <2 x i8>, ptr %in.vecptr4, align 64
303 %in.vec5 = load <2 x i8>, ptr %in.vecptr5, align 64
304 %in.vec6 = load <2 x i8>, ptr %in.vecptr6, align 64
305 %in.vec7 = load <2 x i8>, ptr %in.vecptr7, align 64
306 %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
307 %2 = shufflevector <2 x i8> %in.vec2, <2 x i8> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
308 %3 = shufflevector <2 x i8> %in.vec4, <2 x i8> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
309 %4 = shufflevector <2 x i8> %in.vec6, <2 x i8> %in.vec7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
310 %5 = shufflevector <4 x i8> %1, <4 x i8> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
311 %6 = shufflevector <4 x i8> %3, <4 x i8> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
312 %7 = shufflevector <8 x i8> %5, <8 x i8> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
313 %interleaved.vec = shufflevector <16 x i8> %7, <16 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
314 store <16 x i8> %interleaved.vec, ptr %out.vec, align 64
318 define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind {
319 ; SSE-LABEL: store_i8_stride8_vf4:
321 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
322 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
323 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
324 ; SSE-NEXT: movdqa (%rdi), %xmm0
325 ; SSE-NEXT: movdqa (%rdx), %xmm1
326 ; SSE-NEXT: movdqa (%r8), %xmm2
327 ; SSE-NEXT: movdqa (%r11), %xmm3
328 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
329 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
330 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
331 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
332 ; SSE-NEXT: pxor %xmm6, %xmm6
333 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
334 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0]
335 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,5]
336 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,6,4]
337 ; SSE-NEXT: packuswb %xmm5, %xmm7
338 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,0]
339 ; SSE-NEXT: movdqa %xmm4, %xmm5
340 ; SSE-NEXT: pandn %xmm7, %xmm5
341 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
342 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,2,0]
343 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,7,5,6,7]
344 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
345 ; SSE-NEXT: packuswb %xmm8, %xmm7
346 ; SSE-NEXT: pand %xmm4, %xmm7
347 ; SSE-NEXT: por %xmm5, %xmm7
348 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
349 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
350 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
351 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,1,1,3,4,5,6,7]
352 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,1,0,2,4,5,6,7]
353 ; SSE-NEXT: packuswb %xmm8, %xmm9
354 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,0,65535,65535]
355 ; SSE-NEXT: movdqa %xmm5, %xmm8
356 ; SSE-NEXT: pandn %xmm9, %xmm8
357 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
358 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
359 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7]
360 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
361 ; SSE-NEXT: packuswb %xmm9, %xmm6
362 ; SSE-NEXT: pand %xmm5, %xmm6
363 ; SSE-NEXT: por %xmm8, %xmm6
364 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
365 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
366 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
367 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,5,7]
368 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
369 ; SSE-NEXT: packuswb %xmm7, %xmm3
370 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
371 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,5,7,6,7]
372 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
373 ; SSE-NEXT: packuswb %xmm7, %xmm2
374 ; SSE-NEXT: pand %xmm4, %xmm2
375 ; SSE-NEXT: pandn %xmm3, %xmm4
376 ; SSE-NEXT: por %xmm2, %xmm4
377 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
378 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
379 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,1,4,5,6,7]
380 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
381 ; SSE-NEXT: packuswb %xmm3, %xmm1
382 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
383 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,1,2,3,4,5,6,7]
384 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
385 ; SSE-NEXT: packuswb %xmm3, %xmm0
386 ; SSE-NEXT: pand %xmm5, %xmm0
387 ; SSE-NEXT: pandn %xmm1, %xmm5
388 ; SSE-NEXT: por %xmm0, %xmm5
389 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
390 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
391 ; SSE-NEXT: movdqa %xmm0, 16(%rax)
392 ; SSE-NEXT: movdqa %xmm6, (%rax)
395 ; AVX-LABEL: store_i8_stride8_vf4:
397 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
398 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
399 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
400 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
401 ; AVX-NEXT: vmovdqa (%rdx), %xmm1
402 ; AVX-NEXT: vmovdqa (%r8), %xmm2
403 ; AVX-NEXT: vmovdqa (%r11), %xmm3
404 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
405 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
406 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
407 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
408 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
409 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
410 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,0,0,0,0,0,0,0,0]
411 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3
412 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2
413 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
414 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,0,0,0,0,0,0,0]
415 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
416 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
417 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
418 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
419 ; AVX-NEXT: vmovaps %ymm0, (%rax)
420 ; AVX-NEXT: vzeroupper
423 ; AVX2-LABEL: store_i8_stride8_vf4:
425 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
426 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
427 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
428 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
429 ; AVX2-NEXT: vmovdqa (%rsi), %xmm1
430 ; AVX2-NEXT: vmovdqa (%rdx), %xmm2
431 ; AVX2-NEXT: vmovdqa (%rcx), %xmm3
432 ; AVX2-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
433 ; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
434 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
435 ; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
436 ; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
437 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
438 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
439 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
440 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
441 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
442 ; AVX2-NEXT: vmovdqa %ymm0, (%rax)
443 ; AVX2-NEXT: vzeroupper
446 ; AVX2-FP-LABEL: store_i8_stride8_vf4:
448 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
449 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
450 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
451 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
452 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
453 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
454 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3
455 ; AVX2-FP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
456 ; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
457 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
458 ; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
459 ; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
460 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
461 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
462 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
463 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
464 ; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0
465 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax)
466 ; AVX2-FP-NEXT: vzeroupper
469 ; AVX2-FCP-LABEL: store_i8_stride8_vf4:
471 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
472 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
473 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
474 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
475 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
476 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
477 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3
478 ; AVX2-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
479 ; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
480 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
481 ; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
482 ; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
483 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
484 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
485 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
486 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
487 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
488 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
489 ; AVX2-FCP-NEXT: vzeroupper
490 ; AVX2-FCP-NEXT: retq
492 ; AVX512-LABEL: store_i8_stride8_vf4:
494 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
495 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
496 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
497 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
498 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1
499 ; AVX512-NEXT: vmovdqa (%rdx), %xmm2
500 ; AVX512-NEXT: vmovdqa (%rcx), %xmm3
501 ; AVX512-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
502 ; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
503 ; AVX512-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
504 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
505 ; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
506 ; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
507 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
508 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
509 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
510 ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
511 ; AVX512-NEXT: vmovdqa %ymm0, (%rax)
512 ; AVX512-NEXT: vzeroupper
515 ; AVX512-FCP-LABEL: store_i8_stride8_vf4:
516 ; AVX512-FCP: # %bb.0:
517 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
518 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
519 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
520 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
521 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
522 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
523 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3
524 ; AVX512-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
525 ; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
526 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
527 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
528 ; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
529 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
530 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
531 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
532 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
533 ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
534 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
535 ; AVX512-FCP-NEXT: vzeroupper
536 ; AVX512-FCP-NEXT: retq
538 ; AVX512DQ-LABEL: store_i8_stride8_vf4:
540 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
541 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
542 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
543 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
544 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
545 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
546 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3
547 ; AVX512DQ-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
548 ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
549 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
550 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
551 ; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
552 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
553 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
554 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
555 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
556 ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
557 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
558 ; AVX512DQ-NEXT: vzeroupper
559 ; AVX512DQ-NEXT: retq
561 ; AVX512DQ-FCP-LABEL: store_i8_stride8_vf4:
562 ; AVX512DQ-FCP: # %bb.0:
563 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
564 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
565 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
566 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
567 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
568 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
569 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3
570 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
571 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
572 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
573 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
574 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
575 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
576 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
577 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
578 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
579 ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
580 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
581 ; AVX512DQ-FCP-NEXT: vzeroupper
582 ; AVX512DQ-FCP-NEXT: retq
584 ; AVX512BW-LABEL: store_i8_stride8_vf4:
586 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
587 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
588 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
589 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
590 ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
591 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
592 ; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3
593 ; AVX512BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
594 ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
595 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
596 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
597 ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
598 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
599 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
600 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
601 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
602 ; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0
603 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
604 ; AVX512BW-NEXT: vzeroupper
605 ; AVX512BW-NEXT: retq
607 ; AVX512BW-FCP-LABEL: store_i8_stride8_vf4:
608 ; AVX512BW-FCP: # %bb.0:
609 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
610 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
611 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
612 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
613 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
614 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
615 ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
616 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
617 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
618 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
619 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
620 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
621 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
622 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
623 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
624 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
625 ; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
626 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
627 ; AVX512BW-FCP-NEXT: vzeroupper
628 ; AVX512BW-FCP-NEXT: retq
630 ; AVX512DQ-BW-LABEL: store_i8_stride8_vf4:
631 ; AVX512DQ-BW: # %bb.0:
632 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
633 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
634 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
635 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
636 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
637 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
638 ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3
639 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
640 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
641 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
642 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
643 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
644 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
645 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
646 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
647 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
648 ; AVX512DQ-BW-NEXT: vpermd %ymm0, %ymm1, %ymm0
649 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
650 ; AVX512DQ-BW-NEXT: vzeroupper
651 ; AVX512DQ-BW-NEXT: retq
653 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf4:
654 ; AVX512DQ-BW-FCP: # %bb.0:
655 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
656 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
657 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
658 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
659 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
660 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
661 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
662 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3
663 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2
664 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
665 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
666 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
667 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
668 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
669 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
670 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
671 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
672 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
673 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
674 ; AVX512DQ-BW-FCP-NEXT: retq
675 %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64
676 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64
677 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64
678 %in.vec3 = load <4 x i8>, ptr %in.vecptr3, align 64
679 %in.vec4 = load <4 x i8>, ptr %in.vecptr4, align 64
680 %in.vec5 = load <4 x i8>, ptr %in.vecptr5, align 64
681 %in.vec6 = load <4 x i8>, ptr %in.vecptr6, align 64
682 %in.vec7 = load <4 x i8>, ptr %in.vecptr7, align 64
683 %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
684 %2 = shufflevector <4 x i8> %in.vec2, <4 x i8> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
685 %3 = shufflevector <4 x i8> %in.vec4, <4 x i8> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
686 %4 = shufflevector <4 x i8> %in.vec6, <4 x i8> %in.vec7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
687 %5 = shufflevector <8 x i8> %1, <8 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
688 %6 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
689 %7 = shufflevector <16 x i8> %5, <16 x i8> %6, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
690 %interleaved.vec = shufflevector <32 x i8> %7, <32 x i8> poison, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
691 store <32 x i8> %interleaved.vec, ptr %out.vec, align 64
695 define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind {
696 ; SSE-LABEL: store_i8_stride8_vf8:
698 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
699 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
700 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
701 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
702 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
703 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
704 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
705 ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
706 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
707 ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
708 ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
709 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
710 ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
711 ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
712 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
713 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,1,4,5,6,7]
714 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,1]
715 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,0]
716 ; SSE-NEXT: movdqa %xmm3, %xmm6
717 ; SSE-NEXT: pandn %xmm4, %xmm6
718 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7]
719 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1]
720 ; SSE-NEXT: pand %xmm3, %xmm4
721 ; SSE-NEXT: por %xmm6, %xmm4
722 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3]
723 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7]
724 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,1,3]
725 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535]
726 ; SSE-NEXT: movdqa %xmm4, %xmm8
727 ; SSE-NEXT: pandn %xmm6, %xmm8
728 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0]
729 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
730 ; SSE-NEXT: pand %xmm4, %xmm6
731 ; SSE-NEXT: por %xmm8, %xmm6
732 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
733 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
734 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
735 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
736 ; SSE-NEXT: movdqa %xmm3, %xmm8
737 ; SSE-NEXT: pandn %xmm7, %xmm8
738 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7]
739 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
740 ; SSE-NEXT: pand %xmm3, %xmm7
741 ; SSE-NEXT: por %xmm8, %xmm7
742 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,3,2,3]
743 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7]
744 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3]
745 ; SSE-NEXT: movdqa %xmm4, %xmm9
746 ; SSE-NEXT: pandn %xmm7, %xmm9
747 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
748 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
749 ; SSE-NEXT: pand %xmm4, %xmm7
750 ; SSE-NEXT: por %xmm9, %xmm7
751 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
752 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
753 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,6,5]
754 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
755 ; SSE-NEXT: movdqa %xmm3, %xmm9
756 ; SSE-NEXT: pandn %xmm8, %xmm9
757 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,7]
758 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
759 ; SSE-NEXT: pand %xmm3, %xmm8
760 ; SSE-NEXT: por %xmm9, %xmm8
761 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,3,2,3]
762 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,4,6,5]
763 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3]
764 ; SSE-NEXT: movdqa %xmm4, %xmm10
765 ; SSE-NEXT: pandn %xmm9, %xmm10
766 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2]
767 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5]
768 ; SSE-NEXT: pand %xmm4, %xmm9
769 ; SSE-NEXT: por %xmm10, %xmm9
770 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
771 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
772 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
773 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
774 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
775 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
776 ; SSE-NEXT: pand %xmm3, %xmm2
777 ; SSE-NEXT: pandn %xmm5, %xmm3
778 ; SSE-NEXT: por %xmm2, %xmm3
779 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
780 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
781 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
782 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
783 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
784 ; SSE-NEXT: pand %xmm4, %xmm0
785 ; SSE-NEXT: pandn %xmm1, %xmm4
786 ; SSE-NEXT: por %xmm0, %xmm4
787 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
788 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
789 ; SSE-NEXT: movdqa %xmm0, 48(%rax)
790 ; SSE-NEXT: movdqa %xmm9, 32(%rax)
791 ; SSE-NEXT: movdqa %xmm7, 16(%rax)
792 ; SSE-NEXT: movdqa %xmm6, (%rax)
795 ; AVX-LABEL: store_i8_stride8_vf8:
797 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
798 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
799 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
800 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
801 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
802 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
803 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
804 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
805 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
806 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
807 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
808 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
809 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
810 ; AVX-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
811 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
812 ; AVX-NEXT: vmovq {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,0,0,0,0,0,0]
813 ; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5
814 ; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4
815 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
816 ; AVX-NEXT: vmovq {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
817 ; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6
818 ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5
819 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
820 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
821 ; AVX-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,0,0,0,0,0]
822 ; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6
823 ; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm5
824 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
825 ; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
826 ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7
827 ; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm6
828 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
829 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
830 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
831 ; AVX-NEXT: vmovq {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,0,0,0,0,0,0]
832 ; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6
833 ; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm5
834 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
835 ; AVX-NEXT: vmovq {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
836 ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7
837 ; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm6
838 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
839 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
840 ; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,0,0,0,0,0,0]
841 ; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3
842 ; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2
843 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
844 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
845 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
846 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
847 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
848 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
849 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
850 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
851 ; AVX-NEXT: vmovaps %ymm4, (%rax)
852 ; AVX-NEXT: vzeroupper
855 ; AVX2-LABEL: store_i8_stride8_vf8:
857 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
858 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
859 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
860 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
861 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
862 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
863 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
864 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
865 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
866 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
867 ; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
868 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
869 ; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
870 ; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
871 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
872 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
873 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
874 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,zero,ymm1[19,27]
875 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
876 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[18,26],zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero
877 ; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2
878 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
879 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
880 ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
881 ; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm4
882 ; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
883 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
884 ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
885 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
886 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
887 ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
888 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
889 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
890 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rax)
891 ; AVX2-NEXT: vmovdqa %ymm2, (%rax)
892 ; AVX2-NEXT: vzeroupper
895 ; AVX2-FP-LABEL: store_i8_stride8_vf8:
897 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
898 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
899 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
900 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
901 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
902 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
903 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
904 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
905 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
906 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
907 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
908 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
909 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
910 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
911 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
912 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
913 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
914 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,zero,ymm1[19,27]
915 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
916 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[18,26],zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero
917 ; AVX2-FP-NEXT: vpor %ymm4, %ymm2, %ymm2
918 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
919 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
920 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
921 ; AVX2-FP-NEXT: vpor %ymm6, %ymm4, %ymm4
922 ; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2
923 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
924 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
925 ; AVX2-FP-NEXT: vpor %ymm3, %ymm1, %ymm1
926 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
927 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
928 ; AVX2-FP-NEXT: vpor %ymm3, %ymm0, %ymm0
929 ; AVX2-FP-NEXT: vpor %ymm1, %ymm0, %ymm0
930 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax)
931 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax)
932 ; AVX2-FP-NEXT: vzeroupper
935 ; AVX2-FCP-LABEL: store_i8_stride8_vf8:
937 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
938 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
939 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
940 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
941 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
942 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
943 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
944 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
945 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
946 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
947 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
948 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
949 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
950 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
951 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
952 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
953 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
954 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
955 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1]
956 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3
957 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15]
958 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
959 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2
960 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931]
961 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
962 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
963 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
964 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
965 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
966 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
967 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
968 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
969 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
970 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax)
971 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax)
972 ; AVX2-FCP-NEXT: vzeroupper
973 ; AVX2-FCP-NEXT: retq
975 ; AVX512-LABEL: store_i8_stride8_vf8:
977 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
978 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
979 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
980 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
981 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
982 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
983 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
984 ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
985 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
986 ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
987 ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
988 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
989 ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
990 ; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
991 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
992 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
993 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
994 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
995 ; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm3
996 ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm2
997 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
998 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
999 ; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm4
1000 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3
1001 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
1002 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
1003 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
1004 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128]
1005 ; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm4
1006 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
1007 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3
1008 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
1009 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
1010 ; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1011 ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1012 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1013 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1014 ; AVX512-NEXT: vpord %zmm0, %zmm2, %zmm0
1015 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
1016 ; AVX512-NEXT: vzeroupper
1019 ; AVX512-FCP-LABEL: store_i8_stride8_vf8:
1020 ; AVX512-FCP: # %bb.0:
1021 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1022 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1023 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1024 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1025 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1026 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1027 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1028 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1029 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1030 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1031 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1032 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1033 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1034 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
1035 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1036 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1037 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
1038 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7]
1039 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
1040 ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3
1041 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15]
1042 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1043 ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2
1044 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931]
1045 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
1046 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
1047 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
1048 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
1049 ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
1050 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1051 ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
1052 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
1053 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1054 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1055 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
1056 ; AVX512-FCP-NEXT: vzeroupper
1057 ; AVX512-FCP-NEXT: retq
1059 ; AVX512DQ-LABEL: store_i8_stride8_vf8:
1060 ; AVX512DQ: # %bb.0:
1061 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1062 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
1063 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
1064 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1065 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1066 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1067 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1068 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1069 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1070 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1071 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1072 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1073 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1074 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
1075 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1076 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1077 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
1078 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
1079 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm3
1080 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm2
1081 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
1082 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
1083 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm4
1084 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3
1085 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
1086 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
1087 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
1088 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128]
1089 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm4
1090 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
1091 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3
1092 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
1093 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
1094 ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1095 ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1096 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1097 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1098 ; AVX512DQ-NEXT: vpord %zmm0, %zmm2, %zmm0
1099 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
1100 ; AVX512DQ-NEXT: vzeroupper
1101 ; AVX512DQ-NEXT: retq
1103 ; AVX512DQ-FCP-LABEL: store_i8_stride8_vf8:
1104 ; AVX512DQ-FCP: # %bb.0:
1105 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1106 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1107 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1108 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1109 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1110 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1111 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1112 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1113 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1114 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1115 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1116 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1117 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1118 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
1119 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1120 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1121 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
1122 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7]
1123 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
1124 ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3
1125 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15]
1126 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1127 ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2
1128 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931]
1129 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
1130 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
1131 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
1132 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
1133 ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
1134 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1135 ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
1136 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
1137 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1138 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1139 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
1140 ; AVX512DQ-FCP-NEXT: vzeroupper
1141 ; AVX512DQ-FCP-NEXT: retq
1143 ; AVX512BW-LABEL: store_i8_stride8_vf8:
1144 ; AVX512BW: # %bb.0:
1145 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1146 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1147 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1148 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1149 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1150 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1151 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1152 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1153 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1154 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1155 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1156 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1157 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1158 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
1159 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1160 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1161 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1162 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1
1163 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1164 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
1165 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
1166 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
1167 ; AVX512BW-NEXT: movl $287445282, %ecx # imm = 0x11221122
1168 ; AVX512BW-NEXT: kmovd %ecx, %k1
1169 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
1170 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7]
1171 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
1172 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1]
1173 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
1174 ; AVX512BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488
1175 ; AVX512BW-NEXT: kmovd %ecx, %k1
1176 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1177 ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA
1178 ; AVX512BW-NEXT: kmovd %ecx, %k1
1179 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
1180 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax)
1181 ; AVX512BW-NEXT: vzeroupper
1182 ; AVX512BW-NEXT: retq
1184 ; AVX512BW-FCP-LABEL: store_i8_stride8_vf8:
1185 ; AVX512BW-FCP: # %bb.0:
1186 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1187 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1188 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1189 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1190 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1191 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1192 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1193 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1194 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1195 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1196 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1197 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1198 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1199 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
1200 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1201 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1202 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
1203 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
1204 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15]
1205 ; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1
1206 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
1207 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7]
1208 ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
1209 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u]
1210 ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
1211 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
1212 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1213 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
1214 ; AVX512BW-FCP-NEXT: vzeroupper
1215 ; AVX512BW-FCP-NEXT: retq
1217 ; AVX512DQ-BW-LABEL: store_i8_stride8_vf8:
1218 ; AVX512DQ-BW: # %bb.0:
1219 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1220 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1221 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1222 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1223 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1224 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1225 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1226 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1227 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1228 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1229 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1230 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1231 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1232 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
1233 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1234 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1235 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1236 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1
1237 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1238 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
1239 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
1240 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
1241 ; AVX512DQ-BW-NEXT: movl $287445282, %ecx # imm = 0x11221122
1242 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
1243 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
1244 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7]
1245 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
1246 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1]
1247 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
1248 ; AVX512DQ-BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488
1249 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
1250 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1251 ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA
1252 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
1253 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
1254 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax)
1255 ; AVX512DQ-BW-NEXT: vzeroupper
1256 ; AVX512DQ-BW-NEXT: retq
1258 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf8:
1259 ; AVX512DQ-BW-FCP: # %bb.0:
1260 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1261 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1262 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1263 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1264 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1265 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1266 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1267 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1268 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1269 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1270 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1271 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1272 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1273 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
1274 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1275 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1276 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
1277 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
1278 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15]
1279 ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1
1280 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
1281 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7]
1282 ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
1283 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u]
1284 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
1285 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
1286 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1287 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
1288 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1289 ; AVX512DQ-BW-FCP-NEXT: retq
1290 %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
1291 %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64
1292 %in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64
1293 %in.vec3 = load <8 x i8>, ptr %in.vecptr3, align 64
1294 %in.vec4 = load <8 x i8>, ptr %in.vecptr4, align 64
1295 %in.vec5 = load <8 x i8>, ptr %in.vecptr5, align 64
1296 %in.vec6 = load <8 x i8>, ptr %in.vecptr6, align 64
1297 %in.vec7 = load <8 x i8>, ptr %in.vecptr7, align 64
1298 %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1299 %2 = shufflevector <8 x i8> %in.vec2, <8 x i8> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1300 %3 = shufflevector <8 x i8> %in.vec4, <8 x i8> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1301 %4 = shufflevector <8 x i8> %in.vec6, <8 x i8> %in.vec7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1302 %5 = shufflevector <16 x i8> %1, <16 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1303 %6 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1304 %7 = shufflevector <32 x i8> %5, <32 x i8> %6, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1305 %interleaved.vec = shufflevector <64 x i8> %7, <64 x i8> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
1306 store <64 x i8> %interleaved.vec, ptr %out.vec, align 64
1310 define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind {
1311 ; SSE-LABEL: store_i8_stride8_vf16:
1313 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1314 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
1315 ; SSE-NEXT: movdqa (%rdi), %xmm10
1316 ; SSE-NEXT: movdqa (%rsi), %xmm9
1317 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1318 ; SSE-NEXT: movdqa (%rdx), %xmm1
1319 ; SSE-NEXT: movdqa (%rcx), %xmm6
1320 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1321 ; SSE-NEXT: movdqa (%r8), %xmm3
1322 ; SSE-NEXT: movdqa (%r9), %xmm11
1323 ; SSE-NEXT: movdqa (%r10), %xmm4
1324 ; SSE-NEXT: movdqa (%rax), %xmm13
1325 ; SSE-NEXT: movdqa %xmm4, %xmm12
1326 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
1327 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,2,1,4,5,6,7]
1328 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
1329 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,0]
1330 ; SSE-NEXT: movdqa %xmm2, %xmm5
1331 ; SSE-NEXT: pandn %xmm0, %xmm5
1332 ; SSE-NEXT: movdqa %xmm3, %xmm14
1333 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1334 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,1,3,4,5,6,7]
1335 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,1]
1336 ; SSE-NEXT: pand %xmm2, %xmm7
1337 ; SSE-NEXT: por %xmm5, %xmm7
1338 ; SSE-NEXT: movdqa %xmm1, %xmm15
1339 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7]
1340 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7]
1341 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3]
1342 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535]
1343 ; SSE-NEXT: movdqa %xmm0, %xmm8
1344 ; SSE-NEXT: pandn %xmm5, %xmm8
1345 ; SSE-NEXT: movdqa %xmm10, %xmm6
1346 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
1347 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,0,0]
1348 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5]
1349 ; SSE-NEXT: pand %xmm0, %xmm9
1350 ; SSE-NEXT: por %xmm8, %xmm9
1351 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
1352 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3]
1353 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
1354 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1355 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7]
1356 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
1357 ; SSE-NEXT: movdqa %xmm2, %xmm8
1358 ; SSE-NEXT: pandn %xmm7, %xmm8
1359 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[2,1,3,3,4,5,6,7]
1360 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
1361 ; SSE-NEXT: pand %xmm2, %xmm7
1362 ; SSE-NEXT: por %xmm8, %xmm7
1363 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm15[0,2,2,3,4,5,6,7]
1364 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3]
1365 ; SSE-NEXT: movdqa %xmm0, %xmm9
1366 ; SSE-NEXT: pandn %xmm8, %xmm9
1367 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,1,1]
1368 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
1369 ; SSE-NEXT: pand %xmm0, %xmm8
1370 ; SSE-NEXT: por %xmm9, %xmm8
1371 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,3,2,3]
1372 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3]
1373 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
1374 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,6,6,7]
1375 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
1376 ; SSE-NEXT: movdqa %xmm2, %xmm9
1377 ; SSE-NEXT: pandn %xmm8, %xmm9
1378 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,6,5,7,7]
1379 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
1380 ; SSE-NEXT: pand %xmm2, %xmm8
1381 ; SSE-NEXT: por %xmm9, %xmm8
1382 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7]
1383 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3]
1384 ; SSE-NEXT: movdqa %xmm0, %xmm5
1385 ; SSE-NEXT: pandn %xmm9, %xmm5
1386 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3]
1387 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5]
1388 ; SSE-NEXT: pand %xmm0, %xmm9
1389 ; SSE-NEXT: por %xmm5, %xmm9
1390 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,3,2,3]
1391 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,2,2,3]
1392 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1393 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,4,6,5]
1394 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1395 ; SSE-NEXT: movdqa %xmm2, %xmm9
1396 ; SSE-NEXT: pandn %xmm5, %xmm9
1397 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,4,5,5,7]
1398 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1399 ; SSE-NEXT: pand %xmm2, %xmm5
1400 ; SSE-NEXT: por %xmm9, %xmm5
1401 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,4,6,5]
1402 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3]
1403 ; SSE-NEXT: movdqa %xmm0, %xmm12
1404 ; SSE-NEXT: pandn %xmm9, %xmm12
1405 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
1406 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
1407 ; SSE-NEXT: pand %xmm0, %xmm6
1408 ; SSE-NEXT: por %xmm12, %xmm6
1409 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1410 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3]
1411 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1]
1412 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15]
1413 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7]
1414 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
1415 ; SSE-NEXT: movdqa %xmm2, %xmm6
1416 ; SSE-NEXT: pandn %xmm5, %xmm6
1417 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15]
1418 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7]
1419 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
1420 ; SSE-NEXT: pand %xmm2, %xmm5
1421 ; SSE-NEXT: por %xmm6, %xmm5
1422 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1423 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1424 ; SSE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
1425 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7]
1426 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,3]
1427 ; SSE-NEXT: movdqa %xmm0, %xmm9
1428 ; SSE-NEXT: pandn %xmm6, %xmm9
1429 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
1430 ; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15]
1431 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,0,0]
1432 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
1433 ; SSE-NEXT: pand %xmm0, %xmm6
1434 ; SSE-NEXT: por %xmm9, %xmm6
1435 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3]
1436 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
1437 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7]
1438 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
1439 ; SSE-NEXT: movdqa %xmm2, %xmm6
1440 ; SSE-NEXT: pandn %xmm5, %xmm6
1441 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7]
1442 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
1443 ; SSE-NEXT: pand %xmm2, %xmm5
1444 ; SSE-NEXT: por %xmm6, %xmm5
1445 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,3,2,3]
1446 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7]
1447 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3]
1448 ; SSE-NEXT: movdqa %xmm0, %xmm11
1449 ; SSE-NEXT: pandn %xmm5, %xmm11
1450 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,1,1]
1451 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
1452 ; SSE-NEXT: pand %xmm0, %xmm5
1453 ; SSE-NEXT: por %xmm11, %xmm5
1454 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1455 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1456 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7]
1457 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1458 ; SSE-NEXT: movdqa %xmm2, %xmm11
1459 ; SSE-NEXT: pandn %xmm6, %xmm11
1460 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,6,5,7,7]
1461 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1462 ; SSE-NEXT: pand %xmm2, %xmm6
1463 ; SSE-NEXT: por %xmm11, %xmm6
1464 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,6,6,7]
1465 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3]
1466 ; SSE-NEXT: movdqa %xmm0, %xmm13
1467 ; SSE-NEXT: pandn %xmm11, %xmm13
1468 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[3,3,3,3]
1469 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
1470 ; SSE-NEXT: pand %xmm0, %xmm11
1471 ; SSE-NEXT: por %xmm13, %xmm11
1472 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
1473 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
1474 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1]
1475 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,5]
1476 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1477 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7]
1478 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1479 ; SSE-NEXT: pand %xmm2, %xmm3
1480 ; SSE-NEXT: pandn %xmm4, %xmm2
1481 ; SSE-NEXT: por %xmm3, %xmm2
1482 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5]
1483 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1484 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2]
1485 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
1486 ; SSE-NEXT: pand %xmm0, %xmm3
1487 ; SSE-NEXT: pandn %xmm1, %xmm0
1488 ; SSE-NEXT: por %xmm3, %xmm0
1489 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
1490 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1491 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1492 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1493 ; SSE-NEXT: movdqa %xmm0, 96(%rax)
1494 ; SSE-NEXT: movdqa %xmm11, 112(%rax)
1495 ; SSE-NEXT: movdqa %xmm5, 80(%rax)
1496 ; SSE-NEXT: movdqa %xmm9, 64(%rax)
1497 ; SSE-NEXT: movdqa %xmm12, 32(%rax)
1498 ; SSE-NEXT: movdqa %xmm8, 48(%rax)
1499 ; SSE-NEXT: movdqa %xmm7, 16(%rax)
1500 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1501 ; SSE-NEXT: movaps %xmm0, (%rax)
1504 ; AVX-LABEL: store_i8_stride8_vf16:
1506 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1507 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
1508 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
1509 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
1510 ; AVX-NEXT: vmovdqa (%rsi), %xmm2
1511 ; AVX-NEXT: vmovdqa (%rdx), %xmm3
1512 ; AVX-NEXT: vmovdqa (%rcx), %xmm4
1513 ; AVX-NEXT: vmovdqa (%r8), %xmm5
1514 ; AVX-NEXT: vmovdqa (%r9), %xmm6
1515 ; AVX-NEXT: vmovdqa (%r11), %xmm8
1516 ; AVX-NEXT: vmovdqa (%r10), %xmm9
1517 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
1518 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7]
1519 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
1520 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
1521 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7]
1522 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,2,1]
1523 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3],xmm11[4,5,6],xmm0[7]
1524 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,0,2,1,4,5,6,7]
1525 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,2,1]
1526 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,1,1,3,4,5,6,7]
1527 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,2,1]
1528 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6],xmm11[7]
1529 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0
1530 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1531 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7]
1532 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero
1533 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1534 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1]
1535 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
1536 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4],xmm12[5],xmm14[6,7]
1537 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
1538 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[0,0,2,1,4,5,6,7]
1539 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero
1540 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4],xmm15[5],xmm14[6,7]
1541 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
1542 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3],ymm12[4],ymm0[5],ymm12[6],ymm0[7]
1543 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,6,6,7]
1544 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3]
1545 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,6,5,7,7]
1546 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
1547 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3],xmm14[4,5,6],xmm12[7]
1548 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,6,5]
1549 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1550 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7]
1551 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
1552 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3],xmm10[4,5,6],xmm7[7]
1553 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7
1554 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7]
1555 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,3,3]
1556 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
1557 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
1558 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4],xmm10[5],xmm12[6,7]
1559 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,6,5]
1560 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,3,3]
1561 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3]
1562 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
1563 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4],xmm11[5],xmm12[6,7]
1564 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
1565 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7]
1566 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
1567 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7]
1568 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1]
1569 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
1570 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7]
1571 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
1572 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3],xmm6[4,5,6],xmm9[7]
1573 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,0,2,1,4,5,6,7]
1574 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1]
1575 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,1,1,3,4,5,6,7]
1576 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
1577 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5,6],xmm9[7]
1578 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6
1579 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
1580 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
1581 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1582 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1583 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1584 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1585 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4],xmm4[5],xmm2[6,7]
1586 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1587 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,0,2,1,4,5,6,7]
1588 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero
1589 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4],xmm9[5],xmm4[6,7]
1590 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
1591 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3],ymm2[4],ymm6[5],ymm2[6],ymm6[7]
1592 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7]
1593 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1594 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,5,7,7]
1595 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1596 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5,6],xmm4[7]
1597 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,4,6,5]
1598 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1599 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,7]
1600 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1601 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6],xmm6[7]
1602 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
1603 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
1604 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3]
1605 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3]
1606 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
1607 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4],xmm5[5],xmm6[6,7]
1608 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5]
1609 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
1610 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1611 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1612 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4],xmm3[5],xmm1[6,7]
1613 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
1614 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7]
1615 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
1616 ; AVX-NEXT: vmovaps %ymm2, 64(%rax)
1617 ; AVX-NEXT: vmovaps %ymm7, 32(%rax)
1618 ; AVX-NEXT: vmovaps %ymm0, (%rax)
1619 ; AVX-NEXT: vzeroupper
1622 ; AVX2-LABEL: store_i8_stride8_vf16:
1624 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1625 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
1626 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
1627 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1628 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1
1629 ; AVX2-NEXT: vmovdqa (%r8), %xmm2
1630 ; AVX2-NEXT: vmovdqa (%r11), %xmm3
1631 ; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1632 ; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1633 ; AVX2-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm4
1634 ; AVX2-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
1635 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2]
1636 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11]
1637 ; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2
1638 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
1639 ; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819]
1640 ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9
1641 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15]
1642 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1643 ; AVX2-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
1644 ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm11
1645 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1646 ; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819]
1647 ; AVX2-NEXT: vpshufb %ymm13, %ymm12, %ymm14
1648 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
1649 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7]
1650 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
1651 ; AVX2-NEXT: vpshufb %ymm11, %ymm5, %ymm5
1652 ; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847]
1653 ; AVX2-NEXT: vpshufb %ymm14, %ymm7, %ymm7
1654 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15]
1655 ; AVX2-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7
1656 ; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm9
1657 ; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847]
1658 ; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm12
1659 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
1660 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
1661 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
1662 ; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm6
1663 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3]
1664 ; AVX2-NEXT: vpshufb %ymm8, %ymm4, %ymm8
1665 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15]
1666 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1667 ; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm8
1668 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
1669 ; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm9
1670 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15]
1671 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
1672 ; AVX2-NEXT: vpshufb %ymm11, %ymm3, %ymm3
1673 ; AVX2-NEXT: vpshufb %ymm14, %ymm4, %ymm4
1674 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
1675 ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1676 ; AVX2-NEXT: vpshufb %ymm15, %ymm0, %ymm0
1677 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
1678 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
1679 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rax)
1680 ; AVX2-NEXT: vmovdqa %ymm6, 64(%rax)
1681 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rax)
1682 ; AVX2-NEXT: vmovdqa %ymm2, (%rax)
1683 ; AVX2-NEXT: vzeroupper
1686 ; AVX2-FP-LABEL: store_i8_stride8_vf16:
1688 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1689 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1690 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1691 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
1692 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
1693 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
1694 ; AVX2-FP-NEXT: vmovdqa (%r11), %xmm3
1695 ; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1696 ; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1697 ; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm4
1698 ; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
1699 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2]
1700 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11]
1701 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm2
1702 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
1703 ; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819]
1704 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm9
1705 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15]
1706 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1707 ; AVX2-FP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
1708 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm11
1709 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1710 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819]
1711 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm12, %ymm14
1712 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
1713 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7]
1714 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
1715 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
1716 ; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847]
1717 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm7, %ymm7
1718 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15]
1719 ; AVX2-FP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7
1720 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm9
1721 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847]
1722 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm12, %ymm12
1723 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
1724 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
1725 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
1726 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm3, %ymm6
1727 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3]
1728 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm4, %ymm8
1729 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15]
1730 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1731 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm8
1732 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
1733 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm9
1734 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15]
1735 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
1736 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
1737 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm4, %ymm4
1738 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
1739 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1740 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
1741 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
1742 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
1743 ; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax)
1744 ; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rax)
1745 ; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax)
1746 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax)
1747 ; AVX2-FP-NEXT: vzeroupper
1748 ; AVX2-FP-NEXT: retq
1750 ; AVX2-FCP-LABEL: store_i8_stride8_vf16:
1751 ; AVX2-FCP: # %bb.0:
1752 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1753 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1754 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1755 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
1756 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
1757 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
1758 ; AVX2-FCP-NEXT: vmovdqa (%r11), %xmm3
1759 ; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1760 ; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1761 ; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm4
1762 ; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
1763 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2]
1764 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11]
1765 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2
1766 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
1767 ; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819]
1768 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9
1769 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15]
1770 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1771 ; AVX2-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
1772 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11
1773 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1774 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819]
1775 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14
1776 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
1777 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7]
1778 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
1779 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
1780 ; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847]
1781 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7
1782 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15]
1783 ; AVX2-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7
1784 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9
1785 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847]
1786 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12
1787 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
1788 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
1789 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
1790 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm6
1791 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3]
1792 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm8
1793 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15]
1794 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1795 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm8
1796 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
1797 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm9
1798 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15]
1799 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
1800 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
1801 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4
1802 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
1803 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1804 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
1805 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
1806 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
1807 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax)
1808 ; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rax)
1809 ; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%rax)
1810 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax)
1811 ; AVX2-FCP-NEXT: vzeroupper
1812 ; AVX2-FCP-NEXT: retq
1814 ; AVX512-LABEL: store_i8_stride8_vf16:
1816 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1817 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
1818 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
1819 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
1820 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1
1821 ; AVX512-NEXT: vmovdqa (%r8), %xmm2
1822 ; AVX512-NEXT: vmovdqa (%r11), %xmm3
1823 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1824 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1825 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
1826 ; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
1827 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
1828 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
1829 ; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm6
1830 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
1831 ; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847]
1832 ; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm9
1833 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15]
1834 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1835 ; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
1836 ; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm11
1837 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1838 ; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847]
1839 ; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm14
1840 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
1841 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7]
1842 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11]
1843 ; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4
1844 ; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819]
1845 ; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7
1846 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15]
1847 ; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7
1848 ; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9
1849 ; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819]
1850 ; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12
1851 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
1852 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
1853 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
1854 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
1855 ; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm5
1856 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
1857 ; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm6
1858 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15]
1859 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1860 ; AVX512-NEXT: vpshufb %ymm10, %ymm1, %ymm6
1861 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
1862 ; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm8
1863 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15]
1864 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
1865 ; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
1866 ; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2
1867 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15]
1868 ; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1869 ; AVX512-NEXT: vpshufb %ymm15, %ymm0, %ymm0
1870 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
1871 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
1872 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
1873 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
1874 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rax)
1875 ; AVX512-NEXT: vzeroupper
1878 ; AVX512-FCP-LABEL: store_i8_stride8_vf16:
1879 ; AVX512-FCP: # %bb.0:
1880 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1881 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1882 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1883 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
1884 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
1885 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
1886 ; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3
1887 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1888 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1889 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
1890 ; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
1891 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
1892 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
1893 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6
1894 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
1895 ; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847]
1896 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9
1897 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15]
1898 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1899 ; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
1900 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11
1901 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1902 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847]
1903 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14
1904 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
1905 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7]
1906 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11]
1907 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
1908 ; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819]
1909 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7
1910 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15]
1911 ; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7
1912 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9
1913 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819]
1914 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12
1915 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
1916 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
1917 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
1918 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
1919 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
1920 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
1921 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6
1922 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15]
1923 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1924 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6
1925 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
1926 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm8
1927 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15]
1928 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
1929 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
1930 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
1931 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15]
1932 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1933 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
1934 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
1935 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
1936 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
1937 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
1938 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
1939 ; AVX512-FCP-NEXT: vzeroupper
1940 ; AVX512-FCP-NEXT: retq
1942 ; AVX512DQ-LABEL: store_i8_stride8_vf16:
1943 ; AVX512DQ: # %bb.0:
1944 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1945 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
1946 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
1947 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
1948 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
1949 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
1950 ; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3
1951 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1952 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1953 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
1954 ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
1955 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
1956 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
1957 ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm6
1958 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
1959 ; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847]
1960 ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm9
1961 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15]
1962 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
1963 ; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
1964 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm11
1965 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
1966 ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847]
1967 ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm14
1968 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
1969 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7]
1970 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11]
1971 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4
1972 ; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819]
1973 ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7
1974 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15]
1975 ; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7
1976 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm9, %ymm9
1977 ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819]
1978 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm12, %ymm12
1979 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
1980 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
1981 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
1982 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
1983 ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm5
1984 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
1985 ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm2, %ymm6
1986 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15]
1987 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1988 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm6
1989 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
1990 ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm8
1991 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15]
1992 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
1993 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
1994 ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2
1995 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15]
1996 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1997 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm0, %ymm0
1998 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
1999 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
2000 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
2001 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax)
2002 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax)
2003 ; AVX512DQ-NEXT: vzeroupper
2004 ; AVX512DQ-NEXT: retq
2006 ; AVX512DQ-FCP-LABEL: store_i8_stride8_vf16:
2007 ; AVX512DQ-FCP: # %bb.0:
2008 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2009 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2010 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2011 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
2012 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
2013 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
2014 ; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3
2015 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
2016 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
2017 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
2018 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
2019 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
2020 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
2021 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6
2022 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
2023 ; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847]
2024 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9
2025 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15]
2026 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
2027 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
2028 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11
2029 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
2030 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847]
2031 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14
2032 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
2033 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7]
2034 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11]
2035 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
2036 ; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819]
2037 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7
2038 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15]
2039 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7
2040 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9
2041 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819]
2042 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12
2043 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
2044 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
2045 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
2046 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
2047 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
2048 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
2049 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6
2050 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15]
2051 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
2052 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6
2053 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
2054 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm8
2055 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15]
2056 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
2057 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
2058 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
2059 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15]
2060 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
2061 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
2062 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
2063 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
2064 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
2065 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
2066 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
2067 ; AVX512DQ-FCP-NEXT: vzeroupper
2068 ; AVX512DQ-FCP-NEXT: retq
2070 ; AVX512BW-LABEL: store_i8_stride8_vf16:
2071 ; AVX512BW: # %bb.0:
2072 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2073 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
2074 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
2075 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
2076 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
2077 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
2078 ; AVX512BW-NEXT: vmovdqa (%r11), %xmm3
2079 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
2080 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
2081 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
2082 ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
2083 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
2084 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3
2085 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
2086 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6]
2087 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
2088 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7]
2089 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm3[0,2,0,2,4,6,4,6]
2090 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
2091 ; AVX512BW-NEXT: movl $-2004318072, %ecx # imm = 0x88888888
2092 ; AVX512BW-NEXT: kmovd %ecx, %k1
2093 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm4 {%k1}
2094 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2095 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm0[0,2,0,2,4,6,4,6]
2096 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
2097 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
2098 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm1[0,2,0,2,4,6,4,6]
2099 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
2100 ; AVX512BW-NEXT: movl $572662306, %ecx # imm = 0x22222222
2101 ; AVX512BW-NEXT: kmovd %ecx, %k2
2102 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2}
2103 ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA
2104 ; AVX512BW-NEXT: kmovd %ecx, %k3
2105 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k3}
2106 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
2107 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
2108 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7]
2109 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
2110 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1}
2111 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
2112 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
2113 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
2114 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
2115 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
2116 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
2117 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
2118 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax)
2119 ; AVX512BW-NEXT: vzeroupper
2120 ; AVX512BW-NEXT: retq
2122 ; AVX512BW-FCP-LABEL: store_i8_stride8_vf16:
2123 ; AVX512BW-FCP: # %bb.0:
2124 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2125 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2126 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2127 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
2128 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
2129 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
2130 ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3
2131 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
2132 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
2133 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4
2134 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
2135 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
2136 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
2137 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14]
2138 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3
2139 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
2140 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
2141 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6]
2142 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
2143 ; AVX512BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888
2144 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
2145 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1}
2146 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1
2147 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
2148 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2149 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6]
2150 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
2151 ; AVX512BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222
2152 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2
2153 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2}
2154 ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
2155 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3
2156 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3}
2157 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
2158 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2159 ; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5
2160 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
2161 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
2162 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
2163 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1}
2164 ; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1
2165 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
2166 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
2167 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
2168 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
2169 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
2170 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
2171 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
2172 ; AVX512BW-FCP-NEXT: vzeroupper
2173 ; AVX512BW-FCP-NEXT: retq
2175 ; AVX512DQ-BW-LABEL: store_i8_stride8_vf16:
2176 ; AVX512DQ-BW: # %bb.0:
2177 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2178 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
2179 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
2180 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
2181 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
2182 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
2183 ; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3
2184 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
2185 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
2186 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
2187 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
2188 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
2189 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3
2190 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
2191 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6]
2192 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
2193 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7]
2194 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm3[0,2,0,2,4,6,4,6]
2195 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
2196 ; AVX512DQ-BW-NEXT: movl $-2004318072, %ecx # imm = 0x88888888
2197 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
2198 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm4 {%k1}
2199 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2200 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm0[0,2,0,2,4,6,4,6]
2201 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
2202 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
2203 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm1[0,2,0,2,4,6,4,6]
2204 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
2205 ; AVX512DQ-BW-NEXT: movl $572662306, %ecx # imm = 0x22222222
2206 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2
2207 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2}
2208 ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA
2209 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3
2210 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k3}
2211 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
2212 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
2213 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7]
2214 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
2215 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1}
2216 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
2217 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
2218 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
2219 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
2220 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
2221 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
2222 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
2223 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax)
2224 ; AVX512DQ-BW-NEXT: vzeroupper
2225 ; AVX512DQ-BW-NEXT: retq
2227 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf16:
2228 ; AVX512DQ-BW-FCP: # %bb.0:
2229 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2230 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2231 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2232 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
2233 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
2234 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
2235 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3
2236 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
2237 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
2238 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4
2239 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
2240 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
2241 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
2242 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14]
2243 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3
2244 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
2245 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
2246 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6]
2247 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
2248 ; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888
2249 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
2250 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1}
2251 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1
2252 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
2253 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2254 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6]
2255 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
2256 ; AVX512DQ-BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222
2257 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2
2258 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2}
2259 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
2260 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3
2261 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3}
2262 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
2263 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2264 ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5
2265 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
2266 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
2267 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
2268 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1}
2269 ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1
2270 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
2271 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
2272 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
2273 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
2274 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
2275 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
2276 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
2277 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2278 ; AVX512DQ-BW-FCP-NEXT: retq
2279 %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
2280 %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64
2281 %in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64
2282 %in.vec3 = load <16 x i8>, ptr %in.vecptr3, align 64
2283 %in.vec4 = load <16 x i8>, ptr %in.vecptr4, align 64
2284 %in.vec5 = load <16 x i8>, ptr %in.vecptr5, align 64
2285 %in.vec6 = load <16 x i8>, ptr %in.vecptr6, align 64
2286 %in.vec7 = load <16 x i8>, ptr %in.vecptr7, align 64
2287 %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2288 %2 = shufflevector <16 x i8> %in.vec2, <16 x i8> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2289 %3 = shufflevector <16 x i8> %in.vec4, <16 x i8> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2290 %4 = shufflevector <16 x i8> %in.vec6, <16 x i8> %in.vec7, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2291 %5 = shufflevector <32 x i8> %1, <32 x i8> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2292 %6 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2293 %7 = shufflevector <64 x i8> %5, <64 x i8> %6, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
2294 %interleaved.vec = shufflevector <128 x i8> %7, <128 x i8> poison, <128 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 96, i32 112, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 97, i32 113, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 98, i32 114, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 99, i32 115, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 100, i32 116, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 101, i32 117, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 102, i32 118, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 103, i32 119, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 104, i32 120, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 105, i32 121, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 106, i32 122, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 107, i32 123, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 108, i32 124, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 109, i32 125, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 110, i32 126, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95, i32 111, i32 127>
2295 store <128 x i8> %interleaved.vec, ptr %out.vec, align 64
2299 define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind {
2300 ; SSE-LABEL: store_i8_stride8_vf32:
2302 ; SSE-NEXT: subq $232, %rsp
2303 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2304 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
2305 ; SSE-NEXT: movdqa (%rdi), %xmm5
2306 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2307 ; SSE-NEXT: movdqa (%rsi), %xmm4
2308 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2309 ; SSE-NEXT: movdqa (%rdx), %xmm1
2310 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2311 ; SSE-NEXT: movdqa (%rcx), %xmm8
2312 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2313 ; SSE-NEXT: movdqa (%r8), %xmm13
2314 ; SSE-NEXT: movdqa (%r9), %xmm12
2315 ; SSE-NEXT: movdqa (%r10), %xmm14
2316 ; SSE-NEXT: movdqa (%rax), %xmm11
2317 ; SSE-NEXT: movdqa %xmm14, %xmm2
2318 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
2319 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
2320 ; SSE-NEXT: movdqa %xmm2, %xmm15
2321 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2322 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1]
2323 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,0]
2324 ; SSE-NEXT: movdqa %xmm9, %xmm6
2325 ; SSE-NEXT: pandn %xmm2, %xmm6
2326 ; SSE-NEXT: movdqa %xmm13, %xmm3
2327 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
2328 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7]
2329 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2330 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,1]
2331 ; SSE-NEXT: pand %xmm9, %xmm7
2332 ; SSE-NEXT: por %xmm6, %xmm7
2333 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
2334 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
2335 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2336 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,1,3]
2337 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535]
2338 ; SSE-NEXT: movdqa %xmm8, %xmm10
2339 ; SSE-NEXT: pandn %xmm6, %xmm10
2340 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2341 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
2342 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2343 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
2344 ; SSE-NEXT: pand %xmm8, %xmm6
2345 ; SSE-NEXT: por %xmm10, %xmm6
2346 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
2347 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
2348 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
2349 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2350 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[0,0,2,1,4,5,6,7]
2351 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
2352 ; SSE-NEXT: movdqa %xmm9, %xmm7
2353 ; SSE-NEXT: pandn %xmm6, %xmm7
2354 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7]
2355 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
2356 ; SSE-NEXT: pand %xmm9, %xmm6
2357 ; SSE-NEXT: por %xmm7, %xmm6
2358 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7]
2359 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3]
2360 ; SSE-NEXT: movdqa %xmm8, %xmm10
2361 ; SSE-NEXT: pandn %xmm7, %xmm10
2362 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0]
2363 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
2364 ; SSE-NEXT: pand %xmm8, %xmm7
2365 ; SSE-NEXT: por %xmm10, %xmm7
2366 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
2367 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
2368 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
2369 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2370 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
2371 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7]
2372 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2373 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
2374 ; SSE-NEXT: movdqa %xmm9, %xmm6
2375 ; SSE-NEXT: pandn %xmm5, %xmm6
2376 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
2377 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[2,1,3,3,4,5,6,7]
2378 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2379 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1]
2380 ; SSE-NEXT: pand %xmm9, %xmm4
2381 ; SSE-NEXT: por %xmm6, %xmm4
2382 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
2383 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2384 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2385 ; SSE-NEXT: # xmm12 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15]
2386 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7]
2387 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2388 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
2389 ; SSE-NEXT: movdqa %xmm8, %xmm5
2390 ; SSE-NEXT: pandn %xmm3, %xmm5
2391 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2392 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2393 ; SSE-NEXT: # xmm11 = xmm11[8],mem[8],xmm11[9],mem[9],xmm11[10],mem[10],xmm11[11],mem[11],xmm11[12],mem[12],xmm11[13],mem[13],xmm11[14],mem[14],xmm11[15],mem[15]
2394 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
2395 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2396 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2397 ; SSE-NEXT: pand %xmm8, %xmm1
2398 ; SSE-NEXT: por %xmm5, %xmm1
2399 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
2400 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2401 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2402 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,0,2,1,4,5,6,7]
2403 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
2404 ; SSE-NEXT: movdqa %xmm9, %xmm3
2405 ; SSE-NEXT: pandn %xmm1, %xmm3
2406 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,1,1,3,4,5,6,7]
2407 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
2408 ; SSE-NEXT: pand %xmm9, %xmm1
2409 ; SSE-NEXT: por %xmm3, %xmm1
2410 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7]
2411 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
2412 ; SSE-NEXT: movdqa %xmm8, %xmm4
2413 ; SSE-NEXT: pandn %xmm3, %xmm4
2414 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,0,0]
2415 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
2416 ; SSE-NEXT: pand %xmm8, %xmm3
2417 ; SSE-NEXT: por %xmm4, %xmm3
2418 ; SSE-NEXT: movdqa 16(%r10), %xmm10
2419 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
2420 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2421 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2422 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2423 ; SSE-NEXT: movdqa 16(%rax), %xmm1
2424 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2425 ; SSE-NEXT: movdqa %xmm10, %xmm5
2426 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
2427 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
2428 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2429 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
2430 ; SSE-NEXT: movdqa %xmm9, %xmm3
2431 ; SSE-NEXT: pandn %xmm1, %xmm3
2432 ; SSE-NEXT: movdqa 16(%r8), %xmm12
2433 ; SSE-NEXT: movdqa 16(%r9), %xmm11
2434 ; SSE-NEXT: movdqa %xmm12, %xmm4
2435 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
2436 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7]
2437 ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
2438 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,1]
2439 ; SSE-NEXT: pand %xmm9, %xmm0
2440 ; SSE-NEXT: por %xmm3, %xmm0
2441 ; SSE-NEXT: movdqa 16(%rdx), %xmm13
2442 ; SSE-NEXT: movdqa 16(%rcx), %xmm7
2443 ; SSE-NEXT: movdqa %xmm13, %xmm3
2444 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
2445 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7]
2446 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2447 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
2448 ; SSE-NEXT: movdqa %xmm8, %xmm2
2449 ; SSE-NEXT: pandn %xmm1, %xmm2
2450 ; SSE-NEXT: movdqa 16(%rdi), %xmm14
2451 ; SSE-NEXT: movdqa 16(%rsi), %xmm6
2452 ; SSE-NEXT: movdqa %xmm14, %xmm1
2453 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
2454 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,1,1,1]
2455 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2456 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5]
2457 ; SSE-NEXT: pand %xmm8, %xmm15
2458 ; SSE-NEXT: por %xmm2, %xmm15
2459 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
2460 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3]
2461 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2462 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2463 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7]
2464 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
2465 ; SSE-NEXT: movdqa %xmm9, %xmm2
2466 ; SSE-NEXT: pandn %xmm0, %xmm2
2467 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7]
2468 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
2469 ; SSE-NEXT: pand %xmm9, %xmm0
2470 ; SSE-NEXT: por %xmm2, %xmm0
2471 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
2472 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
2473 ; SSE-NEXT: movdqa %xmm8, %xmm15
2474 ; SSE-NEXT: pandn %xmm2, %xmm15
2475 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
2476 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2477 ; SSE-NEXT: pand %xmm8, %xmm2
2478 ; SSE-NEXT: por %xmm15, %xmm2
2479 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
2480 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2481 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2482 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2483 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2484 ; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15]
2485 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7]
2486 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2487 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
2488 ; SSE-NEXT: movdqa %xmm9, %xmm2
2489 ; SSE-NEXT: pandn %xmm0, %xmm2
2490 ; SSE-NEXT: movdqa %xmm12, %xmm15
2491 ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm11[8],xmm15[9],xmm11[9],xmm15[10],xmm11[10],xmm15[11],xmm11[11],xmm15[12],xmm11[12],xmm15[13],xmm11[13],xmm15[14],xmm11[14],xmm15[15],xmm11[15]
2492 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,1,3,3,4,5,6,7]
2493 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
2494 ; SSE-NEXT: pand %xmm9, %xmm0
2495 ; SSE-NEXT: por %xmm2, %xmm0
2496 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15]
2497 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7]
2498 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
2499 ; SSE-NEXT: movdqa %xmm8, %xmm3
2500 ; SSE-NEXT: pandn %xmm2, %xmm3
2501 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15]
2502 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
2503 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2504 ; SSE-NEXT: pand %xmm8, %xmm1
2505 ; SSE-NEXT: por %xmm3, %xmm1
2506 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
2507 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2508 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2509 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2510 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,0,2,1,4,5,6,7]
2511 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
2512 ; SSE-NEXT: movdqa %xmm9, %xmm1
2513 ; SSE-NEXT: pandn %xmm0, %xmm1
2514 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,1,3,4,5,6,7]
2515 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
2516 ; SSE-NEXT: pand %xmm9, %xmm0
2517 ; SSE-NEXT: por %xmm1, %xmm0
2518 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,0,2,1,4,5,6,7]
2519 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
2520 ; SSE-NEXT: movdqa %xmm8, %xmm2
2521 ; SSE-NEXT: pandn %xmm1, %xmm2
2522 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0]
2523 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2524 ; SSE-NEXT: pand %xmm8, %xmm1
2525 ; SSE-NEXT: por %xmm2, %xmm1
2526 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
2527 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2528 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2529 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2530 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2531 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7]
2532 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2533 ; SSE-NEXT: movdqa %xmm9, %xmm1
2534 ; SSE-NEXT: pandn %xmm0, %xmm1
2535 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2536 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,7,7]
2537 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2538 ; SSE-NEXT: pand %xmm9, %xmm0
2539 ; SSE-NEXT: por %xmm1, %xmm0
2540 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2541 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7]
2542 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
2543 ; SSE-NEXT: movdqa %xmm8, %xmm2
2544 ; SSE-NEXT: pandn %xmm1, %xmm2
2545 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2546 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3]
2547 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2548 ; SSE-NEXT: pand %xmm8, %xmm1
2549 ; SSE-NEXT: por %xmm2, %xmm1
2550 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
2551 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2552 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2553 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2554 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5]
2555 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2556 ; SSE-NEXT: movdqa %xmm9, %xmm1
2557 ; SSE-NEXT: pandn %xmm0, %xmm1
2558 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7]
2559 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2560 ; SSE-NEXT: pand %xmm9, %xmm0
2561 ; SSE-NEXT: por %xmm1, %xmm0
2562 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5]
2563 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
2564 ; SSE-NEXT: movdqa %xmm8, %xmm2
2565 ; SSE-NEXT: pandn %xmm1, %xmm2
2566 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2]
2567 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2568 ; SSE-NEXT: pand %xmm8, %xmm1
2569 ; SSE-NEXT: por %xmm2, %xmm1
2570 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
2571 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2572 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2573 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2574 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7]
2575 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2576 ; SSE-NEXT: movdqa %xmm9, %xmm2
2577 ; SSE-NEXT: pandn %xmm0, %xmm2
2578 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2579 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,7,7]
2580 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2581 ; SSE-NEXT: pand %xmm9, %xmm0
2582 ; SSE-NEXT: por %xmm2, %xmm0
2583 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2584 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7]
2585 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
2586 ; SSE-NEXT: movdqa %xmm8, %xmm3
2587 ; SSE-NEXT: pandn %xmm2, %xmm3
2588 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2589 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,3,3,3]
2590 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2591 ; SSE-NEXT: pand %xmm8, %xmm2
2592 ; SSE-NEXT: por %xmm3, %xmm2
2593 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
2594 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
2595 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2596 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5]
2597 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2598 ; SSE-NEXT: movdqa %xmm9, %xmm2
2599 ; SSE-NEXT: pandn %xmm0, %xmm2
2600 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7]
2601 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2602 ; SSE-NEXT: pand %xmm9, %xmm0
2603 ; SSE-NEXT: por %xmm2, %xmm0
2604 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,4,6,5]
2605 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
2606 ; SSE-NEXT: movdqa %xmm8, %xmm4
2607 ; SSE-NEXT: pandn %xmm2, %xmm4
2608 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2]
2609 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2610 ; SSE-NEXT: pand %xmm8, %xmm2
2611 ; SSE-NEXT: por %xmm4, %xmm2
2612 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
2613 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2614 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2615 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2616 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7]
2617 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2618 ; SSE-NEXT: movdqa %xmm9, %xmm4
2619 ; SSE-NEXT: pandn %xmm2, %xmm4
2620 ; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload
2621 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7]
2622 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2623 ; SSE-NEXT: pand %xmm9, %xmm2
2624 ; SSE-NEXT: por %xmm4, %xmm2
2625 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2626 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7]
2627 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3]
2628 ; SSE-NEXT: movdqa %xmm8, %xmm5
2629 ; SSE-NEXT: pandn %xmm4, %xmm5
2630 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2631 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3]
2632 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
2633 ; SSE-NEXT: pand %xmm8, %xmm4
2634 ; SSE-NEXT: por %xmm5, %xmm4
2635 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3]
2636 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
2637 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
2638 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5]
2639 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2640 ; SSE-NEXT: movdqa %xmm9, %xmm5
2641 ; SSE-NEXT: pandn %xmm4, %xmm5
2642 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7]
2643 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2644 ; SSE-NEXT: pand %xmm9, %xmm4
2645 ; SSE-NEXT: por %xmm5, %xmm4
2646 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5]
2647 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3]
2648 ; SSE-NEXT: movdqa %xmm8, %xmm10
2649 ; SSE-NEXT: pandn %xmm5, %xmm10
2650 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,2,2]
2651 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
2652 ; SSE-NEXT: pand %xmm8, %xmm5
2653 ; SSE-NEXT: por %xmm10, %xmm5
2654 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3]
2655 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2656 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1]
2657 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2658 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7]
2659 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
2660 ; SSE-NEXT: movdqa %xmm9, %xmm10
2661 ; SSE-NEXT: pandn %xmm5, %xmm10
2662 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,6,5,7,7]
2663 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
2664 ; SSE-NEXT: pand %xmm9, %xmm5
2665 ; SSE-NEXT: por %xmm10, %xmm5
2666 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,6,6,7]
2667 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3]
2668 ; SSE-NEXT: movdqa %xmm8, %xmm12
2669 ; SSE-NEXT: pandn %xmm10, %xmm12
2670 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[3,3,3,3]
2671 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
2672 ; SSE-NEXT: pand %xmm8, %xmm10
2673 ; SSE-NEXT: por %xmm12, %xmm10
2674 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
2675 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
2676 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
2677 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,4,6,5]
2678 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
2679 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,5,5,7]
2680 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
2681 ; SSE-NEXT: pand %xmm9, %xmm11
2682 ; SSE-NEXT: pandn %xmm5, %xmm9
2683 ; SSE-NEXT: por %xmm11, %xmm9
2684 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,4,6,5]
2685 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3]
2686 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,2,2]
2687 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
2688 ; SSE-NEXT: pand %xmm8, %xmm6
2689 ; SSE-NEXT: pandn %xmm5, %xmm8
2690 ; SSE-NEXT: por %xmm6, %xmm8
2691 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,3,2,3]
2692 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,2,3]
2693 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
2694 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2695 ; SSE-NEXT: movdqa %xmm6, 224(%rax)
2696 ; SSE-NEXT: movdqa %xmm10, 240(%rax)
2697 ; SSE-NEXT: movdqa %xmm4, 160(%rax)
2698 ; SSE-NEXT: movdqa %xmm2, 176(%rax)
2699 ; SSE-NEXT: movdqa %xmm0, 96(%rax)
2700 ; SSE-NEXT: movdqa %xmm3, 112(%rax)
2701 ; SSE-NEXT: movdqa %xmm1, 32(%rax)
2702 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2703 ; SSE-NEXT: movaps %xmm0, 48(%rax)
2704 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2705 ; SSE-NEXT: movaps %xmm0, 192(%rax)
2706 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2707 ; SSE-NEXT: movaps %xmm0, 208(%rax)
2708 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2709 ; SSE-NEXT: movaps %xmm0, 128(%rax)
2710 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2711 ; SSE-NEXT: movaps %xmm0, 144(%rax)
2712 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2713 ; SSE-NEXT: movaps %xmm0, 64(%rax)
2714 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2715 ; SSE-NEXT: movaps %xmm0, 80(%rax)
2716 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2717 ; SSE-NEXT: movaps %xmm0, (%rax)
2718 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2719 ; SSE-NEXT: movaps %xmm0, 16(%rax)
2720 ; SSE-NEXT: addq $232, %rsp
2723 ; AVX-LABEL: store_i8_stride8_vf32:
2725 ; AVX-NEXT: subq $56, %rsp
2726 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2727 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
2728 ; AVX-NEXT: vmovdqa (%r10), %xmm0
2729 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2730 ; AVX-NEXT: vmovdqa (%rax), %xmm2
2731 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2732 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
2733 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
2734 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
2735 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
2736 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
2737 ; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm1
2738 ; AVX-NEXT: vmovdqa (%r9), %xmm5
2739 ; AVX-NEXT: vmovdqa (%r8), %xmm7
2740 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
2741 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7]
2742 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7]
2743 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
2744 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
2745 ; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3
2746 ; AVX-NEXT: vorps %ymm1, %ymm3, %ymm8
2747 ; AVX-NEXT: vmovdqa (%rsi), %xmm3
2748 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2749 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
2750 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2751 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
2752 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[1,1,1,1]
2753 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
2754 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm13
2755 ; AVX-NEXT: vmovdqa (%rcx), %xmm10
2756 ; AVX-NEXT: vmovdqa (%rdx), %xmm11
2757 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2758 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,0,2,1,4,5,6,7]
2759 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
2760 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7]
2761 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero
2762 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm4, %ymm14
2763 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
2764 ; AVX-NEXT: vandps %ymm6, %ymm13, %ymm13
2765 ; AVX-NEXT: vandnps %ymm14, %ymm6, %ymm14
2766 ; AVX-NEXT: vorps %ymm14, %ymm13, %ymm13
2767 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4],ymm8[5],ymm13[6],ymm8[7]
2768 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2769 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5]
2770 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2771 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
2772 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,5,5,7]
2773 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7]
2774 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8
2775 ; AVX-NEXT: vmovdqa 16(%r10), %xmm4
2776 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2777 ; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0
2778 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
2779 ; AVX-NEXT: vandps %ymm9, %ymm8, %ymm8
2780 ; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0
2781 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5]
2782 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,6,6,7]
2783 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm12
2784 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
2785 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
2786 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
2787 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
2788 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3
2789 ; AVX-NEXT: vmovdqa 16(%rax), %xmm8
2790 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2791 ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7]
2792 ; AVX-NEXT: vandnps %ymm12, %ymm6, %ymm12
2793 ; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3
2794 ; AVX-NEXT: vorps %ymm3, %ymm12, %ymm3
2795 ; AVX-NEXT: vmovdqa 16(%r9), %xmm13
2796 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7]
2797 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2798 ; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
2799 ; AVX-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
2800 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2801 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5]
2802 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
2803 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2804 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
2805 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2806 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7]
2807 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7]
2808 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2809 ; AVX-NEXT: vmovdqa 16(%r8), %xmm7
2810 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2811 ; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0
2812 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
2813 ; AVX-NEXT: vandps %ymm2, %ymm9, %ymm2
2814 ; AVX-NEXT: vmovaps %ymm9, %ymm5
2815 ; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0
2816 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
2817 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2818 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5]
2819 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
2820 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2821 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
2822 ; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2
2823 ; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
2824 ; AVX-NEXT: # xmm3 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
2825 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2826 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
2827 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
2828 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
2829 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
2830 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
2831 ; AVX-NEXT: vandps %ymm6, %ymm1, %ymm1
2832 ; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1
2833 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
2834 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2835 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
2836 ; AVX-NEXT: vmovdqa %xmm4, %xmm8
2837 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,4,6,5]
2838 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,6,6,7]
2839 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2840 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
2841 ; AVX-NEXT: vmovdqa %xmm13, %xmm9
2842 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,5,5,7]
2843 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,7,7]
2844 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2845 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2846 ; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0
2847 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2848 ; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1
2849 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm5
2850 ; AVX-NEXT: vmovdqa 16(%rcx), %xmm4
2851 ; AVX-NEXT: vmovdqa 16(%rdx), %xmm3
2852 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2853 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,4,6,5]
2854 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7]
2855 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm0
2856 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm2
2857 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
2858 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2859 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[2,3,2,3]
2860 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
2861 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3]
2862 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
2863 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13
2864 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
2865 ; AVX-NEXT: vandnps %ymm0, %ymm6, %ymm0
2866 ; AVX-NEXT: vandps %ymm6, %ymm13, %ymm13
2867 ; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0
2868 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7]
2869 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2870 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
2871 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5]
2872 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,6,7]
2873 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
2874 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
2875 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7]
2876 ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7]
2877 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7
2878 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2879 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
2880 ; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0
2881 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
2882 ; AVX-NEXT: vandps %ymm7, %ymm9, %ymm7
2883 ; AVX-NEXT: vorps %ymm0, %ymm7, %ymm7
2884 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
2885 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5]
2886 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7]
2887 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
2888 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2889 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
2890 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
2891 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
2892 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
2893 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
2894 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
2895 ; AVX-NEXT: vandnps %ymm3, %ymm6, %ymm3
2896 ; AVX-NEXT: vandps %ymm6, %ymm1, %ymm1
2897 ; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1
2898 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4],ymm7[5],ymm1[6],ymm7[7]
2899 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,0,2,1,4,5,6,7]
2900 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7]
2901 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
2902 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7]
2903 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7]
2904 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
2905 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
2906 ; AVX-NEXT: vandnps %ymm3, %ymm9, %ymm3
2907 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
2908 ; AVX-NEXT: vandps %ymm4, %ymm9, %ymm4
2909 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
2910 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
2911 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
2912 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
2913 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
2914 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7]
2915 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
2916 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2917 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2918 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
2919 ; AVX-NEXT: vandps %ymm6, %ymm2, %ymm2
2920 ; AVX-NEXT: vandnps %ymm0, %ymm6, %ymm0
2921 ; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0
2922 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
2923 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,0,2,1,4,5,6,7]
2924 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7]
2925 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2926 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7]
2927 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7]
2928 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
2929 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
2930 ; AVX-NEXT: vandnps %ymm2, %ymm9, %ymm2
2931 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
2932 ; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3
2933 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
2934 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
2935 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,1,1]
2936 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
2937 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
2938 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,0,2,1,4,5,6,7]
2939 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
2940 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,2,2,3,4,5,6,7]
2941 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
2942 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
2943 ; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3
2944 ; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4
2945 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
2946 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
2947 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2948 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7]
2949 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
2950 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
2951 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2952 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7]
2953 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7]
2954 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
2955 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
2956 ; AVX-NEXT: vandnps %ymm3, %ymm9, %ymm3
2957 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
2958 ; AVX-NEXT: vandps %ymm4, %ymm9, %ymm4
2959 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
2960 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2961 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
2962 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
2963 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
2964 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
2965 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2966 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,0,2,1,4,5,6,7]
2967 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
2968 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
2969 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
2970 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
2971 ; AVX-NEXT: vandps %ymm6, %ymm4, %ymm4
2972 ; AVX-NEXT: vandnps %ymm5, %ymm6, %ymm5
2973 ; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4
2974 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
2975 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2976 ; AVX-NEXT: vmovaps %ymm3, 64(%rax)
2977 ; AVX-NEXT: vmovaps %ymm2, 128(%rax)
2978 ; AVX-NEXT: vmovaps %ymm0, 192(%rax)
2979 ; AVX-NEXT: vmovaps %ymm1, 224(%rax)
2980 ; AVX-NEXT: vmovaps %ymm15, 160(%rax)
2981 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2982 ; AVX-NEXT: vmovaps %ymm0, 96(%rax)
2983 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2984 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
2985 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2986 ; AVX-NEXT: vmovaps %ymm0, (%rax)
2987 ; AVX-NEXT: addq $56, %rsp
2988 ; AVX-NEXT: vzeroupper
2991 ; AVX2-LABEL: store_i8_stride8_vf32:
2993 ; AVX2-NEXT: subq $88, %rsp
2994 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2995 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
2996 ; AVX2-NEXT: vmovdqa (%rsi), %xmm2
2997 ; AVX2-NEXT: vmovdqa (%rdi), %xmm3
2998 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2999 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3000 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
3001 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
3002 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
3003 ; AVX2-NEXT: vmovdqa (%rcx), %xmm4
3004 ; AVX2-NEXT: vmovdqa (%rdx), %xmm5
3005 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
3006 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,0,2,1,4,5,6,7]
3007 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
3008 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7]
3009 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
3010 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
3011 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15]
3012 ; AVX2-NEXT: vmovdqa (%r10), %xmm6
3013 ; AVX2-NEXT: vmovdqa (%rax), %xmm7
3014 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
3015 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7]
3016 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,2,2,3,4,5,6,7]
3017 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9
3018 ; AVX2-NEXT: vmovdqa (%r9), %xmm10
3019 ; AVX2-NEXT: vmovdqa (%r8), %xmm11
3020 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
3021 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,1,1,3,4,5,6,7]
3022 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,1,3,3,4,5,6,7]
3023 ; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm8, %ymm15
3024 ; AVX2-NEXT: vmovaps 16(%rsi), %xmm8
3025 ; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3026 ; AVX2-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5,6,7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13,14,15]
3027 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9
3028 ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5]
3029 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4],ymm15[5],ymm0[6],ymm15[7]
3030 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3031 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
3032 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3033 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
3034 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3035 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3036 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,4,6,5]
3037 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
3038 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1
3039 ; AVX2-NEXT: vmovdqa 16(%rcx), %xmm8
3040 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
3041 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
3042 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,4,6,5]
3043 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7]
3044 ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1
3045 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,5,7]
3046 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7]
3047 ; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13
3048 ; AVX2-NEXT: vmovdqa 16(%rdx), %xmm15
3049 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5],ymm13[6],ymm1[7],ymm13[8,9,10,11,12],ymm1[13],ymm13[14],ymm1[15]
3050 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3051 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
3052 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3053 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
3054 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3055 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
3056 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3057 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
3058 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3059 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
3060 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
3061 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3062 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
3063 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,6,6,7]
3064 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
3065 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
3066 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
3067 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
3068 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3069 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
3070 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,6,6,7]
3071 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
3072 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
3073 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3074 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7]
3075 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,7,7]
3076 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7
3077 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5],ymm7[6],ymm2[7],ymm7[8,9,10,11,12],ymm2[13],ymm7[14],ymm2[15]
3078 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
3079 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
3080 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3081 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3082 ; AVX2-NEXT: vmovdqa %xmm9, %xmm5
3083 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
3084 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
3085 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3086 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3]
3087 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
3088 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3089 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
3090 ; AVX2-NEXT: vmovdqa %xmm8, %xmm9
3091 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,4,6,5]
3092 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,6,6,7]
3093 ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3094 ; AVX2-NEXT: vmovdqa 16(%r10), %xmm8
3095 ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7]
3096 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7,8],ymm13[9],ymm7[10,11,12],ymm13[13],ymm7[14,15]
3097 ; AVX2-NEXT: vmovdqa 16(%rax), %xmm4
3098 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
3099 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,4,6,5]
3100 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7]
3101 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2
3102 ; AVX2-NEXT: vmovdqa 16(%r9), %xmm3
3103 ; AVX2-NEXT: vmovdqa 16(%r8), %xmm1
3104 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3105 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7]
3106 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,6,5,7,7]
3107 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0
3108 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15]
3109 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3110 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7]
3111 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3112 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
3113 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
3114 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3115 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[3,3,3,3]
3116 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
3117 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0
3118 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15]
3119 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,4,6,5]
3120 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,6,6,7]
3121 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7
3122 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7]
3123 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7,8],ymm7[9],ymm0[10,11,12],ymm7[13],ymm0[14,15]
3124 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
3125 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5]
3126 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7]
3127 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
3128 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
3129 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7]
3130 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,6,5,7,7]
3131 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
3132 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6],ymm5[7],ymm3[8,9,10,11,12],ymm5[13],ymm3[14],ymm5[15]
3133 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
3134 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
3135 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3136 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
3137 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3138 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
3139 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7]
3140 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
3141 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7]
3142 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
3143 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
3144 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
3145 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7]
3146 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
3147 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
3148 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7]
3149 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
3150 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
3151 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5,6,7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13,14,15]
3152 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
3153 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
3154 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
3155 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1]
3156 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3157 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3158 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7]
3159 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
3160 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7]
3161 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
3162 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
3163 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
3164 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,0,2,1,4,5,6,7]
3165 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,2,2,3,4,5,6,7]
3166 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
3167 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,1,3,4,5,6,7]
3168 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[2,1,3,3,4,5,6,7]
3169 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
3170 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5,6,7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13,14,15]
3171 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
3172 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
3173 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3174 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
3175 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
3176 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
3177 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
3178 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3179 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7]
3180 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
3181 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
3182 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
3183 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
3184 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
3185 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3186 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7]
3187 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
3188 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
3189 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3190 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7]
3191 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7]
3192 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
3193 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15]
3194 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
3195 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
3196 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
3197 ; AVX2-NEXT: vmovdqa %ymm3, 64(%rax)
3198 ; AVX2-NEXT: vmovdqa %ymm2, 128(%rax)
3199 ; AVX2-NEXT: vmovdqa %ymm1, 192(%rax)
3200 ; AVX2-NEXT: vmovdqa %ymm0, 224(%rax)
3201 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3202 ; AVX2-NEXT: vmovaps %ymm0, 160(%rax)
3203 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3204 ; AVX2-NEXT: vmovaps %ymm0, 96(%rax)
3205 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3206 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
3207 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3208 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
3209 ; AVX2-NEXT: addq $88, %rsp
3210 ; AVX2-NEXT: vzeroupper
3213 ; AVX2-FP-LABEL: store_i8_stride8_vf32:
3215 ; AVX2-FP-NEXT: subq $72, %rsp
3216 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3217 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3218 ; AVX2-FP-NEXT: vmovdqa (%r10), %xmm5
3219 ; AVX2-FP-NEXT: vmovdqa (%rax), %xmm6
3220 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
3221 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8
3222 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23]
3223 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1
3224 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
3225 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3226 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9
3227 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23]
3228 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
3229 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm7
3230 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm3
3231 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm10
3232 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
3233 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
3234 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
3235 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13
3236 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm15
3237 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
3238 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0
3239 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31]
3240 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15]
3241 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7]
3242 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3243 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
3244 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm8
3245 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
3246 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm9, %ymm9
3247 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15]
3248 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
3249 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
3250 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9
3251 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854]
3252 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm9, %ymm9
3253 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15]
3254 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
3255 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
3256 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
3257 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
3258 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3259 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3260 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
3261 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3262 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm0
3263 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm1, %ymm1
3264 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
3265 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15]
3266 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2
3267 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3268 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15]
3269 ; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1
3270 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
3271 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
3272 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
3273 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
3274 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3275 ; AVX2-FP-NEXT: vmovdqa 16(%r10), %xmm8
3276 ; AVX2-FP-NEXT: vmovdqa 16(%rax), %xmm6
3277 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
3278 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12
3279 ; AVX2-FP-NEXT: vmovdqa 16(%r9), %xmm7
3280 ; AVX2-FP-NEXT: vmovdqa 16(%r8), %xmm5
3281 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
3282 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10
3283 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm12, %ymm3
3284 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm10, %ymm4
3285 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
3286 ; AVX2-FP-NEXT: vmovdqa 16(%rcx), %xmm4
3287 ; AVX2-FP-NEXT: vmovdqa 16(%rdx), %xmm2
3288 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
3289 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9
3290 ; AVX2-FP-NEXT: vmovdqa 16(%rsi), %xmm1
3291 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm0
3292 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3293 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14
3294 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u]
3295 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31]
3296 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15]
3297 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7]
3298 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
3299 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
3300 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6
3301 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31]
3302 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5
3303 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31]
3304 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15]
3305 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
3306 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3307 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
3308 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u]
3309 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
3310 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31]
3311 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15]
3312 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7]
3313 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
3314 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm4
3315 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
3316 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
3317 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
3318 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798]
3319 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm5
3320 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3321 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
3322 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
3323 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
3324 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
3325 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7]
3326 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm12, %ymm2
3327 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm10, %ymm4
3328 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
3329 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm4
3330 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3331 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
3332 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm9, %ymm4
3333 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
3334 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
3335 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3336 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm3
3337 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3338 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
3339 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
3340 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3341 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
3342 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm11, %xmm5
3343 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
3344 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5
3345 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15]
3346 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
3347 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3348 ; AVX2-FP-NEXT: vmovdqa %ymm3, 64(%rax)
3349 ; AVX2-FP-NEXT: vmovdqa %ymm2, 128(%rax)
3350 ; AVX2-FP-NEXT: vmovdqa %ymm1, 192(%rax)
3351 ; AVX2-FP-NEXT: vmovdqa %ymm0, 224(%rax)
3352 ; AVX2-FP-NEXT: vmovdqa %ymm15, 160(%rax)
3353 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3354 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax)
3355 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3356 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
3357 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3358 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
3359 ; AVX2-FP-NEXT: addq $72, %rsp
3360 ; AVX2-FP-NEXT: vzeroupper
3361 ; AVX2-FP-NEXT: retq
3363 ; AVX2-FCP-LABEL: store_i8_stride8_vf32:
3364 ; AVX2-FCP: # %bb.0:
3365 ; AVX2-FCP-NEXT: subq $72, %rsp
3366 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3367 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3368 ; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm5
3369 ; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm6
3370 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
3371 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8
3372 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23]
3373 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm1
3374 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
3375 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3376 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9
3377 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23]
3378 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
3379 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm7
3380 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm3
3381 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm10
3382 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
3383 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
3384 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
3385 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13
3386 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm15
3387 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
3388 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0
3389 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31]
3390 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15]
3391 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7]
3392 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3393 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
3394 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8
3395 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
3396 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm9, %ymm9
3397 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15]
3398 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
3399 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
3400 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9
3401 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854]
3402 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9
3403 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15]
3404 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
3405 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
3406 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
3407 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
3408 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3409 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3410 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
3411 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3412 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0
3413 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1
3414 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
3415 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15]
3416 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2
3417 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3418 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15]
3419 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1
3420 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
3421 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
3422 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
3423 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
3424 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3425 ; AVX2-FCP-NEXT: vmovdqa 16(%r10), %xmm8
3426 ; AVX2-FCP-NEXT: vmovdqa 16(%rax), %xmm6
3427 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
3428 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12
3429 ; AVX2-FCP-NEXT: vmovdqa 16(%r9), %xmm7
3430 ; AVX2-FCP-NEXT: vmovdqa 16(%r8), %xmm5
3431 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
3432 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10
3433 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm12, %ymm3
3434 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm4
3435 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
3436 ; AVX2-FCP-NEXT: vmovdqa 16(%rcx), %xmm4
3437 ; AVX2-FCP-NEXT: vmovdqa 16(%rdx), %xmm2
3438 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
3439 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9
3440 ; AVX2-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
3441 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm0
3442 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3443 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14
3444 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u]
3445 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31]
3446 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15]
3447 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7]
3448 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
3449 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
3450 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6
3451 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31]
3452 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5
3453 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31]
3454 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15]
3455 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
3456 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3457 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
3458 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u]
3459 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
3460 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31]
3461 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15]
3462 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7]
3463 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
3464 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm4
3465 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
3466 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
3467 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
3468 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798]
3469 ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm5
3470 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3471 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
3472 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
3473 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
3474 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
3475 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7]
3476 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm2
3477 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm4
3478 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
3479 ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm4
3480 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3481 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
3482 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm4
3483 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
3484 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
3485 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3486 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3
3487 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3488 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
3489 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
3490 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3491 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
3492 ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm5
3493 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
3494 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5
3495 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15]
3496 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
3497 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3498 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
3499 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 128(%rax)
3500 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 192(%rax)
3501 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 224(%rax)
3502 ; AVX2-FCP-NEXT: vmovdqa %ymm15, 160(%rax)
3503 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3504 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax)
3505 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3506 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
3507 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3508 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
3509 ; AVX2-FCP-NEXT: addq $72, %rsp
3510 ; AVX2-FCP-NEXT: vzeroupper
3511 ; AVX2-FCP-NEXT: retq
3513 ; AVX512-LABEL: store_i8_stride8_vf32:
3515 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
3516 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
3517 ; AVX512-NEXT: vmovdqa (%r10), %xmm1
3518 ; AVX512-NEXT: vmovdqa 16(%r10), %xmm11
3519 ; AVX512-NEXT: vmovdqa (%rax), %xmm2
3520 ; AVX512-NEXT: vmovdqa 16(%rax), %xmm12
3521 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
3522 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21
3523 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22
3524 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
3525 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
3526 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
3527 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3528 ; AVX512-NEXT: vmovdqa (%r9), %xmm3
3529 ; AVX512-NEXT: vmovdqa 16(%r9), %xmm13
3530 ; AVX512-NEXT: vmovdqa (%r8), %xmm4
3531 ; AVX512-NEXT: vmovdqa 16(%r8), %xmm14
3532 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
3533 ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm23
3534 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm24
3535 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7]
3536 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,6,5,7,7]
3537 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
3538 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
3539 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15]
3540 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7]
3541 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3542 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0
3543 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
3544 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7]
3545 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
3546 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
3547 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
3548 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
3549 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16
3550 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1
3551 ; AVX512-NEXT: vmovdqa (%rdi), %xmm7
3552 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
3553 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm25
3554 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3555 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3556 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
3557 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3558 ; AVX512-NEXT: vmovdqa (%rcx), %xmm8
3559 ; AVX512-NEXT: vmovdqa (%rdx), %xmm9
3560 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
3561 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,4,6,5]
3562 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7]
3563 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3
3564 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
3565 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7]
3566 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
3567 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3568 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
3569 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3570 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
3571 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7]
3572 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
3573 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7]
3574 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
3575 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3576 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
3577 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18
3578 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
3579 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
3580 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
3581 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
3582 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
3583 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7]
3584 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7]
3585 ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm10
3586 ; AVX512-NEXT: vmovdqa 16(%rcx), %xmm5
3587 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7]
3588 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3589 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0
3590 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3591 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7]
3592 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15]
3593 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7]
3594 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
3595 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3
3596 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
3597 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
3598 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15]
3599 ; AVX512-NEXT: vmovdqa 16(%rdx), %xmm10
3600 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17
3601 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
3602 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5]
3603 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7]
3604 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm15
3605 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4
3606 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
3607 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
3608 ; AVX512-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[2,3,2,3]
3609 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
3610 ; AVX512-NEXT: vpshufd {{.*#+}} xmm20 = xmm0[3,3,3,3]
3611 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
3612 ; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm1
3613 ; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7]
3614 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15]
3615 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3616 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
3617 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7]
3618 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
3619 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
3620 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
3621 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3622 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0
3623 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3
3624 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15]
3625 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm15
3626 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
3627 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
3628 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
3629 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
3630 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
3631 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7]
3632 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,7,7]
3633 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6
3634 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7]
3635 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3636 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0
3637 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3638 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
3639 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15]
3640 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7]
3641 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
3642 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3
3643 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
3644 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
3645 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15]
3646 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11
3647 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
3648 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3649 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3650 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
3651 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3652 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
3653 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
3654 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
3655 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
3656 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
3657 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7]
3658 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
3659 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3660 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
3661 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3662 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
3663 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
3664 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
3665 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7]
3666 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
3667 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
3668 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
3669 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3670 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1
3671 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2
3672 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3673 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
3674 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7]
3675 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3676 ; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3
3677 ; AVX512-NEXT: vmovdqa64 %xmm24, %xmm4
3678 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
3679 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7]
3680 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7]
3681 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
3682 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7]
3683 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3684 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1
3685 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
3686 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
3687 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
3688 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7]
3689 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
3690 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
3691 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
3692 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
3693 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15]
3694 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
3695 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm2
3696 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
3697 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
3698 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3699 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
3700 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
3701 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
3702 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5]
3703 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7]
3704 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
3705 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
3706 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7]
3707 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
3708 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3709 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
3710 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3711 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7]
3712 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
3713 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
3714 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7]
3715 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
3716 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
3717 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15]
3718 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
3719 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
3720 ; AVX512-NEXT: movw $-21846, %cx # imm = 0xAAAA
3721 ; AVX512-NEXT: kmovw %ecx, %k1
3722 ; AVX512-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1}
3723 ; AVX512-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1}
3724 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1}
3725 ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
3726 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax)
3727 ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax)
3728 ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rax)
3729 ; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax)
3730 ; AVX512-NEXT: vzeroupper
3733 ; AVX512-FCP-LABEL: store_i8_stride8_vf32:
3734 ; AVX512-FCP: # %bb.0:
3735 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3736 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3737 ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm1
3738 ; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm2
3739 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
3740 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21
3741 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22
3742 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3743 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31]
3744 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3
3745 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm4
3746 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
3747 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm24
3748 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25
3749 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
3750 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
3751 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm3
3752 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm20
3753 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15]
3754 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
3755 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3756 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
3757 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
3758 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3759 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm27
3760 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
3761 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23
3762 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1
3763 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10
3764 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
3765 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm30
3766 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3767 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12
3768 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm13
3769 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
3770 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2
3771 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854]
3772 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
3773 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
3774 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm3
3775 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28
3776 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
3777 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
3778 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3779 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
3780 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3781 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm9 = [1284,1798]
3782 ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
3783 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
3784 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15]
3785 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19
3786 ; AVX512-FCP-NEXT: vmovdqa 16(%r10), %xmm8
3787 ; AVX512-FCP-NEXT: vmovdqa 16(%rax), %xmm11
3788 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
3789 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3790 ; AVX512-FCP-NEXT: vmovdqa 16(%r9), %xmm7
3791 ; AVX512-FCP-NEXT: vmovdqa 16(%r8), %xmm6
3792 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
3793 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
3794 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,10,11,10,11,10,11,0,1,2,3,12,13,12,13,12,13,10,11,14,15,14,15]
3795 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm3
3796 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2
3797 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15]
3798 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,0,1,8,9,10,11,2,3,2,3,4,5,2,3,4,5,4,5,8,9,10,11,6,7,6,7]
3799 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3800 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
3801 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
3802 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17
3803 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm5
3804 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4
3805 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3806 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
3807 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1
3808 ; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm16
3809 ; AVX512-FCP-NEXT: vmovdqa 16(%rcx), %xmm3
3810 ; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm2
3811 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3812 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15
3813 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u]
3814 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15]
3815 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3816 ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
3817 ; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm31
3818 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0
3819 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u]
3820 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15]
3821 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18
3822 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
3823 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
3824 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3825 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
3826 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
3827 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm6
3828 ; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm11
3829 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7
3830 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15]
3831 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm7
3832 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
3833 ; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm14
3834 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1
3835 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
3836 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm20
3837 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
3838 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
3839 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
3840 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3
3841 ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm15
3842 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3
3843 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm6
3844 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm4
3845 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
3846 ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm9
3847 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
3848 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3849 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm0
3850 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
3851 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
3852 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
3853 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
3854 ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2
3855 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
3856 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3857 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm3
3858 ; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
3859 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
3860 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
3861 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm4
3862 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
3863 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm5
3864 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
3865 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
3866 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3
3867 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15]
3868 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
3869 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm3
3870 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
3871 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
3872 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5
3873 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm5
3874 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
3875 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm6
3876 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15]
3877 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
3878 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6
3879 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
3880 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4
3881 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15]
3882 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
3883 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3884 ; AVX512-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
3885 ; AVX512-FCP-NEXT: kmovw %ecx, %k1
3886 ; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1}
3887 ; AVX512-FCP-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1}
3888 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm1 {%k1}
3889 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
3890 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
3891 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax)
3892 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 128(%rax)
3893 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax)
3894 ; AVX512-FCP-NEXT: vzeroupper
3895 ; AVX512-FCP-NEXT: retq
3897 ; AVX512DQ-LABEL: store_i8_stride8_vf32:
3898 ; AVX512DQ: # %bb.0:
3899 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
3900 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
3901 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
3902 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm10
3903 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
3904 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm11
3905 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
3906 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm19
3907 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm20
3908 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3909 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3910 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
3911 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
3912 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5
3913 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1
3914 ; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm12
3915 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
3916 ; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm13
3917 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
3918 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm21
3919 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22
3920 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,6,5]
3921 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,6,6,7]
3922 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7
3923 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7]
3924 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15]
3925 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3926 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
3927 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3928 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0
3929 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7]
3930 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
3931 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
3932 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
3933 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6
3934 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15]
3935 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm16
3936 ; AVX512DQ-NEXT: vmovdqa (%r10), %xmm5
3937 ; AVX512DQ-NEXT: vmovdqa (%rax), %xmm6
3938 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
3939 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,4,6,5]
3940 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,6,6,7]
3941 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm9
3942 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm7
3943 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm8
3944 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
3945 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,5,7]
3946 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7]
3947 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1
3948 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
3949 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3950 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6],ymm9[7],ymm1[8,9,10],ymm9[11],ymm1[12,13,14],ymm9[15]
3951 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7]
3952 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3953 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0
3954 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
3955 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7]
3956 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7]
3957 ; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm9, %ymm9
3958 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
3959 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5,6],ymm0[7],ymm9[8,9,10],ymm0[11],ymm9[12,13,14],ymm0[15]
3960 ; AVX512DQ-NEXT: movw $-21846, %cx # imm = 0xAAAA
3961 ; AVX512DQ-NEXT: kmovw %ecx, %k1
3962 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1}
3963 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
3964 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3965 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3966 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3]
3967 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
3968 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
3969 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,4,6,5]
3970 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7]
3971 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2
3972 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1
3973 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
3974 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
3975 ; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm14
3976 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3977 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
3978 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3979 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,0,2,1,4,5,6,7]
3980 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero
3981 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
3982 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7]
3983 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
3984 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2
3985 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
3986 ; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm15
3987 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17
3988 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
3989 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5]
3990 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7]
3991 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18
3992 ; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm3
3993 ; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm9
3994 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
3995 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7]
3996 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,7,7]
3997 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
3998 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7]
3999 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4000 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
4001 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7]
4002 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
4003 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
4004 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,2,2,3,4,6,6,7]
4005 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4006 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7],ymm2[8,9,10],ymm4[11],ymm2[12,13,14],ymm4[15]
4007 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
4008 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
4009 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15]
4010 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm17 {%k1}
4011 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
4012 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4013 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4014 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
4015 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
4016 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
4017 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,4,6,5]
4018 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7]
4019 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
4020 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
4021 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,3,3,6,5,7,7]
4022 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
4023 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4024 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
4025 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4026 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,1,4,5,6,7]
4027 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero
4028 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
4029 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7]
4030 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
4031 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2
4032 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
4033 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm10
4034 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
4035 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
4036 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
4037 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
4038 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15]
4039 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
4040 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7]
4041 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
4042 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7]
4043 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4044 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
4045 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4046 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
4047 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15]
4048 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7]
4049 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
4050 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
4051 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
4052 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
4053 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
4054 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm10 {%k1}
4055 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm0
4056 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1
4057 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4058 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4059 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4060 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
4061 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
4062 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3
4063 ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4
4064 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
4065 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
4066 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,6,6,7]
4067 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4
4068 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
4069 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7]
4070 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
4071 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4072 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
4073 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4074 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
4075 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
4076 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
4077 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7]
4078 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
4079 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
4080 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
4081 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4082 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
4083 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
4084 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7]
4085 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4086 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
4087 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7]
4088 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7]
4089 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
4090 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7]
4091 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4092 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1
4093 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4094 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
4095 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
4096 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7]
4097 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
4098 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
4099 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
4100 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
4101 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15]
4102 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1}
4103 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
4104 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
4105 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax)
4106 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax)
4107 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rax)
4108 ; AVX512DQ-NEXT: vzeroupper
4109 ; AVX512DQ-NEXT: retq
4111 ; AVX512DQ-FCP-LABEL: store_i8_stride8_vf32:
4112 ; AVX512DQ-FCP: # %bb.0:
4113 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4114 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4115 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1
4116 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
4117 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
4118 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
4119 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm20
4120 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4121 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
4122 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1
4123 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm18
4124 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3
4125 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
4126 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
4127 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21
4128 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22
4129 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3
4130 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854]
4131 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
4132 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15]
4133 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
4134 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4135 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
4136 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798]
4137 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm3
4138 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
4139 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4140 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
4141 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17
4142 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm1
4143 ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm14
4144 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15]
4145 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
4146 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4147 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
4148 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1
4149 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
4150 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm12
4151 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm13
4152 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
4153 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
4154 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
4155 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm3
4156 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm25
4157 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15]
4158 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
4159 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4160 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
4161 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
4162 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4163 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27
4164 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
4165 ; AVX512DQ-FCP-NEXT: movw $-21846, %r11w # imm = 0xAAAA
4166 ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1
4167 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1}
4168 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm10
4169 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm9
4170 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
4171 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0
4172 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm3
4173 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rcx), %xmm7
4174 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdx), %xmm6
4175 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
4176 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
4177 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u]
4178 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15]
4179 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
4180 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2
4181 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm29
4182 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
4183 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u]
4184 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15]
4185 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16
4186 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%r10), %xmm8
4187 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rax), %xmm4
4188 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
4189 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0
4190 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%r9), %xmm5
4191 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%r8), %xmm3
4192 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
4193 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15
4194 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,10,11,10,11,10,11,0,1,2,3,12,13,12,13,12,13,10,11,14,15,14,15]
4195 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1
4196 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm2
4197 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15]
4198 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,0,1,8,9,10,11,2,3,2,3,4,5,2,3,4,5,4,5,8,9,10,11,6,7,6,7]
4199 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4200 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm2
4201 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
4202 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1}
4203 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
4204 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
4205 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4206 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2
4207 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm10
4208 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2
4209 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11
4210 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm6
4211 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15]
4212 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm9
4213 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
4214 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4215 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm15
4216 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
4217 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1
4218 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15]
4219 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
4220 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
4221 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
4222 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
4223 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
4224 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5
4225 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm3
4226 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm6
4227 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4
4228 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
4229 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm8
4230 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
4231 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7
4232 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
4233 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15]
4234 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1}
4235 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1
4236 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
4237 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
4238 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2
4239 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
4240 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
4241 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3
4242 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
4243 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
4244 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm4
4245 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
4246 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
4247 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm4
4248 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
4249 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
4250 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
4251 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
4252 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2
4253 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
4254 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
4255 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
4256 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm4
4257 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
4258 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5
4259 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
4260 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
4261 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3
4262 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15]
4263 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm1 {%k1}
4264 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4265 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
4266 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
4267 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax)
4268 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax)
4269 ; AVX512DQ-FCP-NEXT: vzeroupper
4270 ; AVX512DQ-FCP-NEXT: retq
4272 ; AVX512BW-LABEL: store_i8_stride8_vf32:
4273 ; AVX512BW: # %bb.0:
4274 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4275 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
4276 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
4277 ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
4278 ; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm11
4279 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2
4280 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm12
4281 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
4282 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4283 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
4284 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
4285 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5
4286 ; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3
4287 ; AVX512BW-NEXT: vmovdqa 16(%rcx), %xmm13
4288 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4
4289 ; AVX512BW-NEXT: vmovdqa 16(%rdx), %xmm14
4290 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
4291 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7]
4292 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
4293 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7]
4294 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
4295 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7
4296 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15]
4297 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0]
4298 ; AVX512BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0
4299 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
4300 ; AVX512BW-NEXT: vmovdqa (%r11), %xmm6
4301 ; AVX512BW-NEXT: vmovdqa 16(%r11), %xmm15
4302 ; AVX512BW-NEXT: vmovdqa (%r10), %xmm7
4303 ; AVX512BW-NEXT: vmovdqa64 16(%r10), %xmm17
4304 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
4305 ; AVX512BW-NEXT: vmovdqa (%r9), %xmm8
4306 ; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm18
4307 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm9
4308 ; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm19
4309 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
4310 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39]
4311 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20
4312 ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA
4313 ; AVX512BW-NEXT: kmovd %ecx, %k1
4314 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1}
4315 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
4316 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
4317 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1]
4318 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
4319 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5
4320 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
4321 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7]
4322 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero
4323 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7]
4324 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero
4325 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10
4326 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15]
4327 ; AVX512BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16
4328 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16
4329 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7]
4330 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7]
4331 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10
4332 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1}
4333 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
4334 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
4335 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1]
4336 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
4337 ; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
4338 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
4339 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7]
4340 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero
4341 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7]
4342 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero
4343 ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12
4344 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15]
4345 ; AVX512BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5
4346 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5
4347 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15]
4348 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15]
4349 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11
4350 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1}
4351 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4352 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4353 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1]
4354 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
4355 ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2
4356 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
4357 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
4358 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
4359 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7]
4360 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero
4361 ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4
4362 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15]
4363 ; AVX512BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1
4364 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
4365 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
4366 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
4367 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3
4368 ; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1}
4369 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
4370 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax)
4371 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax)
4372 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
4373 ; AVX512BW-NEXT: vzeroupper
4374 ; AVX512BW-NEXT: retq
4376 ; AVX512BW-FCP-LABEL: store_i8_stride8_vf32:
4377 ; AVX512BW-FCP: # %bb.0:
4378 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4379 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4380 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
4381 ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm0
4382 ; AVX512BW-FCP-NEXT: vmovdqa 16(%r11), %xmm1
4383 ; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm2
4384 ; AVX512BW-FCP-NEXT: vmovdqa 16(%r10), %xmm3
4385 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
4386 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm5
4387 ; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm6
4388 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm7
4389 ; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm8
4390 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
4391 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39]
4392 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm9
4393 ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm4
4394 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm11
4395 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12
4396 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdx), %xmm13
4397 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
4398 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm15
4399 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm16
4400 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17
4401 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm18
4402 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15]
4403 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0]
4404 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm20, %zmm19
4405 ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
4406 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
4407 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1}
4408 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
4409 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
4410 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm14
4411 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
4412 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7]
4413 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21
4414 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1}
4415 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
4416 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
4417 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm10, %zmm3
4418 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
4419 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15]
4420 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm6
4421 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1}
4422 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
4423 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
4424 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm1
4425 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
4426 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7]
4427 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm20, %zmm2
4428 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
4429 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
4430 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
4431 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax)
4432 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax)
4433 ; AVX512BW-FCP-NEXT: vzeroupper
4434 ; AVX512BW-FCP-NEXT: retq
4436 ; AVX512DQ-BW-LABEL: store_i8_stride8_vf32:
4437 ; AVX512DQ-BW: # %bb.0:
4438 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4439 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
4440 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
4441 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
4442 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm11
4443 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2
4444 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm12
4445 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
4446 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4447 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
4448 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
4449 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5
4450 ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3
4451 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rcx), %xmm13
4452 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4
4453 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdx), %xmm14
4454 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
4455 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7]
4456 ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
4457 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7]
4458 ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
4459 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7
4460 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15]
4461 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0]
4462 ; AVX512DQ-BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0
4463 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
4464 ; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm6
4465 ; AVX512DQ-BW-NEXT: vmovdqa 16(%r11), %xmm15
4466 ; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm7
4467 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%r10), %xmm17
4468 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
4469 ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8
4470 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm18
4471 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm9
4472 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm19
4473 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
4474 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39]
4475 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20
4476 ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA
4477 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
4478 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1}
4479 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
4480 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
4481 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1]
4482 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
4483 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5
4484 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
4485 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7]
4486 ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero
4487 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7]
4488 ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero
4489 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10
4490 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15]
4491 ; AVX512DQ-BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16
4492 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16
4493 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7]
4494 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7]
4495 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10
4496 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1}
4497 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
4498 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
4499 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1]
4500 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
4501 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
4502 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
4503 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7]
4504 ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero
4505 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7]
4506 ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero
4507 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12
4508 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15]
4509 ; AVX512DQ-BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5
4510 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5
4511 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15]
4512 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15]
4513 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11
4514 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1}
4515 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4516 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4517 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1]
4518 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
4519 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2
4520 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
4521 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
4522 ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
4523 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7]
4524 ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero
4525 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4
4526 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15]
4527 ; AVX512DQ-BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1
4528 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
4529 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
4530 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
4531 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3
4532 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1}
4533 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
4534 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rax)
4535 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 128(%rax)
4536 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
4537 ; AVX512DQ-BW-NEXT: vzeroupper
4538 ; AVX512DQ-BW-NEXT: retq
4540 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf32:
4541 ; AVX512DQ-BW-FCP: # %bb.0:
4542 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4543 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4544 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
4545 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm0
4546 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r11), %xmm1
4547 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm2
4548 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r10), %xmm3
4549 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
4550 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm5
4551 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm6
4552 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm7
4553 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm8
4554 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
4555 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39]
4556 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm9
4557 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm4
4558 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm11
4559 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12
4560 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdx), %xmm13
4561 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
4562 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm15
4563 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm16
4564 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17
4565 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm18
4566 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15]
4567 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0]
4568 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm20, %zmm19
4569 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
4570 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
4571 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1}
4572 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
4573 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
4574 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm14
4575 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
4576 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7]
4577 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21
4578 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1}
4579 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
4580 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
4581 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm10, %zmm3
4582 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
4583 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15]
4584 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm6
4585 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1}
4586 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
4587 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
4588 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm1
4589 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
4590 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7]
4591 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm20, %zmm2
4592 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
4593 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
4594 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
4595 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax)
4596 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax)
4597 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
4598 ; AVX512DQ-BW-FCP-NEXT: retq
4599 %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64
4600 %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64
4601 %in.vec2 = load <32 x i8>, ptr %in.vecptr2, align 64
4602 %in.vec3 = load <32 x i8>, ptr %in.vecptr3, align 64
4603 %in.vec4 = load <32 x i8>, ptr %in.vecptr4, align 64
4604 %in.vec5 = load <32 x i8>, ptr %in.vecptr5, align 64
4605 %in.vec6 = load <32 x i8>, ptr %in.vecptr6, align 64
4606 %in.vec7 = load <32 x i8>, ptr %in.vecptr7, align 64
4607 %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
4608 %2 = shufflevector <32 x i8> %in.vec2, <32 x i8> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
4609 %3 = shufflevector <32 x i8> %in.vec4, <32 x i8> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
4610 %4 = shufflevector <32 x i8> %in.vec6, <32 x i8> %in.vec7, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
4611 %5 = shufflevector <64 x i8> %1, <64 x i8> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
4612 %6 = shufflevector <64 x i8> %3, <64 x i8> %4, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
4613 %7 = shufflevector <128 x i8> %5, <128 x i8> %6, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
4614 %interleaved.vec = shufflevector <256 x i8> %7, <256 x i8> poison, <256 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 192, i32 224, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 193, i32 225, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 194, i32 226, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 195, i32 227, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 196, i32 228, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 197, i32 229, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 198, i32 230, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 199, i32 231, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 200, i32 232, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 201, i32 233, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 202, i32 234, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 203, i32 235, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 204, i32 236, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 205, i32 237, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 206, i32 238, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 207, i32 239, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 208, i32 240, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 209, i32 241, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 210, i32 242, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 211, i32 243, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 212, i32 244, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 213, i32 245, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 214, i32 246, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 215, i32 247, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 216, i32 248, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 217, i32 249, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 218, i32 250, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 219, i32 251, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 220, i32 252, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 221, i32 253, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 222, i32 254, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191, i32 223, i32 255>
4615 store <256 x i8> %interleaved.vec, ptr %out.vec, align 64
4619 define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind {
4620 ; SSE-LABEL: store_i8_stride8_vf64:
4622 ; SSE-NEXT: subq $312, %rsp # imm = 0x138
4623 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4624 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
4625 ; SSE-NEXT: movdqa (%rdi), %xmm3
4626 ; SSE-NEXT: movdqa (%rsi), %xmm5
4627 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4628 ; SSE-NEXT: movdqa (%rdx), %xmm4
4629 ; SSE-NEXT: movdqa (%rcx), %xmm8
4630 ; SSE-NEXT: movdqa (%r8), %xmm6
4631 ; SSE-NEXT: movdqa (%r9), %xmm9
4632 ; SSE-NEXT: movdqa (%r10), %xmm7
4633 ; SSE-NEXT: movdqa (%rax), %xmm10
4634 ; SSE-NEXT: movdqa %xmm7, %xmm0
4635 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
4636 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
4637 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,1]
4638 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,65535,0]
4639 ; SSE-NEXT: movdqa %xmm13, %xmm12
4640 ; SSE-NEXT: pandn %xmm2, %xmm12
4641 ; SSE-NEXT: movdqa %xmm6, %xmm11
4642 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
4643 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7]
4644 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,2,1]
4645 ; SSE-NEXT: pand %xmm13, %xmm14
4646 ; SSE-NEXT: por %xmm12, %xmm14
4647 ; SSE-NEXT: movdqa %xmm4, %xmm12
4648 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
4649 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,0,2,1,4,5,6,7]
4650 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
4651 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535]
4652 ; SSE-NEXT: movdqa %xmm1, %xmm15
4653 ; SSE-NEXT: pandn %xmm2, %xmm15
4654 ; SSE-NEXT: movdqa %xmm3, %xmm2
4655 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
4656 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0]
4657 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
4658 ; SSE-NEXT: pand %xmm1, %xmm5
4659 ; SSE-NEXT: por %xmm15, %xmm5
4660 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,3,2,3]
4661 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
4662 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1]
4663 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4664 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7]
4665 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
4666 ; SSE-NEXT: movdqa %xmm13, %xmm14
4667 ; SSE-NEXT: pandn %xmm5, %xmm14
4668 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[2,1,3,3,4,5,6,7]
4669 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
4670 ; SSE-NEXT: pand %xmm13, %xmm5
4671 ; SSE-NEXT: por %xmm14, %xmm5
4672 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,2,2,3,4,5,6,7]
4673 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3]
4674 ; SSE-NEXT: movdqa %xmm1, %xmm15
4675 ; SSE-NEXT: pandn %xmm14, %xmm15
4676 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1]
4677 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
4678 ; SSE-NEXT: pand %xmm1, %xmm14
4679 ; SSE-NEXT: por %xmm15, %xmm14
4680 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
4681 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
4682 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
4683 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4684 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5]
4685 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
4686 ; SSE-NEXT: movdqa %xmm13, %xmm14
4687 ; SSE-NEXT: pandn %xmm5, %xmm14
4688 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,7]
4689 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
4690 ; SSE-NEXT: pand %xmm13, %xmm5
4691 ; SSE-NEXT: por %xmm14, %xmm5
4692 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,4,6,5]
4693 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3]
4694 ; SSE-NEXT: movdqa %xmm1, %xmm15
4695 ; SSE-NEXT: pandn %xmm14, %xmm15
4696 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2]
4697 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
4698 ; SSE-NEXT: pand %xmm1, %xmm14
4699 ; SSE-NEXT: por %xmm15, %xmm14
4700 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
4701 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
4702 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
4703 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4704 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
4705 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4706 ; SSE-NEXT: movdqa %xmm13, %xmm5
4707 ; SSE-NEXT: pandn %xmm0, %xmm5
4708 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,7,7]
4709 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3]
4710 ; SSE-NEXT: pand %xmm13, %xmm11
4711 ; SSE-NEXT: por %xmm5, %xmm11
4712 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7]
4713 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
4714 ; SSE-NEXT: movdqa %xmm1, %xmm5
4715 ; SSE-NEXT: pandn %xmm0, %xmm5
4716 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
4717 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,5,5,5,5]
4718 ; SSE-NEXT: pand %xmm1, %xmm12
4719 ; SSE-NEXT: por %xmm5, %xmm12
4720 ; SSE-NEXT: movdqa 16(%r8), %xmm0
4721 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,3,2,3]
4722 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,2,3]
4723 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
4724 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4725 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
4726 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,0,2,1,4,5,6,7]
4727 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
4728 ; SSE-NEXT: movdqa %xmm13, %xmm10
4729 ; SSE-NEXT: pandn %xmm5, %xmm10
4730 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
4731 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7]
4732 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
4733 ; SSE-NEXT: pand %xmm13, %xmm5
4734 ; SSE-NEXT: por %xmm10, %xmm5
4735 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
4736 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
4737 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7]
4738 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3]
4739 ; SSE-NEXT: movdqa %xmm1, %xmm9
4740 ; SSE-NEXT: pandn %xmm8, %xmm9
4741 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4742 ; SSE-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15]
4743 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,0,0]
4744 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
4745 ; SSE-NEXT: pand %xmm1, %xmm8
4746 ; SSE-NEXT: por %xmm9, %xmm8
4747 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
4748 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
4749 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4750 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7]
4751 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
4752 ; SSE-NEXT: movdqa %xmm13, %xmm8
4753 ; SSE-NEXT: pandn %xmm5, %xmm8
4754 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7]
4755 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
4756 ; SSE-NEXT: pand %xmm13, %xmm5
4757 ; SSE-NEXT: por %xmm8, %xmm5
4758 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
4759 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7]
4760 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3]
4761 ; SSE-NEXT: movdqa %xmm1, %xmm9
4762 ; SSE-NEXT: pandn %xmm8, %xmm9
4763 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1]
4764 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
4765 ; SSE-NEXT: pand %xmm1, %xmm8
4766 ; SSE-NEXT: por %xmm9, %xmm8
4767 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
4768 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
4769 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4770 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5]
4771 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
4772 ; SSE-NEXT: movdqa %xmm13, %xmm8
4773 ; SSE-NEXT: pandn %xmm5, %xmm8
4774 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,5,7]
4775 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3]
4776 ; SSE-NEXT: pand %xmm13, %xmm9
4777 ; SSE-NEXT: por %xmm8, %xmm9
4778 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5]
4779 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3]
4780 ; SSE-NEXT: movdqa %xmm1, %xmm8
4781 ; SSE-NEXT: pandn %xmm5, %xmm8
4782 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,2,2]
4783 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,5,5,5,5]
4784 ; SSE-NEXT: pand %xmm1, %xmm10
4785 ; SSE-NEXT: por %xmm8, %xmm10
4786 ; SSE-NEXT: movdqa 16(%r10), %xmm5
4787 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,3,2,3]
4788 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3]
4789 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
4790 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4791 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
4792 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
4793 ; SSE-NEXT: movdqa %xmm13, %xmm8
4794 ; SSE-NEXT: pandn %xmm7, %xmm8
4795 ; SSE-NEXT: movdqa 16(%rax), %xmm7
4796 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4797 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7]
4798 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
4799 ; SSE-NEXT: pand %xmm13, %xmm6
4800 ; SSE-NEXT: por %xmm8, %xmm6
4801 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
4802 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
4803 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3]
4804 ; SSE-NEXT: movdqa %xmm1, %xmm8
4805 ; SSE-NEXT: pandn %xmm4, %xmm8
4806 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
4807 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
4808 ; SSE-NEXT: pand %xmm1, %xmm3
4809 ; SSE-NEXT: por %xmm8, %xmm3
4810 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
4811 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
4812 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4813 ; SSE-NEXT: movdqa %xmm5, %xmm10
4814 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
4815 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,0,2,1,4,5,6,7]
4816 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1]
4817 ; SSE-NEXT: movdqa %xmm13, %xmm4
4818 ; SSE-NEXT: pandn %xmm3, %xmm4
4819 ; SSE-NEXT: movdqa 16(%r9), %xmm6
4820 ; SSE-NEXT: movdqa %xmm0, %xmm11
4821 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
4822 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7]
4823 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,0,2,1]
4824 ; SSE-NEXT: pand %xmm13, %xmm14
4825 ; SSE-NEXT: por %xmm4, %xmm14
4826 ; SSE-NEXT: movdqa 16(%rdx), %xmm3
4827 ; SSE-NEXT: movdqa 16(%rcx), %xmm8
4828 ; SSE-NEXT: movdqa %xmm3, %xmm12
4829 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
4830 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,2,1,4,5,6,7]
4831 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3]
4832 ; SSE-NEXT: movdqa %xmm1, %xmm15
4833 ; SSE-NEXT: pandn %xmm4, %xmm15
4834 ; SSE-NEXT: movdqa 16(%rdi), %xmm4
4835 ; SSE-NEXT: movdqa 16(%rsi), %xmm9
4836 ; SSE-NEXT: movdqa %xmm4, %xmm2
4837 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
4838 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,0,0]
4839 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
4840 ; SSE-NEXT: pand %xmm1, %xmm7
4841 ; SSE-NEXT: por %xmm15, %xmm7
4842 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,3,2,3]
4843 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
4844 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1]
4845 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4846 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7]
4847 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
4848 ; SSE-NEXT: movdqa %xmm13, %xmm14
4849 ; SSE-NEXT: pandn %xmm7, %xmm14
4850 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7]
4851 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
4852 ; SSE-NEXT: pand %xmm13, %xmm7
4853 ; SSE-NEXT: por %xmm14, %xmm7
4854 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,2,2,3,4,5,6,7]
4855 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3]
4856 ; SSE-NEXT: movdqa %xmm1, %xmm15
4857 ; SSE-NEXT: pandn %xmm14, %xmm15
4858 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1]
4859 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
4860 ; SSE-NEXT: pand %xmm1, %xmm14
4861 ; SSE-NEXT: por %xmm15, %xmm14
4862 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
4863 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
4864 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1]
4865 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4866 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,4,6,5]
4867 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
4868 ; SSE-NEXT: movdqa %xmm13, %xmm14
4869 ; SSE-NEXT: pandn %xmm7, %xmm14
4870 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,4,5,5,7]
4871 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
4872 ; SSE-NEXT: pand %xmm13, %xmm7
4873 ; SSE-NEXT: por %xmm14, %xmm7
4874 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,4,6,5]
4875 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3]
4876 ; SSE-NEXT: movdqa %xmm1, %xmm15
4877 ; SSE-NEXT: pandn %xmm14, %xmm15
4878 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2]
4879 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
4880 ; SSE-NEXT: pand %xmm1, %xmm14
4881 ; SSE-NEXT: por %xmm15, %xmm14
4882 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
4883 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
4884 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1]
4885 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4886 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,6,6,7]
4887 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
4888 ; SSE-NEXT: movdqa %xmm13, %xmm10
4889 ; SSE-NEXT: pandn %xmm7, %xmm10
4890 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,6,5,7,7]
4891 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
4892 ; SSE-NEXT: pand %xmm13, %xmm7
4893 ; SSE-NEXT: por %xmm10, %xmm7
4894 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7]
4895 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3]
4896 ; SSE-NEXT: movdqa %xmm1, %xmm11
4897 ; SSE-NEXT: pandn %xmm10, %xmm11
4898 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[3,3,3,3]
4899 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
4900 ; SSE-NEXT: pand %xmm1, %xmm10
4901 ; SSE-NEXT: por %xmm11, %xmm10
4902 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
4903 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3]
4904 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
4905 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4906 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4907 ; SSE-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15]
4908 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7]
4909 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
4910 ; SSE-NEXT: movdqa %xmm13, %xmm10
4911 ; SSE-NEXT: pandn %xmm7, %xmm10
4912 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
4913 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,1,1,3,4,5,6,7]
4914 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
4915 ; SSE-NEXT: pand %xmm13, %xmm6
4916 ; SSE-NEXT: por %xmm10, %xmm6
4917 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
4918 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7]
4919 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3]
4920 ; SSE-NEXT: movdqa %xmm1, %xmm8
4921 ; SSE-NEXT: pandn %xmm7, %xmm8
4922 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
4923 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,0,0]
4924 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
4925 ; SSE-NEXT: pand %xmm1, %xmm7
4926 ; SSE-NEXT: por %xmm8, %xmm7
4927 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
4928 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
4929 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
4930 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4931 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7]
4932 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
4933 ; SSE-NEXT: movdqa %xmm13, %xmm7
4934 ; SSE-NEXT: pandn %xmm6, %xmm7
4935 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7]
4936 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
4937 ; SSE-NEXT: pand %xmm13, %xmm6
4938 ; SSE-NEXT: por %xmm7, %xmm6
4939 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
4940 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3]
4941 ; SSE-NEXT: movdqa %xmm1, %xmm8
4942 ; SSE-NEXT: pandn %xmm7, %xmm8
4943 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
4944 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
4945 ; SSE-NEXT: pand %xmm1, %xmm7
4946 ; SSE-NEXT: por %xmm8, %xmm7
4947 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
4948 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
4949 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
4950 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4951 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5]
4952 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
4953 ; SSE-NEXT: movdqa %xmm13, %xmm7
4954 ; SSE-NEXT: pandn %xmm6, %xmm7
4955 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,5,7]
4956 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
4957 ; SSE-NEXT: pand %xmm13, %xmm6
4958 ; SSE-NEXT: por %xmm7, %xmm6
4959 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,4,6,5]
4960 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3]
4961 ; SSE-NEXT: movdqa %xmm1, %xmm8
4962 ; SSE-NEXT: pandn %xmm7, %xmm8
4963 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2]
4964 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
4965 ; SSE-NEXT: pand %xmm1, %xmm7
4966 ; SSE-NEXT: por %xmm8, %xmm7
4967 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
4968 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
4969 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
4970 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4971 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
4972 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
4973 ; SSE-NEXT: movdqa %xmm13, %xmm6
4974 ; SSE-NEXT: pandn %xmm5, %xmm6
4975 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
4976 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4977 ; SSE-NEXT: pand %xmm13, %xmm0
4978 ; SSE-NEXT: por %xmm6, %xmm0
4979 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
4980 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
4981 ; SSE-NEXT: movdqa %xmm1, %xmm5
4982 ; SSE-NEXT: pandn %xmm3, %xmm5
4983 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3]
4984 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
4985 ; SSE-NEXT: pand %xmm1, %xmm3
4986 ; SSE-NEXT: por %xmm5, %xmm3
4987 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
4988 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
4989 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4990 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4991 ; SSE-NEXT: movdqa 32(%r10), %xmm0
4992 ; SSE-NEXT: movdqa 32(%rax), %xmm2
4993 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4994 ; SSE-NEXT: movdqa %xmm0, %xmm10
4995 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
4996 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,0,2,1,4,5,6,7]
4997 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1]
4998 ; SSE-NEXT: movdqa %xmm13, %xmm4
4999 ; SSE-NEXT: pandn %xmm3, %xmm4
5000 ; SSE-NEXT: movdqa 32(%r8), %xmm3
5001 ; SSE-NEXT: movdqa 32(%r9), %xmm7
5002 ; SSE-NEXT: movdqa %xmm3, %xmm11
5003 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
5004 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7]
5005 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,2,1]
5006 ; SSE-NEXT: pand %xmm13, %xmm14
5007 ; SSE-NEXT: por %xmm4, %xmm14
5008 ; SSE-NEXT: movdqa 32(%rdx), %xmm4
5009 ; SSE-NEXT: movdqa 32(%rcx), %xmm8
5010 ; SSE-NEXT: movdqa %xmm4, %xmm12
5011 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
5012 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[0,0,2,1,4,5,6,7]
5013 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3]
5014 ; SSE-NEXT: movdqa %xmm1, %xmm15
5015 ; SSE-NEXT: pandn %xmm5, %xmm15
5016 ; SSE-NEXT: movdqa 32(%rdi), %xmm5
5017 ; SSE-NEXT: movdqa 32(%rsi), %xmm9
5018 ; SSE-NEXT: movdqa %xmm5, %xmm2
5019 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
5020 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0]
5021 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
5022 ; SSE-NEXT: pand %xmm1, %xmm6
5023 ; SSE-NEXT: por %xmm15, %xmm6
5024 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,3,2,3]
5025 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5026 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1]
5027 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5028 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7]
5029 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
5030 ; SSE-NEXT: movdqa %xmm13, %xmm14
5031 ; SSE-NEXT: pandn %xmm6, %xmm14
5032 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7]
5033 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
5034 ; SSE-NEXT: pand %xmm13, %xmm6
5035 ; SSE-NEXT: por %xmm14, %xmm6
5036 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,2,2,3,4,5,6,7]
5037 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3]
5038 ; SSE-NEXT: movdqa %xmm1, %xmm15
5039 ; SSE-NEXT: pandn %xmm14, %xmm15
5040 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1]
5041 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
5042 ; SSE-NEXT: pand %xmm1, %xmm14
5043 ; SSE-NEXT: por %xmm15, %xmm14
5044 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
5045 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
5046 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
5047 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5048 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,4,6,5]
5049 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5050 ; SSE-NEXT: movdqa %xmm13, %xmm14
5051 ; SSE-NEXT: pandn %xmm6, %xmm14
5052 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,5,5,7]
5053 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5054 ; SSE-NEXT: pand %xmm13, %xmm6
5055 ; SSE-NEXT: por %xmm14, %xmm6
5056 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,4,6,5]
5057 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3]
5058 ; SSE-NEXT: movdqa %xmm1, %xmm15
5059 ; SSE-NEXT: pandn %xmm14, %xmm15
5060 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2]
5061 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
5062 ; SSE-NEXT: pand %xmm1, %xmm14
5063 ; SSE-NEXT: por %xmm15, %xmm14
5064 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
5065 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
5066 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
5067 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5068 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,6,6,7]
5069 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5070 ; SSE-NEXT: movdqa %xmm13, %xmm10
5071 ; SSE-NEXT: pandn %xmm6, %xmm10
5072 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,6,5,7,7]
5073 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5074 ; SSE-NEXT: pand %xmm13, %xmm6
5075 ; SSE-NEXT: por %xmm10, %xmm6
5076 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7]
5077 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3]
5078 ; SSE-NEXT: movdqa %xmm1, %xmm11
5079 ; SSE-NEXT: pandn %xmm10, %xmm11
5080 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[3,3,3,3]
5081 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
5082 ; SSE-NEXT: pand %xmm1, %xmm10
5083 ; SSE-NEXT: por %xmm11, %xmm10
5084 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
5085 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3]
5086 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
5087 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5088 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5089 ; SSE-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
5090 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7]
5091 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
5092 ; SSE-NEXT: movdqa %xmm13, %xmm10
5093 ; SSE-NEXT: pandn %xmm6, %xmm10
5094 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
5095 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7]
5096 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
5097 ; SSE-NEXT: pand %xmm13, %xmm6
5098 ; SSE-NEXT: por %xmm10, %xmm6
5099 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
5100 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,0,2,1,4,5,6,7]
5101 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3]
5102 ; SSE-NEXT: movdqa %xmm1, %xmm8
5103 ; SSE-NEXT: pandn %xmm7, %xmm8
5104 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
5105 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0]
5106 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
5107 ; SSE-NEXT: pand %xmm1, %xmm7
5108 ; SSE-NEXT: por %xmm8, %xmm7
5109 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
5110 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
5111 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
5112 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5113 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
5114 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
5115 ; SSE-NEXT: movdqa %xmm13, %xmm7
5116 ; SSE-NEXT: pandn %xmm6, %xmm7
5117 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,1,3,3,4,5,6,7]
5118 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
5119 ; SSE-NEXT: pand %xmm13, %xmm6
5120 ; SSE-NEXT: por %xmm7, %xmm6
5121 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7]
5122 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3]
5123 ; SSE-NEXT: movdqa %xmm1, %xmm8
5124 ; SSE-NEXT: pandn %xmm7, %xmm8
5125 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
5126 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
5127 ; SSE-NEXT: pand %xmm1, %xmm7
5128 ; SSE-NEXT: por %xmm8, %xmm7
5129 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
5130 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
5131 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
5132 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
5133 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5]
5134 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5135 ; SSE-NEXT: movdqa %xmm13, %xmm7
5136 ; SSE-NEXT: pandn %xmm6, %xmm7
5137 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7]
5138 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5139 ; SSE-NEXT: pand %xmm13, %xmm6
5140 ; SSE-NEXT: por %xmm7, %xmm6
5141 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,4,6,5]
5142 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3]
5143 ; SSE-NEXT: movdqa %xmm1, %xmm8
5144 ; SSE-NEXT: pandn %xmm7, %xmm8
5145 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2]
5146 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
5147 ; SSE-NEXT: pand %xmm1, %xmm7
5148 ; SSE-NEXT: por %xmm8, %xmm7
5149 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
5150 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
5151 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
5152 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5153 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5154 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5155 ; SSE-NEXT: movdqa %xmm13, %xmm6
5156 ; SSE-NEXT: pandn %xmm0, %xmm6
5157 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,7,7]
5158 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5159 ; SSE-NEXT: pand %xmm13, %xmm0
5160 ; SSE-NEXT: por %xmm6, %xmm0
5161 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7]
5162 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
5163 ; SSE-NEXT: movdqa %xmm1, %xmm4
5164 ; SSE-NEXT: pandn %xmm3, %xmm4
5165 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3]
5166 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
5167 ; SSE-NEXT: pand %xmm1, %xmm3
5168 ; SSE-NEXT: por %xmm4, %xmm3
5169 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
5170 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
5171 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
5172 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5173 ; SSE-NEXT: movdqa 48(%r10), %xmm9
5174 ; SSE-NEXT: movdqa 48(%rax), %xmm0
5175 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5176 ; SSE-NEXT: movdqa %xmm9, %xmm6
5177 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
5178 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7]
5179 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
5180 ; SSE-NEXT: movdqa %xmm13, %xmm3
5181 ; SSE-NEXT: pandn %xmm0, %xmm3
5182 ; SSE-NEXT: movdqa 48(%r8), %xmm8
5183 ; SSE-NEXT: movdqa 48(%r9), %xmm0
5184 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5185 ; SSE-NEXT: movdqa %xmm8, %xmm4
5186 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
5187 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7]
5188 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,1]
5189 ; SSE-NEXT: pand %xmm13, %xmm10
5190 ; SSE-NEXT: por %xmm3, %xmm10
5191 ; SSE-NEXT: movdqa 48(%rdx), %xmm7
5192 ; SSE-NEXT: movdqa 48(%rcx), %xmm12
5193 ; SSE-NEXT: movdqa %xmm7, %xmm3
5194 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
5195 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7]
5196 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
5197 ; SSE-NEXT: movdqa %xmm1, %xmm15
5198 ; SSE-NEXT: pandn %xmm0, %xmm15
5199 ; SSE-NEXT: movdqa 48(%rdi), %xmm5
5200 ; SSE-NEXT: movdqa 48(%rsi), %xmm11
5201 ; SSE-NEXT: movdqa %xmm5, %xmm0
5202 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
5203 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0]
5204 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
5205 ; SSE-NEXT: pand %xmm1, %xmm14
5206 ; SSE-NEXT: por %xmm15, %xmm14
5207 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,3,2,3]
5208 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3]
5209 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
5210 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5211 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,2,2,3,4,5,6,7]
5212 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
5213 ; SSE-NEXT: movdqa %xmm13, %xmm14
5214 ; SSE-NEXT: pandn %xmm10, %xmm14
5215 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7]
5216 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
5217 ; SSE-NEXT: pand %xmm13, %xmm10
5218 ; SSE-NEXT: por %xmm14, %xmm10
5219 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[0,2,2,3,4,5,6,7]
5220 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3]
5221 ; SSE-NEXT: movdqa %xmm1, %xmm15
5222 ; SSE-NEXT: pandn %xmm14, %xmm15
5223 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1]
5224 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
5225 ; SSE-NEXT: pand %xmm1, %xmm14
5226 ; SSE-NEXT: por %xmm15, %xmm14
5227 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,3,2,3]
5228 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,2,2,3]
5229 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
5230 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,4,6,5]
5231 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
5232 ; SSE-NEXT: movdqa %xmm13, %xmm14
5233 ; SSE-NEXT: pandn %xmm10, %xmm14
5234 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,5,7]
5235 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
5236 ; SSE-NEXT: pand %xmm13, %xmm10
5237 ; SSE-NEXT: por %xmm14, %xmm10
5238 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,4,6,5]
5239 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3]
5240 ; SSE-NEXT: movdqa %xmm1, %xmm2
5241 ; SSE-NEXT: pandn %xmm14, %xmm2
5242 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[2,2,2,2]
5243 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
5244 ; SSE-NEXT: pand %xmm1, %xmm14
5245 ; SSE-NEXT: por %xmm2, %xmm14
5246 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,3,2,3]
5247 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[0,2,2,3]
5248 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1]
5249 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7]
5250 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5251 ; SSE-NEXT: movdqa %xmm13, %xmm6
5252 ; SSE-NEXT: pandn %xmm2, %xmm6
5253 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,7,7]
5254 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5255 ; SSE-NEXT: pand %xmm13, %xmm2
5256 ; SSE-NEXT: por %xmm6, %xmm2
5257 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
5258 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
5259 ; SSE-NEXT: movdqa %xmm1, %xmm4
5260 ; SSE-NEXT: pandn %xmm3, %xmm4
5261 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5262 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
5263 ; SSE-NEXT: pand %xmm1, %xmm0
5264 ; SSE-NEXT: por %xmm4, %xmm0
5265 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
5266 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
5267 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
5268 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
5269 ; SSE-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15]
5270 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7]
5271 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
5272 ; SSE-NEXT: movdqa %xmm13, %xmm2
5273 ; SSE-NEXT: pandn %xmm0, %xmm2
5274 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
5275 ; SSE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15]
5276 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7]
5277 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
5278 ; SSE-NEXT: pand %xmm13, %xmm0
5279 ; SSE-NEXT: por %xmm2, %xmm0
5280 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15]
5281 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,0,2,1,4,5,6,7]
5282 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
5283 ; SSE-NEXT: movdqa %xmm1, %xmm3
5284 ; SSE-NEXT: pandn %xmm2, %xmm3
5285 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15]
5286 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0]
5287 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
5288 ; SSE-NEXT: pand %xmm1, %xmm2
5289 ; SSE-NEXT: por %xmm3, %xmm2
5290 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
5291 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
5292 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5293 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7]
5294 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
5295 ; SSE-NEXT: movdqa %xmm13, %xmm2
5296 ; SSE-NEXT: pandn %xmm0, %xmm2
5297 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7]
5298 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
5299 ; SSE-NEXT: pand %xmm13, %xmm0
5300 ; SSE-NEXT: por %xmm2, %xmm0
5301 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7]
5302 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
5303 ; SSE-NEXT: movdqa %xmm1, %xmm3
5304 ; SSE-NEXT: pandn %xmm2, %xmm3
5305 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,1,1]
5306 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
5307 ; SSE-NEXT: pand %xmm1, %xmm2
5308 ; SSE-NEXT: por %xmm3, %xmm2
5309 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
5310 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
5311 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
5312 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,6,5]
5313 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5314 ; SSE-NEXT: movdqa %xmm13, %xmm3
5315 ; SSE-NEXT: pandn %xmm2, %xmm3
5316 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7]
5317 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5318 ; SSE-NEXT: pand %xmm13, %xmm2
5319 ; SSE-NEXT: por %xmm3, %xmm2
5320 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5]
5321 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
5322 ; SSE-NEXT: movdqa %xmm1, %xmm11
5323 ; SSE-NEXT: pandn %xmm3, %xmm11
5324 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2]
5325 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
5326 ; SSE-NEXT: pand %xmm1, %xmm3
5327 ; SSE-NEXT: por %xmm11, %xmm3
5328 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
5329 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
5330 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5331 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7]
5332 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5333 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7]
5334 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
5335 ; SSE-NEXT: pand %xmm13, %xmm8
5336 ; SSE-NEXT: pandn %xmm2, %xmm13
5337 ; SSE-NEXT: por %xmm8, %xmm13
5338 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7]
5339 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
5340 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
5341 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
5342 ; SSE-NEXT: pand %xmm1, %xmm5
5343 ; SSE-NEXT: pandn %xmm2, %xmm1
5344 ; SSE-NEXT: por %xmm5, %xmm1
5345 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,3,2,3]
5346 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5347 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
5348 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
5349 ; SSE-NEXT: movdqa %xmm1, 496(%rax)
5350 ; SSE-NEXT: movdqa %xmm3, 480(%rax)
5351 ; SSE-NEXT: movdqa %xmm0, 464(%rax)
5352 ; SSE-NEXT: movdqa %xmm4, 448(%rax)
5353 ; SSE-NEXT: movdqa %xmm6, 432(%rax)
5354 ; SSE-NEXT: movdqa %xmm10, 416(%rax)
5355 ; SSE-NEXT: movdqa %xmm15, 400(%rax)
5356 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5357 ; SSE-NEXT: movaps %xmm0, 384(%rax)
5358 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5359 ; SSE-NEXT: movaps %xmm0, 368(%rax)
5360 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5361 ; SSE-NEXT: movaps %xmm0, 352(%rax)
5362 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
5363 ; SSE-NEXT: movaps %xmm0, 336(%rax)
5364 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5365 ; SSE-NEXT: movaps %xmm0, 320(%rax)
5366 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5367 ; SSE-NEXT: movaps %xmm0, 304(%rax)
5368 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5369 ; SSE-NEXT: movaps %xmm0, 288(%rax)
5370 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5371 ; SSE-NEXT: movaps %xmm0, 272(%rax)
5372 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5373 ; SSE-NEXT: movaps %xmm0, 256(%rax)
5374 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5375 ; SSE-NEXT: movaps %xmm0, 240(%rax)
5376 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5377 ; SSE-NEXT: movaps %xmm0, 224(%rax)
5378 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5379 ; SSE-NEXT: movaps %xmm0, 208(%rax)
5380 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5381 ; SSE-NEXT: movaps %xmm0, 192(%rax)
5382 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5383 ; SSE-NEXT: movaps %xmm0, 176(%rax)
5384 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5385 ; SSE-NEXT: movaps %xmm0, 160(%rax)
5386 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5387 ; SSE-NEXT: movaps %xmm0, 144(%rax)
5388 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5389 ; SSE-NEXT: movaps %xmm0, 128(%rax)
5390 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5391 ; SSE-NEXT: movaps %xmm0, 112(%rax)
5392 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5393 ; SSE-NEXT: movaps %xmm0, 96(%rax)
5394 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5395 ; SSE-NEXT: movaps %xmm0, 80(%rax)
5396 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5397 ; SSE-NEXT: movaps %xmm0, 64(%rax)
5398 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5399 ; SSE-NEXT: movaps %xmm0, 48(%rax)
5400 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5401 ; SSE-NEXT: movaps %xmm0, 32(%rax)
5402 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5403 ; SSE-NEXT: movaps %xmm0, 16(%rax)
5404 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5405 ; SSE-NEXT: movaps %xmm0, (%rax)
5406 ; SSE-NEXT: addq $312, %rsp # imm = 0x138
5409 ; AVX-LABEL: store_i8_stride8_vf64:
5411 ; AVX-NEXT: subq $328, %rsp # imm = 0x148
5412 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5413 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
5414 ; AVX-NEXT: vmovdqa (%r10), %xmm1
5415 ; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
5416 ; AVX-NEXT: vmovdqa (%rax), %xmm0
5417 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5418 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5419 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
5420 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
5421 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
5422 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5423 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
5424 ; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm2
5425 ; AVX-NEXT: vmovdqa (%r9), %xmm3
5426 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5427 ; AVX-NEXT: vmovdqa (%r8), %xmm1
5428 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5429 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
5430 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7]
5431 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,6,5,7,7]
5432 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
5433 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
5434 ; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3
5435 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm3
5436 ; AVX-NEXT: vmovdqa (%rcx), %xmm4
5437 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5438 ; AVX-NEXT: vmovdqa (%rdx), %xmm2
5439 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5440 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
5441 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5]
5442 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7]
5443 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
5444 ; AVX-NEXT: vmovdqa (%rsi), %xmm6
5445 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5446 ; AVX-NEXT: vmovdqa (%rdi), %xmm5
5447 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5448 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
5449 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
5450 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
5451 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,3,3,3]
5452 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
5453 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
5454 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
5455 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
5456 ; AVX-NEXT: vandnps %ymm4, %ymm14, %ymm4
5457 ; AVX-NEXT: vandps %ymm6, %ymm14, %ymm6
5458 ; AVX-NEXT: vorps %ymm4, %ymm6, %ymm4
5459 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
5460 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5461 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7]
5462 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
5463 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
5464 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7]
5465 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
5466 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3
5467 ; AVX-NEXT: vmovdqa 48(%r10), %xmm1
5468 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5469 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
5470 ; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0
5471 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
5472 ; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3
5473 ; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0
5474 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
5475 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
5476 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
5477 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
5478 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
5479 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
5480 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
5481 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
5482 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm4
5483 ; AVX-NEXT: vmovdqa 48(%rax), %xmm2
5484 ; AVX-NEXT: vandps %ymm3, %ymm14, %ymm3
5485 ; AVX-NEXT: vandnps %ymm4, %ymm14, %ymm4
5486 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
5487 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7]
5488 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5489 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
5490 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7]
5491 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7]
5492 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4
5493 ; AVX-NEXT: vmovdqa 48(%r9), %xmm12
5494 ; AVX-NEXT: vmovdqa 48(%r8), %xmm3
5495 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
5496 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,1,3,4,5,6,7]
5497 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7]
5498 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
5499 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
5500 ; AVX-NEXT: vandnps %ymm4, %ymm9, %ymm4
5501 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
5502 ; AVX-NEXT: vandps %ymm5, %ymm9, %ymm5
5503 ; AVX-NEXT: vorps %ymm4, %ymm5, %ymm9
5504 ; AVX-NEXT: vmovdqa 48(%rsi), %xmm4
5505 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm6
5506 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
5507 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1]
5508 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
5509 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
5510 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm0
5511 ; AVX-NEXT: vmovdqa 48(%rcx), %xmm5
5512 ; AVX-NEXT: vmovdqa 48(%rdx), %xmm7
5513 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
5514 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7]
5515 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
5516 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,2,2,3,4,5,6,7]
5517 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero
5518 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1
5519 ; AVX-NEXT: vandps %ymm0, %ymm14, %ymm13
5520 ; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1
5521 ; AVX-NEXT: vorps %ymm1, %ymm13, %ymm1
5522 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4],ymm9[5],ymm1[6],ymm9[7]
5523 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5524 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5]
5525 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,6,6,7]
5526 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1
5527 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,4,5,5,7]
5528 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7]
5529 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
5530 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5531 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
5532 ; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1
5533 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
5534 ; AVX-NEXT: vandps %ymm11, %ymm8, %ymm8
5535 ; AVX-NEXT: vorps %ymm1, %ymm8, %ymm1
5536 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5]
5537 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7]
5538 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
5539 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3]
5540 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
5541 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3]
5542 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
5543 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
5544 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7]
5545 ; AVX-NEXT: vandnps %ymm8, %ymm14, %ymm8
5546 ; AVX-NEXT: vandps %ymm14, %ymm9, %ymm9
5547 ; AVX-NEXT: vorps %ymm8, %ymm9, %ymm8
5548 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3],ymm8[4],ymm1[5],ymm8[6],ymm1[7]
5549 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5550 ; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload
5551 ; AVX-NEXT: # xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
5552 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7]
5553 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7]
5554 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm8
5555 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
5556 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7]
5557 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
5558 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
5559 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5]
5560 ; AVX-NEXT: vmovaps %ymm11, %ymm12
5561 ; AVX-NEXT: vandnps %ymm3, %ymm11, %ymm3
5562 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
5563 ; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0
5564 ; AVX-NEXT: vorps %ymm3, %ymm0, %ymm3
5565 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
5566 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
5567 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
5568 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
5569 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
5570 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
5571 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7]
5572 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
5573 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
5574 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
5575 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
5576 ; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4
5577 ; AVX-NEXT: vandnps %ymm6, %ymm14, %ymm6
5578 ; AVX-NEXT: vorps %ymm6, %ymm4, %ymm4
5579 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
5580 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5581 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
5582 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
5583 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
5584 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
5585 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
5586 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
5587 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5588 ; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1
5589 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
5590 ; AVX-NEXT: vandps %ymm2, %ymm11, %ymm2
5591 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
5592 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5]
5593 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7]
5594 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
5595 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
5596 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
5597 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5598 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
5599 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
5600 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
5601 ; AVX-NEXT: vandnps %ymm2, %ymm14, %ymm2
5602 ; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0
5603 ; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
5604 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
5605 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5606 ; AVX-NEXT: vmovdqa 32(%r10), %xmm1
5607 ; AVX-NEXT: vmovdqa 32(%rax), %xmm2
5608 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
5609 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7]
5610 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7]
5611 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4
5612 ; AVX-NEXT: vmovdqa 32(%r9), %xmm0
5613 ; AVX-NEXT: vmovdqa 32(%r8), %xmm3
5614 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
5615 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,1,3,4,5,6,7]
5616 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7]
5617 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
5618 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
5619 ; AVX-NEXT: vandnps %ymm4, %ymm12, %ymm4
5620 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
5621 ; AVX-NEXT: vandps %ymm5, %ymm12, %ymm5
5622 ; AVX-NEXT: vorps %ymm4, %ymm5, %ymm9
5623 ; AVX-NEXT: vmovdqa 32(%rsi), %xmm4
5624 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm6
5625 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
5626 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1]
5627 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
5628 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
5629 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm13
5630 ; AVX-NEXT: vmovdqa 32(%rcx), %xmm5
5631 ; AVX-NEXT: vmovdqa 32(%rdx), %xmm7
5632 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
5633 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,0,2,1,4,5,6,7]
5634 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero
5635 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,2,2,3,4,5,6,7]
5636 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero
5637 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12
5638 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
5639 ; AVX-NEXT: vandps %ymm15, %ymm13, %ymm13
5640 ; AVX-NEXT: vandnps %ymm12, %ymm15, %ymm12
5641 ; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12
5642 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7]
5643 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5644 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5]
5645 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,6,7]
5646 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9
5647 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,5,5,7]
5648 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7]
5649 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
5650 ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
5651 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
5652 ; AVX-NEXT: vandnps %ymm9, %ymm12, %ymm9
5653 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
5654 ; AVX-NEXT: vandps %ymm12, %ymm8, %ymm8
5655 ; AVX-NEXT: vorps %ymm9, %ymm8, %ymm8
5656 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,4,6,5]
5657 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,6,6,7]
5658 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9
5659 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3]
5660 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
5661 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3]
5662 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
5663 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
5664 ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
5665 ; AVX-NEXT: vandnps %ymm9, %ymm15, %ymm9
5666 ; AVX-NEXT: vandps %ymm15, %ymm10, %ymm10
5667 ; AVX-NEXT: vorps %ymm9, %ymm10, %ymm9
5668 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7]
5669 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5670 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5671 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7]
5672 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7]
5673 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm8
5674 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
5675 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7]
5676 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
5677 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
5678 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5]
5679 ; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3
5680 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
5681 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
5682 ; AVX-NEXT: vorps %ymm3, %ymm0, %ymm3
5683 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
5684 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
5685 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
5686 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
5687 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
5688 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
5689 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7]
5690 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
5691 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
5692 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
5693 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
5694 ; AVX-NEXT: vandps %ymm4, %ymm15, %ymm4
5695 ; AVX-NEXT: vandnps %ymm6, %ymm15, %ymm6
5696 ; AVX-NEXT: vorps %ymm6, %ymm4, %ymm4
5697 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
5698 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5699 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
5700 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
5701 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
5702 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
5703 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
5704 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
5705 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5706 ; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1
5707 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
5708 ; AVX-NEXT: vandps %ymm2, %ymm12, %ymm2
5709 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
5710 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5]
5711 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7]
5712 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
5713 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
5714 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
5715 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5716 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
5717 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
5718 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
5719 ; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2
5720 ; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0
5721 ; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
5722 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
5723 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5724 ; AVX-NEXT: vmovdqa 16(%r10), %xmm9
5725 ; AVX-NEXT: vmovdqa 16(%rax), %xmm6
5726 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
5727 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
5728 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
5729 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5730 ; AVX-NEXT: vmovdqa 16(%r9), %xmm8
5731 ; AVX-NEXT: vmovdqa 16(%r8), %xmm7
5732 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
5733 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,1,3,4,5,6,7]
5734 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[2,1,3,3,4,5,6,7]
5735 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
5736 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
5737 ; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2
5738 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
5739 ; AVX-NEXT: vandps %ymm3, %ymm12, %ymm3
5740 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm11
5741 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm5
5742 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
5743 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
5744 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[1,1,1,1]
5745 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
5746 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
5747 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm12
5748 ; AVX-NEXT: vmovdqa 16(%rcx), %xmm4
5749 ; AVX-NEXT: vmovdqa 16(%rdx), %xmm2
5750 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
5751 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,0,2,1,4,5,6,7]
5752 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero
5753 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,2,2,3,4,5,6,7]
5754 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero
5755 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
5756 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
5757 ; AVX-NEXT: vandps %ymm15, %ymm12, %ymm12
5758 ; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14
5759 ; AVX-NEXT: vorps %ymm14, %ymm12, %ymm12
5760 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7]
5761 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,4,6,5]
5762 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
5763 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1
5764 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,5,5,7]
5765 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
5766 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0
5767 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5768 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
5769 ; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1
5770 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
5771 ; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0
5772 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
5773 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,4,6,5]
5774 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,6,6,7]
5775 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1
5776 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,3,2,3]
5777 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
5778 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3]
5779 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
5780 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10
5781 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
5782 ; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1
5783 ; AVX-NEXT: vandps %ymm15, %ymm10, %ymm10
5784 ; AVX-NEXT: vorps %ymm1, %ymm10, %ymm1
5785 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
5786 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
5787 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7]
5788 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7]
5789 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
5790 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
5791 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,1,3,4,5,6,7]
5792 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,3,3,4,5,6,7]
5793 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1
5794 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
5795 ; AVX-NEXT: vandnps %ymm0, %ymm14, %ymm0
5796 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
5797 ; AVX-NEXT: vandps %ymm1, %ymm14, %ymm1
5798 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm1
5799 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
5800 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
5801 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
5802 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
5803 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
5804 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
5805 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
5806 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
5807 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7]
5808 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
5809 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
5810 ; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3
5811 ; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4
5812 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
5813 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
5814 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,4,6,5]
5815 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,6,6,7]
5816 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
5817 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,5,7]
5818 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,6,5,7,7]
5819 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
5820 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
5821 ; AVX-NEXT: vandnps %ymm3, %ymm14, %ymm3
5822 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
5823 ; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4
5824 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
5825 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5]
5826 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
5827 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
5828 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
5829 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
5830 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5831 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
5832 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
5833 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
5834 ; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2
5835 ; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0
5836 ; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
5837 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
5838 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5839 ; AVX-NEXT: vpunpckhbw (%rsp), %xmm2, %xmm3 # 16-byte Folded Reload
5840 ; AVX-NEXT: # xmm3 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
5841 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
5842 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
5843 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
5844 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5845 ; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5846 ; AVX-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15]
5847 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7]
5848 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7]
5849 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
5850 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
5851 ; AVX-NEXT: vandnps %ymm2, %ymm14, %ymm2
5852 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
5853 ; AVX-NEXT: vandps %ymm5, %ymm14, %ymm5
5854 ; AVX-NEXT: vorps %ymm2, %ymm5, %ymm5
5855 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5856 ; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5857 ; AVX-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
5858 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,1,1]
5859 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
5860 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
5861 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
5862 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5863 ; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
5864 ; AVX-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15]
5865 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7]
5866 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
5867 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7]
5868 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero
5869 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
5870 ; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6
5871 ; AVX-NEXT: vandnps %ymm8, %ymm15, %ymm8
5872 ; AVX-NEXT: vorps %ymm6, %ymm8, %ymm6
5873 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
5874 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,6,5]
5875 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
5876 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
5877 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,7]
5878 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7]
5879 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
5880 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
5881 ; AVX-NEXT: vandnps %ymm3, %ymm14, %ymm3
5882 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
5883 ; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4
5884 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
5885 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,4,6,5]
5886 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7]
5887 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
5888 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
5889 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
5890 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
5891 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
5892 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
5893 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
5894 ; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4
5895 ; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2
5896 ; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2
5897 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
5898 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5899 ; AVX-NEXT: vmovaps %ymm2, 96(%rax)
5900 ; AVX-NEXT: vmovaps %ymm5, 64(%rax)
5901 ; AVX-NEXT: vmovaps %ymm0, 160(%rax)
5902 ; AVX-NEXT: vmovaps %ymm1, 128(%rax)
5903 ; AVX-NEXT: vmovaps %ymm10, 224(%rax)
5904 ; AVX-NEXT: vmovaps %ymm11, 192(%rax)
5905 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5906 ; AVX-NEXT: vmovaps %ymm0, 288(%rax)
5907 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5908 ; AVX-NEXT: vmovaps %ymm0, 256(%rax)
5909 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5910 ; AVX-NEXT: vmovaps %ymm0, 352(%rax)
5911 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5912 ; AVX-NEXT: vmovaps %ymm0, 320(%rax)
5913 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5914 ; AVX-NEXT: vmovaps %ymm0, 416(%rax)
5915 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5916 ; AVX-NEXT: vmovaps %ymm0, 384(%rax)
5917 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5918 ; AVX-NEXT: vmovaps %ymm0, 480(%rax)
5919 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5920 ; AVX-NEXT: vmovaps %ymm0, 448(%rax)
5921 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5922 ; AVX-NEXT: vmovaps %ymm0, (%rax)
5923 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5924 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
5925 ; AVX-NEXT: addq $328, %rsp # imm = 0x148
5926 ; AVX-NEXT: vzeroupper
5929 ; AVX2-LABEL: store_i8_stride8_vf64:
5931 ; AVX2-NEXT: subq $328, %rsp # imm = 0x148
5932 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5933 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
5934 ; AVX2-NEXT: vmovdqa (%rsi), %xmm0
5935 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5936 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
5937 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5938 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
5939 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
5940 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
5941 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
5942 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
5943 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
5944 ; AVX2-NEXT: vmovdqa (%rcx), %xmm1
5945 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5946 ; AVX2-NEXT: vmovdqa (%rdx), %xmm3
5947 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5948 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
5949 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5]
5950 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7]
5951 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
5952 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
5953 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
5954 ; AVX2-NEXT: vmovdqa (%r10), %xmm0
5955 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5956 ; AVX2-NEXT: vmovdqa (%rax), %xmm1
5957 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5958 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
5959 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5]
5960 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7]
5961 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5962 ; AVX2-NEXT: vmovdqa (%r9), %xmm0
5963 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5964 ; AVX2-NEXT: vmovdqa (%r8), %xmm6
5965 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5966 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
5967 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7]
5968 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7]
5969 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
5970 ; AVX2-NEXT: vmovdqa 48(%rsi), %xmm0
5971 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6],ymm1[7],ymm7[8,9,10,11,12],ymm1[13],ymm7[14],ymm1[15]
5972 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
5973 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
5974 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
5975 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5976 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
5977 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
5978 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
5979 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm5
5980 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
5981 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
5982 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
5983 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
5984 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3
5985 ; AVX2-NEXT: vmovdqa 48(%rcx), %xmm2
5986 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15]
5987 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7]
5988 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
5989 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
5990 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7]
5991 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7]
5992 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
5993 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15]
5994 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
5995 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
5996 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5997 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
5998 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
5999 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,1,1]
6000 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
6001 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4
6002 ; AVX2-NEXT: vmovdqa 48(%rdx), %xmm3
6003 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
6004 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
6005 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
6006 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7]
6007 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
6008 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
6009 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15]
6010 ; AVX2-NEXT: vmovdqa 48(%r10), %xmm4
6011 ; AVX2-NEXT: vmovdqa 48(%rax), %xmm5
6012 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
6013 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7]
6014 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7]
6015 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm12
6016 ; AVX2-NEXT: vmovdqa 48(%r9), %xmm6
6017 ; AVX2-NEXT: vmovdqa 48(%r8), %xmm7
6018 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
6019 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,1,1,3,4,5,6,7]
6020 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7]
6021 ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13
6022 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5,6,7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13,14,15]
6023 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
6024 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7]
6025 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6026 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3]
6027 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
6028 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3]
6029 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
6030 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8
6031 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5]
6032 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7]
6033 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
6034 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
6035 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15]
6036 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5]
6037 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7]
6038 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9
6039 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,4,5,5,7]
6040 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,6,5,7,7]
6041 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
6042 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15]
6043 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
6044 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7]
6045 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6046 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
6047 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
6048 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6049 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6050 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1
6051 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
6052 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7]
6053 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
6054 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
6055 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
6056 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3
6057 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15]
6058 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6059 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
6060 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7]
6061 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
6062 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
6063 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7]
6064 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7]
6065 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
6066 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15]
6067 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
6068 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7]
6069 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6070 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6071 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6072 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
6073 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6074 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
6075 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5]
6076 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
6077 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
6078 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
6079 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
6080 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5]
6081 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
6082 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
6083 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7]
6084 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7]
6085 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
6086 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6],ymm0[7],ymm2[8,9,10,11,12],ymm0[13],ymm2[14],ymm0[15]
6087 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm0
6088 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
6089 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
6090 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6091 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1
6092 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
6093 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1]
6094 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
6095 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
6096 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm4
6097 ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm2
6098 ; AVX2-NEXT: vmovdqa 32(%rdx), %xmm3
6099 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
6100 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
6101 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
6102 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7]
6103 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
6104 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
6105 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15]
6106 ; AVX2-NEXT: vmovdqa 32(%r10), %xmm4
6107 ; AVX2-NEXT: vmovdqa 32(%rax), %xmm5
6108 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
6109 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7]
6110 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7]
6111 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm12
6112 ; AVX2-NEXT: vmovdqa 32(%r9), %xmm6
6113 ; AVX2-NEXT: vmovdqa 32(%r8), %xmm7
6114 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
6115 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7]
6116 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[2,1,3,3,4,5,6,7]
6117 ; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14
6118 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5,6,7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13,14,15]
6119 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
6120 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7]
6121 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6122 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3]
6123 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
6124 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3]
6125 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
6126 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8
6127 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5]
6128 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7]
6129 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
6130 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
6131 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15]
6132 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5]
6133 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7]
6134 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9
6135 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,5,7]
6136 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,7,7]
6137 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
6138 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15]
6139 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
6140 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7]
6141 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6142 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
6143 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
6144 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6145 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6146 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1
6147 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
6148 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7]
6149 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
6150 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
6151 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
6152 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3
6153 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15]
6154 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6155 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
6156 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7]
6157 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
6158 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
6159 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7]
6160 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7]
6161 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
6162 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15]
6163 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
6164 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7]
6165 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6166 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6167 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6168 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
6169 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6170 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
6171 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5]
6172 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
6173 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
6174 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
6175 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
6176 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5]
6177 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
6178 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
6179 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7]
6180 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7]
6181 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
6182 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6],ymm1[7],ymm2[8,9,10,11,12],ymm1[13],ymm2[14],ymm1[15]
6183 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
6184 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
6185 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
6186 ; AVX2-NEXT: vmovdqa 16(%rsi), %xmm14
6187 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm12
6188 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
6189 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
6190 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6191 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
6192 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
6193 ; AVX2-NEXT: vmovdqa 16(%rcx), %xmm11
6194 ; AVX2-NEXT: vmovdqa 16(%rdx), %xmm9
6195 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
6196 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7]
6197 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
6198 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7]
6199 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
6200 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
6201 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
6202 ; AVX2-NEXT: vmovdqa 16(%r10), %xmm7
6203 ; AVX2-NEXT: vmovdqa 16(%rax), %xmm6
6204 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
6205 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7]
6206 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7]
6207 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
6208 ; AVX2-NEXT: vmovdqa 16(%r9), %xmm5
6209 ; AVX2-NEXT: vmovdqa 16(%r8), %xmm4
6210 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
6211 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7]
6212 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,3,3,4,5,6,7]
6213 ; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm2, %ymm2
6214 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5,6,7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13,14,15]
6215 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
6216 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4],ymm2[5],ymm10[6],ymm2[7]
6217 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,3,2,3]
6218 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
6219 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3]
6220 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
6221 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
6222 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
6223 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
6224 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
6225 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
6226 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
6227 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,6,5]
6228 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,6,6,7]
6229 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
6230 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7]
6231 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
6232 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
6233 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15]
6234 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
6235 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
6236 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
6237 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
6238 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6239 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6240 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
6241 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
6242 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7]
6243 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
6244 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7]
6245 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero
6246 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3
6247 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15]
6248 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
6249 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7]
6250 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
6251 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
6252 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
6253 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7]
6254 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,1,3,3,4,5,6,7]
6255 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
6256 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5,6,7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13,14,15]
6257 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
6258 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7]
6259 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
6260 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
6261 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
6262 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6263 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1
6264 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5]
6265 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
6266 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
6267 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
6268 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
6269 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5]
6270 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
6271 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
6272 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7]
6273 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7]
6274 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
6275 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6],ymm2[7],ymm3[8,9,10,11,12],ymm2[13],ymm3[14],ymm2[15]
6276 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
6277 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
6278 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6279 ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6280 ; AVX2-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
6281 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
6282 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
6283 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
6284 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
6285 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6286 ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
6287 ; AVX2-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15]
6288 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7]
6289 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
6290 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7]
6291 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
6292 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
6293 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15]
6294 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6295 ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
6296 ; AVX2-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15]
6297 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7]
6298 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
6299 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
6300 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
6301 ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
6302 ; AVX2-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15]
6303 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,1,3,4,5,6,7]
6304 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,1,3,3,4,5,6,7]
6305 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm9
6306 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5,6,7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13,14,15]
6307 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
6308 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
6309 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
6310 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
6311 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
6312 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
6313 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2
6314 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5]
6315 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
6316 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
6317 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
6318 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15]
6319 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,4,6,5]
6320 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
6321 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
6322 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7]
6323 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,7,7]
6324 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
6325 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6],ymm4[7],ymm5[8,9,10,11,12],ymm4[13],ymm5[14],ymm4[15]
6326 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
6327 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7]
6328 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
6329 ; AVX2-NEXT: vmovdqa %ymm2, 96(%rax)
6330 ; AVX2-NEXT: vmovdqa %ymm3, 64(%rax)
6331 ; AVX2-NEXT: vmovdqa %ymm1, 160(%rax)
6332 ; AVX2-NEXT: vmovdqa %ymm0, 128(%rax)
6333 ; AVX2-NEXT: vmovdqa %ymm8, 224(%rax)
6334 ; AVX2-NEXT: vmovdqa %ymm10, 192(%rax)
6335 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
6336 ; AVX2-NEXT: vmovaps %ymm0, 288(%rax)
6337 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6338 ; AVX2-NEXT: vmovaps %ymm0, 256(%rax)
6339 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6340 ; AVX2-NEXT: vmovaps %ymm0, 352(%rax)
6341 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6342 ; AVX2-NEXT: vmovaps %ymm0, 320(%rax)
6343 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6344 ; AVX2-NEXT: vmovaps %ymm0, 416(%rax)
6345 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6346 ; AVX2-NEXT: vmovaps %ymm0, 384(%rax)
6347 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6348 ; AVX2-NEXT: vmovaps %ymm0, 480(%rax)
6349 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6350 ; AVX2-NEXT: vmovaps %ymm0, 448(%rax)
6351 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6352 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
6353 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6354 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
6355 ; AVX2-NEXT: addq $328, %rsp # imm = 0x148
6356 ; AVX2-NEXT: vzeroupper
6359 ; AVX2-FP-LABEL: store_i8_stride8_vf64:
6361 ; AVX2-FP-NEXT: subq $392, %rsp # imm = 0x188
6362 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6363 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
6364 ; AVX2-FP-NEXT: vmovdqa (%r10), %xmm1
6365 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6366 ; AVX2-FP-NEXT: vmovdqa (%rax), %xmm0
6367 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6368 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
6369 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6370 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31]
6371 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3
6372 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6373 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm1
6374 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6375 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
6376 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
6377 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31]
6378 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15]
6379 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm4
6380 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6381 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
6382 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6383 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
6384 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
6385 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm5
6386 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6387 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4
6388 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6389 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
6390 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5
6391 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u]
6392 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31]
6393 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15]
6394 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
6395 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6396 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
6397 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6398 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
6399 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
6400 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
6401 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798]
6402 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm1
6403 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
6404 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
6405 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
6406 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
6407 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
6408 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
6409 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6410 ; AVX2-FP-NEXT: vmovdqa 48(%r10), %xmm8
6411 ; AVX2-FP-NEXT: vmovdqa 48(%rax), %xmm3
6412 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
6413 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
6414 ; AVX2-FP-NEXT: vmovdqa 48(%r9), %xmm4
6415 ; AVX2-FP-NEXT: vmovdqa 48(%r8), %xmm5
6416 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
6417 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7
6418 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm6, %ymm0
6419 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm2
6420 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
6421 ; AVX2-FP-NEXT: vmovdqa 48(%rsi), %xmm10
6422 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm9
6423 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
6424 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm1
6425 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6426 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1
6427 ; AVX2-FP-NEXT: vmovdqa 48(%rcx), %xmm13
6428 ; AVX2-FP-NEXT: vmovdqa 48(%rdx), %xmm2
6429 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
6430 ; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15
6431 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm15, %ymm14
6432 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15]
6433 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7]
6434 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6435 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
6436 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm1
6437 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
6438 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm7, %ymm6
6439 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15]
6440 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
6441 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm6
6442 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6443 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854]
6444 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm0
6445 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15]
6446 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
6447 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6448 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
6449 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6450 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6451 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
6452 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
6453 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm0, %ymm3
6454 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
6455 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm4
6456 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
6457 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
6458 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
6459 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm10 = [1284,1798]
6460 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm5
6461 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
6462 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5
6463 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
6464 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31]
6465 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15]
6466 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
6467 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6468 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
6469 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
6470 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
6471 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
6472 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2
6473 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
6474 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
6475 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
6476 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6477 ; AVX2-FP-NEXT: vmovdqa 32(%r10), %xmm1
6478 ; AVX2-FP-NEXT: vmovdqa 32(%rax), %xmm3
6479 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
6480 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
6481 ; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm4
6482 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm5
6483 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
6484 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7
6485 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm6, %ymm0
6486 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm2
6487 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
6488 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm0
6489 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
6490 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
6491 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm10
6492 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
6493 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10
6494 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm11
6495 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm13
6496 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
6497 ; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14
6498 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31]
6499 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15]
6500 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7]
6501 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6502 ; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm10
6503 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm6
6504 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
6505 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm7, %ymm7
6506 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15]
6507 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
6508 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm14, %ymm7
6509 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8
6510 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854]
6511 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm8
6512 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15]
6513 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
6514 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6515 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
6516 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6517 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
6518 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
6519 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
6520 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm4
6521 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
6522 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm5
6523 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
6524 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
6525 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
6526 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [1284,1798]
6527 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm2
6528 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6529 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2
6530 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5
6531 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
6532 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm6
6533 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15]
6534 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7]
6535 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6536 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
6537 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm2
6538 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15]
6539 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm5, %ymm2
6540 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6541 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm0
6542 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
6543 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
6544 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6545 ; AVX2-FP-NEXT: vmovdqa 16(%r10), %xmm4
6546 ; AVX2-FP-NEXT: vmovdqa 16(%rax), %xmm2
6547 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
6548 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
6549 ; AVX2-FP-NEXT: vmovdqa 16(%r9), %xmm1
6550 ; AVX2-FP-NEXT: vmovdqa 16(%r8), %xmm0
6551 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
6552 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8
6553 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm6, %ymm3
6554 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm5
6555 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15]
6556 ; AVX2-FP-NEXT: vmovdqa 16(%rsi), %xmm5
6557 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3
6558 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
6559 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm9, %xmm10
6560 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
6561 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10
6562 ; AVX2-FP-NEXT: vmovdqa 16(%rcx), %xmm13
6563 ; AVX2-FP-NEXT: vmovdqa 16(%rdx), %xmm14
6564 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
6565 ; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15
6566 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
6567 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm15, %ymm11
6568 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15]
6569 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7]
6570 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6571 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
6572 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm6
6573 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
6574 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm8
6575 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15]
6576 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
6577 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm15, %ymm8
6578 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9
6579 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm10 = [2312,2826,3340,3854]
6580 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
6581 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15]
6582 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
6583 ; AVX2-FP-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill
6584 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
6585 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
6586 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1
6587 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
6588 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
6589 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm0
6590 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
6591 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm2, %ymm4
6592 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15]
6593 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
6594 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
6595 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm14 = [1284,1798]
6596 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm3, %xmm5
6597 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
6598 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5
6599 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
6600 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
6601 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm4, %ymm8
6602 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15]
6603 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7]
6604 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
6605 ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm15
6606 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
6607 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15]
6608 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm4, %ymm2
6609 ; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm7
6610 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
6611 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
6612 ; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm11
6613 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15]
6614 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
6615 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6616 ; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6617 ; AVX2-FP-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
6618 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6619 ; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6620 ; AVX2-FP-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15]
6621 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
6622 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm4
6623 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
6624 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm3, %ymm5
6625 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
6626 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6627 ; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
6628 ; AVX2-FP-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15]
6629 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6630 ; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
6631 ; AVX2-FP-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15]
6632 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm8, %xmm9
6633 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
6634 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
6635 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5
6636 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm5, %ymm10
6637 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15]
6638 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
6639 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
6640 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm3
6641 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15]
6642 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm5, %ymm3
6643 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5
6644 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
6645 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15]
6646 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
6647 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6648 ; AVX2-FP-NEXT: vmovdqa %ymm2, 96(%rax)
6649 ; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%rax)
6650 ; AVX2-FP-NEXT: vmovdqa %ymm1, 160(%rax)
6651 ; AVX2-FP-NEXT: vmovdqa %ymm0, 128(%rax)
6652 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
6653 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax)
6654 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6655 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax)
6656 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6657 ; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax)
6658 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6659 ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax)
6660 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6661 ; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rax)
6662 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6663 ; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax)
6664 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6665 ; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax)
6666 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6667 ; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rax)
6668 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6669 ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rax)
6670 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6671 ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rax)
6672 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6673 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
6674 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6675 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
6676 ; AVX2-FP-NEXT: addq $392, %rsp # imm = 0x188
6677 ; AVX2-FP-NEXT: vzeroupper
6678 ; AVX2-FP-NEXT: retq
6680 ; AVX2-FCP-LABEL: store_i8_stride8_vf64:
6681 ; AVX2-FCP: # %bb.0:
6682 ; AVX2-FCP-NEXT: subq $392, %rsp # imm = 0x188
6683 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6684 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
6685 ; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm1
6686 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6687 ; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm0
6688 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6689 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
6690 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6691 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31]
6692 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm3
6693 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6694 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1
6695 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6696 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
6697 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
6698 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31]
6699 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15]
6700 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm4
6701 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6702 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
6703 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6704 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
6705 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
6706 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm5
6707 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6708 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4
6709 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6710 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
6711 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5
6712 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u]
6713 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31]
6714 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15]
6715 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
6716 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6717 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
6718 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6719 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
6720 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
6721 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
6722 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798]
6723 ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm1
6724 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
6725 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
6726 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
6727 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
6728 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
6729 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
6730 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6731 ; AVX2-FCP-NEXT: vmovdqa 48(%r10), %xmm8
6732 ; AVX2-FCP-NEXT: vmovdqa 48(%rax), %xmm3
6733 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
6734 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
6735 ; AVX2-FCP-NEXT: vmovdqa 48(%r9), %xmm4
6736 ; AVX2-FCP-NEXT: vmovdqa 48(%r8), %xmm5
6737 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
6738 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7
6739 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm0
6740 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm2
6741 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
6742 ; AVX2-FCP-NEXT: vmovdqa 48(%rsi), %xmm10
6743 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm9
6744 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
6745 ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm1
6746 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6747 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1
6748 ; AVX2-FCP-NEXT: vmovdqa 48(%rcx), %xmm13
6749 ; AVX2-FCP-NEXT: vmovdqa 48(%rdx), %xmm2
6750 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
6751 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15
6752 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm14
6753 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15]
6754 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7]
6755 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6756 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
6757 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm1
6758 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
6759 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm6
6760 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15]
6761 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
6762 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6
6763 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6764 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854]
6765 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0
6766 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15]
6767 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
6768 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6769 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
6770 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6771 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6772 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
6773 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
6774 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm3
6775 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
6776 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm4
6777 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
6778 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
6779 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
6780 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm10 = [1284,1798]
6781 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm5
6782 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
6783 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5
6784 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
6785 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31]
6786 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15]
6787 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
6788 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6789 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
6790 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
6791 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
6792 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
6793 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2
6794 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
6795 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
6796 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
6797 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6798 ; AVX2-FCP-NEXT: vmovdqa 32(%r10), %xmm1
6799 ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %xmm3
6800 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
6801 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
6802 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm4
6803 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm5
6804 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
6805 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7
6806 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm0
6807 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm2
6808 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
6809 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm0
6810 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
6811 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
6812 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm10
6813 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
6814 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10
6815 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm11
6816 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm13
6817 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
6818 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14
6819 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31]
6820 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15]
6821 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7]
6822 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6823 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm10
6824 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6
6825 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
6826 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm7
6827 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15]
6828 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
6829 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm7
6830 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8
6831 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854]
6832 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8
6833 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15]
6834 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
6835 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6836 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
6837 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6838 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
6839 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
6840 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
6841 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm4
6842 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
6843 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm5
6844 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
6845 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
6846 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
6847 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [1284,1798]
6848 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm2
6849 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6850 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2
6851 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5
6852 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
6853 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6
6854 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15]
6855 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7]
6856 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6857 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
6858 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm2
6859 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15]
6860 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm2
6861 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6862 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0
6863 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
6864 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
6865 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6866 ; AVX2-FCP-NEXT: vmovdqa 16(%r10), %xmm4
6867 ; AVX2-FCP-NEXT: vmovdqa 16(%rax), %xmm2
6868 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
6869 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
6870 ; AVX2-FCP-NEXT: vmovdqa 16(%r9), %xmm1
6871 ; AVX2-FCP-NEXT: vmovdqa 16(%r8), %xmm0
6872 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
6873 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8
6874 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm3
6875 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm5
6876 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15]
6877 ; AVX2-FCP-NEXT: vmovdqa 16(%rsi), %xmm5
6878 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
6879 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
6880 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm10
6881 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
6882 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10
6883 ; AVX2-FCP-NEXT: vmovdqa 16(%rcx), %xmm13
6884 ; AVX2-FCP-NEXT: vmovdqa 16(%rdx), %xmm14
6885 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
6886 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15
6887 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
6888 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11
6889 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15]
6890 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7]
6891 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6892 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
6893 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6
6894 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
6895 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm8
6896 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15]
6897 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
6898 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm8
6899 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9
6900 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm10 = [2312,2826,3340,3854]
6901 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
6902 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15]
6903 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
6904 ; AVX2-FCP-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill
6905 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
6906 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
6907 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1
6908 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
6909 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
6910 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0
6911 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
6912 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm4
6913 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15]
6914 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
6915 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
6916 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm14 = [1284,1798]
6917 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm5
6918 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
6919 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5
6920 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
6921 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
6922 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm8
6923 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15]
6924 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7]
6925 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
6926 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm15
6927 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
6928 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15]
6929 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm2
6930 ; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm7
6931 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
6932 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
6933 ; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm11
6934 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15]
6935 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
6936 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6937 ; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6938 ; AVX2-FCP-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
6939 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6940 ; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6941 ; AVX2-FCP-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15]
6942 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
6943 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4
6944 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3
6945 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm5
6946 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
6947 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6948 ; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
6949 ; AVX2-FCP-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15]
6950 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6951 ; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
6952 ; AVX2-FCP-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15]
6953 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm9
6954 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
6955 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
6956 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5
6957 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm10
6958 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15]
6959 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
6960 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
6961 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3
6962 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15]
6963 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm3
6964 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5
6965 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
6966 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15]
6967 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
6968 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6969 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 96(%rax)
6970 ; AVX2-FCP-NEXT: vmovdqa %ymm4, 64(%rax)
6971 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 160(%rax)
6972 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 128(%rax)
6973 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
6974 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax)
6975 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6976 ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax)
6977 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6978 ; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax)
6979 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6980 ; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax)
6981 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6982 ; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rax)
6983 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6984 ; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax)
6985 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6986 ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax)
6987 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6988 ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax)
6989 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6990 ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rax)
6991 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6992 ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rax)
6993 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6994 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
6995 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6996 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
6997 ; AVX2-FCP-NEXT: addq $392, %rsp # imm = 0x188
6998 ; AVX2-FCP-NEXT: vzeroupper
6999 ; AVX2-FCP-NEXT: retq
7001 ; AVX512-LABEL: store_i8_stride8_vf64:
7003 ; AVX512-NEXT: subq $680, %rsp # imm = 0x2A8
7004 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
7005 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
7006 ; AVX512-NEXT: vmovdqa (%rcx), %xmm1
7007 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7008 ; AVX512-NEXT: vmovdqa 32(%rcx), %xmm10
7009 ; AVX512-NEXT: vmovdqa 48(%rcx), %xmm2
7010 ; AVX512-NEXT: vmovdqa (%rdx), %xmm0
7011 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7012 ; AVX512-NEXT: vmovdqa 48(%rdx), %xmm3
7013 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
7014 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7015 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
7016 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
7017 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7018 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7019 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7020 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7021 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7022 ; AVX512-NEXT: vmovdqa (%r10), %xmm1
7023 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7024 ; AVX512-NEXT: vmovdqa 48(%r10), %xmm4
7025 ; AVX512-NEXT: vmovdqa (%rax), %xmm0
7026 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7027 ; AVX512-NEXT: vmovdqa 48(%rax), %xmm5
7028 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
7029 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7030 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7]
7031 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1
7032 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7033 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7034 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7035 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7036 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7037 ; AVX512-NEXT: vmovdqa (%r9), %xmm1
7038 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7039 ; AVX512-NEXT: vmovdqa 48(%r9), %xmm7
7040 ; AVX512-NEXT: vmovdqa (%r8), %xmm0
7041 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7042 ; AVX512-NEXT: vmovdqa 48(%r8), %xmm12
7043 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
7044 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7]
7045 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7]
7046 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1
7047 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7048 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7]
7049 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
7050 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7051 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7052 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
7053 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7054 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7]
7055 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1
7056 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7057 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7058 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7059 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm9
7060 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
7061 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7062 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7]
7063 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1
7064 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7065 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7066 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7067 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7068 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7069 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
7070 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7]
7071 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7]
7072 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1
7073 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7074 ; AVX512-NEXT: vmovdqa 32(%rdx), %xmm1
7075 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,5,5,7]
7076 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
7077 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0
7078 ; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
7079 ; AVX512-NEXT: vmovdqa 32(%r10), %xmm0
7080 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
7081 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
7082 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7]
7083 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm11
7084 ; AVX512-NEXT: vmovdqa 32(%rax), %xmm2
7085 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,4,6,5]
7086 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
7087 ; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm13, %ymm31
7088 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
7089 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
7090 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7]
7091 ; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm28
7092 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
7093 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
7094 ; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23
7095 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15]
7096 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7]
7097 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7]
7098 ; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm21
7099 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7]
7100 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7]
7101 ; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm20
7102 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
7103 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7]
7104 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
7105 ; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm30
7106 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5]
7107 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
7108 ; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29
7109 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
7110 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7]
7111 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
7112 ; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm24
7113 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5]
7114 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
7115 ; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm22
7116 ; AVX512-NEXT: vmovdqa 32(%r9), %xmm3
7117 ; AVX512-NEXT: vmovdqa 32(%r8), %xmm5
7118 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
7119 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7]
7120 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7]
7121 ; AVX512-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19
7122 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7]
7123 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7]
7124 ; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm18
7125 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
7126 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7]
7127 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7]
7128 ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm4
7129 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7130 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5]
7131 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
7132 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1
7133 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7134 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
7135 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7136 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
7137 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
7138 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7139 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7140 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7141 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7142 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7143 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
7144 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7]
7145 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7]
7146 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
7147 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7148 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7]
7149 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
7150 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7151 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7152 ; AVX512-NEXT: vmovdqa 16(%rcx), %xmm14
7153 ; AVX512-NEXT: vmovdqa 16(%rdx), %xmm12
7154 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
7155 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7156 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7]
7157 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
7158 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7159 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7160 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7161 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7162 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7163 ; AVX512-NEXT: vmovdqa 16(%r10), %xmm1
7164 ; AVX512-NEXT: vmovdqa 16(%rax), %xmm2
7165 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
7166 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm26
7167 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm17
7168 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7169 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7]
7170 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
7171 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7172 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7173 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7174 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7175 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7176 ; AVX512-NEXT: vmovdqa 16(%r9), %xmm0
7177 ; AVX512-NEXT: vmovdqa 16(%r8), %xmm15
7178 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
7179 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
7180 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7]
7181 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,1,3,3,4,5,6,7]
7182 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0
7183 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7184 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1
7185 ; AVX512-NEXT: vmovdqa (%rdi), %xmm2
7186 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
7187 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm25
7188 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm27
7189 ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
7190 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
7191 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7192 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
7193 ; AVX512-NEXT: vmovdqa 48(%rsi), %xmm2
7194 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm1
7195 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
7196 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
7197 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
7198 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
7199 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7200 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7201 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
7202 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
7203 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3
7204 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
7205 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
7206 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
7207 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
7208 ; AVX512-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7]
7209 ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
7210 ; AVX512-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7]
7211 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10
7212 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
7213 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm13 & (zmm10 ^ zmm3))
7214 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
7215 ; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5]
7216 ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
7217 ; AVX512-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7]
7218 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4
7219 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
7220 ; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5]
7221 ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
7222 ; AVX512-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7]
7223 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5
7224 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
7225 ; AVX512-NEXT: vpandnq %zmm4, %zmm3, %zmm4
7226 ; AVX512-NEXT: vpandq %zmm3, %zmm5, %zmm5
7227 ; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA
7228 ; AVX512-NEXT: kmovw %eax, %k1
7229 ; AVX512-NEXT: vpord %zmm4, %zmm5, %zmm10 {%k1}
7230 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
7231 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
7232 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
7233 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7234 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
7235 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
7236 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
7237 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
7238 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7]
7239 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9
7240 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm13 & (zmm9 ^ zmm0))
7241 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
7242 ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
7243 ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
7244 ; AVX512-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7]
7245 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
7246 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
7247 ; AVX512-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5]
7248 ; AVX512-NEXT: vpshufd $232, (%rsp), %ymm4 # 32-byte Folded Reload
7249 ; AVX512-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7]
7250 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
7251 ; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0
7252 ; AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2
7253 ; AVX512-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1}
7254 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
7255 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7256 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
7257 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
7258 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
7259 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
7260 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
7261 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
7262 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
7263 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7264 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7]
7265 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7]
7266 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11
7267 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm13 & (zmm11 ^ zmm0))
7268 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5]
7269 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7]
7270 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
7271 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,0,2,1,4,4,6,5]
7272 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,2,2,3,4,6,6,7]
7273 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
7274 ; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0
7275 ; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
7276 ; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1}
7277 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7]
7278 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,7,7]
7279 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
7280 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7281 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
7282 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7]
7283 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
7284 ; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28
7285 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5
7286 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
7287 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
7288 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
7289 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
7290 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
7291 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
7292 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4
7293 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7294 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7295 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7296 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0
7297 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
7298 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7]
7299 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7]
7300 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
7301 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm0))
7302 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5]
7303 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7]
7304 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
7305 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,0,2,1,4,4,6,5]
7306 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,2,2,3,4,6,6,7]
7307 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
7308 ; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0
7309 ; AVX512-NEXT: vpandq %zmm3, %zmm6, %zmm6
7310 ; AVX512-NEXT: vpord %zmm0, %zmm6, %zmm4 {%k1}
7311 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5]
7312 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
7313 ; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18
7314 ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm0
7315 ; AVX512-NEXT: vmovdqa64 %xmm17, %xmm2
7316 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
7317 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7]
7318 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
7319 ; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm2, %ymm21
7320 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
7321 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7322 ; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm22
7323 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0
7324 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
7325 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7]
7326 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7]
7327 ; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26
7328 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7]
7329 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7]
7330 ; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm7, %ymm19
7331 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7332 ; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
7333 ; AVX512-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7334 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7]
7335 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7]
7336 ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm14
7337 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5]
7338 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
7339 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm15
7340 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7341 ; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
7342 ; AVX512-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7343 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7]
7344 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7]
7345 ; AVX512-NEXT: vinserti32x4 $1, %xmm12, %ymm8, %ymm17
7346 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5]
7347 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
7348 ; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm8, %ymm20
7349 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
7350 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7351 ; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
7352 ; AVX512-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7353 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7]
7354 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7]
7355 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm8
7356 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7]
7357 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7]
7358 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1
7359 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm7
7360 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12
7361 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
7362 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15]
7363 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3]
7364 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
7365 ; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3]
7366 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
7367 ; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12
7368 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
7369 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
7370 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
7371 ; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5
7372 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12
7373 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
7374 ; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
7375 ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
7376 ; AVX512-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7]
7377 ; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5
7378 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12))
7379 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
7380 ; AVX512-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5]
7381 ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
7382 ; AVX512-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7]
7383 ; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12
7384 ; AVX512-NEXT: vpandnq %zmm12, %zmm3, %zmm12
7385 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
7386 ; AVX512-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5]
7387 ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
7388 ; AVX512-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7]
7389 ; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23
7390 ; AVX512-NEXT: vpandq %zmm3, %zmm23, %zmm23
7391 ; AVX512-NEXT: vpord %zmm12, %zmm23, %zmm5 {%k1}
7392 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,3,2,3]
7393 ; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3]
7394 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7395 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7396 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm2
7397 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm6
7398 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7399 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
7400 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
7401 ; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12
7402 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7403 ; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0
7404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm7[2,3,2,3]
7405 ; AVX512-NEXT: vpshufd {{.*#+}} xmm24 = xmm7[3,3,3,3]
7406 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
7407 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1]
7408 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0
7409 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
7410 ; AVX512-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
7411 ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
7412 ; AVX512-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7]
7413 ; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12
7414 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm13 & (zmm12 ^ zmm0))
7415 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
7416 ; AVX512-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3]
7417 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
7418 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
7419 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
7420 ; AVX512-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5]
7421 ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
7422 ; AVX512-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7]
7423 ; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25
7424 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
7425 ; AVX512-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5]
7426 ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
7427 ; AVX512-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7]
7428 ; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16
7429 ; AVX512-NEXT: vpandnq %zmm25, %zmm3, %zmm25
7430 ; AVX512-NEXT: vpandq %zmm3, %zmm16, %zmm16
7431 ; AVX512-NEXT: vpord %zmm25, %zmm16, %zmm12 {%k1}
7432 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
7433 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
7434 ; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16
7435 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
7436 ; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm29, %ymm7
7437 ; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7
7438 ; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7]
7439 ; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7]
7440 ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16
7441 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm13 & (zmm16 ^ zmm7))
7442 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5]
7443 ; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7]
7444 ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7
7445 ; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5]
7446 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,2,2,3,4,6,6,7]
7447 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6
7448 ; AVX512-NEXT: vpandnq %zmm7, %zmm3, %zmm7
7449 ; AVX512-NEXT: vpandq %zmm3, %zmm6, %zmm6
7450 ; AVX512-NEXT: vpord %zmm7, %zmm6, %zmm16 {%k1}
7451 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7452 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero
7453 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
7454 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
7455 ; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
7456 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
7457 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7]
7458 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7]
7459 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2
7460 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm13 & (zmm2 ^ zmm0))
7461 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5]
7462 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7]
7463 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
7464 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,0,2,1,4,4,6,5]
7465 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
7466 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1
7467 ; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0
7468 ; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
7469 ; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm2 {%k1}
7470 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
7471 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax)
7472 ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rax)
7473 ; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rax)
7474 ; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax)
7475 ; AVX512-NEXT: vmovdqa64 %zmm4, 256(%rax)
7476 ; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rax)
7477 ; AVX512-NEXT: vmovdqa64 %zmm9, 384(%rax)
7478 ; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rax)
7479 ; AVX512-NEXT: addq $680, %rsp # imm = 0x2A8
7480 ; AVX512-NEXT: vzeroupper
7483 ; AVX512-FCP-LABEL: store_i8_stride8_vf64:
7484 ; AVX512-FCP: # %bb.0:
7485 ; AVX512-FCP-NEXT: subq $392, %rsp # imm = 0x188
7486 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7487 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7488 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2
7489 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7490 ; AVX512-FCP-NEXT: vmovdqa 48(%rcx), %xmm0
7491 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm15
7492 ; AVX512-FCP-NEXT: vmovdqa 48(%rdx), %xmm1
7493 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15]
7494 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
7495 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
7496 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3
7497 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
7498 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
7499 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
7500 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7501 ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2
7502 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7503 ; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm14
7504 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
7505 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
7506 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
7507 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm3
7508 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
7509 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
7510 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19
7511 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3
7512 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7513 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
7514 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7515 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
7516 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
7517 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
7518 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
7519 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
7520 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
7521 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21
7522 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
7523 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
7524 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3
7525 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
7526 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
7527 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7528 ; AVX512-FCP-NEXT: vmovdqa 48(%r10), %xmm2
7529 ; AVX512-FCP-NEXT: vmovdqa 48(%rax), %xmm3
7530 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
7531 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
7532 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
7533 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
7534 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
7535 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7536 ; AVX512-FCP-NEXT: vmovdqa 48(%r9), %xmm4
7537 ; AVX512-FCP-NEXT: vmovdqa 48(%r8), %xmm5
7538 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
7539 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7
7540 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13
7541 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7
7542 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7
7543 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7544 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
7545 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7546 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1
7547 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
7548 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
7549 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7550 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
7551 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7552 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1
7553 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
7554 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
7555 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
7556 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm0
7557 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm1
7558 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
7559 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
7560 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
7561 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
7562 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
7563 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7564 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
7565 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
7566 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3
7567 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
7568 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20
7569 ; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm2
7570 ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm3
7571 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
7572 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
7573 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
7574 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
7575 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm22
7576 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm4
7577 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm5
7578 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
7579 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7
7580 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13
7581 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7
7582 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm23
7583 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
7584 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7585 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1
7586 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
7587 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24
7588 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
7589 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7590 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1
7591 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
7592 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25
7593 ; AVX512-FCP-NEXT: vmovdqa 16(%rcx), %xmm0
7594 ; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm1
7595 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
7596 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
7597 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
7598 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
7599 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm26
7600 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
7601 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
7602 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3
7603 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
7604 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm27
7605 ; AVX512-FCP-NEXT: vmovdqa 16(%r10), %xmm2
7606 ; AVX512-FCP-NEXT: vmovdqa 16(%rax), %xmm3
7607 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
7608 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
7609 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
7610 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
7611 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm28
7612 ; AVX512-FCP-NEXT: vmovdqa 16(%r9), %xmm4
7613 ; AVX512-FCP-NEXT: vmovdqa 16(%r8), %xmm5
7614 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
7615 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7
7616 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13
7617 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7
7618 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm29
7619 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
7620 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7621 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1
7622 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
7623 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30
7624 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
7625 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7626 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1
7627 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
7628 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm31
7629 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
7630 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7631 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm1
7632 ; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm17
7633 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
7634 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm18
7635 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16
7636 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12
7637 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9
7638 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
7639 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798]
7640 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7641 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm2
7642 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
7643 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7644 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854]
7645 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
7646 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3
7647 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
7648 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
7649 ; AVX512-FCP-NEXT: # zmm3 = mem ^ (zmm7 & (zmm3 ^ mem))
7650 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
7651 ; AVX512-FCP-NEXT: vpandnq %zmm19, %zmm2, %zmm19
7652 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm21
7653 ; AVX512-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA
7654 ; AVX512-FCP-NEXT: kmovw %eax, %k1
7655 ; AVX512-FCP-NEXT: vpord %zmm19, %zmm21, %zmm3 {%k1}
7656 ; AVX512-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
7657 ; AVX512-FCP-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7]
7658 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15
7659 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6
7660 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8
7661 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm19
7662 ; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm15
7663 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm8
7664 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
7665 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm0
7666 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
7667 ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm21, %ymm0
7668 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6
7669 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
7670 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6
7671 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15]
7672 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm8
7673 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7674 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8
7675 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7676 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
7677 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm15
7678 ; AVX512-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload
7679 ; AVX512-FCP-NEXT: # xmm0 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
7680 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7681 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm8
7682 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
7683 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm13
7684 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0
7685 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm8
7686 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
7687 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14
7688 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
7689 ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm14, %ymm21, %ymm14
7690 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11
7691 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11
7692 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11
7693 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
7694 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm8
7695 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7696 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8
7697 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7698 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
7699 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm14
7700 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7701 ; AVX512-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7702 ; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7703 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7704 ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
7705 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm8
7706 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
7707 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
7708 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm21
7709 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm8
7710 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm10
7711 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
7712 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm1
7713 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7714 ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm1, %ymm17, %ymm1
7715 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7716 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
7717 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7718 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
7719 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm8
7720 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
7721 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8
7722 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
7723 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
7724 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1
7725 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
7726 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm9
7727 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
7728 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5
7729 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
7730 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5
7731 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
7732 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload
7733 ; AVX512-FCP-NEXT: # zmm6 = mem ^ (zmm7 & (zmm6 ^ mem))
7734 ; AVX512-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
7735 ; AVX512-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
7736 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm6 {%k1}
7737 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload
7738 ; AVX512-FCP-NEXT: # zmm15 = mem ^ (zmm7 & (zmm15 ^ mem))
7739 ; AVX512-FCP-NEXT: vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload
7740 ; AVX512-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
7741 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm15 {%k1}
7742 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm20 ^ (zmm7 & (zmm11 ^ zmm20))
7743 ; AVX512-FCP-NEXT: vpandnq %zmm22, %zmm2, %zmm5
7744 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm23, %zmm8
7745 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm11 {%k1}
7746 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm24 ^ (zmm7 & (zmm14 ^ zmm24))
7747 ; AVX512-FCP-NEXT: vpandnq %zmm25, %zmm2, %zmm5
7748 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm26, %zmm8
7749 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm14 {%k1}
7750 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm27 ^ (zmm7 & (zmm0 ^ zmm27))
7751 ; AVX512-FCP-NEXT: vpandnq %zmm28, %zmm2, %zmm5
7752 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm29, %zmm8
7753 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm0 {%k1}
7754 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm30 ^ (zmm7 & (zmm1 ^ zmm30))
7755 ; AVX512-FCP-NEXT: vpandnq %zmm31, %zmm2, %zmm5
7756 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm16, %zmm8
7757 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm1 {%k1}
7758 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm19 ^ (zmm7 & (zmm4 ^ zmm19))
7759 ; AVX512-FCP-NEXT: vpandnq %zmm13, %zmm2, %zmm5
7760 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm2
7761 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm2, %zmm4 {%k1}
7762 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7763 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
7764 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax)
7765 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
7766 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 320(%rax)
7767 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax)
7768 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 448(%rax)
7769 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax)
7770 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
7771 ; AVX512-FCP-NEXT: addq $392, %rsp # imm = 0x188
7772 ; AVX512-FCP-NEXT: vzeroupper
7773 ; AVX512-FCP-NEXT: retq
7775 ; AVX512DQ-LABEL: store_i8_stride8_vf64:
7776 ; AVX512DQ: # %bb.0:
7777 ; AVX512DQ-NEXT: subq $680, %rsp # imm = 0x2A8
7778 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
7779 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
7780 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1
7781 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7782 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm10
7783 ; AVX512DQ-NEXT: vmovdqa 48(%rcx), %xmm2
7784 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0
7785 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7786 ; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm3
7787 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
7788 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7789 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
7790 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
7791 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7792 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7793 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7794 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7795 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7796 ; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1
7797 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7798 ; AVX512DQ-NEXT: vmovdqa 48(%r10), %xmm4
7799 ; AVX512DQ-NEXT: vmovdqa (%rax), %xmm0
7800 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7801 ; AVX512DQ-NEXT: vmovdqa 48(%rax), %xmm5
7802 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
7803 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7804 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7]
7805 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1
7806 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7807 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7808 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7809 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7810 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7811 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1
7812 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7813 ; AVX512DQ-NEXT: vmovdqa 48(%r9), %xmm7
7814 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0
7815 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7816 ; AVX512DQ-NEXT: vmovdqa 48(%r8), %xmm12
7817 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
7818 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7]
7819 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7]
7820 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1
7821 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7822 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7]
7823 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
7824 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7825 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7826 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
7827 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7828 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7]
7829 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1
7830 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7831 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7832 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7833 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm9
7834 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
7835 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7836 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7]
7837 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1
7838 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7839 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7840 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7841 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7842 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7843 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
7844 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7]
7845 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7]
7846 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1
7847 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7848 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm1
7849 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,5,5,7]
7850 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
7851 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0
7852 ; AVX512DQ-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
7853 ; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm0
7854 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
7855 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
7856 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7]
7857 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm11
7858 ; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm2
7859 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,4,6,5]
7860 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
7861 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm13, %ymm31
7862 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
7863 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
7864 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7]
7865 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm28
7866 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
7867 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
7868 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23
7869 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15]
7870 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7]
7871 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7]
7872 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm21
7873 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7]
7874 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7]
7875 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm20
7876 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
7877 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7]
7878 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
7879 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm30
7880 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5]
7881 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
7882 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29
7883 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
7884 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7]
7885 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
7886 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm24
7887 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5]
7888 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
7889 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm22
7890 ; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm3
7891 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm5
7892 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
7893 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7]
7894 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7]
7895 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19
7896 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7]
7897 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7]
7898 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm18
7899 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
7900 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7]
7901 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7]
7902 ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm4
7903 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7904 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5]
7905 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
7906 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1
7907 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7908 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
7909 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7910 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
7911 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
7912 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7913 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7914 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7915 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7916 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7917 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
7918 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7]
7919 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7]
7920 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
7921 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7922 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7]
7923 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
7924 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7925 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7926 ; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm14
7927 ; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm12
7928 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
7929 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7930 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7]
7931 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
7932 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7933 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7934 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7935 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7936 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7937 ; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm1
7938 ; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm2
7939 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
7940 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm26
7941 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm17
7942 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
7943 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7]
7944 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
7945 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7946 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
7947 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
7948 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
7949 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7950 ; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm0
7951 ; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm15
7952 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
7953 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
7954 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7]
7955 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,1,3,3,4,5,6,7]
7956 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0
7957 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7958 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
7959 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
7960 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
7961 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25
7962 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm27
7963 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
7964 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
7965 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7966 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
7967 ; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm2
7968 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
7969 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
7970 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
7971 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
7972 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
7973 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
7974 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7975 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
7976 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
7977 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3
7978 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
7979 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
7980 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
7981 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
7982 ; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7]
7983 ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
7984 ; AVX512DQ-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7]
7985 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10
7986 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
7987 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm13 & (zmm10 ^ zmm3))
7988 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
7989 ; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5]
7990 ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
7991 ; AVX512DQ-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7]
7992 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4
7993 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
7994 ; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5]
7995 ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
7996 ; AVX512DQ-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7]
7997 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5
7998 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
7999 ; AVX512DQ-NEXT: vpandnq %zmm4, %zmm3, %zmm4
8000 ; AVX512DQ-NEXT: vpandq %zmm3, %zmm5, %zmm5
8001 ; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA
8002 ; AVX512DQ-NEXT: kmovw %eax, %k1
8003 ; AVX512DQ-NEXT: vpord %zmm4, %zmm5, %zmm10 {%k1}
8004 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
8005 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
8006 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
8007 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8008 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
8009 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8010 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
8011 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
8012 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7]
8013 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9
8014 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm13 & (zmm9 ^ zmm0))
8015 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
8016 ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
8017 ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
8018 ; AVX512DQ-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7]
8019 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8020 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
8021 ; AVX512DQ-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5]
8022 ; AVX512DQ-NEXT: vpshufd $232, (%rsp), %ymm4 # 32-byte Folded Reload
8023 ; AVX512DQ-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7]
8024 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
8025 ; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0
8026 ; AVX512DQ-NEXT: vpandq %zmm3, %zmm2, %zmm2
8027 ; AVX512DQ-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1}
8028 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
8029 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8030 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
8031 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
8032 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
8033 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
8034 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
8035 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
8036 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
8037 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8038 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7]
8039 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7]
8040 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11
8041 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm13 & (zmm11 ^ zmm0))
8042 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5]
8043 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7]
8044 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
8045 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,0,2,1,4,4,6,5]
8046 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,2,2,3,4,6,6,7]
8047 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
8048 ; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0
8049 ; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1
8050 ; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1}
8051 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7]
8052 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,7,7]
8053 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
8054 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8055 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
8056 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7]
8057 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
8058 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28
8059 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5
8060 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
8061 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
8062 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
8063 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
8064 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
8065 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
8066 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4
8067 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8068 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
8069 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8070 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0
8071 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
8072 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7]
8073 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7]
8074 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
8075 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm0))
8076 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5]
8077 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7]
8078 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
8079 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,0,2,1,4,4,6,5]
8080 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,2,2,3,4,6,6,7]
8081 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
8082 ; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0
8083 ; AVX512DQ-NEXT: vpandq %zmm3, %zmm6, %zmm6
8084 ; AVX512DQ-NEXT: vpord %zmm0, %zmm6, %zmm4 {%k1}
8085 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5]
8086 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8087 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18
8088 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm0
8089 ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm2
8090 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
8091 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7]
8092 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
8093 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm2, %ymm21
8094 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
8095 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
8096 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm22
8097 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0
8098 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
8099 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7]
8100 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7]
8101 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26
8102 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7]
8103 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7]
8104 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm7, %ymm19
8105 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8106 ; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
8107 ; AVX512DQ-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8108 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7]
8109 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7]
8110 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm14
8111 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5]
8112 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
8113 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm15
8114 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8115 ; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
8116 ; AVX512DQ-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8117 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7]
8118 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7]
8119 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm8, %ymm17
8120 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5]
8121 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
8122 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm8, %ymm20
8123 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
8124 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8125 ; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
8126 ; AVX512DQ-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8127 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7]
8128 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7]
8129 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm8
8130 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7]
8131 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7]
8132 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1
8133 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm7
8134 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12
8135 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
8136 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15]
8137 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3]
8138 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
8139 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3]
8140 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
8141 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12
8142 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
8143 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
8144 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
8145 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5
8146 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12
8147 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
8148 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
8149 ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
8150 ; AVX512DQ-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7]
8151 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5
8152 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12))
8153 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
8154 ; AVX512DQ-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5]
8155 ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
8156 ; AVX512DQ-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7]
8157 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12
8158 ; AVX512DQ-NEXT: vpandnq %zmm12, %zmm3, %zmm12
8159 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
8160 ; AVX512DQ-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5]
8161 ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
8162 ; AVX512DQ-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7]
8163 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23
8164 ; AVX512DQ-NEXT: vpandq %zmm3, %zmm23, %zmm23
8165 ; AVX512DQ-NEXT: vpord %zmm12, %zmm23, %zmm5 {%k1}
8166 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,3,2,3]
8167 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3]
8168 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8169 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
8170 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm2
8171 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm6
8172 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
8173 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
8174 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
8175 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12
8176 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8177 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0
8178 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm7[2,3,2,3]
8179 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm24 = xmm7[3,3,3,3]
8180 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
8181 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1]
8182 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0
8183 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
8184 ; AVX512DQ-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
8185 ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
8186 ; AVX512DQ-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7]
8187 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12
8188 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm13 & (zmm12 ^ zmm0))
8189 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
8190 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3]
8191 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
8192 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8193 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
8194 ; AVX512DQ-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5]
8195 ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
8196 ; AVX512DQ-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7]
8197 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25
8198 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
8199 ; AVX512DQ-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5]
8200 ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
8201 ; AVX512DQ-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7]
8202 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16
8203 ; AVX512DQ-NEXT: vpandnq %zmm25, %zmm3, %zmm25
8204 ; AVX512DQ-NEXT: vpandq %zmm3, %zmm16, %zmm16
8205 ; AVX512DQ-NEXT: vpord %zmm25, %zmm16, %zmm12 {%k1}
8206 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
8207 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
8208 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16
8209 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
8210 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm29, %ymm7
8211 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7
8212 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7]
8213 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7]
8214 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16
8215 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm13 & (zmm16 ^ zmm7))
8216 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5]
8217 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7]
8218 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7
8219 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5]
8220 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,2,2,3,4,6,6,7]
8221 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6
8222 ; AVX512DQ-NEXT: vpandnq %zmm7, %zmm3, %zmm7
8223 ; AVX512DQ-NEXT: vpandq %zmm3, %zmm6, %zmm6
8224 ; AVX512DQ-NEXT: vpord %zmm7, %zmm6, %zmm16 {%k1}
8225 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8226 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero
8227 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
8228 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
8229 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
8230 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
8231 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7]
8232 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7]
8233 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2
8234 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm13 & (zmm2 ^ zmm0))
8235 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5]
8236 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7]
8237 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
8238 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,0,2,1,4,4,6,5]
8239 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
8240 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1
8241 ; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0
8242 ; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1
8243 ; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm2 {%k1}
8244 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
8245 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax)
8246 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rax)
8247 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax)
8248 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax)
8249 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rax)
8250 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rax)
8251 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 384(%rax)
8252 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 64(%rax)
8253 ; AVX512DQ-NEXT: addq $680, %rsp # imm = 0x2A8
8254 ; AVX512DQ-NEXT: vzeroupper
8255 ; AVX512DQ-NEXT: retq
8257 ; AVX512DQ-FCP-LABEL: store_i8_stride8_vf64:
8258 ; AVX512DQ-FCP: # %bb.0:
8259 ; AVX512DQ-FCP-NEXT: subq $392, %rsp # imm = 0x188
8260 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8261 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
8262 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2
8263 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8264 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rcx), %xmm0
8265 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm15
8266 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdx), %xmm1
8267 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15]
8268 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
8269 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
8270 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3
8271 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
8272 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
8273 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
8274 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8275 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2
8276 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8277 ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm14
8278 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
8279 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
8280 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
8281 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm3
8282 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
8283 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
8284 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19
8285 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3
8286 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8287 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
8288 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8289 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
8290 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
8291 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
8292 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
8293 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
8294 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
8295 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21
8296 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8297 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
8298 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3
8299 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
8300 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
8301 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8302 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%r10), %xmm2
8303 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rax), %xmm3
8304 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
8305 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
8306 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
8307 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
8308 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
8309 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8310 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%r9), %xmm4
8311 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%r8), %xmm5
8312 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
8313 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7
8314 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13
8315 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7
8316 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7
8317 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8318 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
8319 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8320 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1
8321 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
8322 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
8323 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8324 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
8325 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8326 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1
8327 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
8328 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
8329 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
8330 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm0
8331 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm1
8332 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
8333 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
8334 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
8335 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
8336 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
8337 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8338 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8339 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
8340 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3
8341 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
8342 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20
8343 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm2
8344 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm3
8345 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
8346 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
8347 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
8348 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
8349 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm22
8350 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm4
8351 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm5
8352 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
8353 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7
8354 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13
8355 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7
8356 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm23
8357 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
8358 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8359 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1
8360 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
8361 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24
8362 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
8363 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8364 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1
8365 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
8366 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25
8367 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rcx), %xmm0
8368 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdx), %xmm1
8369 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
8370 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
8371 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
8372 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
8373 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm26
8374 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8375 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
8376 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm3
8377 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
8378 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm27
8379 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%r10), %xmm2
8380 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rax), %xmm3
8381 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
8382 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4
8383 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
8384 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
8385 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm28
8386 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%r9), %xmm4
8387 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%r8), %xmm5
8388 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
8389 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7
8390 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13
8391 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7
8392 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm29
8393 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
8394 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8395 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1
8396 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
8397 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30
8398 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
8399 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8400 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm1
8401 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
8402 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm31
8403 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
8404 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8405 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm1
8406 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm17
8407 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
8408 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm18
8409 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16
8410 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12
8411 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9
8412 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
8413 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798]
8414 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8415 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm2
8416 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
8417 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8418 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854]
8419 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
8420 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3
8421 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
8422 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
8423 ; AVX512DQ-FCP-NEXT: # zmm3 = mem ^ (zmm7 & (zmm3 ^ mem))
8424 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
8425 ; AVX512DQ-FCP-NEXT: vpandnq %zmm19, %zmm2, %zmm19
8426 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm21
8427 ; AVX512DQ-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA
8428 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
8429 ; AVX512DQ-FCP-NEXT: vpord %zmm19, %zmm21, %zmm3 {%k1}
8430 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
8431 ; AVX512DQ-FCP-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7]
8432 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15
8433 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6
8434 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8
8435 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm19
8436 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm15
8437 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm8
8438 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
8439 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm0
8440 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
8441 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm21, %ymm0
8442 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6
8443 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
8444 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6
8445 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15]
8446 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm8
8447 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8448 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8
8449 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8450 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
8451 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm15
8452 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload
8453 ; AVX512DQ-FCP-NEXT: # xmm0 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
8454 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8455 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm8
8456 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
8457 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm13
8458 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0
8459 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm8
8460 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
8461 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14
8462 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
8463 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm14, %ymm21, %ymm14
8464 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11
8465 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11
8466 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11
8467 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
8468 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm8
8469 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8470 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8
8471 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8472 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
8473 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm14
8474 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8475 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8476 ; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8477 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8478 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
8479 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm8
8480 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
8481 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
8482 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm21
8483 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm8
8484 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm10
8485 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
8486 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm1
8487 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8488 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm1, %ymm17, %ymm1
8489 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8490 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
8491 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8492 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
8493 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm8
8494 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
8495 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8
8496 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
8497 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
8498 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1
8499 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
8500 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm9
8501 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
8502 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5
8503 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
8504 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5
8505 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
8506 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload
8507 ; AVX512DQ-FCP-NEXT: # zmm6 = mem ^ (zmm7 & (zmm6 ^ mem))
8508 ; AVX512DQ-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
8509 ; AVX512DQ-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
8510 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm6 {%k1}
8511 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload
8512 ; AVX512DQ-FCP-NEXT: # zmm15 = mem ^ (zmm7 & (zmm15 ^ mem))
8513 ; AVX512DQ-FCP-NEXT: vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload
8514 ; AVX512DQ-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
8515 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm15 {%k1}
8516 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm20 ^ (zmm7 & (zmm11 ^ zmm20))
8517 ; AVX512DQ-FCP-NEXT: vpandnq %zmm22, %zmm2, %zmm5
8518 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm23, %zmm8
8519 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm11 {%k1}
8520 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm24 ^ (zmm7 & (zmm14 ^ zmm24))
8521 ; AVX512DQ-FCP-NEXT: vpandnq %zmm25, %zmm2, %zmm5
8522 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm26, %zmm8
8523 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm14 {%k1}
8524 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm27 ^ (zmm7 & (zmm0 ^ zmm27))
8525 ; AVX512DQ-FCP-NEXT: vpandnq %zmm28, %zmm2, %zmm5
8526 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm29, %zmm8
8527 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm0 {%k1}
8528 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm30 ^ (zmm7 & (zmm1 ^ zmm30))
8529 ; AVX512DQ-FCP-NEXT: vpandnq %zmm31, %zmm2, %zmm5
8530 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm16, %zmm8
8531 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm1 {%k1}
8532 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm19 ^ (zmm7 & (zmm4 ^ zmm19))
8533 ; AVX512DQ-FCP-NEXT: vpandnq %zmm13, %zmm2, %zmm5
8534 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm2
8535 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm2, %zmm4 {%k1}
8536 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8537 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
8538 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax)
8539 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
8540 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 320(%rax)
8541 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax)
8542 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 448(%rax)
8543 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax)
8544 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
8545 ; AVX512DQ-FCP-NEXT: addq $392, %rsp # imm = 0x188
8546 ; AVX512DQ-FCP-NEXT: vzeroupper
8547 ; AVX512DQ-FCP-NEXT: retq
8549 ; AVX512BW-LABEL: store_i8_stride8_vf64:
8550 ; AVX512BW: # %bb.0:
8551 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
8552 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
8553 ; AVX512BW-NEXT: vmovdqa (%r10), %xmm0
8554 ; AVX512BW-NEXT: vmovdqa 16(%r10), %xmm12
8555 ; AVX512BW-NEXT: vmovdqa64 32(%r10), %xmm16
8556 ; AVX512BW-NEXT: vmovdqa 48(%r10), %xmm15
8557 ; AVX512BW-NEXT: vmovdqa (%rax), %xmm2
8558 ; AVX512BW-NEXT: vmovdqa 16(%rax), %xmm13
8559 ; AVX512BW-NEXT: vmovdqa64 32(%rax), %xmm17
8560 ; AVX512BW-NEXT: vmovdqa64 48(%rax), %xmm18
8561 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
8562 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
8563 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
8564 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
8565 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
8566 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3]
8567 ; AVX512BW-NEXT: vpermw %ymm1, %ymm3, %ymm1
8568 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
8569 ; AVX512BW-NEXT: vmovdqa (%r9), %xmm4
8570 ; AVX512BW-NEXT: vmovdqa64 48(%r9), %xmm19
8571 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm5
8572 ; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm21
8573 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
8574 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7]
8575 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7]
8576 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6
8577 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7]
8578 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3]
8579 ; AVX512BW-NEXT: vpermw %ymm7, %ymm6, %ymm7
8580 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm14
8581 ; AVX512BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888
8582 ; AVX512BW-NEXT: kmovd %eax, %k1
8583 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1}
8584 ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7
8585 ; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm24
8586 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm8
8587 ; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm27
8588 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
8589 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
8590 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
8591 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3]
8592 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
8593 ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9
8594 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
8595 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
8596 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
8597 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
8598 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1
8599 ; AVX512BW-NEXT: vmovdqa (%rcx), %xmm9
8600 ; AVX512BW-NEXT: vmovdqa64 48(%rcx), %xmm28
8601 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10
8602 ; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm29
8603 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
8604 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5]
8605 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7]
8606 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm11, %ymm11
8607 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7]
8608 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7]
8609 ; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20
8610 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20
8611 ; AVX512BW-NEXT: movl $572662306, %eax # imm = 0x22222222
8612 ; AVX512BW-NEXT: kmovd %eax, %k2
8613 ; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm1 {%k2}
8614 ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA
8615 ; AVX512BW-NEXT: kmovd %eax, %k3
8616 ; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k3}
8617 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
8618 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5]
8619 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7]
8620 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20
8621 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
8622 ; AVX512BW-NEXT: vpermw %ymm14, %ymm3, %ymm14
8623 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14
8624 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
8625 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7]
8626 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7]
8627 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm22, %ymm22
8628 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7]
8629 ; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20
8630 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23
8631 ; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1}
8632 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7]
8633 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3]
8634 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
8635 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3]
8636 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
8637 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20
8638 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
8639 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1]
8640 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
8641 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm14, %ymm22, %ymm14
8642 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14
8643 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
8644 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5]
8645 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7]
8646 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25
8647 ; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm22
8648 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
8649 ; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20
8650 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20
8651 ; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm25
8652 ; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm14 {%k2}
8653 ; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm20
8654 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k3}
8655 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
8656 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5]
8657 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7]
8658 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm18, %ymm18
8659 ; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm23
8660 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7]
8661 ; AVX512BW-NEXT: vpermw %ymm15, %ymm3, %ymm15
8662 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15
8663 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
8664 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7]
8665 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7]
8666 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm21
8667 ; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm19
8668 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
8669 ; AVX512BW-NEXT: vpermw %ymm18, %ymm6, %ymm18
8670 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18
8671 ; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm26
8672 ; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1}
8673 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15]
8674 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3]
8675 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
8676 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3]
8677 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
8678 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21
8679 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
8680 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1]
8681 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
8682 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm15, %ymm24, %ymm15
8683 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm15, %zmm15
8684 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
8685 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
8686 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7]
8687 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24
8688 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7]
8689 ; AVX512BW-NEXT: vpermw %ymm21, %ymm11, %ymm21
8690 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm21
8691 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm15 {%k2}
8692 ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3}
8693 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
8694 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5]
8695 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7]
8696 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21
8697 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
8698 ; AVX512BW-NEXT: vpermw %ymm18, %ymm3, %ymm18
8699 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18
8700 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7]
8701 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7]
8702 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7]
8703 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24
8704 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7]
8705 ; AVX512BW-NEXT: vpermw %ymm21, %ymm6, %ymm21
8706 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm27
8707 ; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1}
8708 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7]
8709 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3]
8710 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
8711 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3]
8712 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
8713 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21
8714 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
8715 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1]
8716 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
8717 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm24, %ymm18
8718 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18
8719 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7]
8720 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
8721 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7]
8722 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm24, %ymm28
8723 ; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm24
8724 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7]
8725 ; AVX512BW-NEXT: vpermw %ymm21, %ymm11, %ymm21
8726 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm21
8727 ; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm28
8728 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm18 {%k2}
8729 ; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm21
8730 ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3}
8731 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
8732 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5]
8733 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7]
8734 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm17, %ymm17
8735 ; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm27
8736 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7]
8737 ; AVX512BW-NEXT: vpermw %ymm16, %ymm3, %ymm16
8738 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16
8739 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15]
8740 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7]
8741 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7]
8742 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm25
8743 ; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm17
8744 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7]
8745 ; AVX512BW-NEXT: vpermw %ymm22, %ymm6, %ymm22
8746 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25
8747 ; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm22
8748 ; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1}
8749 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15]
8750 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3]
8751 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
8752 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3]
8753 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
8754 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20
8755 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
8756 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1]
8757 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
8758 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm23, %ymm16
8759 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm16
8760 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15]
8761 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
8762 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
8763 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20
8764 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
8765 ; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19
8766 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19
8767 ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k2}
8768 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3}
8769 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
8770 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
8771 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
8772 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20
8773 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
8774 ; AVX512BW-NEXT: vpermw %ymm19, %ymm3, %ymm19
8775 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19
8776 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7]
8777 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7]
8778 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7]
8779 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23
8780 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7]
8781 ; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20
8782 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20
8783 ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm20 {%k1}
8784 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7]
8785 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3]
8786 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
8787 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3]
8788 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero
8789 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23
8790 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
8791 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1]
8792 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
8793 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm25, %ymm19
8794 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm19
8795 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7]
8796 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5]
8797 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7]
8798 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25
8799 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
8800 ; AVX512BW-NEXT: vpermw %ymm23, %ymm11, %ymm23
8801 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm23, %zmm23
8802 ; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2}
8803 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k3}
8804 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
8805 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5]
8806 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7]
8807 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm13, %ymm13
8808 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
8809 ; AVX512BW-NEXT: vpermw %ymm12, %ymm3, %ymm12
8810 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
8811 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15]
8812 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7]
8813 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7]
8814 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20
8815 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
8816 ; AVX512BW-NEXT: vpermw %ymm13, %ymm6, %ymm13
8817 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13
8818 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1}
8819 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15]
8820 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3]
8821 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
8822 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3]
8823 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
8824 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20
8825 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
8826 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1]
8827 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
8828 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm12, %ymm21, %ymm12
8829 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm12
8830 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15]
8831 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5]
8832 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7]
8833 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20
8834 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
8835 ; AVX512BW-NEXT: vpermw %ymm17, %ymm11, %ymm17
8836 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17
8837 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm12 {%k2}
8838 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3}
8839 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
8840 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
8841 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7]
8842 ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm2
8843 ; AVX512BW-NEXT: vpermw %ymm0, %ymm3, %ymm0
8844 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
8845 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8846 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
8847 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
8848 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7]
8849 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
8850 ; AVX512BW-NEXT: vpermw %ymm2, %ymm6, %ymm2
8851 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
8852 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
8853 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1}
8854 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
8855 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
8856 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
8857 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
8858 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
8859 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
8860 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8861 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
8862 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8863 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
8864 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
8865 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
8866 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
8867 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
8868 ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
8869 ; AVX512BW-NEXT: vpermw %ymm3, %ymm11, %ymm3
8870 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
8871 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
8872 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k2}
8873 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
8874 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
8875 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
8876 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax)
8877 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rax)
8878 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rax)
8879 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax)
8880 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rax)
8881 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax)
8882 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax)
8883 ; AVX512BW-NEXT: vzeroupper
8884 ; AVX512BW-NEXT: retq
8886 ; AVX512BW-FCP-LABEL: store_i8_stride8_vf64:
8887 ; AVX512BW-FCP: # %bb.0:
8888 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8889 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
8890 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0
8891 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8892 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm20
8893 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm17
8894 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2
8895 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm21
8896 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18
8897 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
8898 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
8899 ; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854]
8900 ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
8901 ; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798]
8902 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6
8903 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8904 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
8905 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1
8906 ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
8907 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm22
8908 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm19
8909 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm7
8910 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23
8911 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24
8912 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
8913 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
8914 ; AVX512BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222
8915 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
8916 ; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1}
8917 ; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm6
8918 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm25
8919 ; AVX512BW-FCP-NEXT: vmovdqa (%rax), %xmm9
8920 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rax), %xmm26
8921 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
8922 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm10
8923 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm27
8924 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm11
8925 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28
8926 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
8927 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
8928 ; AVX512BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12
8929 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
8930 ; AVX512BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888
8931 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k2
8932 ; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2}
8933 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
8934 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm16
8935 ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm16, %ymm16
8936 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm29
8937 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
8938 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm29, %ymm15, %ymm15
8939 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15
8940 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7]
8941 ; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm8, %zmm15 {%k1}
8942 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
8943 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
8944 ; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm13, %zmm16
8945 ; AVX512BW-FCP-NEXT: vpermw %zmm29, %zmm14, %zmm16 {%k2}
8946 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
8947 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm17, %xmm18
8948 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero
8949 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm29, %ymm18
8950 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm29
8951 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17
8952 ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm17, %ymm17
8953 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17
8954 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm30
8955 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15]
8956 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm31
8957 ; AVX512BW-FCP-NEXT: vpermw %zmm18, %zmm8, %zmm17 {%k1}
8958 ; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm0
8959 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
8960 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
8961 ; AVX512BW-FCP-NEXT: vpermw %zmm18, %zmm13, %zmm18
8962 ; AVX512BW-FCP-NEXT: vpermw %zmm19, %zmm14, %zmm18 {%k2}
8963 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
8964 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm24
8965 ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm24, %ymm24
8966 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm19, %xmm25
8967 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
8968 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm19, %ymm19
8969 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm19, %zmm19
8970 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
8971 ; AVX512BW-FCP-NEXT: vpermw %zmm24, %zmm8, %zmm19 {%k1}
8972 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
8973 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7]
8974 ; AVX512BW-FCP-NEXT: vpermw %zmm24, %zmm13, %zmm24
8975 ; AVX512BW-FCP-NEXT: vpermw %zmm25, %zmm14, %zmm24 {%k2}
8976 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
8977 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm20, %xmm21
8978 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
8979 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm25, %ymm21
8980 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20
8981 ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm20, %ymm20
8982 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm20
8983 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm25
8984 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
8985 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm23
8986 ; AVX512BW-FCP-NEXT: vpermw %zmm21, %zmm8, %zmm20 {%k1}
8987 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rcx), %xmm26
8988 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
8989 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15]
8990 ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm21
8991 ; AVX512BW-FCP-NEXT: vpermw %zmm22, %zmm14, %zmm21 {%k2}
8992 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7]
8993 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm22
8994 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
8995 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm27, %ymm22
8996 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdx), %xmm27
8997 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8998 ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
8999 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm22, %zmm22
9000 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
9001 ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm22 {%k1}
9002 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15]
9003 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm23
9004 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
9005 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23
9006 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
9007 ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
9008 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0
9009 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
9010 ; AVX512BW-FCP-NEXT: vpermw %zmm23, %zmm8, %zmm0 {%k1}
9011 ; AVX512BW-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9012 ; AVX512BW-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9013 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm23
9014 ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm23, %ymm4
9015 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%r10), %xmm23
9016 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
9017 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
9018 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
9019 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rax), %xmm5
9020 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
9021 ; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm4
9022 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
9023 ; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm7
9024 ; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm8, %zmm2 {%k1}
9025 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7]
9026 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
9027 ; AVX512BW-FCP-NEXT: vpermw %zmm8, %zmm13, %zmm8
9028 ; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm8 {%k2}
9029 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15]
9030 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
9031 ; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm13, %zmm4
9032 ; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm4 {%k2}
9033 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
9034 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
9035 ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm13, %zmm5
9036 ; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm5 {%k2}
9037 ; AVX512BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA
9038 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
9039 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1}
9040 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1}
9041 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k1}
9042 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1}
9043 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k1}
9044 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm22 {%k1}
9045 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1}
9046 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1}
9047 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9048 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
9049 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
9050 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax)
9051 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax)
9052 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax)
9053 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax)
9054 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rax)
9055 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
9056 ; AVX512BW-FCP-NEXT: vzeroupper
9057 ; AVX512BW-FCP-NEXT: retq
9059 ; AVX512DQ-BW-LABEL: store_i8_stride8_vf64:
9060 ; AVX512DQ-BW: # %bb.0:
9061 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
9062 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
9063 ; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm0
9064 ; AVX512DQ-BW-NEXT: vmovdqa 16(%r10), %xmm12
9065 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%r10), %xmm16
9066 ; AVX512DQ-BW-NEXT: vmovdqa 48(%r10), %xmm15
9067 ; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm2
9068 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rax), %xmm13
9069 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %xmm17
9070 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rax), %xmm18
9071 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
9072 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
9073 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
9074 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
9075 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
9076 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3]
9077 ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm3, %ymm1
9078 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
9079 ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4
9080 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%r9), %xmm19
9081 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm5
9082 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm21
9083 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
9084 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7]
9085 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7]
9086 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6
9087 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7]
9088 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3]
9089 ; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm6, %ymm7
9090 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm14
9091 ; AVX512DQ-BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888
9092 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
9093 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1}
9094 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm7
9095 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm24
9096 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm8
9097 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm27
9098 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
9099 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
9100 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
9101 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3]
9102 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
9103 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9
9104 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
9105 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
9106 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
9107 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
9108 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1
9109 ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm9
9110 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rcx), %xmm28
9111 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10
9112 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm29
9113 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
9114 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5]
9115 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7]
9116 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm11, %ymm11
9117 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7]
9118 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7]
9119 ; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20
9120 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20
9121 ; AVX512DQ-BW-NEXT: movl $572662306, %eax # imm = 0x22222222
9122 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
9123 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm1 {%k2}
9124 ; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA
9125 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
9126 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k3}
9127 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
9128 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5]
9129 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7]
9130 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20
9131 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
9132 ; AVX512DQ-BW-NEXT: vpermw %ymm14, %ymm3, %ymm14
9133 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14
9134 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
9135 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7]
9136 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7]
9137 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm22, %ymm22
9138 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7]
9139 ; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20
9140 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23
9141 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1}
9142 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7]
9143 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3]
9144 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
9145 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3]
9146 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
9147 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20
9148 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
9149 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1]
9150 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
9151 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm14, %ymm22, %ymm14
9152 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14
9153 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
9154 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5]
9155 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7]
9156 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25
9157 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm22
9158 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
9159 ; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20
9160 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20
9161 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm25
9162 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm14 {%k2}
9163 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm20
9164 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k3}
9165 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
9166 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5]
9167 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7]
9168 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm18, %ymm18
9169 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm23
9170 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7]
9171 ; AVX512DQ-BW-NEXT: vpermw %ymm15, %ymm3, %ymm15
9172 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15
9173 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
9174 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7]
9175 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7]
9176 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm21
9177 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm19
9178 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
9179 ; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm6, %ymm18
9180 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18
9181 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm26
9182 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1}
9183 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15]
9184 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3]
9185 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
9186 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3]
9187 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
9188 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21
9189 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
9190 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1]
9191 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
9192 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm15, %ymm24, %ymm15
9193 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm15, %zmm15
9194 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
9195 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
9196 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7]
9197 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24
9198 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7]
9199 ; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm11, %ymm21
9200 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm21
9201 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm15 {%k2}
9202 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3}
9203 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
9204 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5]
9205 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7]
9206 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21
9207 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
9208 ; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm3, %ymm18
9209 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18
9210 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7]
9211 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7]
9212 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7]
9213 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24
9214 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7]
9215 ; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm6, %ymm21
9216 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm27
9217 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1}
9218 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7]
9219 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3]
9220 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
9221 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3]
9222 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
9223 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21
9224 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
9225 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1]
9226 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
9227 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm24, %ymm18
9228 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18
9229 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7]
9230 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
9231 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7]
9232 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm24, %ymm28
9233 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm24
9234 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7]
9235 ; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm11, %ymm21
9236 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm21
9237 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm28
9238 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm18 {%k2}
9239 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm21
9240 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3}
9241 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
9242 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5]
9243 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7]
9244 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm17, %ymm17
9245 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm27
9246 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7]
9247 ; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm3, %ymm16
9248 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16
9249 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15]
9250 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7]
9251 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7]
9252 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm25
9253 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm17
9254 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7]
9255 ; AVX512DQ-BW-NEXT: vpermw %ymm22, %ymm6, %ymm22
9256 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25
9257 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm22
9258 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1}
9259 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15]
9260 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3]
9261 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
9262 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3]
9263 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
9264 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20
9265 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
9266 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1]
9267 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
9268 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm23, %ymm16
9269 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm16
9270 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15]
9271 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
9272 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
9273 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20
9274 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
9275 ; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19
9276 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19
9277 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k2}
9278 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3}
9279 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
9280 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
9281 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
9282 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20
9283 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
9284 ; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm3, %ymm19
9285 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19
9286 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7]
9287 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7]
9288 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7]
9289 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23
9290 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7]
9291 ; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20
9292 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20
9293 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm20 {%k1}
9294 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7]
9295 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3]
9296 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
9297 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3]
9298 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero
9299 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23
9300 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
9301 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1]
9302 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
9303 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm25, %ymm19
9304 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm19
9305 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7]
9306 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5]
9307 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7]
9308 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25
9309 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
9310 ; AVX512DQ-BW-NEXT: vpermw %ymm23, %ymm11, %ymm23
9311 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm23, %zmm23
9312 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2}
9313 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k3}
9314 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
9315 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5]
9316 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7]
9317 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm13, %ymm13
9318 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
9319 ; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm3, %ymm12
9320 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
9321 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15]
9322 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7]
9323 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7]
9324 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20
9325 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
9326 ; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm6, %ymm13
9327 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13
9328 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1}
9329 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15]
9330 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3]
9331 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
9332 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3]
9333 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
9334 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20
9335 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
9336 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1]
9337 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
9338 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm12, %ymm21, %ymm12
9339 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm12
9340 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15]
9341 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5]
9342 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7]
9343 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20
9344 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
9345 ; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm11, %ymm17
9346 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17
9347 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm12 {%k2}
9348 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3}
9349 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
9350 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
9351 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7]
9352 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm2
9353 ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm3, %ymm0
9354 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
9355 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
9356 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
9357 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
9358 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7]
9359 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
9360 ; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm6, %ymm2
9361 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
9362 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
9363 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1}
9364 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
9365 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
9366 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
9367 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
9368 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
9369 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
9370 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
9371 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
9372 ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
9373 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
9374 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
9375 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
9376 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
9377 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
9378 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
9379 ; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm11, %ymm3
9380 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
9381 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
9382 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k2}
9383 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
9384 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
9385 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
9386 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax)
9387 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rax)
9388 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 320(%rax)
9389 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 256(%rax)
9390 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 448(%rax)
9391 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 384(%rax)
9392 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax)
9393 ; AVX512DQ-BW-NEXT: vzeroupper
9394 ; AVX512DQ-BW-NEXT: retq
9396 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride8_vf64:
9397 ; AVX512DQ-BW-FCP: # %bb.0:
9398 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9399 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
9400 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0
9401 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9402 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm20
9403 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm17
9404 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2
9405 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm21
9406 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18
9407 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
9408 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
9409 ; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854]
9410 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
9411 ; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798]
9412 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6
9413 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
9414 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
9415 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1
9416 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
9417 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm22
9418 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm19
9419 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm7
9420 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23
9421 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24
9422 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
9423 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
9424 ; AVX512DQ-BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222
9425 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
9426 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1}
9427 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm6
9428 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm25
9429 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rax), %xmm9
9430 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rax), %xmm26
9431 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
9432 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm10
9433 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm27
9434 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm11
9435 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28
9436 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
9437 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
9438 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12
9439 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
9440 ; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888
9441 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2
9442 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2}
9443 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
9444 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm16
9445 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm16, %ymm16
9446 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm29
9447 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
9448 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm29, %ymm15, %ymm15
9449 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15
9450 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7]
9451 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm8, %zmm15 {%k1}
9452 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
9453 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
9454 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm13, %zmm16
9455 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm29, %zmm14, %zmm16 {%k2}
9456 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
9457 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm17, %xmm18
9458 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero
9459 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm29, %ymm18
9460 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm29
9461 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17
9462 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm17, %ymm17
9463 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17
9464 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm30
9465 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15]
9466 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm31
9467 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm18, %zmm8, %zmm17 {%k1}
9468 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm0
9469 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
9470 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
9471 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm18, %zmm13, %zmm18
9472 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm19, %zmm14, %zmm18 {%k2}
9473 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
9474 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm24
9475 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm24, %ymm24
9476 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm19, %xmm25
9477 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
9478 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm19, %ymm19
9479 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm19, %zmm19
9480 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
9481 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm24, %zmm8, %zmm19 {%k1}
9482 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
9483 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7]
9484 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm24, %zmm13, %zmm24
9485 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm25, %zmm14, %zmm24 {%k2}
9486 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
9487 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm20, %xmm21
9488 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
9489 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm25, %ymm21
9490 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20
9491 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm20, %ymm20
9492 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm20
9493 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm25
9494 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
9495 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm23
9496 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm21, %zmm8, %zmm20 {%k1}
9497 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rcx), %xmm26
9498 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
9499 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15]
9500 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm21
9501 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm22, %zmm14, %zmm21 {%k2}
9502 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7]
9503 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm22
9504 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
9505 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm27, %ymm22
9506 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdx), %xmm27
9507 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
9508 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
9509 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm22, %zmm22
9510 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
9511 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm22 {%k1}
9512 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15]
9513 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm23
9514 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
9515 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23
9516 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
9517 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
9518 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0
9519 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
9520 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm23, %zmm8, %zmm0 {%k1}
9521 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9522 ; AVX512DQ-BW-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9523 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm23
9524 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm23, %ymm4
9525 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%r10), %xmm23
9526 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
9527 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
9528 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
9529 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rax), %xmm5
9530 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
9531 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm4
9532 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
9533 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm7
9534 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm8, %zmm2 {%k1}
9535 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7]
9536 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
9537 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm8, %zmm13, %zmm8
9538 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm8 {%k2}
9539 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15]
9540 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
9541 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm13, %zmm4
9542 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm4 {%k2}
9543 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
9544 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
9545 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm13, %zmm5
9546 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm5 {%k2}
9547 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA
9548 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
9549 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1}
9550 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1}
9551 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k1}
9552 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1}
9553 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k1}
9554 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm22 {%k1}
9555 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1}
9556 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1}
9557 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9558 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
9559 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
9560 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax)
9561 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax)
9562 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax)
9563 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax)
9564 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rax)
9565 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
9566 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
9567 ; AVX512DQ-BW-FCP-NEXT: retq
9568 %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
9569 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64
9570 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64
9571 %in.vec3 = load <64 x i8>, ptr %in.vecptr3, align 64
9572 %in.vec4 = load <64 x i8>, ptr %in.vecptr4, align 64
9573 %in.vec5 = load <64 x i8>, ptr %in.vecptr5, align 64
9574 %in.vec6 = load <64 x i8>, ptr %in.vecptr6, align 64
9575 %in.vec7 = load <64 x i8>, ptr %in.vecptr7, align 64
9576 %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
9577 %2 = shufflevector <64 x i8> %in.vec2, <64 x i8> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
9578 %3 = shufflevector <64 x i8> %in.vec4, <64 x i8> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
9579 %4 = shufflevector <64 x i8> %in.vec6, <64 x i8> %in.vec7, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
9580 %5 = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
9581 %6 = shufflevector <128 x i8> %3, <128 x i8> %4, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
9582 %7 = shufflevector <256 x i8> %5, <256 x i8> %6, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383, i32 384, i32 385, i32 386, i32 387, i32 388, i32 389, i32 390, i32 391, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 416, i32 417, i32 418, i32 419, i32 420, i32 421, i32 422, i32 423, i32 424, i32 425, i32 426, i32 427, i32 428, i32 429, i32 430, i32 431, i32 432, i32 433, i32 434, i32 435, i32 436, i32 437, i32 438, i32 439, i32 440, i32 441, i32 442, i32 443, i32 444, i32 445, i32 446, i32 447, i32 448, i32 449, i32 450, i32 451, i32 452, i32 453, i32 454, i32 455, i32 456, i32 457, i32 458, i32 459, i32 460, i32 461, i32 462, i32 463, i32 464, i32 465, i32 466, i32 467, i32 468, i32 469, i32 470, i32 471, i32 472, i32 473, i32 474, i32 475, i32 476, i32 477, i32 478, i32 479, i32 480, i32 481, i32 482, i32 483, i32 484, i32 485, i32 486, i32 487, i32 488, i32 489, i32 490, i32 491, i32 492, i32 493, i32 494, i32 495, i32 496, i32 497, i32 498, i32 499, i32 500, i32 501, i32 502, i32 503, i32 504, i32 505, i32 506, i32 507, i32 508, i32 509, i32 510, i32 511>
9583 %interleaved.vec = shufflevector <512 x i8> %7, <512 x i8> poison, <512 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 384, i32 448, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 385, i32 449, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 386, i32 450, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 387, i32 451, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 388, i32 452, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 389, i32 453, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 390, i32 454, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 391, i32 455, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 392, i32 456, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 393, i32 457, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 394, i32 458, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 395, i32 459, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 396, i32 460, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 397, i32 461, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 398, i32 462, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 399, i32 463, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 400, i32 464, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 401, i32 465, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 402, i32 466, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 403, i32 467, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 404, i32 468, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 405, i32 469, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 406, i32 470, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 407, i32 471, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 408, i32 472, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 409, i32 473, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 410, i32 474, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 411, i32 475, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 412, i32 476, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 413, i32 477, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 414, i32 478, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 415, i32 479, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 416, i32 480, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 417, i32 481, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 418, i32 482, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 419, i32 483, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 420, i32 484, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 421, i32 485, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 422, i32 486, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 423, i32 487, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 424, i32 488, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 425, i32 489, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 426, i32 490, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 427, i32 491, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 428, i32 492, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 429, i32 493, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 430, i32 494, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 431, i32 495, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 432, i32 496, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 433, i32 497, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 434, i32 498, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 435, i32 499, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 436, i32 500, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 437, i32 501, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 438, i32 502, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 439, i32 503, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 440, i32 504, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 441, i32 505, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 442, i32 506, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 443, i32 507, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 444, i32 508, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 445, i32 509, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 446, i32 510, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383, i32 447, i32 511>
9584 store <512 x i8> %interleaved.vec, ptr %out.vec, align 64