1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX512
6 define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
7 ; AVX-LABEL: load_factorf64_4:
9 ; AVX-NEXT: vmovupd (%rdi), %ymm0
10 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1
11 ; AVX-NEXT: vmovupd 64(%rdi), %ymm2
12 ; AVX-NEXT: vmovupd 96(%rdi), %ymm3
13 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
14 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
15 ; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
16 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
17 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
18 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
19 ; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2
20 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
21 ; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
23 %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
24 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
25 %strided.v1 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
26 %strided.v2 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
27 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
28 %add1 = fadd <4 x double> %strided.v0, %strided.v1
29 %add2 = fadd <4 x double> %add1, %strided.v2
30 %add3 = fadd <4 x double> %add2, %strided.v3
31 ret <4 x double> %add3
34 define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
35 ; AVX-LABEL: load_factorf64_2:
37 ; AVX-NEXT: vmovupd (%rdi), %ymm0
38 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1
39 ; AVX-NEXT: vmovupd 64(%rdi), %ymm2
40 ; AVX-NEXT: vmovupd 96(%rdi), %ymm3
41 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
42 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
43 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
44 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
45 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
46 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
47 ; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0
49 %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
50 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
51 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
52 %mul = fmul <4 x double> %strided.v0, %strided.v3
56 define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
57 ; AVX-LABEL: load_factorf64_1:
59 ; AVX-NEXT: vmovupd (%rdi), %ymm0
60 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1
61 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
62 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
63 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
64 ; AVX-NEXT: vmulpd %ymm0, %ymm0, %ymm0
66 %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
67 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
68 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
69 %mul = fmul <4 x double> %strided.v0, %strided.v3
73 define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
74 ; AVX1-LABEL: load_factori64_4:
76 ; AVX1-NEXT: vmovups (%rdi), %ymm0
77 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
78 ; AVX1-NEXT: vmovups 64(%rdi), %ymm2
79 ; AVX1-NEXT: vmovups 96(%rdi), %ymm3
80 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
81 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
82 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
83 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
84 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
85 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
86 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
87 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
88 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1
89 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
90 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm4
91 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
92 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
93 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
94 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
95 ; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
96 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
97 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
98 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
101 ; AVX2OR512-LABEL: load_factori64_4:
102 ; AVX2OR512: # %bb.0:
103 ; AVX2OR512-NEXT: vmovdqu (%rdi), %ymm0
104 ; AVX2OR512-NEXT: vmovdqu 32(%rdi), %ymm1
105 ; AVX2OR512-NEXT: vmovdqu 64(%rdi), %ymm2
106 ; AVX2OR512-NEXT: vmovdqu 96(%rdi), %ymm3
107 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
108 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
109 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
110 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
111 ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
112 ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
113 ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
114 ; AVX2OR512-NEXT: vpaddq %ymm3, %ymm4, %ymm3
115 ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
116 ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
117 ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
118 ; AVX2OR512-NEXT: retq
119 %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
120 %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
121 %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
122 %strided.v2 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
123 %strided.v3 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
124 %add1 = add <4 x i64> %strided.v0, %strided.v1
125 %add2 = add <4 x i64> %add1, %strided.v2
126 %add3 = add <4 x i64> %add2, %strided.v3
130 define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
131 ; AVX1-LABEL: store_factorf64_4:
133 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
134 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
135 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
136 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
137 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
138 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
139 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
140 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
141 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
142 ; AVX1-NEXT: vmovups %ymm3, 64(%rdi)
143 ; AVX1-NEXT: vmovups %ymm4, 32(%rdi)
144 ; AVX1-NEXT: vmovups %ymm2, (%rdi)
145 ; AVX1-NEXT: vzeroupper
148 ; AVX2-LABEL: store_factorf64_4:
150 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
151 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
152 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
153 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
154 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
155 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
156 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
157 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
158 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
159 ; AVX2-NEXT: vmovups %ymm3, 64(%rdi)
160 ; AVX2-NEXT: vmovups %ymm4, 32(%rdi)
161 ; AVX2-NEXT: vmovups %ymm2, (%rdi)
162 ; AVX2-NEXT: vzeroupper
165 ; AVX512-LABEL: store_factorf64_4:
167 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
168 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
169 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
170 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
171 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
172 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
173 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
174 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
175 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
176 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
177 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
178 ; AVX512-NEXT: vmovups %zmm1, (%rdi)
179 ; AVX512-NEXT: vzeroupper
181 %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
182 %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
183 %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
184 store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
188 define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) {
189 ; AVX1-LABEL: store_factori64_4:
191 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
192 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
193 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
194 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
195 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
196 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
197 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
198 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
199 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
200 ; AVX1-NEXT: vmovups %ymm3, 64(%rdi)
201 ; AVX1-NEXT: vmovups %ymm4, 32(%rdi)
202 ; AVX1-NEXT: vmovups %ymm2, (%rdi)
203 ; AVX1-NEXT: vzeroupper
206 ; AVX2-LABEL: store_factori64_4:
208 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
209 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
210 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
211 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
212 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
213 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
214 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
215 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
216 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
217 ; AVX2-NEXT: vmovups %ymm3, 64(%rdi)
218 ; AVX2-NEXT: vmovups %ymm4, 32(%rdi)
219 ; AVX2-NEXT: vmovups %ymm2, (%rdi)
220 ; AVX2-NEXT: vzeroupper
223 ; AVX512-LABEL: store_factori64_4:
225 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
226 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
227 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
228 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
229 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
230 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
231 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
232 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
233 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
234 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
235 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
236 ; AVX512-NEXT: vmovups %zmm1, (%rdi)
237 ; AVX512-NEXT: vzeroupper
239 %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
240 %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
241 %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
242 store <16 x i64> %interleaved.vec, <16 x i64>* %ptr, align 16
247 define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, <128 x i8>* %p) {
248 ; AVX1-LABEL: interleaved_store_vf32_i8_stride4:
250 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
251 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
252 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
253 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
254 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
255 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
256 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
257 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
258 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
259 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
260 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
261 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
262 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
263 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
264 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
265 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
266 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
267 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
268 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
269 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
270 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
271 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
272 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
273 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
274 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
275 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
276 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
277 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
278 ; AVX1-NEXT: vmovaps %ymm2, 32(%rdi)
279 ; AVX1-NEXT: vmovaps %ymm3, (%rdi)
280 ; AVX1-NEXT: vmovaps %ymm0, 96(%rdi)
281 ; AVX1-NEXT: vmovaps %ymm1, 64(%rdi)
282 ; AVX1-NEXT: vzeroupper
285 ; AVX2-LABEL: interleaved_store_vf32_i8_stride4:
287 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
288 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
289 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
290 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
291 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
292 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
293 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
294 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
295 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
296 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
297 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
298 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
299 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
300 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdi)
301 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
302 ; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
303 ; AVX2-NEXT: vzeroupper
306 ; AVX512-LABEL: interleaved_store_vf32_i8_stride4:
308 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
309 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
310 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
311 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
312 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
313 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
314 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
315 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
316 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
317 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
318 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
319 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
320 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
321 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
322 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdi)
323 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
324 ; AVX512-NEXT: vzeroupper
326 %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
327 %v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
328 %interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
329 store <128 x i8> %interleaved.vec, <128 x i8>* %p
333 define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, <64 x i8>* %p) {
334 ; AVX1-LABEL: interleaved_store_vf16_i8_stride4:
336 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
337 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
338 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
339 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
340 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
341 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
342 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
343 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
344 ; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi)
345 ; AVX1-NEXT: vmovdqa %xmm4, 32(%rdi)
346 ; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi)
347 ; AVX1-NEXT: vmovdqa %xmm3, (%rdi)
350 ; AVX2-LABEL: interleaved_store_vf16_i8_stride4:
352 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
353 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
354 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
355 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
356 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
357 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
358 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
359 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
360 ; AVX2-NEXT: vmovdqa %xmm0, 48(%rdi)
361 ; AVX2-NEXT: vmovdqa %xmm4, 32(%rdi)
362 ; AVX2-NEXT: vmovdqa %xmm1, 16(%rdi)
363 ; AVX2-NEXT: vmovdqa %xmm3, (%rdi)
366 ; AVX512-LABEL: interleaved_store_vf16_i8_stride4:
368 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
369 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
370 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
371 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
372 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
373 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
374 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
375 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
376 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
377 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
378 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
379 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdi)
380 ; AVX512-NEXT: vzeroupper
382 %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
383 %v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
384 %interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>
385 store <64 x i8> %interleaved.vec, <64 x i8>* %p
389 define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
390 ; AVX-LABEL: interleaved_load_vf8_i8_stride4:
392 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
393 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
394 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
395 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3
396 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
397 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm0[0],xmm3[0]
398 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
399 ; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3
400 ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0
401 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
402 ; AVX-NEXT: vpaddw %xmm0, %xmm4, %xmm0
403 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
404 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
405 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
406 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
407 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,3,2,4,5,6,7]
408 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
409 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
410 ; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
411 ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
412 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
413 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
414 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
416 %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
417 %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
418 %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
419 %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
420 %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
422 %add1 = add <8 x i8> %v1, %v2
423 %add2 = add <8 x i8> %v4, %v3
424 %add3 = mul <8 x i8> %add1, %add2
428 define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
429 ; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
431 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
432 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
433 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
434 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3
435 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4
436 ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5
437 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
438 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
439 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
440 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6
441 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
442 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
443 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
444 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
445 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
446 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
447 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
448 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
449 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
450 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
451 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
452 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
453 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
454 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
455 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
456 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
457 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
458 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
459 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
460 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
461 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
462 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
463 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
464 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
465 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
466 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
467 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
468 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
469 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
470 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
471 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
472 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
473 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
474 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
477 ; AVX2-LABEL: interleaved_load_vf16_i8_stride4:
479 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
480 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
481 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
482 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
483 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
484 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
485 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
486 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
487 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
488 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
489 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
490 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
491 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
492 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
493 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
494 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
495 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
496 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
497 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
498 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
499 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
500 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
501 ; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4
502 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
503 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
504 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
505 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
506 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
507 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
508 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
509 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
510 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
511 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
512 ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
513 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
514 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
515 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
516 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
517 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
518 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
519 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
520 ; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
521 ; AVX2-NEXT: vpxor %xmm0, %xmm4, %xmm0
522 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
525 ; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
527 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
528 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
529 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
530 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
531 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
532 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm5
533 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm4
534 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
535 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
536 ; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm6
537 ; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm5
538 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
539 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
540 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
541 ; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6
542 ; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm5
543 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
544 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
545 ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm7
546 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm6
547 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
548 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
549 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
550 ; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm7
551 ; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm6
552 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
553 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
554 ; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm4
555 ; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm7
556 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
557 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3]
558 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
559 ; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
560 ; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2
561 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
562 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
563 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
564 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
565 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
566 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
567 ; AVX512-NEXT: vpcmpeqb %zmm5, %zmm8, %k0
568 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm4, %k1
569 ; AVX512-NEXT: kxnorw %k1, %k0, %k0
570 ; AVX512-NEXT: vpmovm2b %k0, %zmm0
571 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
572 ; AVX512-NEXT: vzeroupper
574 %wide.vec = load <64 x i8>, <64 x i8>* %ptr
575 %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
576 %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
577 %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
578 %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
580 %cmp1 = icmp eq <16 x i8> %v1, %v2
581 %cmp2 = icmp eq <16 x i8> %v3, %v4
582 %res = icmp eq <16 x i1> %cmp1, %cmp2
587 define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
588 ; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
590 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
591 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm11
592 ; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1
593 ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm12
594 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm3
595 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
596 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
597 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm14
598 ; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm4
599 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm6
600 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm5
601 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
602 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
603 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8
604 ; AVX1-NEXT: vmovdqa (%rdi), %xmm13
605 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm15
606 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7
607 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm5
608 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm1
609 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
610 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
611 ; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm1
612 ; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2
613 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
614 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
615 ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
616 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
617 ; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1
618 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2
619 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
620 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
621 ; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3
622 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm4
623 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
624 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
625 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
626 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm3
627 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
628 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
629 ; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm3
630 ; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2
631 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
632 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
633 ; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
634 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
635 ; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1
636 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2
637 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
638 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
639 ; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3
640 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm4
641 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
642 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
643 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
644 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm3
645 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
646 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
647 ; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm3
648 ; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2
649 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
650 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
651 ; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7]
652 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
653 ; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1
654 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2
655 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
656 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
657 ; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3
658 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm4
659 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
660 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
661 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
662 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm3
663 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
664 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
665 ; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm3
666 ; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2
667 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
668 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
669 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
670 ; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm1
671 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2
672 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1
673 ; AVX1-NEXT: vpcmpeqb %xmm9, %xmm8, %xmm2
674 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
675 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
676 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3
677 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2
678 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm10, %xmm0
679 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
680 ; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0
681 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
684 ; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
686 ; AVX2-NEXT: vmovdqa (%rdi), %xmm9
687 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm11
688 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm12
689 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm13
690 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
691 ; AVX2-NEXT: vpshufb %xmm6, %xmm13, %xmm4
692 ; AVX2-NEXT: vpshufb %xmm6, %xmm12, %xmm5
693 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
694 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
695 ; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm5
696 ; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm7
697 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
698 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
699 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm14
700 ; AVX2-NEXT: vpshufb %xmm6, %xmm14, %xmm7
701 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = mem[2,3,0,1]
702 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
703 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm6
704 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
705 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10
706 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm6
707 ; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm1
708 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = mem[2,3,0,1]
709 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7
710 ; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0
711 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
712 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
713 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
714 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
715 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
716 ; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm0
717 ; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm2
718 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
719 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
720 ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm3
721 ; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm4
722 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
723 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
724 ; AVX2-NEXT: vpshufb %xmm1, %xmm14, %xmm3
725 ; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm1
726 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
727 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
728 ; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm3
729 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2
730 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
731 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
732 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
733 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
734 ; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm8
735 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
736 ; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm1
737 ; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm2
738 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
739 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
740 ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm3
741 ; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm4
742 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
743 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
744 ; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm3
745 ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm0
746 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
747 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
748 ; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm3
749 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2
750 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
751 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
752 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
753 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7]
754 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
755 ; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm2
756 ; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm3
757 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
758 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
759 ; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm4
760 ; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0
761 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
762 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
763 ; AVX2-NEXT: vpshufb %xmm1, %xmm14, %xmm2
764 ; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm1
765 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
766 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
767 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm2
768 ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3
769 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
770 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
771 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
772 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
773 ; AVX2-NEXT: vpcmpeqb %ymm0, %ymm10, %ymm0
774 ; AVX2-NEXT: vpxor %ymm0, %ymm8, %ymm0
775 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
778 ; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
780 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
781 ; AVX512-NEXT: vmovdqa (%rdi), %xmm10
782 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm11
783 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12
784 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm13
785 ; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm4
786 ; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm5
787 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
788 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
789 ; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm5
790 ; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm7
791 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
792 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
793 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14
794 ; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm7
795 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = mem[2,3,0,1]
796 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5
797 ; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm6
798 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
799 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9
800 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm6
801 ; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm1
802 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = mem[2,3,0,1]
803 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7
804 ; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm0
805 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
806 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
807 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
808 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
809 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
810 ; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm1
811 ; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm2
812 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
813 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
814 ; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3
815 ; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm4
816 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
817 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
818 ; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3
819 ; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0
820 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
821 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
822 ; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
823 ; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
824 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
825 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
826 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
827 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
828 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
829 ; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm1
830 ; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm2
831 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
832 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
833 ; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3
834 ; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm4
835 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
836 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
837 ; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3
838 ; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0
839 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
840 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
841 ; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
842 ; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
843 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
844 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
845 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
846 ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
847 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
848 ; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm2
849 ; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm3
850 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
851 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
852 ; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4
853 ; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm0
854 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
855 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
856 ; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm2
857 ; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1
858 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
859 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
860 ; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm2
861 ; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
862 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
863 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
864 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
865 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
866 ; AVX512-NEXT: vpcmpeqb %zmm9, %zmm8, %k0
867 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm15, %k1
868 ; AVX512-NEXT: kxnord %k1, %k0, %k0
869 ; AVX512-NEXT: vpmovm2b %k0, %zmm0
870 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
872 %wide.vec = load <128 x i8>, <128 x i8>* %ptr
873 %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
875 %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
877 %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
879 %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
881 %cmp1 = icmp eq <32 x i8> %v1, %v2
882 %cmp2 = icmp eq <32 x i8> %v3, %v4
883 %res = icmp eq <32 x i1> %cmp1, %cmp2
888 define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
889 ; AVX-LABEL: interleaved_store_vf8_i8_stride4:
891 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
892 ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
893 ; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
894 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
895 ; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1
896 ; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
897 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
898 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
899 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
900 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdi)
901 ; AVX-NEXT: vmovdqa %xmm2, (%rdi)
903 %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
904 %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
905 %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
906 store <32 x i8> %interleaved.vec, <32 x i8>* %p
910 define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
911 ; AVX1-LABEL: interleaved_load_vf32_i8_stride3:
913 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
914 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
915 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
916 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
917 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4
918 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5
919 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
920 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
921 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
922 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
923 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
924 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
925 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
926 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
927 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
928 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
929 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
930 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8
931 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
932 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
933 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2
934 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
935 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
936 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
937 ; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
938 ; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
939 ; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5
940 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
941 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
942 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
943 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
944 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
945 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
946 ; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
947 ; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3
948 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
949 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
950 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
953 ; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3:
954 ; AVX2OR512: # %bb.0:
955 ; AVX2OR512-NEXT: vmovdqa (%rdi), %xmm0
956 ; AVX2OR512-NEXT: vmovdqa 16(%rdi), %xmm1
957 ; AVX2OR512-NEXT: vmovdqa 32(%rdi), %xmm2
958 ; AVX2OR512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
959 ; AVX2OR512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
960 ; AVX2OR512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
961 ; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
962 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
963 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
964 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
965 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
966 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
967 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
968 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
969 ; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
970 ; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
971 ; AVX2OR512-NEXT: vpaddb %ymm2, %ymm1, %ymm1
972 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
973 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
974 ; AVX2OR512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
975 ; AVX2OR512-NEXT: retq
976 %wide.vec = load <96 x i8>, <96 x i8>* %ptr
977 %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
978 %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
979 %v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
980 %add1 = add <32 x i8> %v1, %v2
981 %add2 = add <32 x i8> %v3, %add1
985 define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
986 ; AVX-LABEL: interleaved_load_vf16_i8_stride3:
988 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
989 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
990 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
991 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
992 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
993 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
994 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
995 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
996 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
997 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
998 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
999 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1000 ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
1001 ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
1002 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1003 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1004 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1006 %wide.vec = load <48 x i8>, <48 x i8>* %ptr
1007 %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
1008 %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
1009 %v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
1010 %add1 = add <16 x i8> %v1, %v2
1011 %add2 = add <16 x i8> %v3, %add1
1015 define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
1016 ; AVX-LABEL: interleaved_load_vf8_i8_stride3:
1018 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1019 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
1020 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
1021 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
1022 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
1023 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
1024 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
1025 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
1026 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
1027 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
1028 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1029 ; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0
1030 ; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0
1032 %wide.vec = load <24 x i8>, <24 x i8>* %ptr
1033 %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
1034 %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
1035 %v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
1036 %add1 = add <8 x i8> %v1, %v2
1037 %add2 = add <8 x i8> %v3, %add1
1041 define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
1042 ; AVX-LABEL: interleaved_store_vf8_i8_stride3:
1044 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1045 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1046 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1047 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1048 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1
1049 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
1050 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
1051 ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
1052 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
1053 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
1054 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1055 ; AVX-NEXT: vmovq %xmm0, 16(%rdi)
1056 ; AVX-NEXT: vmovdqu %xmm2, (%rdi)
1058 %1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1059 %2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1060 %interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1061 store <24 x i8> %interleaved.vec, <24 x i8>* %p, align 1
1065 define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
1066 ; AVX1-LABEL: interleaved_store_vf16_i8_stride3:
1068 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1069 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1070 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1071 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1072 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1073 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1074 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1075 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1076 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1077 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1078 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1079 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1080 ; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi)
1081 ; AVX1-NEXT: vmovdqu %xmm1, (%rdi)
1082 ; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi)
1085 ; AVX2-LABEL: interleaved_store_vf16_i8_stride3:
1087 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1088 ; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1089 ; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1090 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1091 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1092 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1093 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1094 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1095 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1096 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1097 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1098 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1099 ; AVX2-NEXT: vmovdqu %xmm0, 16(%rdi)
1100 ; AVX2-NEXT: vmovdqu %xmm1, (%rdi)
1101 ; AVX2-NEXT: vmovdqu %xmm2, 32(%rdi)
1104 ; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
1106 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1107 ; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1108 ; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1109 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1110 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1111 ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1112 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1113 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1114 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1115 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1116 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1117 ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1118 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1119 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1
1120 ; AVX512-NEXT: vmovdqu %ymm0, (%rdi)
1121 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rdi)
1122 ; AVX512-NEXT: vzeroupper
1124 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1125 %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1126 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1127 store <48 x i8> %interleaved.vec, <48 x i8>* %p, align 1
1131 define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <96 x i8>* %p) {
1132 ; AVX1-LABEL: interleaved_store_vf32_i8_stride3:
1134 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1135 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1136 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1137 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
1138 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1139 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1140 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1141 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1142 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1143 ; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1144 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
1145 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
1146 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1147 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
1148 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1149 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1150 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1151 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1152 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1153 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1154 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
1155 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
1156 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
1157 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
1158 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
1159 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
1160 ; AVX1-NEXT: vmovdqu %xmm3, 80(%rdi)
1161 ; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi)
1162 ; AVX1-NEXT: vmovdqu %xmm5, 48(%rdi)
1163 ; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi)
1164 ; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi)
1165 ; AVX1-NEXT: vmovdqu %xmm1, (%rdi)
1166 ; AVX1-NEXT: vzeroupper
1169 ; AVX2-LABEL: interleaved_store_vf32_i8_stride3:
1171 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1172 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1173 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1174 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1175 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1176 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1177 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1178 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1179 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1180 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1181 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1182 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1183 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1184 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1185 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1186 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
1187 ; AVX2-NEXT: vmovdqu %ymm1, 32(%rdi)
1188 ; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
1189 ; AVX2-NEXT: vzeroupper
1192 ; AVX512-LABEL: interleaved_store_vf32_i8_stride3:
1194 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1195 ; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1196 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1197 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1198 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1199 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1200 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1201 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1202 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1203 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1204 ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1205 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1206 ; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1207 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1208 ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1209 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1210 ; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi)
1211 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi)
1212 ; AVX512-NEXT: vzeroupper
1214 %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1215 %2 = shufflevector <32 x i8> %c, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1216 %interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
1217 store <96 x i8> %interleaved.vec, <96 x i8>* %p, align 1
1221 define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <192 x i8>* %p) {
1222 ; AVX1-LABEL: interleaved_store_vf64_i8_stride3:
1224 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
1225 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1226 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1227 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
1228 ; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1229 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1230 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10
1231 ; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm10[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1232 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1233 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11
1234 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1235 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1236 ; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1237 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
1238 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1239 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1240 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1241 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
1242 ; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
1243 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1244 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1245 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
1246 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
1247 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1248 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1249 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
1250 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
1251 ; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
1252 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1253 ; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1254 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1255 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
1256 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1257 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
1258 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
1259 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1260 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1261 ; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
1262 ; AVX1-NEXT: # xmm6 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1263 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
1264 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1265 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
1266 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm14
1267 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm12
1268 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
1269 ; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm7
1270 ; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm1
1271 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
1272 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm11
1273 ; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm9
1274 ; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm2
1275 ; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm3
1276 ; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm4
1277 ; AVX1-NEXT: vmovdqu %xmm1, 80(%rdi)
1278 ; AVX1-NEXT: vmovdqu %xmm7, 64(%rdi)
1279 ; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi)
1280 ; AVX1-NEXT: vmovdqu %xmm14, (%rdi)
1281 ; AVX1-NEXT: vmovdqu %xmm0, 48(%rdi)
1282 ; AVX1-NEXT: vmovdqu %xmm12, 32(%rdi)
1283 ; AVX1-NEXT: vmovdqu %xmm4, 176(%rdi)
1284 ; AVX1-NEXT: vmovdqu %xmm3, 160(%rdi)
1285 ; AVX1-NEXT: vmovdqu %xmm5, 112(%rdi)
1286 ; AVX1-NEXT: vmovdqu %xmm11, 96(%rdi)
1287 ; AVX1-NEXT: vmovdqu %xmm2, 144(%rdi)
1288 ; AVX1-NEXT: vmovdqu %xmm9, 128(%rdi)
1289 ; AVX1-NEXT: vzeroupper
1292 ; AVX2-LABEL: interleaved_store_vf64_i8_stride3:
1294 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1295 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1296 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1297 ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1298 ; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1299 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
1300 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1301 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1302 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20]
1303 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
1304 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm9[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1305 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1306 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
1307 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1308 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
1309 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20]
1310 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6
1311 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1312 ; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6
1313 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
1314 ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
1315 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
1316 ; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
1317 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm4
1318 ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
1319 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
1320 ; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3
1321 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
1322 ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1323 ; AVX2-NEXT: vmovdqu %ymm1, 160(%rdi)
1324 ; AVX2-NEXT: vmovdqu %ymm3, 128(%rdi)
1325 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
1326 ; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi)
1327 ; AVX2-NEXT: vmovdqu %ymm4, 96(%rdi)
1328 ; AVX2-NEXT: vmovdqu %ymm6, (%rdi)
1329 ; AVX2-NEXT: vzeroupper
1332 ; AVX512-LABEL: interleaved_store_vf64_i8_stride3:
1334 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53]
1335 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58]
1336 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
1337 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
1338 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
1339 ; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
1340 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
1341 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52]
1342 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3
1343 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1344 ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1345 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1346 ; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5
1347 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
1348 ; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm6
1349 ; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1350 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1351 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm7
1352 ; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7
1353 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1354 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1355 ; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
1356 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1357 ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1358 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1
1359 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
1360 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
1361 ; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi)
1362 ; AVX512-NEXT: vmovdqu64 %zmm3, 64(%rdi)
1363 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi)
1364 ; AVX512-NEXT: vzeroupper
1366 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1367 %2 = shufflevector <64 x i8> %c, <64 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1368 %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
1369 store <192 x i8> %3, <192 x i8>* %p, align 1
1373 define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
1374 ; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
1376 ; AVX1-NEXT: vmovdqu (%rdi), %xmm11
1377 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm10
1378 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm8
1379 ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3
1380 ; AVX1-NEXT: vmovdqu 64(%rdi), %xmm12
1381 ; AVX1-NEXT: vmovdqu 80(%rdi), %xmm9
1382 ; AVX1-NEXT: vmovdqu 96(%rdi), %xmm6
1383 ; AVX1-NEXT: vmovdqu 112(%rdi), %xmm14
1384 ; AVX1-NEXT: vmovdqu 128(%rdi), %xmm13
1385 ; AVX1-NEXT: vmovdqu 144(%rdi), %xmm5
1386 ; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1
1387 ; AVX1-NEXT: vmovdqu 176(%rdi), %xmm15
1388 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1389 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
1390 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
1391 ; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm11
1392 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
1393 ; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm10
1394 ; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm12
1395 ; AVX1-NEXT: vpshufb %xmm4, %xmm14, %xmm14
1396 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
1397 ; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm2
1398 ; AVX1-NEXT: vpshufb %xmm4, %xmm15, %xmm0
1399 ; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm7
1400 ; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm4
1401 ; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1402 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm7[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
1403 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm0[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
1404 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
1405 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1406 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
1407 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1408 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm1
1409 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7,8,9,10]
1410 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10]
1411 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1412 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm14
1413 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1414 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
1415 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm10
1416 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1417 ; AVX1-NEXT: # ymm12 = mem[0,1,0,1]
1418 ; AVX1-NEXT: vandnps %ymm10, %ymm12, %ymm10
1419 ; AVX1-NEXT: vandps %ymm12, %ymm14, %ymm14
1420 ; AVX1-NEXT: vorps %ymm10, %ymm14, %ymm10
1421 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm14
1422 ; AVX1-NEXT: vandnps %ymm14, %ymm12, %ymm14
1423 ; AVX1-NEXT: vandps %ymm12, %ymm1, %ymm1
1424 ; AVX1-NEXT: vorps %ymm14, %ymm1, %ymm1
1425 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1426 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
1427 ; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm15[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
1428 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
1429 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1430 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9,10]
1431 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm8[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1432 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9,10]
1433 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1434 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
1435 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1436 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2
1437 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0
1438 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
1439 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1440 ; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
1441 ; AVX1-NEXT: vpaddb %xmm12, %xmm10, %xmm3
1442 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1443 ; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
1444 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
1445 ; AVX1-NEXT: vpaddb %xmm9, %xmm1, %xmm1
1446 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1447 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1448 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1451 ; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
1453 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
1454 ; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1
1455 ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm2
1456 ; AVX2-NEXT: vmovdqu 96(%rdi), %xmm3
1457 ; AVX2-NEXT: vmovdqu 112(%rdi), %xmm4
1458 ; AVX2-NEXT: vmovdqu 128(%rdi), %xmm5
1459 ; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1460 ; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1461 ; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1462 ; AVX2-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1463 ; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
1464 ; AVX2-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
1465 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1466 ; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3
1467 ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
1468 ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
1469 ; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4
1470 ; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm5
1471 ; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2
1472 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1473 ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1474 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
1475 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1476 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1477 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26]
1478 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1479 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1480 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1481 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
1482 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1
1483 ; AVX2-NEXT: vpaddb %ymm5, %ymm1, %ymm1
1484 ; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
1485 ; AVX2-NEXT: vpaddb %ymm4, %ymm2, %ymm2
1486 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
1487 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1488 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1489 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
1490 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1491 ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
1494 ; AVX512-LABEL: interleaved_load_vf64_i8_stride3:
1496 ; AVX512-NEXT: vmovdqu (%rdi), %xmm0
1497 ; AVX512-NEXT: vmovdqu 16(%rdi), %xmm1
1498 ; AVX512-NEXT: vmovdqu 32(%rdi), %xmm2
1499 ; AVX512-NEXT: vmovdqu 96(%rdi), %xmm3
1500 ; AVX512-NEXT: vmovdqu 112(%rdi), %xmm4
1501 ; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5
1502 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1503 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1504 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1505 ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1506 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
1507 ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
1508 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1509 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
1510 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
1511 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1512 ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1513 ; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1514 ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1515 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1516 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1517 ; AVX512-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1518 ; AVX512-NEXT: kmovq %rax, %k1
1519 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1520 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm5
1521 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 {%k1} = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1522 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1523 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1524 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1525 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1526 ; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1527 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1528 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1529 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
1530 ; AVX512-NEXT: vpaddb %zmm0, %zmm1, %zmm0
1532 %wide.vec = load <192 x i8>, <192 x i8>* %ptr, align 1
1533 %v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
1534 %v2 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
1535 %v3 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
1536 %add1 = add <64 x i8> %v1, %v2
1537 %add2 = add <64 x i8> %v3, %add1
1541 define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, <256 x i8>* %p) {
1542 ; AVX1-LABEL: interleaved_store_vf64_i8_stride4:
1544 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1545 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11
1546 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12
1547 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1548 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1549 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13
1550 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14
1551 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1552 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1553 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1554 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
1555 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
1556 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
1557 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
1558 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1
1559 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
1560 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1561 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
1562 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
1563 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
1564 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
1565 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
1566 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1567 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1568 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
1569 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
1570 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
1571 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
1572 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5
1573 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1574 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
1575 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
1576 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
1577 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1578 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
1579 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6
1580 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
1581 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm6[2,3],ymm2[2,3]
1582 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
1583 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
1584 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
1585 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
1586 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2
1587 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
1588 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3]
1589 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1590 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
1591 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
1592 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1593 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1594 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7
1595 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
1596 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3]
1597 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1598 ; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm6
1599 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4
1600 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5
1601 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3
1602 ; AVX1-NEXT: vmovaps %ymm3, 160(%rdi)
1603 ; AVX1-NEXT: vmovaps %ymm5, 128(%rdi)
1604 ; AVX1-NEXT: vmovaps %ymm4, 32(%rdi)
1605 ; AVX1-NEXT: vmovaps %ymm6, (%rdi)
1606 ; AVX1-NEXT: vmovaps %ymm2, 224(%rdi)
1607 ; AVX1-NEXT: vmovaps %ymm0, 192(%rdi)
1608 ; AVX1-NEXT: vmovaps %ymm1, 96(%rdi)
1609 ; AVX1-NEXT: vmovaps %ymm10, 64(%rdi)
1610 ; AVX1-NEXT: vzeroupper
1613 ; AVX2-LABEL: interleaved_store_vf64_i8_stride4:
1615 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1616 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
1617 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1618 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
1619 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
1620 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23]
1621 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
1622 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31]
1623 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11]
1624 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
1625 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15]
1626 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
1627 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
1628 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
1629 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
1630 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
1631 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm4
1632 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm5
1633 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
1634 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
1635 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm7
1636 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9
1637 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
1638 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
1639 ; AVX2-NEXT: vmovdqa %ymm1, 224(%rdi)
1640 ; AVX2-NEXT: vmovdqa %ymm3, 192(%rdi)
1641 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
1642 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
1643 ; AVX2-NEXT: vmovdqa %ymm9, 160(%rdi)
1644 ; AVX2-NEXT: vmovdqa %ymm7, 128(%rdi)
1645 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
1646 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
1647 ; AVX2-NEXT: vzeroupper
1650 ; AVX512-LABEL: interleaved_store_vf64_i8_stride4:
1652 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
1653 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
1654 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55]
1655 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63]
1656 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27]
1657 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31]
1658 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27]
1659 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31]
1660 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
1661 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
1662 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm1[2,3]
1663 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm0[2,3]
1664 ; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm3
1665 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1666 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm8
1667 ; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm4
1668 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1669 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm9
1670 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
1671 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
1672 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
1673 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
1674 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm4
1675 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
1676 ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdi)
1677 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdi)
1678 ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdi)
1679 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
1680 ; AVX512-NEXT: vzeroupper
1682 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1683 %2 = shufflevector <64 x i8> %c, <64 x i8> %d, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1684 %interleaved = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
1685 store <256 x i8> %interleaved, <256 x i8>* %p