1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX512
6 define <4 x double> @load_factorf64_4(ptr %ptr) nounwind {
7 ; AVX1-LABEL: load_factorf64_4:
9 ; AVX1-NEXT: vmovupd (%rdi), %ymm0
10 ; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
11 ; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
12 ; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
13 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
14 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
15 ; AVX1-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
16 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
17 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
18 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
19 ; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2
20 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
21 ; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
24 ; AVX2OR512-LABEL: load_factorf64_4:
26 ; AVX2OR512-NEXT: vmovupd (%rdi), %ymm0
27 ; AVX2OR512-NEXT: vmovupd 32(%rdi), %ymm1
28 ; AVX2OR512-NEXT: vmovupd 64(%rdi), %ymm2
29 ; AVX2OR512-NEXT: vmovupd 96(%rdi), %ymm3
30 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
31 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
32 ; AVX2OR512-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
33 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
34 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
35 ; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
36 ; AVX2OR512-NEXT: vaddpd %ymm2, %ymm4, %ymm2
37 ; AVX2OR512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
38 ; AVX2OR512-NEXT: vaddpd %ymm0, %ymm2, %ymm0
39 ; AVX2OR512-NEXT: retq
40 %wide.vec = load <16 x double>, ptr %ptr, align 16
41 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
42 %strided.v1 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
43 %strided.v2 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
44 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
45 %add1 = fadd <4 x double> %strided.v0, %strided.v1
46 %add2 = fadd <4 x double> %add1, %strided.v2
47 %add3 = fadd <4 x double> %add2, %strided.v3
48 ret <4 x double> %add3
51 define <4 x double> @load_factorf64_2(ptr %ptr) nounwind {
52 ; AVX1-LABEL: load_factorf64_2:
54 ; AVX1-NEXT: vmovupd (%rdi), %ymm0
55 ; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
56 ; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
57 ; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
58 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
59 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
60 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
61 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
62 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
63 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
64 ; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm0
67 ; AVX2OR512-LABEL: load_factorf64_2:
69 ; AVX2OR512-NEXT: vmovupd (%rdi), %ymm0
70 ; AVX2OR512-NEXT: vmovupd 32(%rdi), %ymm1
71 ; AVX2OR512-NEXT: vmovupd 64(%rdi), %ymm2
72 ; AVX2OR512-NEXT: vmovupd 96(%rdi), %ymm3
73 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
74 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
75 ; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
76 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
77 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
78 ; AVX2OR512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
79 ; AVX2OR512-NEXT: vmulpd %ymm0, %ymm4, %ymm0
80 ; AVX2OR512-NEXT: retq
81 %wide.vec = load <16 x double>, ptr %ptr, align 16
82 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
83 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
84 %mul = fmul <4 x double> %strided.v0, %strided.v3
88 define <4 x double> @load_factorf64_1(ptr %ptr) nounwind {
89 ; AVX1-LABEL: load_factorf64_1:
91 ; AVX1-NEXT: vmovups (%rdi), %ymm0
92 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
93 ; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
94 ; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
95 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
96 ; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0
99 ; AVX2OR512-LABEL: load_factorf64_1:
100 ; AVX2OR512: # %bb.0:
101 ; AVX2OR512-NEXT: vmovupd (%rdi), %ymm0
102 ; AVX2OR512-NEXT: vmovupd 32(%rdi), %ymm1
103 ; AVX2OR512-NEXT: vperm2f128 $32, 64(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0,1],mem[0,1]
104 ; AVX2OR512-NEXT: vperm2f128 $32, 96(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1],mem[0,1]
105 ; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
106 ; AVX2OR512-NEXT: vmulpd %ymm0, %ymm0, %ymm0
107 ; AVX2OR512-NEXT: retq
108 %wide.vec = load <16 x double>, ptr %ptr, align 16
109 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
110 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
111 %mul = fmul <4 x double> %strided.v0, %strided.v3
112 ret <4 x double> %mul
115 define <4 x i64> @load_factori64_4(ptr %ptr) nounwind {
116 ; AVX1-LABEL: load_factori64_4:
118 ; AVX1-NEXT: vmovups (%rdi), %ymm0
119 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
120 ; AVX1-NEXT: vmovups 64(%rdi), %ymm2
121 ; AVX1-NEXT: vmovups 96(%rdi), %ymm3
122 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
123 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
124 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
125 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
126 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
127 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
128 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
129 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
130 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1
131 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
132 ; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
133 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
134 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
135 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
136 ; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
137 ; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
138 ; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
139 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
140 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
143 ; AVX2OR512-LABEL: load_factori64_4:
144 ; AVX2OR512: # %bb.0:
145 ; AVX2OR512-NEXT: vmovdqu (%rdi), %ymm0
146 ; AVX2OR512-NEXT: vmovdqu 32(%rdi), %ymm1
147 ; AVX2OR512-NEXT: vmovdqu 64(%rdi), %ymm2
148 ; AVX2OR512-NEXT: vmovdqu 96(%rdi), %ymm3
149 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
150 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
151 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
152 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
153 ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
154 ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
155 ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
156 ; AVX2OR512-NEXT: vpaddq %ymm4, %ymm2, %ymm2
157 ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
158 ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
159 ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
160 ; AVX2OR512-NEXT: retq
161 %wide.vec = load <16 x i64>, ptr %ptr, align 16
162 %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
163 %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
164 %strided.v2 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
165 %strided.v3 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
166 %add1 = add <4 x i64> %strided.v0, %strided.v1
167 %add2 = add <4 x i64> %add1, %strided.v2
168 %add3 = add <4 x i64> %add2, %strided.v3
172 define void @store_factorf64_4(ptr %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) nounwind {
173 ; AVX1OR2-LABEL: store_factorf64_4:
175 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
176 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
177 ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
178 ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
179 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
180 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
181 ; AVX1OR2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
182 ; AVX1OR2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
183 ; AVX1OR2-NEXT: vmovups %ymm0, 96(%rdi)
184 ; AVX1OR2-NEXT: vmovups %ymm3, 64(%rdi)
185 ; AVX1OR2-NEXT: vmovups %ymm4, 32(%rdi)
186 ; AVX1OR2-NEXT: vmovups %ymm2, (%rdi)
187 ; AVX1OR2-NEXT: vzeroupper
190 ; AVX512-LABEL: store_factorf64_4:
192 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
193 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
194 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
195 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
196 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
197 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
198 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
199 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
200 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
201 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
202 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
203 ; AVX512-NEXT: vmovups %zmm1, (%rdi)
204 ; AVX512-NEXT: vzeroupper
206 %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
207 %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
208 %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
209 store <16 x double> %interleaved.vec, ptr %ptr, align 16
213 define void @store_factori64_4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) nounwind {
214 ; AVX1OR2-LABEL: store_factori64_4:
216 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
217 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
218 ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
219 ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
220 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
221 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
222 ; AVX1OR2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
223 ; AVX1OR2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
224 ; AVX1OR2-NEXT: vmovups %ymm0, 96(%rdi)
225 ; AVX1OR2-NEXT: vmovups %ymm3, 64(%rdi)
226 ; AVX1OR2-NEXT: vmovups %ymm4, 32(%rdi)
227 ; AVX1OR2-NEXT: vmovups %ymm2, (%rdi)
228 ; AVX1OR2-NEXT: vzeroupper
231 ; AVX512-LABEL: store_factori64_4:
233 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
234 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
235 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
236 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
237 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
238 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
239 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
240 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
241 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
242 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
243 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
244 ; AVX512-NEXT: vmovups %zmm1, (%rdi)
245 ; AVX512-NEXT: vzeroupper
247 %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
248 %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
249 %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
250 store <16 x i64> %interleaved.vec, ptr %ptr, align 16
255 define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, ptr %p) nounwind {
256 ; AVX1-LABEL: interleaved_store_vf32_i8_stride4:
258 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
259 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
260 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
261 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
262 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
263 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
264 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
265 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
266 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
267 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
268 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
269 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
270 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
271 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
272 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
273 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
274 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
275 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
276 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
277 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
278 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm2
279 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0
280 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm3
281 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
282 ; AVX1-NEXT: vmovaps %ymm1, 96(%rdi)
283 ; AVX1-NEXT: vmovaps %ymm3, 64(%rdi)
284 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
285 ; AVX1-NEXT: vmovaps %ymm2, (%rdi)
286 ; AVX1-NEXT: vzeroupper
289 ; AVX2-LABEL: interleaved_store_vf32_i8_stride4:
291 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
292 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
293 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
294 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
295 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
296 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
297 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
298 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
299 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
300 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
301 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
302 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
303 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
304 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdi)
305 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
306 ; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
307 ; AVX2-NEXT: vzeroupper
310 ; AVX512-LABEL: interleaved_store_vf32_i8_stride4:
312 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
313 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
314 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
315 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
316 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
317 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
318 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
319 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
320 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
321 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
322 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
323 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0
324 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
325 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7]
326 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdi)
327 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
328 ; AVX512-NEXT: vzeroupper
330 %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
331 %v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
332 %interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
333 store <128 x i8> %interleaved.vec, ptr %p
337 define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, ptr %p) nounwind {
338 ; AVX1OR2-LABEL: interleaved_store_vf16_i8_stride4:
340 ; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
341 ; AVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
342 ; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
343 ; AVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
344 ; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
345 ; AVX1OR2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
346 ; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
347 ; AVX1OR2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
348 ; AVX1OR2-NEXT: vmovdqa %xmm0, 48(%rdi)
349 ; AVX1OR2-NEXT: vmovdqa %xmm4, 32(%rdi)
350 ; AVX1OR2-NEXT: vmovdqa %xmm1, 16(%rdi)
351 ; AVX1OR2-NEXT: vmovdqa %xmm3, (%rdi)
354 ; AVX512-LABEL: interleaved_store_vf16_i8_stride4:
356 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
357 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
358 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
359 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
360 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
361 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
362 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
363 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
364 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
365 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
366 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
367 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdi)
368 ; AVX512-NEXT: vzeroupper
370 %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
371 %v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
372 %interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>
373 store <64 x i8> %interleaved.vec, ptr %p
377 define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind {
378 ; AVX1OR2-LABEL: interleaved_load_vf8_i8_stride4:
380 ; AVX1OR2-NEXT: vmovdqa (%rdi), %xmm0
381 ; AVX1OR2-NEXT: vmovdqa 16(%rdi), %xmm1
382 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u]
383 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
384 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm2
385 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
386 ; AVX1OR2-NEXT: vpxor %xmm3, %xmm3, %xmm3
387 ; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
388 ; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
389 ; AVX1OR2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
390 ; AVX1OR2-NEXT: vpaddb %xmm2, %xmm3, %xmm2
391 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u]
392 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm4
393 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm3
394 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
395 ; AVX1OR2-NEXT: vpsrld $16, %xmm1, %xmm1
396 ; AVX1OR2-NEXT: vpsrld $16, %xmm0, %xmm0
397 ; AVX1OR2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
398 ; AVX1OR2-NEXT: vpaddb %xmm0, %xmm3, %xmm0
399 ; AVX1OR2-NEXT: vpmullw %xmm0, %xmm2, %xmm0
400 ; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
403 ; AVX512-LABEL: interleaved_load_vf8_i8_stride4:
405 ; AVX512-NEXT: vmovdqu (%rdi), %ymm0
406 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
407 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u]
408 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm3
409 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm2
410 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
411 ; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
412 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm3
413 ; AVX512-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
414 ; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1
415 ; AVX512-NEXT: vmovdqu (%rdi), %ymm2
416 ; AVX512-NEXT: vpmovdw %zmm2, %ymm2
417 ; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0
418 ; AVX512-NEXT: vpmovwb %zmm0, %ymm0
419 ; AVX512-NEXT: vpaddb %xmm0, %xmm2, %xmm0
420 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
421 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
422 ; AVX512-NEXT: vzeroupper
424 %wide.vec = load <32 x i8>, ptr %ptr, align 16
425 %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
426 %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
427 %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
428 %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
430 %add1 = add <8 x i8> %v1, %v2
431 %add2 = add <8 x i8> %v4, %v3
432 %add3 = mul <8 x i8> %add1, %add2
436 define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
437 ; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
439 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
440 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
441 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
442 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3
443 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4
444 ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5
445 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
446 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
447 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
448 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6
449 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
450 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
451 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
452 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
453 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
454 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
455 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
456 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
457 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
458 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
459 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
460 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
461 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
462 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
463 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
464 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
465 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
466 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
467 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
468 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
469 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
470 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
471 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
472 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
473 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
474 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
475 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
476 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
477 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
478 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
479 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
480 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
481 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
482 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
485 ; AVX2-LABEL: interleaved_load_vf16_i8_stride4:
487 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
488 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
489 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
490 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
491 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
492 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
493 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
494 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
495 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
496 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
497 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
498 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
499 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
500 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
501 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
502 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
503 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
504 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
505 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
506 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
507 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
508 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
509 ; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4
510 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
511 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
512 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
513 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
514 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
515 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
516 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
517 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
518 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
519 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
520 ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
521 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
522 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
523 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
524 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
525 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
526 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
527 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
528 ; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
529 ; AVX2-NEXT: vpxor %xmm0, %xmm4, %xmm0
530 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
533 ; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
535 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
536 ; AVX512-NEXT: vpmovdb %zmm0, %xmm1
537 ; AVX512-NEXT: vpsrld $8, %zmm0, %zmm2
538 ; AVX512-NEXT: vpmovdb %zmm2, %xmm2
539 ; AVX512-NEXT: vpsrld $16, %zmm0, %zmm3
540 ; AVX512-NEXT: vpmovdb %zmm3, %xmm3
541 ; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0
542 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
543 ; AVX512-NEXT: vpcmpeqb %zmm2, %zmm1, %k0
544 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm3, %k1
545 ; AVX512-NEXT: kxnorw %k1, %k0, %k0
546 ; AVX512-NEXT: vpmovm2b %k0, %zmm0
547 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
548 ; AVX512-NEXT: vzeroupper
550 %wide.vec = load <64 x i8>, ptr %ptr
551 %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
552 %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
553 %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
554 %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
556 %cmp1 = icmp eq <16 x i8> %v1, %v2
557 %cmp2 = icmp eq <16 x i8> %v3, %v4
558 %res = icmp eq <16 x i1> %cmp1, %cmp2
563 define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
564 ; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
566 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
567 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
568 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
569 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
570 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
571 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm4
572 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm5
573 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
574 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
575 ; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm5
576 ; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm7
577 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
578 ; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm4[4,5,6,7]
579 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm4
580 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7
581 ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm5
582 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6
583 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
584 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm6
585 ; AVX1-NEXT: vpshufb %xmm8, %xmm6, %xmm11
586 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm7
587 ; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm8
588 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
589 ; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2,3],xmm10[4,5,6,7]
590 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
591 ; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm8
592 ; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm12
593 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
594 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
595 ; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
596 ; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
597 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
598 ; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4,5,6,7]
599 ; AVX1-NEXT: vpcmpeqb %xmm8, %xmm9, %xmm8
600 ; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm9
601 ; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm11
602 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
603 ; AVX1-NEXT: vpshufb %xmm12, %xmm6, %xmm11
604 ; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12
605 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
606 ; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
607 ; AVX1-NEXT: vpcmpeqb %xmm9, %xmm10, %xmm9
608 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
609 ; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm11
610 ; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12
611 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
612 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
613 ; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
614 ; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
615 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
616 ; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4,5,6,7]
617 ; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm13
618 ; AVX1-NEXT: vpshufb %xmm10, %xmm5, %xmm10
619 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
620 ; AVX1-NEXT: vpshufb %xmm12, %xmm6, %xmm13
621 ; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12
622 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
623 ; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7]
624 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
625 ; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3
626 ; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm2
627 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
628 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
629 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
630 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
631 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
632 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
633 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm11, %xmm0
634 ; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm1
635 ; AVX1-NEXT: vpshufb %xmm12, %xmm5, %xmm2
636 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
637 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm2
638 ; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm3
639 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
640 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
641 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm10, %xmm1
642 ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm2
643 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
644 ; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
645 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
648 ; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
650 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
651 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
652 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2
653 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
654 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
655 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5
656 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
657 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
658 ; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6
659 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
660 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
661 ; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8
662 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm9
663 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
664 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3]
665 ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9
666 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
667 ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
668 ; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7
669 ; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
670 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
671 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
672 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
673 ; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
674 ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
675 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
676 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
677 ; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
678 ; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11
679 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
680 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
681 ; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
682 ; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
683 ; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
684 ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
685 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
686 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
687 ; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7
688 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
689 ; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
690 ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
691 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
692 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
693 ; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
694 ; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11
695 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
696 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
697 ; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
698 ; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
699 ; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
700 ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
701 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
702 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
703 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
704 ; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
705 ; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
706 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
707 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
708 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
709 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
710 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
711 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
712 ; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1
713 ; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
714 ; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
715 ; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0
716 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
717 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
718 ; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm0
719 ; AVX2-NEXT: vpxor %ymm0, %ymm7, %ymm0
720 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
723 ; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
725 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13]
726 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
727 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2
728 ; AVX512-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u]
729 ; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zmm1[0,4,8,12],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,16,20,24,28],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,32,36,40,44],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,48,52,56,60],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
730 ; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3
731 ; AVX512-NEXT: vpermd %zmm3, %zmm0, %zmm3
732 ; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zero,zero,zmm2[1,5,9,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[17,21,25,29,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[33,37,41,45,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[49,53,57,61,u,u,u,u,u,u,u,u]
733 ; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zmm1[1,5,9,13],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,17,21,25,29],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,33,37,41,45],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,49,53,57,61],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
734 ; AVX512-NEXT: vporq %zmm4, %zmm5, %zmm4
735 ; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm4
736 ; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zmm2[2,6,10,14,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[18,22,26,30,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[34,38,42,46,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[50,54,58,62,u,u,u,u,u,u,u,u]
737 ; AVX512-NEXT: vpshufb {{.*#+}} zmm6 = zmm1[2,6,10,14],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,18,22,26,30],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,34,38,42,46],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,50,54,58,62],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
738 ; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5
739 ; AVX512-NEXT: vpermd %zmm5, %zmm0, %zmm5
740 ; AVX512-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[3,7,11,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[19,23,27,31,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[35,39,43,47,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[51,55,59,63,u,u,u,u,u,u,u,u]
741 ; AVX512-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[3,7,11,15],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,19,23,27,31],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,35,39,43,47],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,51,55,59,63],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
742 ; AVX512-NEXT: vporq %zmm2, %zmm1, %zmm1
743 ; AVX512-NEXT: vpermd %zmm1, %zmm0, %zmm0
744 ; AVX512-NEXT: vpcmpeqb %zmm4, %zmm3, %k0
745 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm5, %k1
746 ; AVX512-NEXT: kxnord %k1, %k0, %k0
747 ; AVX512-NEXT: vpmovm2b %k0, %zmm0
748 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
750 %wide.vec = load <128 x i8>, ptr %ptr
751 %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
753 %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
755 %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
757 %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
759 %cmp1 = icmp eq <32 x i8> %v1, %v2
760 %cmp2 = icmp eq <32 x i8> %v3, %v4
761 %res = icmp eq <32 x i1> %cmp1, %cmp2
766 define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, ptr %p) nounwind {
767 ; AVX-LABEL: interleaved_store_vf8_i8_stride4:
769 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
770 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
771 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
772 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
773 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdi)
774 ; AVX-NEXT: vmovdqa %xmm2, (%rdi)
776 %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
777 %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
778 %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
779 store <32 x i8> %interleaved.vec, ptr %p
783 define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){
784 ; AVX1-LABEL: interleaved_load_vf32_i8_stride3:
786 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
787 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
788 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
789 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
790 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4
791 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5
792 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
793 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
794 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
795 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
796 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
797 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
798 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
799 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
800 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
801 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
802 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
803 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8
804 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
805 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
806 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2
807 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
808 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
809 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
810 ; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
811 ; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
812 ; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5
813 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
814 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
815 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
816 ; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
817 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
818 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
819 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
820 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
821 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
822 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
823 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
826 ; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3:
827 ; AVX2OR512: # %bb.0:
828 ; AVX2OR512-NEXT: vmovdqa (%rdi), %xmm0
829 ; AVX2OR512-NEXT: vmovdqa 16(%rdi), %xmm1
830 ; AVX2OR512-NEXT: vmovdqa 32(%rdi), %xmm2
831 ; AVX2OR512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
832 ; AVX2OR512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
833 ; AVX2OR512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
834 ; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
835 ; AVX2OR512-NEXT: # ymm3 = mem[0,1,0,1]
836 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
837 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
838 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
839 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
840 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
841 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
842 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
843 ; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
844 ; AVX2OR512-NEXT: # ymm4 = mem[0,1,0,1]
845 ; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
846 ; AVX2OR512-NEXT: vpaddb %ymm1, %ymm2, %ymm1
847 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
848 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
849 ; AVX2OR512-NEXT: vpaddb %ymm0, %ymm1, %ymm0
850 ; AVX2OR512-NEXT: retq
851 %wide.vec = load <96 x i8>, ptr %ptr
852 %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
853 %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
854 %v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
855 %add1 = add <32 x i8> %v1, %v2
856 %add2 = add <32 x i8> %v3, %add1
860 define <16 x i8> @interleaved_load_vf16_i8_stride3(ptr %ptr){
861 ; AVX-LABEL: interleaved_load_vf16_i8_stride3:
863 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
864 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
865 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
866 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
867 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
868 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
869 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
870 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
871 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
872 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
873 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
874 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
875 ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
876 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
877 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
878 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
879 ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
881 %wide.vec = load <48 x i8>, ptr %ptr
882 %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
883 %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
884 %v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
885 %add1 = add <16 x i8> %v1, %v2
886 %add2 = add <16 x i8> %v3, %add1
890 define <8 x i8> @interleaved_load_vf8_i8_stride3(ptr %ptr){
891 ; AVX-LABEL: interleaved_load_vf8_i8_stride3:
893 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
894 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
895 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
896 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
897 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
898 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
899 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
900 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
901 ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
902 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
903 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
904 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
905 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
907 %wide.vec = load <24 x i8>, ptr %ptr
908 %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
909 %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
910 %v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
911 %add1 = add <8 x i8> %v1, %v2
912 %add2 = add <8 x i8> %v3, %add1
916 define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, ptr %p) nounwind {
917 ; AVX-LABEL: interleaved_store_vf8_i8_stride3:
919 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
920 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
921 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
922 ; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
923 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
924 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
925 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
926 ; AVX-NEXT: vmovq %xmm0, 16(%rdi)
927 ; AVX-NEXT: vmovdqu %xmm1, (%rdi)
929 %1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
930 %2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
931 %interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
932 store <24 x i8> %interleaved.vec, ptr %p, align 1
936 define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, ptr %p) nounwind {
937 ; AVX1OR2-LABEL: interleaved_store_vf16_i8_stride3:
939 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
940 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
941 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
942 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
943 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
944 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
945 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
946 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
947 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
948 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
949 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
950 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
951 ; AVX1OR2-NEXT: vmovdqu %xmm0, 16(%rdi)
952 ; AVX1OR2-NEXT: vmovdqu %xmm1, (%rdi)
953 ; AVX1OR2-NEXT: vmovdqu %xmm2, 32(%rdi)
956 ; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
958 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
959 ; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
960 ; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
961 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
962 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
963 ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
964 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
965 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
966 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
967 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
968 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
969 ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
970 ; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
971 ; AVX512-NEXT: vmovdqu %xmm2, 32(%rdi)
972 ; AVX512-NEXT: vmovdqu %ymm0, (%rdi)
973 ; AVX512-NEXT: vzeroupper
975 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
976 %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
977 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
978 store <48 x i8> %interleaved.vec, ptr %p, align 1
982 define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, ptr %p) nounwind {
983 ; AVX1-LABEL: interleaved_store_vf32_i8_stride3:
985 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
986 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
987 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
988 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
989 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
990 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
991 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
992 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
993 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
994 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
995 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
996 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
997 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
998 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
999 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1000 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1001 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1002 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
1003 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1004 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1005 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
1006 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
1007 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
1008 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
1009 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
1010 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1011 ; AVX1-NEXT: vmovdqu %xmm5, 80(%rdi)
1012 ; AVX1-NEXT: vmovdqu %xmm3, 64(%rdi)
1013 ; AVX1-NEXT: vmovdqu %xmm4, 48(%rdi)
1014 ; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi)
1015 ; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi)
1016 ; AVX1-NEXT: vmovdqu %xmm1, (%rdi)
1017 ; AVX1-NEXT: vzeroupper
1020 ; AVX2-LABEL: interleaved_store_vf32_i8_stride3:
1022 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1023 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1024 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1025 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1026 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1027 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1028 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1029 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1030 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1031 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1032 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1033 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1034 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1035 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1036 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1037 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1038 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
1039 ; AVX2-NEXT: vmovdqu %ymm1, 32(%rdi)
1040 ; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
1041 ; AVX2-NEXT: vzeroupper
1044 ; AVX512-LABEL: interleaved_store_vf32_i8_stride3:
1046 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1047 ; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1048 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1049 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1050 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1051 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1052 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1053 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1054 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1055 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1056 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1057 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1058 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1059 ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1060 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1061 ; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1
1062 ; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi)
1063 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi)
1064 ; AVX512-NEXT: vzeroupper
1066 %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1067 %2 = shufflevector <32 x i8> %c, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1068 %interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
1069 store <96 x i8> %interleaved.vec, ptr %p, align 1
1073 define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, ptr %p) nounwind {
1074 ; AVX1-LABEL: interleaved_store_vf64_i8_stride3:
1076 ; AVX1-NEXT: subq $24, %rsp
1077 ; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1078 ; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1079 ; AVX1-NEXT: vmovdqa %ymm2, %ymm4
1080 ; AVX1-NEXT: vmovdqa %ymm0, %ymm2
1081 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
1082 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12
1083 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10]
1084 ; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm6
1085 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128]
1086 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11
1087 ; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm7
1088 ; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm0
1089 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1090 ; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm7
1091 ; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm8
1092 ; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm0
1093 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1094 ; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm8
1095 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm15
1096 ; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm10
1097 ; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm0
1098 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1099 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128]
1100 ; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm10
1101 ; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm5
1102 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm12
1103 ; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
1104 ; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm1
1105 ; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm2
1106 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
1107 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1108 ; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
1109 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
1110 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15]
1111 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1112 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1113 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm9
1114 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15]
1115 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm8
1116 ; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
1117 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15]
1118 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm7
1119 ; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1
1120 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15]
1121 ; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2
1122 ; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
1123 ; AVX1-NEXT: vpor %xmm13, %xmm12, %xmm13
1124 ; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
1125 ; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4]
1126 ; AVX1-NEXT: vpor %xmm5, %xmm11, %xmm11
1127 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1128 ; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4]
1129 ; AVX1-NEXT: vpor %xmm11, %xmm10, %xmm11
1130 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
1131 ; AVX1-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4]
1132 ; AVX1-NEXT: vpor %xmm0, %xmm15, %xmm15
1133 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1134 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
1135 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1136 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
1137 ; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1138 ; AVX1-NEXT: # xmm0 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1139 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
1140 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1141 ; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm8
1142 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128]
1143 ; AVX1-NEXT: vpshufb %xmm10, %xmm6, %xmm12
1144 ; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm8
1145 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1146 ; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12
1147 ; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9
1148 ; AVX1-NEXT: vpor %xmm9, %xmm12, %xmm9
1149 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1150 ; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12
1151 ; AVX1-NEXT: vpshufb %xmm10, %xmm14, %xmm14
1152 ; AVX1-NEXT: vpor %xmm14, %xmm12, %xmm12
1153 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1154 ; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm7
1155 ; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1
1156 ; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1
1157 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1158 ; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4
1159 ; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0
1160 ; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm10
1161 ; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm6
1162 ; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3
1163 ; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5
1164 ; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm11
1165 ; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2
1166 ; AVX1-NEXT: vmovdqu %xmm6, 80(%rdi)
1167 ; AVX1-NEXT: vmovdqu %xmm9, 64(%rdi)
1168 ; AVX1-NEXT: vmovdqu %xmm8, 16(%rdi)
1169 ; AVX1-NEXT: vmovdqu %xmm4, (%rdi)
1170 ; AVX1-NEXT: vmovdqu %xmm10, 48(%rdi)
1171 ; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi)
1172 ; AVX1-NEXT: vmovdqu %xmm2, 176(%rdi)
1173 ; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi)
1174 ; AVX1-NEXT: vmovdqu %xmm12, 112(%rdi)
1175 ; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi)
1176 ; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi)
1177 ; AVX1-NEXT: vmovdqu %xmm5, 128(%rdi)
1178 ; AVX1-NEXT: addq $24, %rsp
1179 ; AVX1-NEXT: vzeroupper
1182 ; AVX2-LABEL: interleaved_store_vf64_i8_stride3:
1184 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1185 ; AVX2-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20]
1186 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1187 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
1188 ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7
1189 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1190 ; AVX2-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20]
1191 ; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10
1192 ; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26]
1193 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0]
1194 ; AVX2-NEXT: # ymm11 = mem[0,1,0,1]
1195 ; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm0, %ymm0
1196 ; AVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm1[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm1[16,17,18,19,20,21,22,23,24,25,26]
1197 ; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm1
1198 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm4[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
1199 ; AVX2-NEXT: vpblendvb %ymm11, %ymm12, %ymm2, %ymm12
1200 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
1201 ; AVX2-NEXT: vpblendvb %ymm11, %ymm13, %ymm3, %ymm11
1202 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1203 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1204 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
1205 ; AVX2-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25]
1206 ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1
1207 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
1208 ; AVX2-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25]
1209 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
1210 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
1211 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
1212 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6
1213 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1214 ; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
1215 ; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6
1216 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
1217 ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
1218 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3]
1219 ; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
1220 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm5
1221 ; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5
1222 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
1223 ; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3
1224 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
1225 ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1226 ; AVX2-NEXT: vmovdqu %ymm3, 128(%rdi)
1227 ; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi)
1228 ; AVX2-NEXT: vmovdqu %ymm1, 160(%rdi)
1229 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
1230 ; AVX2-NEXT: vmovdqu %ymm5, 96(%rdi)
1231 ; AVX2-NEXT: vmovdqu %ymm6, (%rdi)
1232 ; AVX2-NEXT: vzeroupper
1235 ; AVX512-LABEL: interleaved_store_vf64_i8_stride3:
1237 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53]
1238 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58]
1239 ; AVX512-NEXT: vpalignr {{.*#+}} zmm4 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
1240 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
1241 ; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52]
1242 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm4[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm4[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm4[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm4[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
1243 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
1244 ; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52]
1245 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1246 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1247 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3]
1248 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1249 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1250 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6
1251 ; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1252 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1253 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1254 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2
1255 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1256 ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1257 ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1258 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4
1259 ; AVX512-NEXT: vpshufb %zmm3, %zmm4, %zmm4
1260 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
1261 ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1262 ; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi)
1263 ; AVX512-NEXT: vmovdqu64 %zmm4, 64(%rdi)
1264 ; AVX512-NEXT: vmovdqu64 %zmm2, (%rdi)
1265 ; AVX512-NEXT: vzeroupper
1267 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1268 %2 = shufflevector <64 x i8> %c, <64 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1269 %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
1270 store <192 x i8> %3, ptr %p, align 1
1274 define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
1275 ; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
1277 ; AVX1-NEXT: vmovdqu (%rdi), %xmm11
1278 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
1279 ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13
1280 ; AVX1-NEXT: vmovups 64(%rdi), %xmm0
1281 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1282 ; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4
1283 ; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5
1284 ; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2
1285 ; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10
1286 ; AVX1-NEXT: vmovdqu 160(%rdi), %xmm3
1287 ; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1288 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
1289 ; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6
1290 ; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm7
1291 ; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8
1292 ; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm9
1293 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
1294 ; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm5
1295 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
1296 ; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm12
1297 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1298 ; AVX1-NEXT: vpor %xmm5, %xmm12, %xmm0
1299 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1300 ; AVX1-NEXT: vpshufb %xmm14, %xmm10, %xmm10
1301 ; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12
1302 ; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm0
1303 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1304 ; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm11
1305 ; AVX1-NEXT: vmovdqa %xmm1, %xmm0
1306 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1307 ; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm12
1308 ; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm1
1309 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1310 ; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm11
1311 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1312 ; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm13
1313 ; AVX1-NEXT: vpor %xmm11, %xmm13, %xmm11
1314 ; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm13
1315 ; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm5
1316 ; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5
1317 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm1
1318 ; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm13
1319 ; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm10
1320 ; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10
1321 ; AVX1-NEXT: vmovdqu 176(%rdi), %xmm13
1322 ; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm0
1323 ; AVX1-NEXT: vpshufb %xmm15, %xmm13, %xmm12
1324 ; AVX1-NEXT: vpor %xmm0, %xmm12, %xmm3
1325 ; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm12
1326 ; AVX1-NEXT: vmovdqu 128(%rdi), %xmm14
1327 ; AVX1-NEXT: vpshufb %xmm15, %xmm14, %xmm15
1328 ; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm15
1329 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128]
1330 ; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm12
1331 ; AVX1-NEXT: vpor %xmm6, %xmm12, %xmm12
1332 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
1333 ; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm15
1334 ; AVX1-NEXT: vpor %xmm7, %xmm15, %xmm15
1335 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1336 ; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2
1337 ; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2
1338 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10]
1339 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
1340 ; AVX1-NEXT: vpor %xmm0, %xmm9, %xmm0
1341 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
1342 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
1343 ; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm10
1344 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
1345 ; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4
1346 ; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4
1347 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1348 ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10
1349 ; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm1
1350 ; AVX1-NEXT: vpor %xmm1, %xmm10, %xmm1
1351 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1352 ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10
1353 ; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13
1354 ; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10
1355 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1356 ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm9
1357 ; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm11
1358 ; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm9
1359 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
1360 ; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0
1361 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
1362 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1363 ; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3
1364 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1365 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
1366 ; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0
1367 ; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2
1368 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1369 ; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3
1370 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
1371 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
1372 ; AVX1-NEXT: vpaddb %xmm1, %xmm8, %xmm1
1373 ; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm2
1374 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1375 ; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3
1376 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
1377 ; AVX1-NEXT: vpaddb %xmm2, %xmm10, %xmm2
1378 ; AVX1-NEXT: vpaddb %xmm2, %xmm7, %xmm2
1379 ; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm3
1380 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1381 ; AVX1-NEXT: vpshufb %xmm13, %xmm4, %xmm4
1382 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
1383 ; AVX1-NEXT: vpaddb %xmm3, %xmm9, %xmm3
1384 ; AVX1-NEXT: vpaddb %xmm3, %xmm6, %xmm3
1385 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1386 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
1389 ; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
1391 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
1392 ; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1
1393 ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm2
1394 ; AVX2-NEXT: vmovdqu 96(%rdi), %xmm3
1395 ; AVX2-NEXT: vmovdqu 112(%rdi), %xmm4
1396 ; AVX2-NEXT: vmovdqu 128(%rdi), %xmm5
1397 ; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm6
1398 ; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm0
1399 ; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1400 ; AVX2-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1401 ; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm1
1402 ; AVX2-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm4
1403 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
1404 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
1405 ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm7
1406 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
1407 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
1408 ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm7
1409 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm5
1410 ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5
1411 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255]
1412 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
1413 ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm3
1414 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6
1415 ; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm9
1416 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0]
1417 ; AVX2-NEXT: # ymm10 = mem[0,1,0,1]
1418 ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm9
1419 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm8
1420 ; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm8
1421 ; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm5[11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26]
1422 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm7[11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26]
1423 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,128,128,128,128,128,17,20,23,26,29,16,19,22,25,28,31,128,128,128,128,128]
1424 ; AVX2-NEXT: vpshufb %ymm10, %ymm6, %ymm6
1425 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128,128,128,128,128,128,128,18,21,24,27,30]
1426 ; AVX2-NEXT: vpshufb %ymm11, %ymm2, %ymm2
1427 ; AVX2-NEXT: vpor %ymm2, %ymm6, %ymm2
1428 ; AVX2-NEXT: vpaddb %ymm2, %ymm9, %ymm2
1429 ; AVX2-NEXT: vpshufb %ymm10, %ymm3, %ymm3
1430 ; AVX2-NEXT: vpshufb %ymm11, %ymm4, %ymm4
1431 ; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3
1432 ; AVX2-NEXT: vpaddb %ymm3, %ymm8, %ymm3
1433 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
1434 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5]
1435 ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
1436 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
1437 ; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
1438 ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
1439 ; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0
1440 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
1441 ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
1442 ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
1443 ; AVX2-NEXT: vpaddb %ymm1, %ymm3, %ymm1
1446 ; AVX512-LABEL: interleaved_load_vf64_i8_stride3:
1448 ; AVX512-NEXT: vmovdqu (%rdi), %xmm0
1449 ; AVX512-NEXT: vmovdqu 16(%rdi), %xmm1
1450 ; AVX512-NEXT: vmovdqu 32(%rdi), %xmm2
1451 ; AVX512-NEXT: vmovdqu 96(%rdi), %xmm3
1452 ; AVX512-NEXT: vmovdqu 112(%rdi), %xmm4
1453 ; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5
1454 ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1455 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1456 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1457 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
1458 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1459 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1460 ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
1461 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1462 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1463 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1464 ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1465 ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1466 ; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1467 ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1468 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1469 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1470 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1471 ; AVX512-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1472 ; AVX512-NEXT: kmovq %rax, %k1
1473 ; AVX512-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
1474 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1475 ; AVX512-NEXT: vpaddb %zmm2, %zmm1, %zmm1
1476 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
1477 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
1478 ; AVX512-NEXT: vpaddb %zmm0, %zmm1, %zmm0
1480 %wide.vec = load <192 x i8>, ptr %ptr, align 1
1481 %v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
1482 %v2 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
1483 %v3 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
1484 %add1 = add <64 x i8> %v1, %v2
1485 %add2 = add <64 x i8> %v3, %add1
1489 define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, ptr %p) nounwind {
1490 ; AVX1-LABEL: interleaved_store_vf64_i8_stride4:
1492 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1493 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11
1494 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12
1495 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1496 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1497 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13
1498 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14
1499 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1500 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1501 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1502 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
1503 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
1504 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
1505 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
1506 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm12
1507 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm13
1508 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
1509 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
1510 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
1511 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
1512 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm13
1513 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
1514 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1515 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
1516 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
1517 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
1518 ; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1519 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
1520 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
1521 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
1522 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
1523 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
1524 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3]
1525 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
1526 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
1527 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7]
1528 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1529 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
1530 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
1531 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
1532 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1533 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
1534 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
1535 ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8
1536 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6
1537 ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
1538 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
1539 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4
1540 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1
1541 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1542 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
1543 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3
1544 ; AVX1-NEXT: vmovaps %ymm3, 224(%rdi)
1545 ; AVX1-NEXT: vmovaps %ymm0, 192(%rdi)
1546 ; AVX1-NEXT: vmovaps %ymm1, 160(%rdi)
1547 ; AVX1-NEXT: vmovaps %ymm4, 128(%rdi)
1548 ; AVX1-NEXT: vmovaps %ymm2, 96(%rdi)
1549 ; AVX1-NEXT: vmovaps %ymm9, 64(%rdi)
1550 ; AVX1-NEXT: vmovaps %ymm6, 32(%rdi)
1551 ; AVX1-NEXT: vmovaps %ymm8, (%rdi)
1552 ; AVX1-NEXT: vzeroupper
1555 ; AVX2-LABEL: interleaved_store_vf64_i8_stride4:
1557 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1558 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
1559 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1560 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
1561 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
1562 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23]
1563 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
1564 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31]
1565 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11]
1566 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
1567 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15]
1568 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
1569 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
1570 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
1571 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
1572 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
1573 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm4
1574 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm5
1575 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
1576 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
1577 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm7
1578 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9
1579 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
1580 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
1581 ; AVX2-NEXT: vmovdqa %ymm1, 224(%rdi)
1582 ; AVX2-NEXT: vmovdqa %ymm3, 192(%rdi)
1583 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
1584 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
1585 ; AVX2-NEXT: vmovdqa %ymm9, 160(%rdi)
1586 ; AVX2-NEXT: vmovdqa %ymm7, 128(%rdi)
1587 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
1588 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
1589 ; AVX2-NEXT: vzeroupper
1592 ; AVX512-LABEL: interleaved_store_vf64_i8_stride4:
1594 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
1595 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
1596 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55]
1597 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63]
1598 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27]
1599 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31]
1600 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27]
1601 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31]
1602 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
1603 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
1604 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm6
1605 ; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm7
1606 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6
1607 ; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm8
1608 ; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm9
1609 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
1610 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
1611 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4
1612 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3
1613 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7]
1614 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4
1615 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7]
1616 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7]
1617 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7]
1618 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdi)
1619 ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdi)
1620 ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdi)
1621 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
1622 ; AVX512-NEXT: vzeroupper
1624 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1625 %2 = shufflevector <64 x i8> %c, <64 x i8> %d, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1626 %interleaved = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
1627 store <256 x i8> %interleaved, ptr %p
1631 define void @splat2_v4f64_load_store(ptr %s, ptr %d) nounwind {
1632 ; AVX1-LABEL: splat2_v4f64_load_store:
1634 ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
1635 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
1636 ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
1637 ; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
1638 ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
1639 ; AVX1-NEXT: vmovupd %ymm1, (%rsi)
1640 ; AVX1-NEXT: vzeroupper
1643 ; AVX2-LABEL: splat2_v4f64_load_store:
1645 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1646 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
1647 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1648 ; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
1649 ; AVX2-NEXT: vmovups %ymm1, (%rsi)
1650 ; AVX2-NEXT: vzeroupper
1653 ; AVX512-LABEL: splat2_v4f64_load_store:
1655 ; AVX512-NEXT: vmovups (%rdi), %ymm0
1656 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
1657 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1658 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1659 ; AVX512-NEXT: vzeroupper
1661 %x = load <4 x double>, ptr %s, align 8
1662 %x2 = shufflevector <4 x double> %x, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1663 %r = shufflevector <8 x double> %x2, <8 x double> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1664 store <8 x double> %r, ptr %d, align 8
1668 define void @splat2_v4i64_load_store(ptr %s, ptr %d) nounwind {
1669 ; AVX1-LABEL: splat2_v4i64_load_store:
1671 ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
1672 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
1673 ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
1674 ; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
1675 ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
1676 ; AVX1-NEXT: vmovupd %ymm1, (%rsi)
1677 ; AVX1-NEXT: vzeroupper
1680 ; AVX2-LABEL: splat2_v4i64_load_store:
1682 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1683 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
1684 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1685 ; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
1686 ; AVX2-NEXT: vmovups %ymm1, (%rsi)
1687 ; AVX2-NEXT: vzeroupper
1690 ; AVX512-LABEL: splat2_v4i64_load_store:
1692 ; AVX512-NEXT: vmovups (%rdi), %ymm0
1693 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
1694 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1695 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1696 ; AVX512-NEXT: vzeroupper
1698 %x = load <4 x i64>, ptr %s, align 8
1699 %x2 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1700 %r = shufflevector <8 x i64> %x2, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1701 store <8 x i64> %r, ptr %d, align 8
1705 define void @splat4_v8f32_load_store(ptr %s, ptr %d) nounwind {
1706 ; AVX1-LABEL: splat4_v8f32_load_store:
1708 ; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm0
1709 ; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm1
1710 ; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm2
1711 ; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm3
1712 ; AVX1-NEXT: vbroadcastss (%rdi), %xmm4
1713 ; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm5
1714 ; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm6
1715 ; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm7
1716 ; AVX1-NEXT: vmovups %xmm7, 48(%rsi)
1717 ; AVX1-NEXT: vmovups %xmm6, 32(%rsi)
1718 ; AVX1-NEXT: vmovups %xmm5, 16(%rsi)
1719 ; AVX1-NEXT: vmovups %xmm4, (%rsi)
1720 ; AVX1-NEXT: vmovups %xmm3, 112(%rsi)
1721 ; AVX1-NEXT: vmovups %xmm2, 96(%rsi)
1722 ; AVX1-NEXT: vmovups %xmm1, 80(%rsi)
1723 ; AVX1-NEXT: vmovups %xmm0, 64(%rsi)
1726 ; AVX2-LABEL: splat4_v8f32_load_store:
1728 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1729 ; AVX2-NEXT: vmovups (%rdi), %xmm1
1730 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1]
1731 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1]
1732 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3]
1733 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
1734 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
1735 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3]
1736 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
1737 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1738 ; AVX2-NEXT: vmovups %ymm0, 96(%rsi)
1739 ; AVX2-NEXT: vmovups %ymm3, 64(%rsi)
1740 ; AVX2-NEXT: vmovups %ymm1, 32(%rsi)
1741 ; AVX2-NEXT: vmovups %ymm2, (%rsi)
1742 ; AVX2-NEXT: vzeroupper
1745 ; AVX512-LABEL: splat4_v8f32_load_store:
1747 ; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3]
1748 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11]
1749 ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1
1750 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15]
1751 ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0
1752 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi)
1753 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi)
1754 ; AVX512-NEXT: vzeroupper
1756 %x = load <8 x float>, ptr %s, align 4
1757 %x2 = shufflevector <8 x float> %x, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1758 %x4 = shufflevector <16 x float> %x2, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1759 %r = shufflevector <32 x float> %x4, <32 x float> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1760 store <32 x float> %r, ptr %d, align 4
1764 define void @splat4_v8i32_load_store(ptr %s, ptr %d) nounwind {
1765 ; AVX1-LABEL: splat4_v8i32_load_store:
1767 ; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
1768 ; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm1
1769 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1770 ; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm1
1771 ; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm2
1772 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1773 ; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm2
1774 ; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm3
1775 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1776 ; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm3
1777 ; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm4
1778 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
1779 ; AVX1-NEXT: vmovups %ymm3, 96(%rsi)
1780 ; AVX1-NEXT: vmovups %ymm2, 64(%rsi)
1781 ; AVX1-NEXT: vmovups %ymm1, 32(%rsi)
1782 ; AVX1-NEXT: vmovups %ymm0, (%rsi)
1783 ; AVX1-NEXT: vzeroupper
1786 ; AVX2-LABEL: splat4_v8i32_load_store:
1788 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1789 ; AVX2-NEXT: vmovups (%rdi), %xmm1
1790 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1]
1791 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1]
1792 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3]
1793 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
1794 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
1795 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3]
1796 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
1797 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1798 ; AVX2-NEXT: vmovups %ymm0, 96(%rsi)
1799 ; AVX2-NEXT: vmovups %ymm3, 64(%rsi)
1800 ; AVX2-NEXT: vmovups %ymm1, 32(%rsi)
1801 ; AVX2-NEXT: vmovups %ymm2, (%rsi)
1802 ; AVX2-NEXT: vzeroupper
1805 ; AVX512-LABEL: splat4_v8i32_load_store:
1807 ; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3]
1808 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11]
1809 ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1
1810 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15]
1811 ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0
1812 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi)
1813 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi)
1814 ; AVX512-NEXT: vzeroupper
1816 %x = load <8 x i32>, ptr %s, align 4
1817 %x2 = shufflevector <8 x i32> %x, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1818 %x4 = shufflevector <16 x i32> %x2, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1819 %r = shufflevector <32 x i32> %x4, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1820 store <32 x i32> %r, ptr %d, align 4
1824 define void @splat4_v4f64_load_store(ptr %s, ptr %d) nounwind {
1825 ; AVX1OR2-LABEL: splat4_v4f64_load_store:
1827 ; AVX1OR2-NEXT: vbroadcastsd (%rdi), %ymm0
1828 ; AVX1OR2-NEXT: vbroadcastsd 16(%rdi), %ymm1
1829 ; AVX1OR2-NEXT: vbroadcastsd 8(%rdi), %ymm2
1830 ; AVX1OR2-NEXT: vbroadcastsd 24(%rdi), %ymm3
1831 ; AVX1OR2-NEXT: vmovups %ymm3, 96(%rsi)
1832 ; AVX1OR2-NEXT: vmovups %ymm1, 64(%rsi)
1833 ; AVX1OR2-NEXT: vmovups %ymm2, 32(%rsi)
1834 ; AVX1OR2-NEXT: vmovups %ymm0, (%rsi)
1835 ; AVX1OR2-NEXT: vzeroupper
1836 ; AVX1OR2-NEXT: retq
1838 ; AVX512-LABEL: splat4_v4f64_load_store:
1840 ; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
1841 ; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1
1842 ; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2
1843 ; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3
1844 ; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
1845 ; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1
1846 ; AVX512-NEXT: vmovups %zmm1, 64(%rsi)
1847 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1848 ; AVX512-NEXT: vzeroupper
1850 %x = load <4 x double>, ptr %s, align 8
1851 %x2 = shufflevector <4 x double> %x, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1852 %x4 = shufflevector <8 x double> %x2, <8 x double> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1853 %r = shufflevector <16 x double> %x4, <16 x double> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1854 store <16 x double> %r, ptr %d, align 8
1858 define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind {
1859 ; AVX1OR2-LABEL: splat4_v4i64_load_store:
1861 ; AVX1OR2-NEXT: vbroadcastsd (%rdi), %ymm0
1862 ; AVX1OR2-NEXT: vbroadcastsd 16(%rdi), %ymm1
1863 ; AVX1OR2-NEXT: vbroadcastsd 8(%rdi), %ymm2
1864 ; AVX1OR2-NEXT: vbroadcastsd 24(%rdi), %ymm3
1865 ; AVX1OR2-NEXT: vmovups %ymm3, 96(%rsi)
1866 ; AVX1OR2-NEXT: vmovups %ymm1, 64(%rsi)
1867 ; AVX1OR2-NEXT: vmovups %ymm2, 32(%rsi)
1868 ; AVX1OR2-NEXT: vmovups %ymm0, (%rsi)
1869 ; AVX1OR2-NEXT: vzeroupper
1870 ; AVX1OR2-NEXT: retq
1872 ; AVX512-LABEL: splat4_v4i64_load_store:
1874 ; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
1875 ; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1
1876 ; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2
1877 ; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3
1878 ; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
1879 ; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1
1880 ; AVX512-NEXT: vmovups %zmm1, 64(%rsi)
1881 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1882 ; AVX512-NEXT: vzeroupper
1884 %x = load <4 x i64>, ptr %s, align 8
1885 %x2 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1886 %x4 = shufflevector <8 x i64> %x2, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1887 %r = shufflevector <16 x i64> %x4, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1888 store <16 x i64> %r, ptr %d, align 8
1892 define <2 x i64> @PR37616(ptr %a0) nounwind {
1893 ; AVX1-LABEL: PR37616:
1895 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm0
1896 ; AVX1-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
1899 ; AVX2OR512-LABEL: PR37616:
1900 ; AVX2OR512: # %bb.0:
1901 ; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0
1902 ; AVX2OR512-NEXT: vunpcklpd 32(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
1903 ; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0
1904 ; AVX2OR512-NEXT: vzeroupper
1905 ; AVX2OR512-NEXT: retq
1906 %load = load <16 x i64>, ptr %a0, align 128
1907 %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
1908 ret <2 x i64> %shuffle