1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX512
6 define <4 x double> @load_factorf64_4(ptr %ptr) nounwind {
7 ; AVX1-LABEL: load_factorf64_4:
9 ; AVX1-NEXT: vmovupd (%rdi), %ymm0
10 ; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
11 ; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm2
12 ; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm3
13 ; AVX1-NEXT: vhaddpd %ymm3, %ymm2, %ymm2
14 ; AVX1-NEXT: vperm2f128 $49, 64(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
15 ; AVX1-NEXT: vperm2f128 $49, 96(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[2,3]
16 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17 ; AVX1-NEXT: vaddpd %ymm3, %ymm2, %ymm2
18 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
19 ; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
22 ; AVX2OR512-LABEL: load_factorf64_4:
24 ; AVX2OR512-NEXT: vmovupd (%rdi), %ymm0
25 ; AVX2OR512-NEXT: vmovupd 32(%rdi), %ymm1
26 ; AVX2OR512-NEXT: vmovupd 64(%rdi), %ymm2
27 ; AVX2OR512-NEXT: vmovupd 96(%rdi), %ymm3
28 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
29 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
30 ; AVX2OR512-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
31 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
32 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
33 ; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
34 ; AVX2OR512-NEXT: vaddpd %ymm2, %ymm4, %ymm2
35 ; AVX2OR512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
36 ; AVX2OR512-NEXT: vaddpd %ymm0, %ymm2, %ymm0
37 ; AVX2OR512-NEXT: retq
38 %wide.vec = load <16 x double>, ptr %ptr, align 16
39 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
40 %strided.v1 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
41 %strided.v2 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
42 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
43 %add1 = fadd <4 x double> %strided.v0, %strided.v1
44 %add2 = fadd <4 x double> %add1, %strided.v2
45 %add3 = fadd <4 x double> %add2, %strided.v3
46 ret <4 x double> %add3
49 define <4 x double> @load_factorf64_2(ptr %ptr) nounwind {
50 ; AVX1-LABEL: load_factorf64_2:
52 ; AVX1-NEXT: vmovupd (%rdi), %ymm0
53 ; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
54 ; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm2
55 ; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm3
56 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
57 ; AVX1-NEXT: vperm2f128 $49, 64(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
58 ; AVX1-NEXT: vperm2f128 $49, 96(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[2,3]
59 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
60 ; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0
63 ; AVX2OR512-LABEL: load_factorf64_2:
65 ; AVX2OR512-NEXT: vmovupd (%rdi), %ymm0
66 ; AVX2OR512-NEXT: vmovupd 32(%rdi), %ymm1
67 ; AVX2OR512-NEXT: vmovupd 64(%rdi), %ymm2
68 ; AVX2OR512-NEXT: vmovupd 96(%rdi), %ymm3
69 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
70 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
71 ; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
72 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
73 ; AVX2OR512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
74 ; AVX2OR512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
75 ; AVX2OR512-NEXT: vmulpd %ymm0, %ymm4, %ymm0
76 ; AVX2OR512-NEXT: retq
77 %wide.vec = load <16 x double>, ptr %ptr, align 16
78 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
79 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
80 %mul = fmul <4 x double> %strided.v0, %strided.v3
84 define <4 x double> @load_factorf64_1(ptr %ptr) nounwind {
85 ; AVX1-LABEL: load_factorf64_1:
87 ; AVX1-NEXT: vmovups (%rdi), %ymm0
88 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
89 ; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
90 ; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
91 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
92 ; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0
95 ; AVX2OR512-LABEL: load_factorf64_1:
97 ; AVX2OR512-NEXT: vmovupd (%rdi), %ymm0
98 ; AVX2OR512-NEXT: vmovupd 32(%rdi), %ymm1
99 ; AVX2OR512-NEXT: vperm2f128 $32, 64(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0,1],mem[0,1]
100 ; AVX2OR512-NEXT: vperm2f128 $32, 96(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1],mem[0,1]
101 ; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
102 ; AVX2OR512-NEXT: vmulpd %ymm0, %ymm0, %ymm0
103 ; AVX2OR512-NEXT: retq
104 %wide.vec = load <16 x double>, ptr %ptr, align 16
105 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
106 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
107 %mul = fmul <4 x double> %strided.v0, %strided.v3
108 ret <4 x double> %mul
111 define <4 x i64> @load_factori64_4(ptr %ptr) nounwind {
112 ; AVX1-LABEL: load_factori64_4:
114 ; AVX1-NEXT: vmovups (%rdi), %ymm0
115 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
116 ; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm2
117 ; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm3
118 ; AVX1-NEXT: vperm2f128 $49, 64(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
119 ; AVX1-NEXT: vperm2f128 $49, 96(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[2,3]
120 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
121 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
122 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
123 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
124 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
125 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
126 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
127 ; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
128 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
129 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
130 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
131 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
132 ; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
133 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
134 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
137 ; AVX2OR512-LABEL: load_factori64_4:
138 ; AVX2OR512: # %bb.0:
139 ; AVX2OR512-NEXT: vmovdqu (%rdi), %ymm0
140 ; AVX2OR512-NEXT: vmovdqu 32(%rdi), %ymm1
141 ; AVX2OR512-NEXT: vmovdqu 64(%rdi), %ymm2
142 ; AVX2OR512-NEXT: vmovdqu 96(%rdi), %ymm3
143 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
144 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
145 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
146 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
147 ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
148 ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
149 ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
150 ; AVX2OR512-NEXT: vpaddq %ymm4, %ymm2, %ymm2
151 ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
152 ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
153 ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
154 ; AVX2OR512-NEXT: retq
155 %wide.vec = load <16 x i64>, ptr %ptr, align 16
156 %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
157 %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
158 %strided.v2 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
159 %strided.v3 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
160 %add1 = add <4 x i64> %strided.v0, %strided.v1
161 %add2 = add <4 x i64> %add1, %strided.v2
162 %add3 = add <4 x i64> %add2, %strided.v3
166 define void @store_factorf64_4(ptr %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) nounwind {
167 ; AVX1OR2-LABEL: store_factorf64_4:
169 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
170 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
171 ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
172 ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
173 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
174 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
175 ; AVX1OR2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
176 ; AVX1OR2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
177 ; AVX1OR2-NEXT: vmovups %ymm0, 96(%rdi)
178 ; AVX1OR2-NEXT: vmovups %ymm3, 64(%rdi)
179 ; AVX1OR2-NEXT: vmovups %ymm4, 32(%rdi)
180 ; AVX1OR2-NEXT: vmovups %ymm2, (%rdi)
181 ; AVX1OR2-NEXT: vzeroupper
184 ; AVX512-LABEL: store_factorf64_4:
186 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
187 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
188 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
189 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
190 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
191 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
192 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
193 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
194 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
195 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
196 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
197 ; AVX512-NEXT: vmovups %zmm1, (%rdi)
198 ; AVX512-NEXT: vzeroupper
200 %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
201 %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
202 %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
203 store <16 x double> %interleaved.vec, ptr %ptr, align 16
207 define void @store_factori64_4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) nounwind {
208 ; AVX1OR2-LABEL: store_factori64_4:
210 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
211 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
212 ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
213 ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
214 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
215 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
216 ; AVX1OR2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
217 ; AVX1OR2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
218 ; AVX1OR2-NEXT: vmovups %ymm0, 96(%rdi)
219 ; AVX1OR2-NEXT: vmovups %ymm3, 64(%rdi)
220 ; AVX1OR2-NEXT: vmovups %ymm4, 32(%rdi)
221 ; AVX1OR2-NEXT: vmovups %ymm2, (%rdi)
222 ; AVX1OR2-NEXT: vzeroupper
225 ; AVX512-LABEL: store_factori64_4:
227 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
228 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
229 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
230 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
231 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
232 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
233 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
234 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
235 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
236 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
237 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
238 ; AVX512-NEXT: vmovups %zmm1, (%rdi)
239 ; AVX512-NEXT: vzeroupper
241 %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
242 %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
243 %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
244 store <16 x i64> %interleaved.vec, ptr %ptr, align 16
249 define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, ptr %p) nounwind {
250 ; AVX1-LABEL: interleaved_store_vf32_i8_stride4:
252 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
253 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
254 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
255 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
256 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
257 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
258 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
259 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
260 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
261 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
262 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
263 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
264 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
265 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
266 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
267 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
268 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
269 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
270 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
271 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
272 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm2
273 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0
274 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm3
275 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
276 ; AVX1-NEXT: vmovaps %ymm1, 96(%rdi)
277 ; AVX1-NEXT: vmovaps %ymm3, 64(%rdi)
278 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
279 ; AVX1-NEXT: vmovaps %ymm2, (%rdi)
280 ; AVX1-NEXT: vzeroupper
283 ; AVX2-LABEL: interleaved_store_vf32_i8_stride4:
285 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
286 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
287 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
288 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
289 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
290 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
291 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
292 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
293 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
294 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
295 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
296 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
297 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
298 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdi)
299 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
300 ; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
301 ; AVX2-NEXT: vzeroupper
304 ; AVX512-LABEL: interleaved_store_vf32_i8_stride4:
306 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
307 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
308 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
309 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
310 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
311 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
312 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
313 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
314 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
315 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
316 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
317 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0
318 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
319 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7]
320 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdi)
321 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
322 ; AVX512-NEXT: vzeroupper
324 %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
325 %v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
326 %interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
327 store <128 x i8> %interleaved.vec, ptr %p
331 define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, ptr %p) nounwind {
332 ; AVX1OR2-LABEL: interleaved_store_vf16_i8_stride4:
334 ; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
335 ; AVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
336 ; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
337 ; AVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
338 ; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
339 ; AVX1OR2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
340 ; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
341 ; AVX1OR2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
342 ; AVX1OR2-NEXT: vmovdqa %xmm0, 48(%rdi)
343 ; AVX1OR2-NEXT: vmovdqa %xmm4, 32(%rdi)
344 ; AVX1OR2-NEXT: vmovdqa %xmm1, 16(%rdi)
345 ; AVX1OR2-NEXT: vmovdqa %xmm3, (%rdi)
348 ; AVX512-LABEL: interleaved_store_vf16_i8_stride4:
350 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
351 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
352 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
353 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
354 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
355 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
356 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
357 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
358 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
359 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
360 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
361 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdi)
362 ; AVX512-NEXT: vzeroupper
364 %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
365 %v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
366 %interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>
367 store <64 x i8> %interleaved.vec, ptr %p
371 define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind {
372 ; AVX1OR2-LABEL: interleaved_load_vf8_i8_stride4:
374 ; AVX1OR2-NEXT: vmovdqa (%rdi), %xmm0
375 ; AVX1OR2-NEXT: vmovdqa 16(%rdi), %xmm1
376 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
377 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
378 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm2
379 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
380 ; AVX1OR2-NEXT: vpxor %xmm3, %xmm3, %xmm3
381 ; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
382 ; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
383 ; AVX1OR2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
384 ; AVX1OR2-NEXT: vpaddb %xmm2, %xmm3, %xmm2
385 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u>
386 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm4
387 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm3
388 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
389 ; AVX1OR2-NEXT: vpsrld $16, %xmm1, %xmm1
390 ; AVX1OR2-NEXT: vpsrld $16, %xmm0, %xmm0
391 ; AVX1OR2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
392 ; AVX1OR2-NEXT: vpaddb %xmm0, %xmm3, %xmm0
393 ; AVX1OR2-NEXT: vpmullw %xmm0, %xmm2, %xmm0
394 ; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
397 ; AVX512-LABEL: interleaved_load_vf8_i8_stride4:
399 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
400 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
401 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u>
402 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm3
403 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm2
404 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
405 ; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
406 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
407 ; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
408 ; AVX512-NEXT: vpaddb %xmm0, %xmm2, %xmm0
409 ; AVX512-NEXT: vmovdqu (%rdi), %ymm1
410 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
411 ; AVX512-NEXT: vmovdqu (%rdi), %ymm2
412 ; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2
413 ; AVX512-NEXT: vpmovwb %zmm2, %ymm2
414 ; AVX512-NEXT: vpaddb %xmm2, %xmm1, %xmm1
415 ; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
416 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
417 ; AVX512-NEXT: vzeroupper
419 %wide.vec = load <32 x i8>, ptr %ptr, align 16
420 %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
421 %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
422 %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
423 %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
425 %add1 = add <8 x i8> %v1, %v2
426 %add2 = add <8 x i8> %v4, %v3
427 %add3 = mul <8 x i8> %add1, %add2
431 define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
432 ; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
434 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
435 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
436 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
437 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3
438 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4
439 ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5
440 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
441 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
442 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
443 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6
444 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
445 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
446 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
447 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
448 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
449 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
450 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
451 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
452 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
453 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
454 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
455 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
456 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
457 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
458 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
459 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
460 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
461 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
462 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
463 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
464 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
465 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
466 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
467 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
468 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
469 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
470 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
471 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
472 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
473 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
474 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
475 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
476 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
477 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
480 ; AVX2-LABEL: interleaved_load_vf16_i8_stride4:
482 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
483 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
484 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
485 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
486 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
487 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
488 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
489 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
490 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
491 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
492 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
493 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
494 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
495 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
496 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
497 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
498 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
499 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
500 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
501 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
502 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
503 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
504 ; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4
505 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
506 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
507 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
508 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
509 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
510 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
511 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
512 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
513 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
514 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
515 ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
516 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
517 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
518 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
519 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
520 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
521 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
522 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
523 ; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
524 ; AVX2-NEXT: vpxor %xmm0, %xmm4, %xmm0
525 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
528 ; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
530 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
531 ; AVX512-NEXT: vpmovdb %zmm0, %xmm1
532 ; AVX512-NEXT: vpsrld $8, %zmm0, %zmm2
533 ; AVX512-NEXT: vpmovdb %zmm2, %xmm2
534 ; AVX512-NEXT: vpsrld $16, %zmm0, %zmm3
535 ; AVX512-NEXT: vpmovdb %zmm3, %xmm3
536 ; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0
537 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
538 ; AVX512-NEXT: vpcmpeqb %zmm2, %zmm1, %k0
539 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm3, %k1
540 ; AVX512-NEXT: kxnorw %k1, %k0, %k0
541 ; AVX512-NEXT: vpmovm2b %k0, %zmm0
542 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
543 ; AVX512-NEXT: vzeroupper
545 %wide.vec = load <64 x i8>, ptr %ptr
546 %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
547 %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
548 %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
549 %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
551 %cmp1 = icmp eq <16 x i8> %v1, %v2
552 %cmp2 = icmp eq <16 x i8> %v3, %v4
553 %res = icmp eq <16 x i1> %cmp1, %cmp2
558 define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
559 ; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
561 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
562 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
563 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
564 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
565 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
566 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm4
567 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm5
568 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
569 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
570 ; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm5
571 ; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm7
572 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
573 ; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm4[4,5,6,7]
574 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm4
575 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7
576 ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm5
577 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6
578 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
579 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm6
580 ; AVX1-NEXT: vpshufb %xmm8, %xmm6, %xmm11
581 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm7
582 ; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm8
583 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
584 ; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2,3],xmm10[4,5,6,7]
585 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
586 ; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm8
587 ; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm12
588 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
589 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
590 ; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
591 ; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
592 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
593 ; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4,5,6,7]
594 ; AVX1-NEXT: vpcmpeqb %xmm8, %xmm9, %xmm8
595 ; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm9
596 ; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm11
597 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
598 ; AVX1-NEXT: vpshufb %xmm12, %xmm6, %xmm11
599 ; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12
600 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
601 ; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
602 ; AVX1-NEXT: vpcmpeqb %xmm9, %xmm10, %xmm9
603 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
604 ; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm11
605 ; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12
606 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
607 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
608 ; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
609 ; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
610 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
611 ; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4,5,6,7]
612 ; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm13
613 ; AVX1-NEXT: vpshufb %xmm10, %xmm5, %xmm10
614 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
615 ; AVX1-NEXT: vpshufb %xmm12, %xmm6, %xmm13
616 ; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12
617 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
618 ; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7]
619 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
620 ; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3
621 ; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm2
622 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
623 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
624 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
625 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
626 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
627 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
628 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm11, %xmm0
629 ; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm1
630 ; AVX1-NEXT: vpshufb %xmm12, %xmm5, %xmm2
631 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
632 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm2
633 ; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm3
634 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
635 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
636 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm10, %xmm1
637 ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm2
638 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
639 ; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
640 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
643 ; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
645 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
646 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
647 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2
648 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
649 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
650 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5
651 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
652 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
653 ; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6
654 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
655 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
656 ; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8
657 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7
658 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
659 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3]
660 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
661 ; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm9
662 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
663 ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
664 ; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm8
665 ; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm8
666 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
667 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
668 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
669 ; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
670 ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
671 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
672 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
673 ; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
674 ; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9
675 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
676 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
677 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
678 ; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
679 ; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
680 ; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
681 ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
682 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
683 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
684 ; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7
685 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
686 ; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
687 ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
688 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
689 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
690 ; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
691 ; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9
692 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
693 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
694 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
695 ; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
696 ; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
697 ; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
698 ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
699 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
700 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
701 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
702 ; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
703 ; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
704 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
705 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
706 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
707 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
708 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
709 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
710 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
711 ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
712 ; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
713 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
714 ; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0
715 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
716 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
717 ; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm0
718 ; AVX2-NEXT: vpxor %ymm0, %ymm7, %ymm0
719 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
722 ; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
724 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13]
725 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
726 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2
727 ; AVX512-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u]
728 ; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zmm1[0,4,8,12],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,16,20,24,28],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,32,36,40,44],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,48,52,56,60],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
729 ; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3
730 ; AVX512-NEXT: vpermd %zmm3, %zmm0, %zmm3
731 ; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zero,zero,zmm2[1,5,9,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[17,21,25,29,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[33,37,41,45,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[49,53,57,61,u,u,u,u,u,u,u,u]
732 ; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zmm1[1,5,9,13],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,17,21,25,29],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,33,37,41,45],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,49,53,57,61],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
733 ; AVX512-NEXT: vporq %zmm4, %zmm5, %zmm4
734 ; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm4
735 ; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zmm2[2,6,10,14,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[18,22,26,30,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[34,38,42,46,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[50,54,58,62,u,u,u,u,u,u,u,u]
736 ; AVX512-NEXT: vpshufb {{.*#+}} zmm6 = zmm1[2,6,10,14],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,18,22,26,30],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,34,38,42,46],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,50,54,58,62],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
737 ; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5
738 ; AVX512-NEXT: vpermd %zmm5, %zmm0, %zmm5
739 ; AVX512-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[3,7,11,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[19,23,27,31,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[35,39,43,47,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[51,55,59,63,u,u,u,u,u,u,u,u]
740 ; AVX512-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[3,7,11,15],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,19,23,27,31],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,35,39,43,47],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,51,55,59,63],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
741 ; AVX512-NEXT: vporq %zmm2, %zmm1, %zmm1
742 ; AVX512-NEXT: vpermd %zmm1, %zmm0, %zmm0
743 ; AVX512-NEXT: vpcmpeqb %zmm4, %zmm3, %k0
744 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm5, %k1
745 ; AVX512-NEXT: kxnord %k1, %k0, %k0
746 ; AVX512-NEXT: vpmovm2b %k0, %zmm0
747 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
749 %wide.vec = load <128 x i8>, ptr %ptr
750 %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
752 %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
754 %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
756 %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
758 %cmp1 = icmp eq <32 x i8> %v1, %v2
759 %cmp2 = icmp eq <32 x i8> %v3, %v4
760 %res = icmp eq <32 x i1> %cmp1, %cmp2
765 define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, ptr %p) nounwind {
766 ; AVX-LABEL: interleaved_store_vf8_i8_stride4:
768 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
769 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
770 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
771 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
772 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdi)
773 ; AVX-NEXT: vmovdqa %xmm2, (%rdi)
775 %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
776 %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
777 %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
778 store <32 x i8> %interleaved.vec, ptr %p
782 define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){
783 ; AVX1-LABEL: interleaved_load_vf32_i8_stride3:
785 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
786 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
787 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
788 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
789 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4
790 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5
791 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
792 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
793 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
794 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
795 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
796 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
797 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
798 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
799 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
800 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
801 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
802 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8
803 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
804 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
805 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2
806 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
807 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
808 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
809 ; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
810 ; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
811 ; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5
812 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
813 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
814 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
815 ; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
816 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
817 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
818 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
819 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
820 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
821 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
822 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
825 ; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3:
826 ; AVX2OR512: # %bb.0:
827 ; AVX2OR512-NEXT: vmovdqa (%rdi), %xmm0
828 ; AVX2OR512-NEXT: vmovdqa 16(%rdi), %xmm1
829 ; AVX2OR512-NEXT: vmovdqa 32(%rdi), %xmm2
830 ; AVX2OR512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
831 ; AVX2OR512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
832 ; AVX2OR512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
833 ; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
834 ; AVX2OR512-NEXT: # ymm3 = mem[0,1,0,1]
835 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
836 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
837 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
838 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
839 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
840 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
841 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
842 ; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
843 ; AVX2OR512-NEXT: # ymm4 = mem[0,1,0,1]
844 ; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
845 ; AVX2OR512-NEXT: vpaddb %ymm1, %ymm2, %ymm1
846 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
847 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
848 ; AVX2OR512-NEXT: vpaddb %ymm0, %ymm1, %ymm0
849 ; AVX2OR512-NEXT: retq
850 %wide.vec = load <96 x i8>, ptr %ptr
851 %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
852 %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
853 %v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
854 %add1 = add <32 x i8> %v1, %v2
855 %add2 = add <32 x i8> %v3, %add1
859 define <16 x i8> @interleaved_load_vf16_i8_stride3(ptr %ptr){
860 ; AVX-LABEL: interleaved_load_vf16_i8_stride3:
862 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
863 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
864 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
865 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
866 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
867 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
868 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
869 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
870 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
871 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
872 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
873 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
874 ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
875 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
876 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
877 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
878 ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
880 %wide.vec = load <48 x i8>, ptr %ptr
881 %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
882 %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
883 %v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
884 %add1 = add <16 x i8> %v1, %v2
885 %add2 = add <16 x i8> %v3, %add1
889 define <8 x i8> @interleaved_load_vf8_i8_stride3(ptr %ptr){
890 ; AVX-LABEL: interleaved_load_vf8_i8_stride3:
892 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
893 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
894 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
895 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
896 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
897 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
898 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
899 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
900 ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
901 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
902 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
903 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
904 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
906 %wide.vec = load <24 x i8>, ptr %ptr
907 %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
908 %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
909 %v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
910 %add1 = add <8 x i8> %v1, %v2
911 %add2 = add <8 x i8> %v3, %add1
915 define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, ptr %p) nounwind {
916 ; AVX-LABEL: interleaved_store_vf8_i8_stride3:
918 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
919 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
920 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
921 ; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
922 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
923 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
924 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
925 ; AVX-NEXT: vmovq %xmm0, 16(%rdi)
926 ; AVX-NEXT: vmovdqu %xmm1, (%rdi)
928 %1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
929 %2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
930 %interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
931 store <24 x i8> %interleaved.vec, ptr %p, align 1
935 define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, ptr %p) nounwind {
936 ; AVX1OR2-LABEL: interleaved_store_vf16_i8_stride3:
938 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
939 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
940 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
941 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
942 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
943 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
944 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
945 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
946 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
947 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
948 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
949 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
950 ; AVX1OR2-NEXT: vmovdqu %xmm0, 16(%rdi)
951 ; AVX1OR2-NEXT: vmovdqu %xmm1, (%rdi)
952 ; AVX1OR2-NEXT: vmovdqu %xmm2, 32(%rdi)
955 ; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
957 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
958 ; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
959 ; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
960 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
961 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
962 ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
963 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
964 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
965 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
966 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
967 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
968 ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
969 ; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
970 ; AVX512-NEXT: vmovdqu %xmm2, 32(%rdi)
971 ; AVX512-NEXT: vmovdqu %ymm0, (%rdi)
972 ; AVX512-NEXT: vzeroupper
974 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
975 %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
976 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
977 store <48 x i8> %interleaved.vec, ptr %p, align 1
981 define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, ptr %p) nounwind {
982 ; AVX1-LABEL: interleaved_store_vf32_i8_stride3:
984 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
985 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
986 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
987 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
988 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
989 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
990 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
991 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
992 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
993 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
994 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
995 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
996 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
997 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
998 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
999 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1000 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1001 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
1002 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1003 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1004 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
1005 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
1006 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
1007 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
1008 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
1009 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1010 ; AVX1-NEXT: vmovdqu %xmm5, 80(%rdi)
1011 ; AVX1-NEXT: vmovdqu %xmm3, 64(%rdi)
1012 ; AVX1-NEXT: vmovdqu %xmm4, 48(%rdi)
1013 ; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi)
1014 ; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi)
1015 ; AVX1-NEXT: vmovdqu %xmm1, (%rdi)
1016 ; AVX1-NEXT: vzeroupper
1019 ; AVX2-LABEL: interleaved_store_vf32_i8_stride3:
1021 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1022 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1023 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1024 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1025 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1026 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1027 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1028 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1029 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1030 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1031 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1032 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1033 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1034 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1035 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1036 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1037 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
1038 ; AVX2-NEXT: vmovdqu %ymm1, 32(%rdi)
1039 ; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
1040 ; AVX2-NEXT: vzeroupper
1043 ; AVX512-LABEL: interleaved_store_vf32_i8_stride3:
1045 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1046 ; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1047 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1048 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1049 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1050 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1051 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1052 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1053 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1054 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1055 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1056 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1057 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
1058 ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1059 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1060 ; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1
1061 ; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi)
1062 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi)
1063 ; AVX512-NEXT: vzeroupper
1065 %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1066 %2 = shufflevector <32 x i8> %c, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1067 %interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
1068 store <96 x i8> %interleaved.vec, ptr %p, align 1
1072 define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, ptr %p) nounwind {
1073 ; AVX1-LABEL: interleaved_store_vf64_i8_stride3:
1075 ; AVX1-NEXT: subq $24, %rsp
1076 ; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1077 ; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1078 ; AVX1-NEXT: vmovdqa %ymm2, %ymm4
1079 ; AVX1-NEXT: vmovdqa %ymm0, %ymm2
1080 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
1081 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12
1082 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10>
1083 ; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm6
1084 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128>
1085 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11
1086 ; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm7
1087 ; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm0
1088 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1089 ; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm7
1090 ; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm8
1091 ; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm0
1092 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1093 ; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm8
1094 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm15
1095 ; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm10
1096 ; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm0
1097 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1098 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128]
1099 ; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm10
1100 ; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm5
1101 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm12
1102 ; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
1103 ; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm1
1104 ; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm2
1105 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
1106 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1107 ; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
1108 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
1109 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15>
1110 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1111 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1112 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm9
1113 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15]
1114 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm8
1115 ; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
1116 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15]
1117 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm7
1118 ; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1
1119 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15]
1120 ; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2
1121 ; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
1122 ; AVX1-NEXT: vpor %xmm13, %xmm12, %xmm13
1123 ; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
1124 ; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4]
1125 ; AVX1-NEXT: vpor %xmm5, %xmm11, %xmm11
1126 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1127 ; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4]
1128 ; AVX1-NEXT: vpor %xmm11, %xmm10, %xmm11
1129 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
1130 ; AVX1-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4]
1131 ; AVX1-NEXT: vpor %xmm0, %xmm15, %xmm15
1132 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1133 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
1134 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1135 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
1136 ; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1137 ; AVX1-NEXT: # xmm0 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1138 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
1139 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1140 ; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm8
1141 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128]
1142 ; AVX1-NEXT: vpshufb %xmm10, %xmm6, %xmm12
1143 ; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm8
1144 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1145 ; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12
1146 ; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9
1147 ; AVX1-NEXT: vpor %xmm9, %xmm12, %xmm9
1148 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1149 ; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12
1150 ; AVX1-NEXT: vpshufb %xmm10, %xmm14, %xmm14
1151 ; AVX1-NEXT: vpor %xmm14, %xmm12, %xmm12
1152 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1153 ; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm7
1154 ; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1
1155 ; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1
1156 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1157 ; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4
1158 ; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0
1159 ; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm10
1160 ; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm6
1161 ; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3
1162 ; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5
1163 ; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm11
1164 ; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2
1165 ; AVX1-NEXT: vmovdqu %xmm6, 80(%rdi)
1166 ; AVX1-NEXT: vmovdqu %xmm9, 64(%rdi)
1167 ; AVX1-NEXT: vmovdqu %xmm8, 16(%rdi)
1168 ; AVX1-NEXT: vmovdqu %xmm4, (%rdi)
1169 ; AVX1-NEXT: vmovdqu %xmm10, 48(%rdi)
1170 ; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi)
1171 ; AVX1-NEXT: vmovdqu %xmm2, 176(%rdi)
1172 ; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi)
1173 ; AVX1-NEXT: vmovdqu %xmm12, 112(%rdi)
1174 ; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi)
1175 ; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi)
1176 ; AVX1-NEXT: vmovdqu %xmm5, 128(%rdi)
1177 ; AVX1-NEXT: addq $24, %rsp
1178 ; AVX1-NEXT: vzeroupper
1181 ; AVX2-LABEL: interleaved_store_vf64_i8_stride3:
1183 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1184 ; AVX2-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20]
1185 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1186 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
1187 ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7
1188 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1189 ; AVX2-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20]
1190 ; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10
1191 ; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26]
1192 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0]
1193 ; AVX2-NEXT: # ymm11 = mem[0,1,0,1]
1194 ; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm0, %ymm0
1195 ; AVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm1[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm1[16,17,18,19,20,21,22,23,24,25,26]
1196 ; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm1
1197 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm4[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
1198 ; AVX2-NEXT: vpblendvb %ymm11, %ymm12, %ymm2, %ymm12
1199 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
1200 ; AVX2-NEXT: vpblendvb %ymm11, %ymm13, %ymm3, %ymm11
1201 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1202 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1203 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
1204 ; AVX2-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25]
1205 ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1
1206 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
1207 ; AVX2-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25]
1208 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
1209 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
1210 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
1211 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6
1212 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1213 ; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
1214 ; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6
1215 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
1216 ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
1217 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3]
1218 ; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
1219 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm5
1220 ; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5
1221 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
1222 ; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3
1223 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
1224 ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1225 ; AVX2-NEXT: vmovdqu %ymm3, 128(%rdi)
1226 ; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi)
1227 ; AVX2-NEXT: vmovdqu %ymm1, 160(%rdi)
1228 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
1229 ; AVX2-NEXT: vmovdqu %ymm5, 96(%rdi)
1230 ; AVX2-NEXT: vmovdqu %ymm6, (%rdi)
1231 ; AVX2-NEXT: vzeroupper
1234 ; AVX512-LABEL: interleaved_store_vf64_i8_stride3:
1236 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53]
1237 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58]
1238 ; AVX512-NEXT: vpalignr {{.*#+}} zmm4 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
1239 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
1240 ; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52]
1241 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm4[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm4[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm4[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm4[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
1242 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
1243 ; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52]
1244 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1245 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1246 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3]
1247 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1248 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1249 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6
1250 ; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1251 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1252 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1253 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2
1254 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1255 ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
1256 ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1257 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4
1258 ; AVX512-NEXT: vpshufb %zmm3, %zmm4, %zmm4
1259 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
1260 ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1261 ; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi)
1262 ; AVX512-NEXT: vmovdqu64 %zmm4, 64(%rdi)
1263 ; AVX512-NEXT: vmovdqu64 %zmm2, (%rdi)
1264 ; AVX512-NEXT: vzeroupper
1266 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1267 %2 = shufflevector <64 x i8> %c, <64 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1268 %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
1269 store <192 x i8> %3, ptr %p, align 1
1273 define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
1274 ; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
1276 ; AVX1-NEXT: vmovdqu (%rdi), %xmm11
1277 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
1278 ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13
1279 ; AVX1-NEXT: vmovups 64(%rdi), %xmm0
1280 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1281 ; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4
1282 ; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5
1283 ; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2
1284 ; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10
1285 ; AVX1-NEXT: vmovdqu 160(%rdi), %xmm3
1286 ; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1287 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
1288 ; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6
1289 ; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm7
1290 ; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8
1291 ; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm9
1292 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u>
1293 ; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm5
1294 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u>
1295 ; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm12
1296 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1297 ; AVX1-NEXT: vpor %xmm5, %xmm12, %xmm0
1298 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1299 ; AVX1-NEXT: vpshufb %xmm14, %xmm10, %xmm10
1300 ; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12
1301 ; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm0
1302 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1303 ; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm11
1304 ; AVX1-NEXT: vmovdqa %xmm1, %xmm0
1305 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1306 ; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm12
1307 ; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm1
1308 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1309 ; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm11
1310 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1311 ; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm13
1312 ; AVX1-NEXT: vpor %xmm11, %xmm13, %xmm11
1313 ; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm13
1314 ; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm5
1315 ; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5
1316 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm1
1317 ; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm13
1318 ; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm10
1319 ; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10
1320 ; AVX1-NEXT: vmovdqu 176(%rdi), %xmm13
1321 ; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm0
1322 ; AVX1-NEXT: vpshufb %xmm15, %xmm13, %xmm12
1323 ; AVX1-NEXT: vpor %xmm0, %xmm12, %xmm3
1324 ; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm12
1325 ; AVX1-NEXT: vmovdqu 128(%rdi), %xmm14
1326 ; AVX1-NEXT: vpshufb %xmm15, %xmm14, %xmm15
1327 ; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm15
1328 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128]
1329 ; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm12
1330 ; AVX1-NEXT: vpor %xmm6, %xmm12, %xmm12
1331 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
1332 ; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm15
1333 ; AVX1-NEXT: vpor %xmm7, %xmm15, %xmm15
1334 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1335 ; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2
1336 ; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2
1337 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10]
1338 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
1339 ; AVX1-NEXT: vpor %xmm0, %xmm9, %xmm0
1340 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
1341 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
1342 ; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm10
1343 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
1344 ; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4
1345 ; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4
1346 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1347 ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10
1348 ; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm1
1349 ; AVX1-NEXT: vpor %xmm1, %xmm10, %xmm1
1350 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1351 ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10
1352 ; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13
1353 ; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10
1354 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1355 ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm9
1356 ; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm11
1357 ; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm9
1358 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
1359 ; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0
1360 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
1361 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1362 ; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3
1363 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1364 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
1365 ; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0
1366 ; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2
1367 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1368 ; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3
1369 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
1370 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
1371 ; AVX1-NEXT: vpaddb %xmm1, %xmm8, %xmm1
1372 ; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm2
1373 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1374 ; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3
1375 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
1376 ; AVX1-NEXT: vpaddb %xmm2, %xmm10, %xmm2
1377 ; AVX1-NEXT: vpaddb %xmm2, %xmm7, %xmm2
1378 ; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm3
1379 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1380 ; AVX1-NEXT: vpshufb %xmm13, %xmm4, %xmm4
1381 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
1382 ; AVX1-NEXT: vpaddb %xmm3, %xmm9, %xmm3
1383 ; AVX1-NEXT: vpaddb %xmm3, %xmm6, %xmm3
1384 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1385 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
1388 ; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
1390 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
1391 ; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1
1392 ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm2
1393 ; AVX2-NEXT: vmovdqu 96(%rdi), %xmm3
1394 ; AVX2-NEXT: vmovdqu 112(%rdi), %xmm4
1395 ; AVX2-NEXT: vmovdqu 128(%rdi), %xmm5
1396 ; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm6
1397 ; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm0
1398 ; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1399 ; AVX2-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1400 ; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm1
1401 ; AVX2-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm4
1402 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
1403 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
1404 ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm7
1405 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
1406 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
1407 ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm7
1408 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm5
1409 ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5
1410 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255]
1411 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
1412 ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm3
1413 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6
1414 ; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm9
1415 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0]
1416 ; AVX2-NEXT: # ymm10 = mem[0,1,0,1]
1417 ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm9
1418 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm8
1419 ; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm8
1420 ; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm5[11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26]
1421 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm7[11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26]
1422 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,128,128,128,128,128,17,20,23,26,29,16,19,22,25,28,31,128,128,128,128,128]
1423 ; AVX2-NEXT: vpshufb %ymm10, %ymm6, %ymm6
1424 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128,128,128,128,128,128,128,18,21,24,27,30]
1425 ; AVX2-NEXT: vpshufb %ymm11, %ymm2, %ymm2
1426 ; AVX2-NEXT: vpor %ymm2, %ymm6, %ymm2
1427 ; AVX2-NEXT: vpaddb %ymm2, %ymm9, %ymm2
1428 ; AVX2-NEXT: vpshufb %ymm10, %ymm3, %ymm3
1429 ; AVX2-NEXT: vpshufb %ymm11, %ymm4, %ymm4
1430 ; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3
1431 ; AVX2-NEXT: vpaddb %ymm3, %ymm8, %ymm3
1432 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
1433 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5]
1434 ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
1435 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
1436 ; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
1437 ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
1438 ; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0
1439 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
1440 ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
1441 ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
1442 ; AVX2-NEXT: vpaddb %ymm1, %ymm3, %ymm1
1445 ; AVX512-LABEL: interleaved_load_vf64_i8_stride3:
1447 ; AVX512-NEXT: vmovdqu (%rdi), %xmm0
1448 ; AVX512-NEXT: vmovdqu 16(%rdi), %xmm1
1449 ; AVX512-NEXT: vmovdqu 32(%rdi), %xmm2
1450 ; AVX512-NEXT: vmovdqu 96(%rdi), %xmm3
1451 ; AVX512-NEXT: vmovdqu 112(%rdi), %xmm4
1452 ; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5
1453 ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1454 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1455 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1456 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
1457 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1458 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1459 ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
1460 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1461 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1462 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1463 ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1464 ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1465 ; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1466 ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1467 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1468 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1469 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1470 ; AVX512-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1471 ; AVX512-NEXT: kmovq %rax, %k1
1472 ; AVX512-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
1473 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1474 ; AVX512-NEXT: vpaddb %zmm2, %zmm1, %zmm1
1475 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
1476 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
1477 ; AVX512-NEXT: vpaddb %zmm0, %zmm1, %zmm0
1479 %wide.vec = load <192 x i8>, ptr %ptr, align 1
1480 %v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
1481 %v2 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
1482 %v3 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
1483 %add1 = add <64 x i8> %v1, %v2
1484 %add2 = add <64 x i8> %v3, %add1
1488 define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, ptr %p) nounwind {
1489 ; AVX1-LABEL: interleaved_store_vf64_i8_stride4:
1491 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1492 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11
1493 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12
1494 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1495 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1496 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13
1497 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14
1498 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1499 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1500 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1501 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
1502 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
1503 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
1504 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
1505 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm12
1506 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm13
1507 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
1508 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
1509 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
1510 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
1511 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm13
1512 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
1513 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1514 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
1515 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
1516 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
1517 ; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1518 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
1519 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
1520 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
1521 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
1522 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
1523 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3]
1524 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
1525 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
1526 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7]
1527 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1528 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
1529 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
1530 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
1531 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1532 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
1533 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
1534 ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8
1535 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6
1536 ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
1537 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
1538 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4
1539 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1
1540 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1541 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
1542 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3
1543 ; AVX1-NEXT: vmovaps %ymm3, 224(%rdi)
1544 ; AVX1-NEXT: vmovaps %ymm0, 192(%rdi)
1545 ; AVX1-NEXT: vmovaps %ymm1, 160(%rdi)
1546 ; AVX1-NEXT: vmovaps %ymm4, 128(%rdi)
1547 ; AVX1-NEXT: vmovaps %ymm2, 96(%rdi)
1548 ; AVX1-NEXT: vmovaps %ymm9, 64(%rdi)
1549 ; AVX1-NEXT: vmovaps %ymm6, 32(%rdi)
1550 ; AVX1-NEXT: vmovaps %ymm8, (%rdi)
1551 ; AVX1-NEXT: vzeroupper
1554 ; AVX2-LABEL: interleaved_store_vf64_i8_stride4:
1556 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1557 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
1558 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1559 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
1560 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
1561 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23]
1562 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
1563 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31]
1564 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11]
1565 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
1566 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15]
1567 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
1568 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
1569 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
1570 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
1571 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
1572 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm4
1573 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm5
1574 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
1575 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
1576 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm7
1577 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9
1578 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
1579 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
1580 ; AVX2-NEXT: vmovdqa %ymm1, 224(%rdi)
1581 ; AVX2-NEXT: vmovdqa %ymm3, 192(%rdi)
1582 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
1583 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
1584 ; AVX2-NEXT: vmovdqa %ymm9, 160(%rdi)
1585 ; AVX2-NEXT: vmovdqa %ymm7, 128(%rdi)
1586 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
1587 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
1588 ; AVX2-NEXT: vzeroupper
1591 ; AVX512-LABEL: interleaved_store_vf64_i8_stride4:
1593 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
1594 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
1595 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55]
1596 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63]
1597 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27]
1598 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31]
1599 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27]
1600 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31]
1601 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
1602 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
1603 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm6
1604 ; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm7
1605 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6
1606 ; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm8
1607 ; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm9
1608 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
1609 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
1610 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4
1611 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3
1612 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7]
1613 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4
1614 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7]
1615 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7]
1616 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7]
1617 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdi)
1618 ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdi)
1619 ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdi)
1620 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
1621 ; AVX512-NEXT: vzeroupper
1623 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1624 %2 = shufflevector <64 x i8> %c, <64 x i8> %d, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1625 %interleaved = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
1626 store <256 x i8> %interleaved, ptr %p
1630 define void @splat2_v4f64_load_store(ptr %s, ptr %d) nounwind {
1631 ; AVX1-LABEL: splat2_v4f64_load_store:
1633 ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
1634 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
1635 ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
1636 ; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
1637 ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
1638 ; AVX1-NEXT: vmovupd %ymm1, (%rsi)
1639 ; AVX1-NEXT: vzeroupper
1642 ; AVX2-LABEL: splat2_v4f64_load_store:
1644 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1645 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
1646 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1647 ; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
1648 ; AVX2-NEXT: vmovups %ymm1, (%rsi)
1649 ; AVX2-NEXT: vzeroupper
1652 ; AVX512-LABEL: splat2_v4f64_load_store:
1654 ; AVX512-NEXT: vmovups (%rdi), %ymm0
1655 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
1656 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1657 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1658 ; AVX512-NEXT: vzeroupper
1660 %x = load <4 x double>, ptr %s, align 8
1661 %x2 = shufflevector <4 x double> %x, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1662 %r = shufflevector <8 x double> %x2, <8 x double> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1663 store <8 x double> %r, ptr %d, align 8
1667 define void @splat2_v4i64_load_store(ptr %s, ptr %d) nounwind {
1668 ; AVX1-LABEL: splat2_v4i64_load_store:
1670 ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
1671 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
1672 ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
1673 ; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
1674 ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
1675 ; AVX1-NEXT: vmovupd %ymm1, (%rsi)
1676 ; AVX1-NEXT: vzeroupper
1679 ; AVX2-LABEL: splat2_v4i64_load_store:
1681 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1682 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
1683 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1684 ; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
1685 ; AVX2-NEXT: vmovups %ymm1, (%rsi)
1686 ; AVX2-NEXT: vzeroupper
1689 ; AVX512-LABEL: splat2_v4i64_load_store:
1691 ; AVX512-NEXT: vmovups (%rdi), %ymm0
1692 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
1693 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1694 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1695 ; AVX512-NEXT: vzeroupper
1697 %x = load <4 x i64>, ptr %s, align 8
1698 %x2 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1699 %r = shufflevector <8 x i64> %x2, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1700 store <8 x i64> %r, ptr %d, align 8
1704 define void @splat4_v8f32_load_store(ptr %s, ptr %d) nounwind {
1705 ; AVX1-LABEL: splat4_v8f32_load_store:
1707 ; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm0
1708 ; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm1
1709 ; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm2
1710 ; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm3
1711 ; AVX1-NEXT: vbroadcastss (%rdi), %xmm4
1712 ; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm5
1713 ; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm6
1714 ; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm7
1715 ; AVX1-NEXT: vmovups %xmm7, 48(%rsi)
1716 ; AVX1-NEXT: vmovups %xmm6, 32(%rsi)
1717 ; AVX1-NEXT: vmovups %xmm5, 16(%rsi)
1718 ; AVX1-NEXT: vmovups %xmm4, (%rsi)
1719 ; AVX1-NEXT: vmovups %xmm3, 112(%rsi)
1720 ; AVX1-NEXT: vmovups %xmm2, 96(%rsi)
1721 ; AVX1-NEXT: vmovups %xmm1, 80(%rsi)
1722 ; AVX1-NEXT: vmovups %xmm0, 64(%rsi)
1725 ; AVX2-LABEL: splat4_v8f32_load_store:
1727 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1728 ; AVX2-NEXT: vmovups (%rdi), %xmm1
1729 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1]
1730 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1]
1731 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3]
1732 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
1733 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
1734 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3]
1735 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
1736 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1737 ; AVX2-NEXT: vmovups %ymm0, 96(%rsi)
1738 ; AVX2-NEXT: vmovups %ymm3, 64(%rsi)
1739 ; AVX2-NEXT: vmovups %ymm1, 32(%rsi)
1740 ; AVX2-NEXT: vmovups %ymm2, (%rsi)
1741 ; AVX2-NEXT: vzeroupper
1744 ; AVX512-LABEL: splat4_v8f32_load_store:
1746 ; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3]
1747 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11]
1748 ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1
1749 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15]
1750 ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0
1751 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi)
1752 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi)
1753 ; AVX512-NEXT: vzeroupper
1755 %x = load <8 x float>, ptr %s, align 4
1756 %x2 = shufflevector <8 x float> %x, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1757 %x4 = shufflevector <16 x float> %x2, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1758 %r = shufflevector <32 x float> %x4, <32 x float> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1759 store <32 x float> %r, ptr %d, align 4
1763 define void @splat4_v8i32_load_store(ptr %s, ptr %d) nounwind {
1764 ; AVX1-LABEL: splat4_v8i32_load_store:
1766 ; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
1767 ; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm1
1768 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1769 ; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm1
1770 ; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm2
1771 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1772 ; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm2
1773 ; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm3
1774 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1775 ; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm3
1776 ; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm4
1777 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
1778 ; AVX1-NEXT: vmovups %ymm3, 96(%rsi)
1779 ; AVX1-NEXT: vmovups %ymm2, 64(%rsi)
1780 ; AVX1-NEXT: vmovups %ymm1, 32(%rsi)
1781 ; AVX1-NEXT: vmovups %ymm0, (%rsi)
1782 ; AVX1-NEXT: vzeroupper
1785 ; AVX2-LABEL: splat4_v8i32_load_store:
1787 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1788 ; AVX2-NEXT: vmovups (%rdi), %xmm1
1789 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1]
1790 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1]
1791 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3]
1792 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
1793 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
1794 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3]
1795 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
1796 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1797 ; AVX2-NEXT: vmovups %ymm0, 96(%rsi)
1798 ; AVX2-NEXT: vmovups %ymm3, 64(%rsi)
1799 ; AVX2-NEXT: vmovups %ymm1, 32(%rsi)
1800 ; AVX2-NEXT: vmovups %ymm2, (%rsi)
1801 ; AVX2-NEXT: vzeroupper
1804 ; AVX512-LABEL: splat4_v8i32_load_store:
1806 ; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3]
1807 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11]
1808 ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1
1809 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15]
1810 ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0
1811 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi)
1812 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi)
1813 ; AVX512-NEXT: vzeroupper
1815 %x = load <8 x i32>, ptr %s, align 4
1816 %x2 = shufflevector <8 x i32> %x, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1817 %x4 = shufflevector <16 x i32> %x2, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1818 %r = shufflevector <32 x i32> %x4, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1819 store <32 x i32> %r, ptr %d, align 4
1823 define void @splat4_v4f64_load_store(ptr %s, ptr %d) nounwind {
1824 ; AVX1OR2-LABEL: splat4_v4f64_load_store:
1826 ; AVX1OR2-NEXT: vbroadcastsd (%rdi), %ymm0
1827 ; AVX1OR2-NEXT: vbroadcastsd 16(%rdi), %ymm1
1828 ; AVX1OR2-NEXT: vbroadcastsd 8(%rdi), %ymm2
1829 ; AVX1OR2-NEXT: vbroadcastsd 24(%rdi), %ymm3
1830 ; AVX1OR2-NEXT: vmovups %ymm3, 96(%rsi)
1831 ; AVX1OR2-NEXT: vmovups %ymm1, 64(%rsi)
1832 ; AVX1OR2-NEXT: vmovups %ymm2, 32(%rsi)
1833 ; AVX1OR2-NEXT: vmovups %ymm0, (%rsi)
1834 ; AVX1OR2-NEXT: vzeroupper
1835 ; AVX1OR2-NEXT: retq
1837 ; AVX512-LABEL: splat4_v4f64_load_store:
1839 ; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
1840 ; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1
1841 ; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2
1842 ; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3
1843 ; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
1844 ; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1
1845 ; AVX512-NEXT: vmovups %zmm1, 64(%rsi)
1846 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1847 ; AVX512-NEXT: vzeroupper
1849 %x = load <4 x double>, ptr %s, align 8
1850 %x2 = shufflevector <4 x double> %x, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1851 %x4 = shufflevector <8 x double> %x2, <8 x double> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1852 %r = shufflevector <16 x double> %x4, <16 x double> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1853 store <16 x double> %r, ptr %d, align 8
1857 define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind {
1858 ; AVX1OR2-LABEL: splat4_v4i64_load_store:
1860 ; AVX1OR2-NEXT: vbroadcastsd (%rdi), %ymm0
1861 ; AVX1OR2-NEXT: vbroadcastsd 16(%rdi), %ymm1
1862 ; AVX1OR2-NEXT: vbroadcastsd 8(%rdi), %ymm2
1863 ; AVX1OR2-NEXT: vbroadcastsd 24(%rdi), %ymm3
1864 ; AVX1OR2-NEXT: vmovups %ymm3, 96(%rsi)
1865 ; AVX1OR2-NEXT: vmovups %ymm1, 64(%rsi)
1866 ; AVX1OR2-NEXT: vmovups %ymm2, 32(%rsi)
1867 ; AVX1OR2-NEXT: vmovups %ymm0, (%rsi)
1868 ; AVX1OR2-NEXT: vzeroupper
1869 ; AVX1OR2-NEXT: retq
1871 ; AVX512-LABEL: splat4_v4i64_load_store:
1873 ; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
1874 ; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1
1875 ; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2
1876 ; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3
1877 ; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
1878 ; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1
1879 ; AVX512-NEXT: vmovups %zmm1, 64(%rsi)
1880 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1881 ; AVX512-NEXT: vzeroupper
1883 %x = load <4 x i64>, ptr %s, align 8
1884 %x2 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1885 %x4 = shufflevector <8 x i64> %x2, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1886 %r = shufflevector <16 x i64> %x4, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1887 store <16 x i64> %r, ptr %d, align 8
1891 define <2 x i64> @PR37616(ptr %a0) nounwind {
1892 ; AVX1-LABEL: PR37616:
1894 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm0
1895 ; AVX1-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
1898 ; AVX2OR512-LABEL: PR37616:
1899 ; AVX2OR512: # %bb.0:
1900 ; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0
1901 ; AVX2OR512-NEXT: vunpcklpd 32(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
1902 ; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0
1903 ; AVX2OR512-NEXT: vzeroupper
1904 ; AVX2OR512-NEXT: retq
1905 %load = load <16 x i64>, ptr %a0, align 128
1906 %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
1907 ret <2 x i64> %shuffle