1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX512
6 define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
7 ; AVX-LABEL: load_factorf64_4:
9 ; AVX-NEXT: vmovupd (%rdi), %ymm0
10 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1
11 ; AVX-NEXT: vmovupd 64(%rdi), %ymm2
12 ; AVX-NEXT: vmovupd 96(%rdi), %ymm3
13 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
14 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
15 ; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
16 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
17 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
18 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
19 ; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2
20 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
21 ; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
23 %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
24 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
25 %strided.v1 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
26 %strided.v2 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
27 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
28 %add1 = fadd <4 x double> %strided.v0, %strided.v1
29 %add2 = fadd <4 x double> %add1, %strided.v2
30 %add3 = fadd <4 x double> %add2, %strided.v3
31 ret <4 x double> %add3
34 define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
35 ; AVX-LABEL: load_factorf64_2:
37 ; AVX-NEXT: vmovupd (%rdi), %ymm0
38 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1
39 ; AVX-NEXT: vmovupd 64(%rdi), %ymm2
40 ; AVX-NEXT: vmovupd 96(%rdi), %ymm3
41 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
42 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
43 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
44 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
45 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
46 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
47 ; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0
49 %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
50 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
51 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
52 %mul = fmul <4 x double> %strided.v0, %strided.v3
56 define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
57 ; AVX-LABEL: load_factorf64_1:
59 ; AVX-NEXT: vmovupd (%rdi), %ymm0
60 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1
61 ; AVX-NEXT: vperm2f128 $32, 64(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0,1],mem[0,1]
62 ; AVX-NEXT: vperm2f128 $32, 96(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1],mem[0,1]
63 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
64 ; AVX-NEXT: vmulpd %ymm0, %ymm0, %ymm0
66 %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
67 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
68 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
69 %mul = fmul <4 x double> %strided.v0, %strided.v3
73 define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
74 ; AVX1-LABEL: load_factori64_4:
76 ; AVX1-NEXT: vmovups (%rdi), %ymm0
77 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
78 ; AVX1-NEXT: vmovups 64(%rdi), %ymm2
79 ; AVX1-NEXT: vmovups 96(%rdi), %ymm3
80 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
81 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
82 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
83 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
84 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
85 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
86 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
87 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
88 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1
89 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
90 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm4
91 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
92 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
93 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
94 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
95 ; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
96 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
97 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
98 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
101 ; AVX2OR512-LABEL: load_factori64_4:
102 ; AVX2OR512: # %bb.0:
103 ; AVX2OR512-NEXT: vmovdqu (%rdi), %ymm0
104 ; AVX2OR512-NEXT: vmovdqu 32(%rdi), %ymm1
105 ; AVX2OR512-NEXT: vmovdqu 64(%rdi), %ymm2
106 ; AVX2OR512-NEXT: vmovdqu 96(%rdi), %ymm3
107 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
108 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
109 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
110 ; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
111 ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
112 ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
113 ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
114 ; AVX2OR512-NEXT: vpaddq %ymm3, %ymm4, %ymm3
115 ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
116 ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
117 ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
118 ; AVX2OR512-NEXT: retq
119 %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
120 %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
121 %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
122 %strided.v2 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
123 %strided.v3 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
124 %add1 = add <4 x i64> %strided.v0, %strided.v1
125 %add2 = add <4 x i64> %add1, %strided.v2
126 %add3 = add <4 x i64> %add2, %strided.v3
130 define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
131 ; AVX1-LABEL: store_factorf64_4:
133 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
134 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
135 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
136 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
137 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
138 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
139 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
140 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
141 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
142 ; AVX1-NEXT: vmovups %ymm3, 64(%rdi)
143 ; AVX1-NEXT: vmovups %ymm4, 32(%rdi)
144 ; AVX1-NEXT: vmovups %ymm2, (%rdi)
145 ; AVX1-NEXT: vzeroupper
148 ; AVX2-LABEL: store_factorf64_4:
150 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
151 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
152 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
153 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
154 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
155 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
156 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
157 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
158 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
159 ; AVX2-NEXT: vmovups %ymm3, 64(%rdi)
160 ; AVX2-NEXT: vmovups %ymm4, 32(%rdi)
161 ; AVX2-NEXT: vmovups %ymm2, (%rdi)
162 ; AVX2-NEXT: vzeroupper
165 ; AVX512-LABEL: store_factorf64_4:
167 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
168 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
169 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
170 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
171 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
172 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
173 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
174 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
175 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
176 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
177 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
178 ; AVX512-NEXT: vmovups %zmm1, (%rdi)
179 ; AVX512-NEXT: vzeroupper
181 %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
182 %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
183 %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
184 store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
188 define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) {
189 ; AVX1-LABEL: store_factori64_4:
191 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
192 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
193 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
194 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
195 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
196 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
197 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
198 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
199 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
200 ; AVX1-NEXT: vmovups %ymm3, 64(%rdi)
201 ; AVX1-NEXT: vmovups %ymm4, 32(%rdi)
202 ; AVX1-NEXT: vmovups %ymm2, (%rdi)
203 ; AVX1-NEXT: vzeroupper
206 ; AVX2-LABEL: store_factori64_4:
208 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
209 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
210 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
211 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
212 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
213 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
214 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
215 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
216 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
217 ; AVX2-NEXT: vmovups %ymm3, 64(%rdi)
218 ; AVX2-NEXT: vmovups %ymm4, 32(%rdi)
219 ; AVX2-NEXT: vmovups %ymm2, (%rdi)
220 ; AVX2-NEXT: vzeroupper
223 ; AVX512-LABEL: store_factori64_4:
225 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
226 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
227 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
228 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
229 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
230 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
231 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
232 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
233 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
234 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
235 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
236 ; AVX512-NEXT: vmovups %zmm1, (%rdi)
237 ; AVX512-NEXT: vzeroupper
239 %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
240 %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
241 %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
242 store <16 x i64> %interleaved.vec, <16 x i64>* %ptr, align 16
247 define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, <128 x i8>* %p) {
248 ; AVX1-LABEL: interleaved_store_vf32_i8_stride4:
250 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
251 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
252 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
253 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
254 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
255 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
256 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
257 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
258 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
259 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
260 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
261 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
262 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
263 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
264 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
265 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
266 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
267 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
268 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
269 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
270 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm2
271 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1
272 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
273 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
274 ; AVX1-NEXT: vmovaps %ymm0, 96(%rdi)
275 ; AVX1-NEXT: vmovaps %ymm3, 64(%rdi)
276 ; AVX1-NEXT: vmovaps %ymm1, 32(%rdi)
277 ; AVX1-NEXT: vmovaps %ymm2, (%rdi)
278 ; AVX1-NEXT: vzeroupper
281 ; AVX2-LABEL: interleaved_store_vf32_i8_stride4:
283 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
284 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
285 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
286 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
287 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
288 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
289 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
290 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
291 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
292 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
293 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
294 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
295 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
296 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdi)
297 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
298 ; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
299 ; AVX2-NEXT: vzeroupper
302 ; AVX512-LABEL: interleaved_store_vf32_i8_stride4:
304 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
305 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
306 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
307 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
308 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
309 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
310 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
311 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
312 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
313 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
314 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
315 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
316 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
317 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
318 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdi)
319 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
320 ; AVX512-NEXT: vzeroupper
322 %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
323 %v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
324 %interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
325 store <128 x i8> %interleaved.vec, <128 x i8>* %p
329 define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, <64 x i8>* %p) {
330 ; AVX1-LABEL: interleaved_store_vf16_i8_stride4:
332 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
333 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
334 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
335 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
336 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
337 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
338 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
339 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
340 ; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi)
341 ; AVX1-NEXT: vmovdqa %xmm4, 32(%rdi)
342 ; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi)
343 ; AVX1-NEXT: vmovdqa %xmm3, (%rdi)
346 ; AVX2-LABEL: interleaved_store_vf16_i8_stride4:
348 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
349 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
350 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
351 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
352 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
353 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
354 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
355 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
356 ; AVX2-NEXT: vmovdqa %xmm0, 48(%rdi)
357 ; AVX2-NEXT: vmovdqa %xmm4, 32(%rdi)
358 ; AVX2-NEXT: vmovdqa %xmm1, 16(%rdi)
359 ; AVX2-NEXT: vmovdqa %xmm3, (%rdi)
362 ; AVX512-LABEL: interleaved_store_vf16_i8_stride4:
364 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
365 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
366 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
367 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
368 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
369 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
370 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
371 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
372 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
373 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
374 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
375 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdi)
376 ; AVX512-NEXT: vzeroupper
378 %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
379 %v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
380 %interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>
381 store <64 x i8> %interleaved.vec, <64 x i8>* %p
385 define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
386 ; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
388 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
389 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
390 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
391 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
392 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
393 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
394 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
395 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
396 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
397 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
398 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
399 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
400 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
401 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[1,0,3,2,4,5,6,7]
402 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
403 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
404 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
405 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,u,1,u,7,u,5,u,1,u,5,u,0,u,1,u>
406 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
407 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
408 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
409 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
410 ; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
411 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
414 ; AVX2-LABEL: interleaved_load_vf8_i8_stride4:
416 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
417 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
418 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
419 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
420 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm2
421 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
422 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
423 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
424 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
425 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
426 ; AVX2-NEXT: vpaddb %xmm2, %xmm3, %xmm2
427 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
428 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
429 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[1,0,3,2,4,5,6,7]
430 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
431 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
432 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
433 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,u,1,u,7,u,5,u,1,u,5,u,0,u,1,u>
434 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
435 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
436 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
437 ; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
438 ; AVX2-NEXT: vpmullw %xmm0, %xmm2, %xmm0
439 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
442 ; AVX512-LABEL: interleaved_load_vf8_i8_stride4:
444 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
445 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
446 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
447 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm3
448 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[1,0,3,2,4,5,6,7]
449 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm2
450 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[1,0,3,2,4,5,6,7]
451 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0]
452 ; AVX512-NEXT: vprold $8, %zmm3, %zmm3
453 ; AVX512-NEXT: vprold $8, %zmm2, %zmm2
454 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
455 ; AVX512-NEXT: vpaddb %xmm4, %xmm2, %xmm2
456 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
457 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
458 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
459 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
460 ; AVX512-NEXT: vmovdqu (%rdi), %ymm1
461 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
462 ; AVX512-NEXT: vpaddb %xmm0, %xmm1, %xmm0
463 ; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm0
464 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
465 ; AVX512-NEXT: vzeroupper
467 %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
468 %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
469 %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
470 %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
471 %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
473 %add1 = add <8 x i8> %v1, %v2
474 %add2 = add <8 x i8> %v4, %v3
475 %add3 = mul <8 x i8> %add1, %add2
479 define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
480 ; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
482 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
483 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
484 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
485 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3
486 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4
487 ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5
488 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
489 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
490 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
491 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6
492 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
493 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
494 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
495 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
496 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
497 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
498 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
499 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
500 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
501 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
502 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
503 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
504 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
505 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
506 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
507 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
508 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
509 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
510 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
511 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
512 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
513 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
514 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
515 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
516 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
517 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
518 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
519 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
520 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
521 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
522 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
523 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
524 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
525 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
528 ; AVX2-LABEL: interleaved_load_vf16_i8_stride4:
530 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
531 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
532 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
533 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
534 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
535 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
536 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
537 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
538 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
539 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
540 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
541 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
542 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
543 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
544 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
545 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
546 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
547 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
548 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
549 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
550 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
551 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
552 ; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4
553 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
554 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
555 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
556 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
557 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
558 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
559 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
560 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
561 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
562 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
563 ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
564 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
565 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
566 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
567 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
568 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
569 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
570 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
571 ; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
572 ; AVX2-NEXT: vpxor %xmm0, %xmm4, %xmm0
573 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
576 ; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
578 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
579 ; AVX512-NEXT: vpmovdb %zmm0, %xmm8
580 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1
581 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
582 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
583 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm4
584 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
585 ; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6
586 ; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5
587 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
588 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
589 ; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm7
590 ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6
591 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
592 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
593 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
594 ; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm7
595 ; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm6
596 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
597 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
598 ; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm0
599 ; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7
600 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
601 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
602 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
603 ; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
604 ; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
605 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
606 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
607 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
608 ; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1
609 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
610 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
611 ; AVX512-NEXT: vpcmpeqb %zmm5, %zmm8, %k0
612 ; AVX512-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
613 ; AVX512-NEXT: kxnorw %k1, %k0, %k0
614 ; AVX512-NEXT: vpmovm2b %k0, %zmm0
615 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
616 ; AVX512-NEXT: vzeroupper
618 %wide.vec = load <64 x i8>, <64 x i8>* %ptr
619 %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
620 %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
621 %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
622 %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
624 %cmp1 = icmp eq <16 x i8> %v1, %v2
625 %cmp2 = icmp eq <16 x i8> %v3, %v4
626 %res = icmp eq <16 x i1> %cmp1, %cmp2
631 define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
632 ; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
634 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
635 ; AVX1-NEXT: vmovdqa (%rdi), %xmm10
636 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm11
637 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm12
638 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm13
639 ; AVX1-NEXT: vpshufb %xmm6, %xmm13, %xmm4
640 ; AVX1-NEXT: vpshufb %xmm6, %xmm12, %xmm5
641 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
642 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
643 ; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm5
644 ; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm7
645 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
646 ; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3],xmm4[4,5,6,7]
647 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm14
648 ; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm7
649 ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm5
650 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6
651 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
652 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm6
653 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm2
654 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm7
655 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
656 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
657 ; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4,5,6,7]
658 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
659 ; AVX1-NEXT: vpshufb %xmm1, %xmm13, %xmm2
660 ; AVX1-NEXT: vpshufb %xmm1, %xmm12, %xmm0
661 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
662 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
663 ; AVX1-NEXT: vpshufb %xmm2, %xmm11, %xmm3
664 ; AVX1-NEXT: vpshufb %xmm2, %xmm10, %xmm4
665 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
666 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
667 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm8, %xmm0
668 ; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
669 ; AVX1-NEXT: vpshufb %xmm1, %xmm14, %xmm0
670 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
671 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
672 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm1
673 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
674 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
675 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
676 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm9, %xmm9
677 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
678 ; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm1
679 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2
680 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
681 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
682 ; AVX1-NEXT: vpshufb %xmm2, %xmm11, %xmm3
683 ; AVX1-NEXT: vpshufb %xmm2, %xmm10, %xmm4
684 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
685 ; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3],xmm1[4,5,6,7]
686 ; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm3
687 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
688 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
689 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm3
690 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
691 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
692 ; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2,3],xmm0[4,5,6,7]
693 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
694 ; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm3
695 ; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm4
696 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
697 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
698 ; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm0
699 ; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm1
700 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
701 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
702 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm8, %xmm0
703 ; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm1
704 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
705 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
706 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm2
707 ; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm3
708 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
709 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
710 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm15, %xmm1
711 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
712 ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2
713 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
714 ; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
715 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
718 ; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
720 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
721 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm9
722 ; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm1
723 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm10
724 ; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm3
725 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
726 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
727 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
728 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm12
729 ; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm4
730 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5
731 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm6
732 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
733 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
734 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm1[6,7]
735 ; AVX2-NEXT: vmovdqa (%rdi), %xmm11
736 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm13
737 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm6
738 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7
739 ; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm1
740 ; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0
741 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
742 ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm1
743 ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm2
744 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
745 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
746 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
747 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
748 ; AVX2-NEXT: vpshufb %xmm1, %xmm9, %xmm2
749 ; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm0
750 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
751 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
752 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
753 ; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm3
754 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm4
755 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
756 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
757 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
758 ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm3
759 ; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1
760 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
761 ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm3
762 ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm2
763 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
764 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
765 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
766 ; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm8
767 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
768 ; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm1
769 ; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm2
770 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
771 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
772 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
773 ; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm3
774 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm4
775 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
776 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
777 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
778 ; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm3
779 ; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0
780 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
781 ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm3
782 ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm2
783 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
784 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
785 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
786 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
787 ; AVX2-NEXT: vpshufb %xmm1, %xmm9, %xmm2
788 ; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm3
789 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
790 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
791 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
792 ; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm4
793 ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
794 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
795 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
796 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
797 ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm4
798 ; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1
799 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
800 ; AVX2-NEXT: vpshufb %xmm3, %xmm13, %xmm4
801 ; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm3
802 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
803 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
804 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
805 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
806 ; AVX2-NEXT: vpxor %ymm0, %ymm8, %ymm0
807 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
810 ; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
812 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0
813 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
814 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
815 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
816 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1
817 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
818 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
819 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
820 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12
821 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14
822 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
823 ; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm0
824 ; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm4
825 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
826 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
827 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
828 ; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm4
829 ; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm6
830 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
831 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
832 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm0[6,7]
833 ; AVX512-NEXT: vmovdqa (%rdi), %xmm13
834 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6
835 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7
836 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0
837 ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3
838 ; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
839 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
840 ; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
841 ; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2
842 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
843 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
844 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7]
845 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
846 ; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm2
847 ; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm3
848 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
849 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
850 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
851 ; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4
852 ; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm5
853 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
854 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
855 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
856 ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm4
857 ; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
858 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
859 ; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm4
860 ; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3
861 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
862 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
863 ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm2[4,5,6,7]
864 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
865 ; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm3
866 ; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm4
867 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
868 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
869 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
870 ; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5
871 ; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm1
872 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
873 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
874 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
875 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
876 ; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
877 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
878 ; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm2
879 ; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm3
880 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
881 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
882 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
883 ; AVX512-NEXT: vpcmpeqb %zmm8, %zmm9, %k0
884 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm15, %k1
885 ; AVX512-NEXT: kxnord %k1, %k0, %k0
886 ; AVX512-NEXT: vpmovm2b %k0, %zmm0
887 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
889 %wide.vec = load <128 x i8>, <128 x i8>* %ptr
890 %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
892 %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
894 %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
896 %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
898 %cmp1 = icmp eq <32 x i8> %v1, %v2
899 %cmp2 = icmp eq <32 x i8> %v3, %v4
900 %res = icmp eq <32 x i1> %cmp1, %cmp2
905 define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
906 ; AVX-LABEL: interleaved_store_vf8_i8_stride4:
908 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
909 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
910 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
911 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
912 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdi)
913 ; AVX-NEXT: vmovdqa %xmm2, (%rdi)
915 %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
916 %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
917 %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
918 store <32 x i8> %interleaved.vec, <32 x i8>* %p
922 define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
923 ; AVX1-LABEL: interleaved_load_vf32_i8_stride3:
925 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
926 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
927 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
928 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
929 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4
930 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5
931 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
932 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
933 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
934 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
935 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
936 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
937 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
938 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
939 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
940 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
941 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
942 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8
943 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
944 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
945 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2
946 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
947 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
948 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
949 ; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
950 ; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
951 ; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5
952 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
953 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
954 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
955 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
956 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
957 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
958 ; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
959 ; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3
960 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
961 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
962 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
965 ; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3:
966 ; AVX2OR512: # %bb.0:
967 ; AVX2OR512-NEXT: vmovdqa (%rdi), %xmm0
968 ; AVX2OR512-NEXT: vmovdqa 16(%rdi), %xmm1
969 ; AVX2OR512-NEXT: vmovdqa 32(%rdi), %xmm2
970 ; AVX2OR512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
971 ; AVX2OR512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
972 ; AVX2OR512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
973 ; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
974 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
975 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
976 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
977 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
978 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
979 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
980 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
981 ; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
982 ; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
983 ; AVX2OR512-NEXT: vpaddb %ymm2, %ymm1, %ymm1
984 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
985 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
986 ; AVX2OR512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
987 ; AVX2OR512-NEXT: retq
988 %wide.vec = load <96 x i8>, <96 x i8>* %ptr
989 %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
990 %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
991 %v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
992 %add1 = add <32 x i8> %v1, %v2
993 %add2 = add <32 x i8> %v3, %add1
997 define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
998 ; AVX-LABEL: interleaved_load_vf16_i8_stride3:
1000 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1001 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
1002 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
1003 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1004 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1005 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1006 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1007 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1008 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1009 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1010 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1011 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1012 ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
1013 ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
1014 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1015 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1016 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1018 %wide.vec = load <48 x i8>, <48 x i8>* %ptr
1019 %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
1020 %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
1021 %v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
1022 %add1 = add <16 x i8> %v1, %v2
1023 %add2 = add <16 x i8> %v3, %add1
1027 define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
1028 ; AVX-LABEL: interleaved_load_vf8_i8_stride3:
1030 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1031 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
1032 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
1033 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
1034 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
1035 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
1036 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
1037 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
1038 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
1039 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
1040 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1041 ; AVX-NEXT: vpaddb %xmm0, %xmm3, %xmm0
1042 ; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0
1044 %wide.vec = load <24 x i8>, <24 x i8>* %ptr
1045 %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
1046 %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
1047 %v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
1048 %add1 = add <8 x i8> %v1, %v2
1049 %add2 = add <8 x i8> %v3, %add1
1053 define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
1054 ; AVX-LABEL: interleaved_store_vf8_i8_stride3:
1056 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1057 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
1058 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
1059 ; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
1060 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
1061 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
1062 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
1063 ; AVX-NEXT: vmovq %xmm0, 16(%rdi)
1064 ; AVX-NEXT: vmovdqu %xmm1, (%rdi)
1066 %1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1067 %2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1068 %interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1069 store <24 x i8> %interleaved.vec, <24 x i8>* %p, align 1
1073 define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
1074 ; AVX-LABEL: interleaved_store_vf16_i8_stride3:
1076 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1077 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1078 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1079 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1080 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1081 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1082 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1083 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1084 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1085 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1086 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1087 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1088 ; AVX-NEXT: vmovdqu %xmm0, 16(%rdi)
1089 ; AVX-NEXT: vmovdqu %xmm1, (%rdi)
1090 ; AVX-NEXT: vmovdqu %xmm2, 32(%rdi)
1092 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1093 %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1094 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1095 store <48 x i8> %interleaved.vec, <48 x i8>* %p, align 1
1099 define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <96 x i8>* %p) {
1100 ; AVX1-LABEL: interleaved_store_vf32_i8_stride3:
1102 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1103 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1104 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1105 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
1106 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1107 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1108 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1109 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1110 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1111 ; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1112 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
1113 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
1114 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1115 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
1116 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1117 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1118 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1119 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1120 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1121 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1122 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
1123 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
1124 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
1125 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
1126 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
1127 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
1128 ; AVX1-NEXT: vmovdqu %xmm3, 80(%rdi)
1129 ; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi)
1130 ; AVX1-NEXT: vmovdqu %xmm5, 48(%rdi)
1131 ; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi)
1132 ; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi)
1133 ; AVX1-NEXT: vmovdqu %xmm1, (%rdi)
1134 ; AVX1-NEXT: vzeroupper
1137 ; AVX2-LABEL: interleaved_store_vf32_i8_stride3:
1139 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1140 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1141 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1142 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1143 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1144 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1145 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1146 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1147 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1148 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1149 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1150 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1151 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1152 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1153 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1154 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
1155 ; AVX2-NEXT: vmovdqu %ymm1, 32(%rdi)
1156 ; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
1157 ; AVX2-NEXT: vzeroupper
1160 ; AVX512-LABEL: interleaved_store_vf32_i8_stride3:
1162 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1163 ; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1164 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1165 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1166 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1167 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1168 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1169 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1170 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1171 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1172 ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1173 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1174 ; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1175 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1176 ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1177 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1178 ; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi)
1179 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi)
1180 ; AVX512-NEXT: vzeroupper
1182 %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1183 %2 = shufflevector <32 x i8> %c, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1184 %interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
1185 store <96 x i8> %interleaved.vec, <96 x i8>* %p, align 1
1189 define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <192 x i8>* %p) {
1190 ; AVX1-LABEL: interleaved_store_vf64_i8_stride3:
1192 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
1193 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1194 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1195 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
1196 ; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1197 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1198 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10
1199 ; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm10[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1200 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1201 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11
1202 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1203 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1204 ; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1205 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
1206 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1207 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1208 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1209 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
1210 ; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
1211 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1212 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1213 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
1214 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
1215 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1216 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1217 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
1218 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
1219 ; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
1220 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1221 ; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1222 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1223 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
1224 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1225 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
1226 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
1227 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1228 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1229 ; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
1230 ; AVX1-NEXT: # xmm6 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1231 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
1232 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1233 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
1234 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm14
1235 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm12
1236 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
1237 ; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm7
1238 ; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm1
1239 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
1240 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm11
1241 ; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm9
1242 ; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm2
1243 ; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm3
1244 ; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm4
1245 ; AVX1-NEXT: vmovdqu %xmm1, 80(%rdi)
1246 ; AVX1-NEXT: vmovdqu %xmm7, 64(%rdi)
1247 ; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi)
1248 ; AVX1-NEXT: vmovdqu %xmm14, (%rdi)
1249 ; AVX1-NEXT: vmovdqu %xmm0, 48(%rdi)
1250 ; AVX1-NEXT: vmovdqu %xmm12, 32(%rdi)
1251 ; AVX1-NEXT: vmovdqu %xmm4, 176(%rdi)
1252 ; AVX1-NEXT: vmovdqu %xmm3, 160(%rdi)
1253 ; AVX1-NEXT: vmovdqu %xmm5, 112(%rdi)
1254 ; AVX1-NEXT: vmovdqu %xmm11, 96(%rdi)
1255 ; AVX1-NEXT: vmovdqu %xmm2, 144(%rdi)
1256 ; AVX1-NEXT: vmovdqu %xmm9, 128(%rdi)
1257 ; AVX1-NEXT: vzeroupper
1260 ; AVX2-LABEL: interleaved_store_vf64_i8_stride3:
1262 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1263 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1264 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1265 ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1266 ; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1267 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
1268 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1269 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1270 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20]
1271 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
1272 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm9[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1273 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1274 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
1275 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1276 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
1277 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20]
1278 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6
1279 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1280 ; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6
1281 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
1282 ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
1283 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
1284 ; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
1285 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm4
1286 ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
1287 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
1288 ; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3
1289 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
1290 ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
1291 ; AVX2-NEXT: vmovdqu %ymm1, 160(%rdi)
1292 ; AVX2-NEXT: vmovdqu %ymm3, 128(%rdi)
1293 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
1294 ; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi)
1295 ; AVX2-NEXT: vmovdqu %ymm4, 96(%rdi)
1296 ; AVX2-NEXT: vmovdqu %ymm6, (%rdi)
1297 ; AVX2-NEXT: vzeroupper
1300 ; AVX512-LABEL: interleaved_store_vf64_i8_stride3:
1302 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53]
1303 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58]
1304 ; AVX512-NEXT: vpalignr {{.*#+}} zmm4 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
1305 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
1306 ; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52]
1307 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm4[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm4[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm4[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm4[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
1308 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
1309 ; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52]
1310 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1311 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1312 ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1313 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1314 ; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5
1315 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3]
1316 ; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm6
1317 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1318 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1319 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm7
1320 ; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7
1321 ; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1322 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1323 ; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1324 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1325 ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1326 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm2
1327 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
1328 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
1329 ; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi)
1330 ; AVX512-NEXT: vmovdqu64 %zmm3, 64(%rdi)
1331 ; AVX512-NEXT: vmovdqu64 %zmm2, (%rdi)
1332 ; AVX512-NEXT: vzeroupper
1334 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1335 %2 = shufflevector <64 x i8> %c, <64 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1336 %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
1337 store <192 x i8> %3, <192 x i8>* %p, align 1
1341 define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
1342 ; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
1344 ; AVX1-NEXT: vmovdqu (%rdi), %xmm11
1345 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm10
1346 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm8
1347 ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3
1348 ; AVX1-NEXT: vmovdqu 64(%rdi), %xmm12
1349 ; AVX1-NEXT: vmovdqu 80(%rdi), %xmm9
1350 ; AVX1-NEXT: vmovdqu 96(%rdi), %xmm6
1351 ; AVX1-NEXT: vmovdqu 112(%rdi), %xmm14
1352 ; AVX1-NEXT: vmovdqu 128(%rdi), %xmm13
1353 ; AVX1-NEXT: vmovdqu 144(%rdi), %xmm5
1354 ; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1
1355 ; AVX1-NEXT: vmovdqu 176(%rdi), %xmm15
1356 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1357 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
1358 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
1359 ; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm11
1360 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
1361 ; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm10
1362 ; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm12
1363 ; AVX1-NEXT: vpshufb %xmm4, %xmm14, %xmm14
1364 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
1365 ; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm2
1366 ; AVX1-NEXT: vpshufb %xmm4, %xmm15, %xmm0
1367 ; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm7
1368 ; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm4
1369 ; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1370 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm7[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
1371 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm0[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
1372 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
1373 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1374 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
1375 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1376 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm1
1377 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7,8,9,10]
1378 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10]
1379 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1380 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm14
1381 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1382 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
1383 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm10
1384 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1385 ; AVX1-NEXT: # ymm12 = mem[0,1,0,1]
1386 ; AVX1-NEXT: vandnps %ymm10, %ymm12, %ymm10
1387 ; AVX1-NEXT: vandps %ymm12, %ymm14, %ymm14
1388 ; AVX1-NEXT: vorps %ymm10, %ymm14, %ymm10
1389 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm14
1390 ; AVX1-NEXT: vandnps %ymm14, %ymm12, %ymm14
1391 ; AVX1-NEXT: vandps %ymm1, %ymm12, %ymm1
1392 ; AVX1-NEXT: vorps %ymm1, %ymm14, %ymm1
1393 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1394 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
1395 ; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm15[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
1396 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
1397 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1398 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9,10]
1399 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm8[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1400 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9,10]
1401 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1402 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
1403 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1404 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2
1405 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0
1406 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
1407 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1408 ; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
1409 ; AVX1-NEXT: vpaddb %xmm12, %xmm10, %xmm3
1410 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1411 ; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
1412 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
1413 ; AVX1-NEXT: vpaddb %xmm1, %xmm9, %xmm1
1414 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1415 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
1416 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1419 ; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
1421 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
1422 ; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1
1423 ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm2
1424 ; AVX2-NEXT: vmovdqu 96(%rdi), %xmm3
1425 ; AVX2-NEXT: vmovdqu 112(%rdi), %xmm4
1426 ; AVX2-NEXT: vmovdqu 128(%rdi), %xmm5
1427 ; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1428 ; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1429 ; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1430 ; AVX2-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1431 ; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
1432 ; AVX2-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
1433 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1434 ; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3
1435 ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
1436 ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
1437 ; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4
1438 ; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm5
1439 ; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2
1440 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1441 ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1442 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
1443 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1444 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1445 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26]
1446 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1447 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1448 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1449 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
1450 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1
1451 ; AVX2-NEXT: vpaddb %ymm5, %ymm1, %ymm1
1452 ; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
1453 ; AVX2-NEXT: vpaddb %ymm4, %ymm2, %ymm2
1454 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
1455 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1456 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1457 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
1458 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1459 ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
1462 ; AVX512-LABEL: interleaved_load_vf64_i8_stride3:
1464 ; AVX512-NEXT: vmovdqu (%rdi), %xmm0
1465 ; AVX512-NEXT: vmovdqu 16(%rdi), %xmm1
1466 ; AVX512-NEXT: vmovdqu 32(%rdi), %xmm2
1467 ; AVX512-NEXT: vmovdqu 96(%rdi), %xmm3
1468 ; AVX512-NEXT: vmovdqu 112(%rdi), %xmm4
1469 ; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5
1470 ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1471 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1472 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1473 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
1474 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1475 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1476 ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
1477 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1478 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1479 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1480 ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1481 ; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1
1482 ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1483 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1484 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1485 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1486 ; AVX512-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1487 ; AVX512-NEXT: kmovq %rax, %k1
1488 ; AVX512-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
1489 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1490 ; AVX512-NEXT: vpaddb %zmm1, %zmm2, %zmm1
1491 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
1492 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
1493 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1495 %wide.vec = load <192 x i8>, <192 x i8>* %ptr, align 1
1496 %v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
1497 %v2 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
1498 %v3 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
1499 %add1 = add <64 x i8> %v1, %v2
1500 %add2 = add <64 x i8> %v3, %add1
1504 define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, <256 x i8>* %p) {
1505 ; AVX1-LABEL: interleaved_store_vf64_i8_stride4:
1507 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1508 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm10
1509 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11
1510 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
1511 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1512 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13
1513 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14
1514 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1515 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1516 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1517 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
1518 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
1519 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
1520 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
1521 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1
1522 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
1523 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1524 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
1525 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
1526 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
1527 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
1528 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
1529 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1530 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1531 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
1532 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
1533 ; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1534 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
1535 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3]
1536 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
1537 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
1538 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1539 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
1540 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
1541 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
1542 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
1543 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
1544 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
1545 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3]
1546 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
1547 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1548 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1549 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1550 ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
1551 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4
1552 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2
1553 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1
1554 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6
1555 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7
1556 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1557 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
1558 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm14, %ymm3
1559 ; AVX1-NEXT: vmovaps %ymm3, 224(%rdi)
1560 ; AVX1-NEXT: vmovaps %ymm0, 192(%rdi)
1561 ; AVX1-NEXT: vmovaps %ymm7, 160(%rdi)
1562 ; AVX1-NEXT: vmovaps %ymm6, 128(%rdi)
1563 ; AVX1-NEXT: vmovaps %ymm1, 96(%rdi)
1564 ; AVX1-NEXT: vmovaps %ymm2, 64(%rdi)
1565 ; AVX1-NEXT: vmovaps %ymm4, 32(%rdi)
1566 ; AVX1-NEXT: vmovaps %ymm8, (%rdi)
1567 ; AVX1-NEXT: vzeroupper
1570 ; AVX2-LABEL: interleaved_store_vf64_i8_stride4:
1572 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1573 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
1574 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1575 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
1576 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
1577 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23]
1578 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
1579 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31]
1580 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11]
1581 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
1582 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15]
1583 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
1584 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
1585 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
1586 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
1587 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
1588 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm4
1589 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm5
1590 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
1591 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
1592 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm7
1593 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9
1594 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
1595 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
1596 ; AVX2-NEXT: vmovdqa %ymm1, 224(%rdi)
1597 ; AVX2-NEXT: vmovdqa %ymm3, 192(%rdi)
1598 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
1599 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
1600 ; AVX2-NEXT: vmovdqa %ymm9, 160(%rdi)
1601 ; AVX2-NEXT: vmovdqa %ymm7, 128(%rdi)
1602 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
1603 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
1604 ; AVX2-NEXT: vzeroupper
1607 ; AVX512-LABEL: interleaved_store_vf64_i8_stride4:
1609 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
1610 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
1611 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55]
1612 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63]
1613 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27]
1614 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31]
1615 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27]
1616 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31]
1617 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
1618 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
1619 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm1[2,3]
1620 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm0[2,3]
1621 ; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm3
1622 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1623 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm8
1624 ; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm4
1625 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1626 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm9
1627 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
1628 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
1629 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
1630 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
1631 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm4
1632 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
1633 ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdi)
1634 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdi)
1635 ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdi)
1636 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
1637 ; AVX512-NEXT: vzeroupper
1639 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1640 %2 = shufflevector <64 x i8> %c, <64 x i8> %d, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1641 %interleaved = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
1642 store <256 x i8> %interleaved, <256 x i8>* %p
1646 define void @splat2_v4f64_load_store(<4 x double>* %s, <8 x double>* %d) {
1647 ; AVX1-LABEL: splat2_v4f64_load_store:
1649 ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
1650 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
1651 ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
1652 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
1653 ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
1654 ; AVX1-NEXT: vmovupd %ymm1, (%rsi)
1655 ; AVX1-NEXT: vzeroupper
1658 ; AVX2-LABEL: splat2_v4f64_load_store:
1660 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1661 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
1662 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1663 ; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
1664 ; AVX2-NEXT: vmovups %ymm1, (%rsi)
1665 ; AVX2-NEXT: vzeroupper
1668 ; AVX512-LABEL: splat2_v4f64_load_store:
1670 ; AVX512-NEXT: vmovups (%rdi), %ymm0
1671 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
1672 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1673 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1674 ; AVX512-NEXT: vzeroupper
1676 %x = load <4 x double>, <4 x double>* %s, align 8
1677 %x2 = shufflevector <4 x double> %x, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1678 %r = shufflevector <8 x double> %x2, <8 x double> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1679 store <8 x double> %r, <8 x double>* %d, align 8
1683 define void @splat2_v4i64_load_store(<4 x i64>* %s, <8 x i64>* %d) {
1684 ; AVX1-LABEL: splat2_v4i64_load_store:
1686 ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
1687 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
1688 ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
1689 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
1690 ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
1691 ; AVX1-NEXT: vmovupd %ymm1, (%rsi)
1692 ; AVX1-NEXT: vzeroupper
1695 ; AVX2-LABEL: splat2_v4i64_load_store:
1697 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1698 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
1699 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1700 ; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
1701 ; AVX2-NEXT: vmovups %ymm1, (%rsi)
1702 ; AVX2-NEXT: vzeroupper
1705 ; AVX512-LABEL: splat2_v4i64_load_store:
1707 ; AVX512-NEXT: vmovups (%rdi), %ymm0
1708 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
1709 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
1710 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1711 ; AVX512-NEXT: vzeroupper
1713 %x = load <4 x i64>, <4 x i64>* %s, align 8
1714 %x2 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1715 %r = shufflevector <8 x i64> %x2, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1716 store <8 x i64> %r, <8 x i64>* %d, align 8
1720 define void @splat4_v8f32_load_store(<8 x float>* %s, <32 x float>* %d) {
1721 ; AVX1-LABEL: splat4_v8f32_load_store:
1723 ; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm0
1724 ; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm1
1725 ; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm2
1726 ; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm3
1727 ; AVX1-NEXT: vbroadcastss (%rdi), %xmm4
1728 ; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm5
1729 ; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm6
1730 ; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm7
1731 ; AVX1-NEXT: vmovups %xmm7, 48(%rsi)
1732 ; AVX1-NEXT: vmovups %xmm6, 32(%rsi)
1733 ; AVX1-NEXT: vmovups %xmm5, 16(%rsi)
1734 ; AVX1-NEXT: vmovups %xmm4, (%rsi)
1735 ; AVX1-NEXT: vmovups %xmm3, 112(%rsi)
1736 ; AVX1-NEXT: vmovups %xmm2, 96(%rsi)
1737 ; AVX1-NEXT: vmovups %xmm1, 80(%rsi)
1738 ; AVX1-NEXT: vmovups %xmm0, 64(%rsi)
1741 ; AVX2-LABEL: splat4_v8f32_load_store:
1743 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1744 ; AVX2-NEXT: vmovups (%rdi), %xmm1
1745 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1]
1746 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1]
1747 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3]
1748 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
1749 ; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
1750 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3]
1751 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
1752 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1753 ; AVX2-NEXT: vmovups %ymm0, 96(%rsi)
1754 ; AVX2-NEXT: vmovups %ymm3, 64(%rsi)
1755 ; AVX2-NEXT: vmovups %ymm1, 32(%rsi)
1756 ; AVX2-NEXT: vmovups %ymm2, (%rsi)
1757 ; AVX2-NEXT: vzeroupper
1760 ; AVX512-LABEL: splat4_v8f32_load_store:
1762 ; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3]
1763 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11]
1764 ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1
1765 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15]
1766 ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0
1767 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi)
1768 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi)
1769 ; AVX512-NEXT: vzeroupper
1771 %x = load <8 x float>, <8 x float>* %s, align 4
1772 %x2 = shufflevector <8 x float> %x, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1773 %x4 = shufflevector <16 x float> %x2, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1774 %r = shufflevector <32 x float> %x4, <32 x float> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1775 store <32 x float> %r, <32 x float>* %d, align 4
1779 define void @splat4_v8i32_load_store(<8 x i32>* %s, <32 x i32>* %d) {
1780 ; AVX1-LABEL: splat4_v8i32_load_store:
1782 ; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
1783 ; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm1
1784 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1785 ; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm1
1786 ; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm2
1787 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1788 ; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm2
1789 ; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm3
1790 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1791 ; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm3
1792 ; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm4
1793 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
1794 ; AVX1-NEXT: vmovups %ymm3, 96(%rsi)
1795 ; AVX1-NEXT: vmovups %ymm2, 64(%rsi)
1796 ; AVX1-NEXT: vmovups %ymm1, 32(%rsi)
1797 ; AVX1-NEXT: vmovups %ymm0, (%rsi)
1798 ; AVX1-NEXT: vzeroupper
1801 ; AVX2-LABEL: splat4_v8i32_load_store:
1803 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1804 ; AVX2-NEXT: vmovups (%rdi), %xmm1
1805 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1]
1806 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1]
1807 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3]
1808 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
1809 ; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
1810 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3]
1811 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
1812 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
1813 ; AVX2-NEXT: vmovups %ymm0, 96(%rsi)
1814 ; AVX2-NEXT: vmovups %ymm3, 64(%rsi)
1815 ; AVX2-NEXT: vmovups %ymm1, 32(%rsi)
1816 ; AVX2-NEXT: vmovups %ymm2, (%rsi)
1817 ; AVX2-NEXT: vzeroupper
1820 ; AVX512-LABEL: splat4_v8i32_load_store:
1822 ; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3]
1823 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11]
1824 ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1
1825 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15]
1826 ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0
1827 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi)
1828 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi)
1829 ; AVX512-NEXT: vzeroupper
1831 %x = load <8 x i32>, <8 x i32>* %s, align 4
1832 %x2 = shufflevector <8 x i32> %x, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1833 %x4 = shufflevector <16 x i32> %x2, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1834 %r = shufflevector <32 x i32> %x4, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1835 store <32 x i32> %r, <32 x i32>* %d, align 4
1839 define void @splat4_v4f64_load_store(<4 x double>* %s, <16 x double>* %d) {
1840 ; AVX1-LABEL: splat4_v4f64_load_store:
1842 ; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
1843 ; AVX1-NEXT: vbroadcastsd 16(%rdi), %ymm1
1844 ; AVX1-NEXT: vbroadcastsd 8(%rdi), %ymm2
1845 ; AVX1-NEXT: vbroadcastsd 24(%rdi), %ymm3
1846 ; AVX1-NEXT: vmovups %ymm3, 96(%rsi)
1847 ; AVX1-NEXT: vmovups %ymm1, 64(%rsi)
1848 ; AVX1-NEXT: vmovups %ymm2, 32(%rsi)
1849 ; AVX1-NEXT: vmovups %ymm0, (%rsi)
1850 ; AVX1-NEXT: vzeroupper
1853 ; AVX2-LABEL: splat4_v4f64_load_store:
1855 ; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
1856 ; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm1
1857 ; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm2
1858 ; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3
1859 ; AVX2-NEXT: vmovups %ymm3, 96(%rsi)
1860 ; AVX2-NEXT: vmovups %ymm1, 64(%rsi)
1861 ; AVX2-NEXT: vmovups %ymm2, 32(%rsi)
1862 ; AVX2-NEXT: vmovups %ymm0, (%rsi)
1863 ; AVX2-NEXT: vzeroupper
1866 ; AVX512-LABEL: splat4_v4f64_load_store:
1868 ; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
1869 ; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1
1870 ; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2
1871 ; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3
1872 ; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
1873 ; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1
1874 ; AVX512-NEXT: vmovups %zmm1, 64(%rsi)
1875 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1876 ; AVX512-NEXT: vzeroupper
1878 %x = load <4 x double>, <4 x double>* %s, align 8
1879 %x2 = shufflevector <4 x double> %x, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1880 %x4 = shufflevector <8 x double> %x2, <8 x double> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1881 %r = shufflevector <16 x double> %x4, <16 x double> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1882 store <16 x double> %r, <16 x double>* %d, align 8
1886 define void @splat4_v4i64_load_store(<4 x i64>* %s, <16 x i64>* %d) {
1887 ; AVX1-LABEL: splat4_v4i64_load_store:
1889 ; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
1890 ; AVX1-NEXT: vbroadcastsd 16(%rdi), %ymm1
1891 ; AVX1-NEXT: vbroadcastsd 8(%rdi), %ymm2
1892 ; AVX1-NEXT: vbroadcastsd 24(%rdi), %ymm3
1893 ; AVX1-NEXT: vmovups %ymm3, 96(%rsi)
1894 ; AVX1-NEXT: vmovups %ymm1, 64(%rsi)
1895 ; AVX1-NEXT: vmovups %ymm2, 32(%rsi)
1896 ; AVX1-NEXT: vmovups %ymm0, (%rsi)
1897 ; AVX1-NEXT: vzeroupper
1900 ; AVX2-LABEL: splat4_v4i64_load_store:
1902 ; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
1903 ; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm1
1904 ; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm2
1905 ; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3
1906 ; AVX2-NEXT: vmovups %ymm3, 96(%rsi)
1907 ; AVX2-NEXT: vmovups %ymm1, 64(%rsi)
1908 ; AVX2-NEXT: vmovups %ymm2, 32(%rsi)
1909 ; AVX2-NEXT: vmovups %ymm0, (%rsi)
1910 ; AVX2-NEXT: vzeroupper
1913 ; AVX512-LABEL: splat4_v4i64_load_store:
1915 ; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
1916 ; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1
1917 ; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2
1918 ; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3
1919 ; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
1920 ; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1
1921 ; AVX512-NEXT: vmovups %zmm1, 64(%rsi)
1922 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
1923 ; AVX512-NEXT: vzeroupper
1925 %x = load <4 x i64>, <4 x i64>* %s, align 8
1926 %x2 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1927 %x4 = shufflevector <8 x i64> %x2, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1928 %r = shufflevector <16 x i64> %x4, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1929 store <16 x i64> %r, <16 x i64>* %d, align 8
1933 define <2 x i64> @PR37616(<16 x i64>* %a0) {
1934 ; AVX1-LABEL: PR37616:
1936 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm0
1937 ; AVX1-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
1940 ; AVX2OR512-LABEL: PR37616:
1941 ; AVX2OR512: # %bb.0:
1942 ; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0
1943 ; AVX2OR512-NEXT: vunpcklpd 32(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
1944 ; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0
1945 ; AVX2OR512-NEXT: vzeroupper
1946 ; AVX2OR512-NEXT: retq
1947 %load = load <16 x i64>, <16 x i64>* %a0, align 128
1948 %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
1949 ret <2 x i64> %shuffle