1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512
9 ; These patterns are produced by LoopVectorizer for interleaved stores.
11 define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
12 ; SSE-LABEL: store_i64_stride3_vf2:
14 ; SSE-NEXT: movapd (%rdi), %xmm0
15 ; SSE-NEXT: movapd (%rsi), %xmm1
16 ; SSE-NEXT: movapd (%rdx), %xmm2
17 ; SSE-NEXT: movapd %xmm0, %xmm3
18 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0]
19 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
20 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
21 ; SSE-NEXT: movapd %xmm0, 16(%rcx)
22 ; SSE-NEXT: movapd %xmm1, 32(%rcx)
23 ; SSE-NEXT: movapd %xmm3, (%rcx)
26 ; AVX1-LABEL: store_i64_stride3_vf2:
28 ; AVX1-NEXT: vmovaps (%rdi), %xmm0
29 ; AVX1-NEXT: vmovaps (%rsi), %xmm1
30 ; AVX1-NEXT: vmovaps (%rdx), %xmm2
31 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3
32 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
33 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3]
34 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
35 ; AVX1-NEXT: vmovaps %xmm1, 32(%rcx)
36 ; AVX1-NEXT: vmovapd %ymm0, (%rcx)
37 ; AVX1-NEXT: vzeroupper
40 ; AVX2-LABEL: store_i64_stride3_vf2:
42 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
43 ; AVX2-NEXT: vmovaps (%rsi), %xmm1
44 ; AVX2-NEXT: vmovaps (%rdx), %xmm2
45 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
46 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3
47 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
48 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
49 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
50 ; AVX2-NEXT: vmovaps %xmm1, 32(%rcx)
51 ; AVX2-NEXT: vmovaps %ymm0, (%rcx)
52 ; AVX2-NEXT: vzeroupper
55 ; AVX512-LABEL: store_i64_stride3_vf2:
57 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
58 ; AVX512-NEXT: vmovaps (%rdx), %xmm1
59 ; AVX512-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
60 ; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
61 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = <0,2,4,1,3,5,u,u>
62 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
63 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx)
64 ; AVX512-NEXT: vmovaps %ymm0, (%rcx)
65 ; AVX512-NEXT: vzeroupper
67 %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 32
68 %in.vec1 = load <2 x i64>, ptr %in.vecptr1, align 32
69 %in.vec2 = load <2 x i64>, ptr %in.vecptr2, align 32
71 %concat01 = shufflevector <2 x i64> %in.vec0, <2 x i64> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
72 %concat2u = shufflevector <2 x i64> %in.vec2, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
73 %concat012 = shufflevector <4 x i64> %concat01, <4 x i64> %concat2u, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
74 %interleaved.vec = shufflevector <6 x i64> %concat012, <6 x i64> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
76 store <6 x i64> %interleaved.vec, ptr %out.vec, align 32
81 define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
82 ; SSE-LABEL: store_i64_stride3_vf4:
84 ; SSE-NEXT: movaps (%rdi), %xmm0
85 ; SSE-NEXT: movaps 16(%rdi), %xmm1
86 ; SSE-NEXT: movaps (%rsi), %xmm2
87 ; SSE-NEXT: movaps 16(%rsi), %xmm3
88 ; SSE-NEXT: movaps (%rdx), %xmm4
89 ; SSE-NEXT: movaps 16(%rdx), %xmm5
90 ; SSE-NEXT: movaps %xmm3, %xmm6
91 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
92 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3]
93 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
94 ; SSE-NEXT: movaps %xmm2, %xmm3
95 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
96 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3]
97 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
98 ; SSE-NEXT: movaps %xmm0, (%rcx)
99 ; SSE-NEXT: movaps %xmm4, 16(%rcx)
100 ; SSE-NEXT: movaps %xmm3, 32(%rcx)
101 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
102 ; SSE-NEXT: movaps %xmm5, 64(%rcx)
103 ; SSE-NEXT: movaps %xmm6, 80(%rcx)
106 ; AVX1-LABEL: store_i64_stride3_vf4:
108 ; AVX1-NEXT: vmovapd (%rdi), %ymm0
109 ; AVX1-NEXT: vmovapd (%rsi), %ymm1
110 ; AVX1-NEXT: vmovapd (%rdx), %ymm2
111 ; AVX1-NEXT: vmovapd 16(%rdi), %xmm3
112 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3]
113 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[0,0,3,2]
114 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm4[2,3]
115 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
116 ; AVX1-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm4
117 ; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = mem[0,1,0,1]
118 ; AVX1-NEXT: vinsertf128 $1, (%rdi), %ymm5, %ymm5
119 ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
120 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
121 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
122 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3]
123 ; AVX1-NEXT: vmovaps %ymm4, (%rcx)
124 ; AVX1-NEXT: vmovapd %ymm3, 64(%rcx)
125 ; AVX1-NEXT: vmovapd %ymm0, 32(%rcx)
126 ; AVX1-NEXT: vzeroupper
129 ; AVX2-LABEL: store_i64_stride3_vf4:
131 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
132 ; AVX2-NEXT: vmovaps (%rsi), %ymm1
133 ; AVX2-NEXT: vmovaps (%rdx), %ymm2
134 ; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[2,3,0,1,6,7,4,5]
135 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
136 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
137 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
138 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
139 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
140 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
141 ; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
142 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
143 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
144 ; AVX2-NEXT: vbroadcastsd (%rdx), %ymm2
145 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
146 ; AVX2-NEXT: vmovaps %ymm0, (%rcx)
147 ; AVX2-NEXT: vmovaps %ymm1, 64(%rcx)
148 ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx)
149 ; AVX2-NEXT: vzeroupper
152 ; AVX512-LABEL: store_i64_stride3_vf4:
154 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
155 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1
156 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
157 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11]
158 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
159 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6]
160 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
161 ; AVX512-NEXT: vmovdqu64 %zmm3, (%rcx)
162 ; AVX512-NEXT: vmovdqa %ymm2, 64(%rcx)
163 ; AVX512-NEXT: vzeroupper
165 %in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 32
166 %in.vec1 = load <4 x i64>, ptr %in.vecptr1, align 32
167 %in.vec2 = load <4 x i64>, ptr %in.vecptr2, align 32
169 %concat01 = shufflevector <4 x i64> %in.vec0, <4 x i64> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
170 %concat2u = shufflevector <4 x i64> %in.vec2, <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
171 %concat012 = shufflevector <8 x i64> %concat01, <8 x i64> %concat2u, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
172 %interleaved.vec = shufflevector <12 x i64> %concat012, <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
174 store <12 x i64> %interleaved.vec, ptr %out.vec, align 32
179 define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
180 ; SSE-LABEL: store_i64_stride3_vf8:
182 ; SSE-NEXT: movaps (%rdi), %xmm3
183 ; SSE-NEXT: movaps 16(%rdi), %xmm2
184 ; SSE-NEXT: movaps 32(%rdi), %xmm13
185 ; SSE-NEXT: movaps 48(%rdi), %xmm12
186 ; SSE-NEXT: movaps (%rsi), %xmm8
187 ; SSE-NEXT: movaps 16(%rsi), %xmm9
188 ; SSE-NEXT: movaps 32(%rsi), %xmm11
189 ; SSE-NEXT: movaps 48(%rsi), %xmm4
190 ; SSE-NEXT: movaps (%rdx), %xmm7
191 ; SSE-NEXT: movaps 16(%rdx), %xmm0
192 ; SSE-NEXT: movaps 32(%rdx), %xmm6
193 ; SSE-NEXT: movaps 48(%rdx), %xmm5
194 ; SSE-NEXT: movaps %xmm4, %xmm10
195 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1]
196 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3]
197 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm4[0]
198 ; SSE-NEXT: movaps %xmm11, %xmm14
199 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1]
200 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,3]
201 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0]
202 ; SSE-NEXT: movaps %xmm9, %xmm1
203 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
204 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
205 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0]
206 ; SSE-NEXT: movaps %xmm8, %xmm4
207 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1]
208 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3]
209 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0]
210 ; SSE-NEXT: movaps %xmm3, (%rcx)
211 ; SSE-NEXT: movaps %xmm7, 16(%rcx)
212 ; SSE-NEXT: movaps %xmm4, 32(%rcx)
213 ; SSE-NEXT: movaps %xmm2, 48(%rcx)
214 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
215 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
216 ; SSE-NEXT: movaps %xmm13, 96(%rcx)
217 ; SSE-NEXT: movaps %xmm6, 112(%rcx)
218 ; SSE-NEXT: movaps %xmm14, 128(%rcx)
219 ; SSE-NEXT: movaps %xmm12, 144(%rcx)
220 ; SSE-NEXT: movaps %xmm5, 160(%rcx)
221 ; SSE-NEXT: movaps %xmm10, 176(%rcx)
224 ; AVX1-LABEL: store_i64_stride3_vf8:
226 ; AVX1-NEXT: vmovapd 32(%rdi), %ymm0
227 ; AVX1-NEXT: vmovapd (%rdi), %ymm1
228 ; AVX1-NEXT: vmovapd (%rsi), %ymm2
229 ; AVX1-NEXT: vmovapd 32(%rsi), %ymm3
230 ; AVX1-NEXT: vmovapd (%rdx), %ymm4
231 ; AVX1-NEXT: vmovapd 32(%rdx), %ymm5
232 ; AVX1-NEXT: vinsertf128 $1, (%rdx), %ymm1, %ymm6
233 ; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = mem[0,1,0,1]
234 ; AVX1-NEXT: vinsertf128 $1, (%rdi), %ymm7, %ymm7
235 ; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
236 ; AVX1-NEXT: vmovapd 48(%rdi), %xmm7
237 ; AVX1-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm5[2,3]
238 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm8 = ymm3[0,0,3,2]
239 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3],ymm8[2,3]
240 ; AVX1-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3]
241 ; AVX1-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm8
242 ; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = mem[0,1,0,1]
243 ; AVX1-NEXT: vinsertf128 $1, 32(%rdi), %ymm9, %ymm9
244 ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
245 ; AVX1-NEXT: vmovapd 16(%rdi), %xmm9
246 ; AVX1-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm4[2,3]
247 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm10 = ymm2[0,0,3,2]
248 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm10[2,3]
249 ; AVX1-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3]
250 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
251 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3]
252 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3]
253 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,2,2]
254 ; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3]
255 ; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3]
256 ; AVX1-NEXT: vmovapd %ymm1, 32(%rcx)
257 ; AVX1-NEXT: vmovapd %ymm9, 64(%rcx)
258 ; AVX1-NEXT: vmovaps %ymm8, 96(%rcx)
259 ; AVX1-NEXT: vmovapd %ymm7, 160(%rcx)
260 ; AVX1-NEXT: vmovapd %ymm0, 128(%rcx)
261 ; AVX1-NEXT: vmovaps %ymm6, (%rcx)
262 ; AVX1-NEXT: vzeroupper
265 ; AVX2-LABEL: store_i64_stride3_vf8:
267 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
268 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
269 ; AVX2-NEXT: vmovaps (%rsi), %ymm2
270 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm3
271 ; AVX2-NEXT: vmovaps (%rdx), %ymm4
272 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm5
273 ; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0]
274 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1]
275 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7]
276 ; AVX2-NEXT: vbroadcastsd (%rdx), %ymm7
277 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
278 ; AVX2-NEXT: vpermilps {{.*#+}} ymm7 = ymm3[2,3,0,1,6,7,4,5]
279 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7]
280 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm5[2,3],ymm7[4,5,6,7]
281 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
282 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3]
283 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3]
284 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7]
285 ; AVX2-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0]
286 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
287 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7]
288 ; AVX2-NEXT: vbroadcastsd 32(%rdx), %ymm5
289 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7]
290 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
291 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3]
292 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[2,1,2,3]
293 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7]
294 ; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
295 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
296 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
297 ; AVX2-NEXT: vmovaps %ymm0, 32(%rcx)
298 ; AVX2-NEXT: vmovaps %ymm5, 64(%rcx)
299 ; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
300 ; AVX2-NEXT: vmovaps %ymm3, 160(%rcx)
301 ; AVX2-NEXT: vmovaps %ymm7, 128(%rcx)
302 ; AVX2-NEXT: vmovaps %ymm6, (%rcx)
303 ; AVX2-NEXT: vzeroupper
306 ; AVX512-LABEL: store_i64_stride3_vf8:
308 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
309 ; AVX512-NEXT: vmovdqu64 (%rsi), %zmm1
310 ; AVX512-NEXT: vmovdqu64 (%rdx), %zmm2
311 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,8,u,1,9,u,2,10>
312 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
313 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7]
314 ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm4
315 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <u,3,11,u,4,12,u,5>
316 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
317 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7]
318 ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5
319 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <5,u,14,6,u,15,7,u>
320 ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3
321 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15]
322 ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm0
323 ; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rcx)
324 ; AVX512-NEXT: vmovdqu64 %zmm5, 64(%rcx)
325 ; AVX512-NEXT: vmovdqu64 %zmm4, (%rcx)
326 ; AVX512-NEXT: vzeroupper
328 %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 32
329 %in.vec1 = load <8 x i64>, ptr %in.vecptr1, align 32
330 %in.vec2 = load <8 x i64>, ptr %in.vecptr2, align 32
332 %concat01 = shufflevector <8 x i64> %in.vec0, <8 x i64> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
333 %concat2u = shufflevector <8 x i64> %in.vec2, <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
334 %concat012 = shufflevector <16 x i64> %concat01, <16 x i64> %concat2u, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
335 %interleaved.vec = shufflevector <24 x i64> %concat012, <24 x i64> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
337 store <24 x i64> %interleaved.vec, ptr %out.vec, align 32
342 define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
343 ; SSE-LABEL: store_i64_stride3_vf16:
345 ; SSE-NEXT: subq $24, %rsp
346 ; SSE-NEXT: movapd 64(%rdi), %xmm9
347 ; SSE-NEXT: movapd (%rdi), %xmm3
348 ; SSE-NEXT: movapd 16(%rdi), %xmm13
349 ; SSE-NEXT: movapd 32(%rdi), %xmm8
350 ; SSE-NEXT: movapd 48(%rdi), %xmm10
351 ; SSE-NEXT: movapd 64(%rsi), %xmm12
352 ; SSE-NEXT: movapd (%rsi), %xmm7
353 ; SSE-NEXT: movapd 16(%rsi), %xmm14
354 ; SSE-NEXT: movapd 32(%rsi), %xmm15
355 ; SSE-NEXT: movapd 48(%rsi), %xmm11
356 ; SSE-NEXT: movapd 64(%rdx), %xmm6
357 ; SSE-NEXT: movapd (%rdx), %xmm2
358 ; SSE-NEXT: movapd 16(%rdx), %xmm4
359 ; SSE-NEXT: movapd 32(%rdx), %xmm5
360 ; SSE-NEXT: movapd 48(%rdx), %xmm0
361 ; SSE-NEXT: movapd %xmm3, %xmm1
362 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm7[0]
363 ; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill
364 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
365 ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
366 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
367 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
368 ; SSE-NEXT: movapd %xmm13, %xmm3
369 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm14[0]
370 ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
371 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm4[0],xmm13[1]
372 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
373 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1]
374 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
375 ; SSE-NEXT: movapd %xmm8, %xmm13
376 ; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm15[0]
377 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
378 ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
379 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1]
380 ; SSE-NEXT: movapd %xmm10, %xmm1
381 ; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm11[0]
382 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
383 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
384 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
385 ; SSE-NEXT: movapd %xmm9, %xmm14
386 ; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm12[0]
387 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1]
388 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
389 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1]
390 ; SSE-NEXT: movapd 80(%rdi), %xmm8
391 ; SSE-NEXT: movapd 80(%rsi), %xmm6
392 ; SSE-NEXT: movapd %xmm8, %xmm9
393 ; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0]
394 ; SSE-NEXT: movapd 80(%rdx), %xmm0
395 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1]
396 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
397 ; SSE-NEXT: movapd 96(%rdi), %xmm5
398 ; SSE-NEXT: movapd 96(%rsi), %xmm1
399 ; SSE-NEXT: movapd %xmm5, %xmm7
400 ; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0]
401 ; SSE-NEXT: movapd 96(%rdx), %xmm2
402 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
403 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
404 ; SSE-NEXT: movapd 112(%rdi), %xmm2
405 ; SSE-NEXT: movapd 112(%rsi), %xmm0
406 ; SSE-NEXT: movapd %xmm2, %xmm3
407 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
408 ; SSE-NEXT: movapd 112(%rdx), %xmm4
409 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
410 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
411 ; SSE-NEXT: movapd %xmm0, 368(%rcx)
412 ; SSE-NEXT: movapd %xmm2, 352(%rcx)
413 ; SSE-NEXT: movapd %xmm3, 336(%rcx)
414 ; SSE-NEXT: movapd %xmm1, 320(%rcx)
415 ; SSE-NEXT: movapd %xmm5, 304(%rcx)
416 ; SSE-NEXT: movapd %xmm7, 288(%rcx)
417 ; SSE-NEXT: movapd %xmm6, 272(%rcx)
418 ; SSE-NEXT: movapd %xmm8, 256(%rcx)
419 ; SSE-NEXT: movapd %xmm9, 240(%rcx)
420 ; SSE-NEXT: movapd %xmm12, 224(%rcx)
421 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
422 ; SSE-NEXT: movaps %xmm0, 208(%rcx)
423 ; SSE-NEXT: movapd %xmm14, 192(%rcx)
424 ; SSE-NEXT: movapd %xmm11, 176(%rcx)
425 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
426 ; SSE-NEXT: movaps %xmm0, 160(%rcx)
427 ; SSE-NEXT: movapd %xmm10, 144(%rcx)
428 ; SSE-NEXT: movapd %xmm15, 128(%rcx)
429 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
430 ; SSE-NEXT: movaps %xmm0, 112(%rcx)
431 ; SSE-NEXT: movapd %xmm13, 96(%rcx)
432 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
433 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
434 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
435 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
436 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
437 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
438 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
439 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
440 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
441 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
442 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
443 ; SSE-NEXT: movaps %xmm0, (%rcx)
444 ; SSE-NEXT: addq $24, %rsp
447 ; AVX1-LABEL: store_i64_stride3_vf16:
449 ; AVX1-NEXT: subq $40, %rsp
450 ; AVX1-NEXT: vmovapd 32(%rdi), %ymm2
451 ; AVX1-NEXT: vmovapd 64(%rdi), %ymm4
452 ; AVX1-NEXT: vmovaps (%rdi), %ymm0
453 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
454 ; AVX1-NEXT: vmovapd 32(%rsi), %ymm6
455 ; AVX1-NEXT: vmovapd 64(%rsi), %ymm9
456 ; AVX1-NEXT: vmovapd 96(%rsi), %ymm5
457 ; AVX1-NEXT: vmovapd 32(%rdx), %ymm8
458 ; AVX1-NEXT: vmovapd 64(%rdx), %ymm11
459 ; AVX1-NEXT: vmovapd 96(%rdx), %ymm7
460 ; AVX1-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm1
461 ; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[0,1,0,1]
462 ; AVX1-NEXT: vinsertf128 $1, (%rdi), %ymm3, %ymm3
463 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
464 ; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
465 ; AVX1-NEXT: vmovapd 80(%rdi), %xmm3
466 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3]
467 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm10 = ymm9[0,0,3,2]
468 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3]
469 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0],ymm3[1],ymm10[2],ymm3[3]
470 ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
471 ; AVX1-NEXT: vinsertf128 $1, 64(%rdx), %ymm4, %ymm10
472 ; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = mem[0,1,0,1]
473 ; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm12, %ymm12
474 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7]
475 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
476 ; AVX1-NEXT: vmovapd 48(%rdi), %xmm12
477 ; AVX1-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm8[2,3]
478 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm13 = ymm6[0,0,3,2]
479 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm8[2,3],ymm13[2,3]
480 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0],ymm12[1],ymm13[2],ymm12[3]
481 ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
482 ; AVX1-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm13
483 ; AVX1-NEXT: vmovapd %ymm2, %ymm12
484 ; AVX1-NEXT: vpermilps {{.*#+}} xmm14 = mem[0,1,0,1]
485 ; AVX1-NEXT: vinsertf128 $1, 32(%rdi), %ymm14, %ymm14
486 ; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
487 ; AVX1-NEXT: vmovapd 112(%rdi), %xmm14
488 ; AVX1-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm7[2,3]
489 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm15 = ymm5[0,0,3,2]
490 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm15[2,3]
491 ; AVX1-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3]
492 ; AVX1-NEXT: vpermilps {{.*#+}} xmm15 = mem[0,1,0,1]
493 ; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm15, %ymm15
494 ; AVX1-NEXT: vmovapd 96(%rdi), %ymm2
495 ; AVX1-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm1
496 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7]
497 ; AVX1-NEXT: vmovapd (%rdx), %ymm15
498 ; AVX1-NEXT: vmovapd 16(%rdi), %xmm3
499 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3]
500 ; AVX1-NEXT: vmovapd (%rsi), %ymm0
501 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm10 = ymm0[0,0,3,2]
502 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm15[2,3],ymm10[2,3]
503 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3]
504 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm9 = ymm9[1,0,2,2]
505 ; AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2],ymm9[3]
506 ; AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3]
507 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[1,0,2,2]
508 ; AVX1-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3]
509 ; AVX1-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3]
510 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[1,0,2,2]
511 ; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3]
512 ; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3]
513 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
514 ; AVX1-NEXT: vblendpd $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
515 ; AVX1-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3]
516 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3]
517 ; AVX1-NEXT: vmovapd %ymm0, 32(%rcx)
518 ; AVX1-NEXT: vmovapd %ymm3, 64(%rcx)
519 ; AVX1-NEXT: vmovaps %ymm1, 288(%rcx)
520 ; AVX1-NEXT: vmovapd %ymm14, 352(%rcx)
521 ; AVX1-NEXT: vmovapd %ymm2, 320(%rcx)
522 ; AVX1-NEXT: vmovaps %ymm13, 96(%rcx)
523 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
524 ; AVX1-NEXT: vmovaps %ymm0, 160(%rcx)
525 ; AVX1-NEXT: vmovapd %ymm6, 128(%rcx)
526 ; AVX1-NEXT: vmovapd %ymm4, 224(%rcx)
527 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
528 ; AVX1-NEXT: vmovaps %ymm0, 192(%rcx)
529 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
530 ; AVX1-NEXT: vmovaps %ymm0, 256(%rcx)
531 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
532 ; AVX1-NEXT: vmovaps %ymm0, (%rcx)
533 ; AVX1-NEXT: addq $40, %rsp
534 ; AVX1-NEXT: vzeroupper
537 ; AVX2-LABEL: store_i64_stride3_vf16:
539 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
540 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm9
541 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm7
542 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm5
543 ; AVX2-NEXT: vmovaps (%rsi), %ymm2
544 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm12
545 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm11
546 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm8
547 ; AVX2-NEXT: vmovaps (%rdx), %ymm3
548 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm13
549 ; AVX2-NEXT: vmovaps 64(%rdx), %ymm14
550 ; AVX2-NEXT: vmovaps 96(%rdx), %ymm10
551 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
552 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1]
553 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7]
554 ; AVX2-NEXT: vbroadcastsd (%rdx), %ymm4
555 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
556 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
557 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3]
558 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm14[2,1,2,3]
559 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7]
560 ; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0]
561 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,2,1]
562 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3],ymm15[4,5,6,7]
563 ; AVX2-NEXT: vbroadcastsd 64(%rdx), %ymm15
564 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7]
565 ; AVX2-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
566 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5],ymm11[6,7]
567 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5,6,7]
568 ; AVX2-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
569 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm9[4,5],ymm11[6,7]
570 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7]
571 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm9[1],ymm12[1],ymm9[3],ymm12[3]
572 ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3]
573 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3]
574 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7]
575 ; AVX2-NEXT: vmovddup {{.*#+}} xmm13 = mem[0,0]
576 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1]
577 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7]
578 ; AVX2-NEXT: vbroadcastsd 32(%rdx), %ymm13
579 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5],ymm9[6,7]
580 ; AVX2-NEXT: vpermilps {{.*#+}} ymm13 = ymm8[2,3,0,1,6,7,4,5]
581 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm5[4,5],ymm13[6,7]
582 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7]
583 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm5[1],ymm8[1],ymm5[3],ymm8[3]
584 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3]
585 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
586 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7]
587 ; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0]
588 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1]
589 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
590 ; AVX2-NEXT: vbroadcastsd 96(%rdx), %ymm10
591 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7]
592 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
593 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3]
594 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[2,1,2,3]
595 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5],ymm14[6,7]
596 ; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
597 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
598 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
599 ; AVX2-NEXT: vmovaps %ymm0, 32(%rcx)
600 ; AVX2-NEXT: vmovaps %ymm10, 64(%rcx)
601 ; AVX2-NEXT: vmovaps %ymm5, 288(%rcx)
602 ; AVX2-NEXT: vmovaps %ymm8, 352(%rcx)
603 ; AVX2-NEXT: vmovaps %ymm13, 320(%rcx)
604 ; AVX2-NEXT: vmovaps %ymm9, 96(%rcx)
605 ; AVX2-NEXT: vmovaps %ymm12, 160(%rcx)
606 ; AVX2-NEXT: vmovaps %ymm11, 128(%rcx)
607 ; AVX2-NEXT: vmovaps %ymm7, 224(%rcx)
608 ; AVX2-NEXT: vmovaps %ymm6, 192(%rcx)
609 ; AVX2-NEXT: vmovaps %ymm4, 256(%rcx)
610 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
611 ; AVX2-NEXT: vzeroupper
614 ; AVX512-LABEL: store_i64_stride3_vf16:
616 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
617 ; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1
618 ; AVX512-NEXT: vmovdqu64 (%rsi), %zmm2
619 ; AVX512-NEXT: vmovdqu64 64(%rsi), %zmm3
620 ; AVX512-NEXT: vmovdqu64 (%rdx), %zmm4
621 ; AVX512-NEXT: vmovdqu64 64(%rdx), %zmm5
622 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,8,u,1,9,u,2,10>
623 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7
624 ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm7
625 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7]
626 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm7
627 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = <5,u,14,6,u,15,7,u>
628 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
629 ; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm10
630 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15]
631 ; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm10
632 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <u,3,11,u,4,12,u,5>
633 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13
634 ; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm13
635 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7]
636 ; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm13
637 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
638 ; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm1
639 ; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm9
640 ; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm9
641 ; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0
642 ; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
643 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rcx)
644 ; AVX512-NEXT: vmovdqu64 %zmm9, 128(%rcx)
645 ; AVX512-NEXT: vmovdqu64 %zmm1, 192(%rcx)
646 ; AVX512-NEXT: vmovdqu64 %zmm13, 256(%rcx)
647 ; AVX512-NEXT: vmovdqu64 %zmm10, 320(%rcx)
648 ; AVX512-NEXT: vmovdqu64 %zmm7, (%rcx)
649 ; AVX512-NEXT: vzeroupper
651 %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 32
652 %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 32
653 %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 32
655 %concat01 = shufflevector <16 x i64> %in.vec0, <16 x i64> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
656 %concat2u = shufflevector <16 x i64> %in.vec2, <16 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
657 %concat012 = shufflevector <32 x i64> %concat01, <32 x i64> %concat2u, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
658 %interleaved.vec = shufflevector <48 x i64> %concat012, <48 x i64> poison, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
660 store <48 x i64> %interleaved.vec, ptr %out.vec, align 32