1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK9
13 define void @concat_a_to_shuf_of_a(ptr %a.ptr, ptr %dst) {
14 ; SSE-LABEL: concat_a_to_shuf_of_a:
16 ; SSE-NEXT: movdqa (%rdi), %xmm0
17 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
18 ; SSE-NEXT: movdqa %xmm0, 16(%rsi)
19 ; SSE-NEXT: movdqa %xmm1, (%rsi)
22 ; AVX-LABEL: concat_a_to_shuf_of_a:
24 ; AVX-NEXT: vmovaps (%rdi), %xmm0
25 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
26 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
27 ; AVX-NEXT: vmovaps %ymm0, (%rsi)
28 ; AVX-NEXT: vzeroupper
31 ; AVX2-LABEL: concat_a_to_shuf_of_a:
33 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
34 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,1]
35 ; AVX2-NEXT: vmovaps %ymm0, (%rsi)
36 ; AVX2-NEXT: vzeroupper
39 ; AVX512F-LABEL: concat_a_to_shuf_of_a:
41 ; AVX512F-NEXT: vmovaps (%rdi), %xmm0
42 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,1]
43 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
44 ; AVX512F-NEXT: vzeroupper
47 ; AVX512BW-LABEL: concat_a_to_shuf_of_a:
49 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
50 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,1]
51 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
52 ; AVX512BW-NEXT: vzeroupper
54 %a = load <2 x i64>, ptr %a.ptr, align 64
55 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
56 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
57 store <4 x i64> %concat, ptr %dst, align 64
60 define void @concat_shuf_of_a_to_a(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
61 ; SSE-LABEL: concat_shuf_of_a_to_a:
63 ; SSE-NEXT: movdqa (%rdi), %xmm0
64 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
65 ; SSE-NEXT: movdqa %xmm0, (%rdx)
66 ; SSE-NEXT: movdqa %xmm1, 16(%rdx)
69 ; AVX-LABEL: concat_shuf_of_a_to_a:
71 ; AVX-NEXT: vmovaps (%rdi), %xmm0
72 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
73 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
74 ; AVX-NEXT: vmovaps %ymm0, (%rdx)
75 ; AVX-NEXT: vzeroupper
78 ; AVX2-LABEL: concat_shuf_of_a_to_a:
80 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
81 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0]
82 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
83 ; AVX2-NEXT: vzeroupper
86 ; AVX512F-LABEL: concat_shuf_of_a_to_a:
88 ; AVX512F-NEXT: vmovaps (%rdi), %xmm0
89 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0]
90 ; AVX512F-NEXT: vmovaps %ymm0, (%rdx)
91 ; AVX512F-NEXT: vzeroupper
94 ; AVX512BW-LABEL: concat_shuf_of_a_to_a:
96 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
97 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0]
98 ; AVX512BW-NEXT: vmovaps %ymm0, (%rdx)
99 ; AVX512BW-NEXT: vzeroupper
100 ; AVX512BW-NEXT: retq
101 %a = load <2 x i64>, ptr %a.ptr, align 64
102 %b = load <2 x i64>, ptr %b.ptr, align 64
103 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
104 %concat = shufflevector <2 x i64> %a, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
105 store <4 x i64> %concat, ptr %dst, align 64
109 define void @concat_a_to_shuf_of_a_extrause_of_shuf(ptr %a.ptr, ptr %dst, ptr %shuf.escape.ptr) {
110 ; SSE-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf:
112 ; SSE-NEXT: movdqa (%rdi), %xmm0
113 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
114 ; SSE-NEXT: movdqa %xmm1, (%rdx)
115 ; SSE-NEXT: movdqa %xmm0, 16(%rsi)
116 ; SSE-NEXT: movdqa %xmm1, (%rsi)
119 ; AVX-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf:
121 ; AVX-NEXT: vmovaps (%rdi), %xmm0
122 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
123 ; AVX-NEXT: vmovaps %xmm1, (%rdx)
124 ; AVX-NEXT: vmovaps %xmm0, 16(%rsi)
125 ; AVX-NEXT: vmovaps %xmm1, (%rsi)
128 ; AVX2-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf:
130 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
131 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
132 ; AVX2-NEXT: vmovaps %xmm1, (%rdx)
133 ; AVX2-NEXT: vmovaps %xmm0, 16(%rsi)
134 ; AVX2-NEXT: vmovaps %xmm1, (%rsi)
137 ; AVX512F-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf:
139 ; AVX512F-NEXT: vmovaps (%rdi), %xmm0
140 ; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
141 ; AVX512F-NEXT: vmovaps %xmm1, (%rdx)
142 ; AVX512F-NEXT: vmovaps %xmm0, 16(%rsi)
143 ; AVX512F-NEXT: vmovaps %xmm1, (%rsi)
146 ; AVX512BW-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf:
148 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
149 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
150 ; AVX512BW-NEXT: vmovaps %xmm1, (%rdx)
151 ; AVX512BW-NEXT: vmovaps %xmm0, 16(%rsi)
152 ; AVX512BW-NEXT: vmovaps %xmm1, (%rsi)
153 ; AVX512BW-NEXT: retq
154 %a = load <2 x i64>, ptr %a.ptr, align 64
155 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
156 store <2 x i64> %shuffle, ptr %shuf.escape.ptr, align 64
157 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
158 store <4 x i64> %concat, ptr %dst, align 64
162 define void @concat_a_to_shuf_of_ab(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
163 ; SSE2-LABEL: concat_a_to_shuf_of_ab:
165 ; SSE2-NEXT: movapd (%rdi), %xmm0
166 ; SSE2-NEXT: movapd (%rsi), %xmm1
167 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
168 ; SSE2-NEXT: movapd %xmm0, 16(%rdx)
169 ; SSE2-NEXT: movapd %xmm1, (%rdx)
172 ; SSE42-LABEL: concat_a_to_shuf_of_ab:
174 ; SSE42-NEXT: movaps (%rdi), %xmm0
175 ; SSE42-NEXT: movaps (%rsi), %xmm1
176 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
177 ; SSE42-NEXT: movaps %xmm0, 16(%rdx)
178 ; SSE42-NEXT: movaps %xmm1, (%rdx)
181 ; AVX-LABEL: concat_a_to_shuf_of_ab:
183 ; AVX-NEXT: vmovaps (%rdi), %xmm0
184 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
185 ; AVX-NEXT: vmovaps %xmm0, 16(%rdx)
186 ; AVX-NEXT: vmovaps %xmm1, (%rdx)
189 ; AVX2-LABEL: concat_a_to_shuf_of_ab:
191 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
192 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
193 ; AVX2-NEXT: vmovaps %xmm0, 16(%rdx)
194 ; AVX2-NEXT: vmovaps %xmm1, (%rdx)
197 ; AVX512F-LABEL: concat_a_to_shuf_of_ab:
199 ; AVX512F-NEXT: vmovaps (%rdi), %xmm0
200 ; AVX512F-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
201 ; AVX512F-NEXT: vmovaps %xmm0, 16(%rdx)
202 ; AVX512F-NEXT: vmovaps %xmm1, (%rdx)
205 ; AVX512BW-LABEL: concat_a_to_shuf_of_ab:
207 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
208 ; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
209 ; AVX512BW-NEXT: vmovaps %xmm0, 16(%rdx)
210 ; AVX512BW-NEXT: vmovaps %xmm1, (%rdx)
211 ; AVX512BW-NEXT: retq
212 %a = load <2 x i64>, ptr %a.ptr, align 64
213 %b = load <2 x i64>, ptr %b.ptr, align 64
214 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
215 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
216 store <4 x i64> %concat, ptr %dst, align 64
219 define void @concat_b_to_shuf_of_ab(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
220 ; SSE2-LABEL: concat_b_to_shuf_of_ab:
222 ; SSE2-NEXT: movaps (%rsi), %xmm0
223 ; SSE2-NEXT: movaps %xmm0, %xmm1
224 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
225 ; SSE2-NEXT: movaps %xmm0, 16(%rdx)
226 ; SSE2-NEXT: movaps %xmm1, (%rdx)
229 ; SSE42-LABEL: concat_b_to_shuf_of_ab:
231 ; SSE42-NEXT: movaps (%rsi), %xmm0
232 ; SSE42-NEXT: movaps (%rdi), %xmm1
233 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
234 ; SSE42-NEXT: movaps %xmm0, 16(%rdx)
235 ; SSE42-NEXT: movaps %xmm1, (%rdx)
238 ; AVX-LABEL: concat_b_to_shuf_of_ab:
240 ; AVX-NEXT: vmovaps (%rsi), %xmm0
241 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3]
242 ; AVX-NEXT: vmovaps %xmm0, 16(%rdx)
243 ; AVX-NEXT: vmovaps %xmm1, (%rdx)
246 ; AVX2-LABEL: concat_b_to_shuf_of_ab:
248 ; AVX2-NEXT: vmovaps (%rsi), %xmm0
249 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3]
250 ; AVX2-NEXT: vmovaps %xmm0, 16(%rdx)
251 ; AVX2-NEXT: vmovaps %xmm1, (%rdx)
254 ; AVX512F-LABEL: concat_b_to_shuf_of_ab:
256 ; AVX512F-NEXT: vmovaps (%rsi), %xmm0
257 ; AVX512F-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3]
258 ; AVX512F-NEXT: vmovaps %xmm0, 16(%rdx)
259 ; AVX512F-NEXT: vmovaps %xmm1, (%rdx)
262 ; AVX512BW-LABEL: concat_b_to_shuf_of_ab:
264 ; AVX512BW-NEXT: vmovaps (%rsi), %xmm0
265 ; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3]
266 ; AVX512BW-NEXT: vmovaps %xmm0, 16(%rdx)
267 ; AVX512BW-NEXT: vmovaps %xmm1, (%rdx)
268 ; AVX512BW-NEXT: retq
269 %a = load <2 x i64>, ptr %a.ptr, align 64
270 %b = load <2 x i64>, ptr %b.ptr, align 64
271 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
272 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
273 store <4 x i64> %concat, ptr %dst, align 64
277 define void @concat_shuf_of_ab_to_a(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
278 ; SSE2-LABEL: concat_shuf_of_ab_to_a:
280 ; SSE2-NEXT: movapd (%rdi), %xmm0
281 ; SSE2-NEXT: movapd (%rsi), %xmm1
282 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
283 ; SSE2-NEXT: movapd %xmm0, (%rdx)
284 ; SSE2-NEXT: movapd %xmm1, 16(%rdx)
287 ; SSE42-LABEL: concat_shuf_of_ab_to_a:
289 ; SSE42-NEXT: movaps (%rdi), %xmm0
290 ; SSE42-NEXT: movaps (%rsi), %xmm1
291 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
292 ; SSE42-NEXT: movaps %xmm1, 16(%rdx)
293 ; SSE42-NEXT: movaps %xmm0, (%rdx)
296 ; AVX-LABEL: concat_shuf_of_ab_to_a:
298 ; AVX-NEXT: vmovaps (%rdi), %xmm0
299 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
300 ; AVX-NEXT: vmovaps %xmm1, 16(%rdx)
301 ; AVX-NEXT: vmovaps %xmm0, (%rdx)
304 ; AVX2-LABEL: concat_shuf_of_ab_to_a:
306 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
307 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
308 ; AVX2-NEXT: vmovaps %xmm1, 16(%rdx)
309 ; AVX2-NEXT: vmovaps %xmm0, (%rdx)
312 ; AVX512F-LABEL: concat_shuf_of_ab_to_a:
314 ; AVX512F-NEXT: vmovaps (%rdi), %xmm0
315 ; AVX512F-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
316 ; AVX512F-NEXT: vmovaps %xmm1, 16(%rdx)
317 ; AVX512F-NEXT: vmovaps %xmm0, (%rdx)
320 ; AVX512BW-LABEL: concat_shuf_of_ab_to_a:
322 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
323 ; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
324 ; AVX512BW-NEXT: vmovaps %xmm1, 16(%rdx)
325 ; AVX512BW-NEXT: vmovaps %xmm0, (%rdx)
326 ; AVX512BW-NEXT: retq
327 %a = load <2 x i64>, ptr %a.ptr, align 64
328 %b = load <2 x i64>, ptr %b.ptr, align 64
329 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
330 %concat = shufflevector <2 x i64> %a, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
331 store <4 x i64> %concat, ptr %dst, align 64
334 define void @concat_shuf_of_ab_to_b(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
335 ; SSE2-LABEL: concat_shuf_of_ab_to_b:
337 ; SSE2-NEXT: movaps (%rsi), %xmm0
338 ; SSE2-NEXT: movaps %xmm0, %xmm1
339 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
340 ; SSE2-NEXT: movaps %xmm1, 16(%rdx)
341 ; SSE2-NEXT: movaps %xmm0, (%rdx)
344 ; SSE42-LABEL: concat_shuf_of_ab_to_b:
346 ; SSE42-NEXT: movaps (%rsi), %xmm0
347 ; SSE42-NEXT: movaps (%rdi), %xmm1
348 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
349 ; SSE42-NEXT: movaps %xmm1, 16(%rdx)
350 ; SSE42-NEXT: movaps %xmm0, (%rdx)
353 ; AVX-LABEL: concat_shuf_of_ab_to_b:
355 ; AVX-NEXT: vmovaps (%rsi), %xmm0
356 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3]
357 ; AVX-NEXT: vmovaps %xmm1, 16(%rdx)
358 ; AVX-NEXT: vmovaps %xmm0, (%rdx)
361 ; AVX2-LABEL: concat_shuf_of_ab_to_b:
363 ; AVX2-NEXT: vmovaps (%rsi), %xmm0
364 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3]
365 ; AVX2-NEXT: vmovaps %xmm1, 16(%rdx)
366 ; AVX2-NEXT: vmovaps %xmm0, (%rdx)
369 ; AVX512F-LABEL: concat_shuf_of_ab_to_b:
371 ; AVX512F-NEXT: vmovaps (%rsi), %xmm0
372 ; AVX512F-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3]
373 ; AVX512F-NEXT: vmovaps %xmm1, 16(%rdx)
374 ; AVX512F-NEXT: vmovaps %xmm0, (%rdx)
377 ; AVX512BW-LABEL: concat_shuf_of_ab_to_b:
379 ; AVX512BW-NEXT: vmovaps (%rsi), %xmm0
380 ; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3]
381 ; AVX512BW-NEXT: vmovaps %xmm1, 16(%rdx)
382 ; AVX512BW-NEXT: vmovaps %xmm0, (%rdx)
383 ; AVX512BW-NEXT: retq
384 %a = load <2 x i64>, ptr %a.ptr, align 64
385 %b = load <2 x i64>, ptr %b.ptr, align 64
386 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
387 %concat = shufflevector <2 x i64> %b, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
388 store <4 x i64> %concat, ptr %dst, align 64
392 define void @concat_b_to_shuf_of_a(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
393 ; SSE-LABEL: concat_b_to_shuf_of_a:
395 ; SSE-NEXT: movaps (%rsi), %xmm0
396 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = mem[2,3,0,1]
397 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
398 ; SSE-NEXT: movdqa %xmm1, (%rdx)
401 ; AVX-LABEL: concat_b_to_shuf_of_a:
403 ; AVX-NEXT: vmovaps (%rsi), %xmm0
404 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1]
405 ; AVX-NEXT: vmovaps %xmm0, 16(%rdx)
406 ; AVX-NEXT: vmovaps %xmm1, (%rdx)
409 ; AVX2-LABEL: concat_b_to_shuf_of_a:
411 ; AVX2-NEXT: vmovaps (%rsi), %xmm0
412 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1]
413 ; AVX2-NEXT: vmovaps %xmm0, 16(%rdx)
414 ; AVX2-NEXT: vmovaps %xmm1, (%rdx)
417 ; AVX512F-LABEL: concat_b_to_shuf_of_a:
419 ; AVX512F-NEXT: vmovaps (%rsi), %xmm0
420 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1]
421 ; AVX512F-NEXT: vmovaps %xmm0, 16(%rdx)
422 ; AVX512F-NEXT: vmovaps %xmm1, (%rdx)
425 ; AVX512BW-LABEL: concat_b_to_shuf_of_a:
427 ; AVX512BW-NEXT: vmovaps (%rsi), %xmm0
428 ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1]
429 ; AVX512BW-NEXT: vmovaps %xmm0, 16(%rdx)
430 ; AVX512BW-NEXT: vmovaps %xmm1, (%rdx)
431 ; AVX512BW-NEXT: retq
432 %a = load <2 x i64>, ptr %a.ptr, align 64
433 %b = load <2 x i64>, ptr %b.ptr, align 64
434 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
435 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
436 store <4 x i64> %concat, ptr %dst, align 64
439 define void @concat_shuf_of_a_to_b(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
440 ; SSE-LABEL: concat_shuf_of_a_to_b:
442 ; SSE-NEXT: movaps (%rsi), %xmm0
443 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = mem[2,3,0,1]
444 ; SSE-NEXT: movdqa %xmm1, 16(%rdx)
445 ; SSE-NEXT: movaps %xmm0, (%rdx)
448 ; AVX-LABEL: concat_shuf_of_a_to_b:
450 ; AVX-NEXT: vmovaps (%rsi), %xmm0
451 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1]
452 ; AVX-NEXT: vmovaps %xmm1, 16(%rdx)
453 ; AVX-NEXT: vmovaps %xmm0, (%rdx)
456 ; AVX2-LABEL: concat_shuf_of_a_to_b:
458 ; AVX2-NEXT: vmovaps (%rsi), %xmm0
459 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1]
460 ; AVX2-NEXT: vmovaps %xmm1, 16(%rdx)
461 ; AVX2-NEXT: vmovaps %xmm0, (%rdx)
464 ; AVX512F-LABEL: concat_shuf_of_a_to_b:
466 ; AVX512F-NEXT: vmovaps (%rsi), %xmm0
467 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1]
468 ; AVX512F-NEXT: vmovaps %xmm1, 16(%rdx)
469 ; AVX512F-NEXT: vmovaps %xmm0, (%rdx)
472 ; AVX512BW-LABEL: concat_shuf_of_a_to_b:
474 ; AVX512BW-NEXT: vmovaps (%rsi), %xmm0
475 ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1]
476 ; AVX512BW-NEXT: vmovaps %xmm1, 16(%rdx)
477 ; AVX512BW-NEXT: vmovaps %xmm0, (%rdx)
478 ; AVX512BW-NEXT: retq
479 %a = load <2 x i64>, ptr %a.ptr, align 64
480 %b = load <2 x i64>, ptr %b.ptr, align 64
481 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
482 %concat = shufflevector <2 x i64> %b, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
483 store <4 x i64> %concat, ptr %dst, align 64
487 define void @concat_poison_to_shuf_of_a(ptr %a.ptr, ptr %dst) {
488 ; SSE-LABEL: concat_poison_to_shuf_of_a:
490 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
491 ; SSE-NEXT: movdqa %xmm0, (%rsi)
494 ; AVX-LABEL: concat_poison_to_shuf_of_a:
496 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1]
497 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
500 ; AVX2-LABEL: concat_poison_to_shuf_of_a:
502 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1]
503 ; AVX2-NEXT: vmovaps %xmm0, (%rsi)
506 ; AVX512F-LABEL: concat_poison_to_shuf_of_a:
508 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1]
509 ; AVX512F-NEXT: vmovaps %xmm0, (%rsi)
512 ; AVX512BW-LABEL: concat_poison_to_shuf_of_a:
514 ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1]
515 ; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
516 ; AVX512BW-NEXT: retq
517 %a = load <2 x i64>, ptr %a.ptr, align 64
518 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
519 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
520 store <4 x i64> %concat, ptr %dst, align 64
523 define void @concat_shuf_of_a_to_poison(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
524 ; SSE-LABEL: concat_shuf_of_a_to_poison:
526 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
527 ; SSE-NEXT: movdqa %xmm0, 16(%rdx)
530 ; AVX-LABEL: concat_shuf_of_a_to_poison:
532 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1]
533 ; AVX-NEXT: vmovaps %xmm0, 16(%rdx)
536 ; AVX2-LABEL: concat_shuf_of_a_to_poison:
538 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1]
539 ; AVX2-NEXT: vmovaps %xmm0, 16(%rdx)
542 ; AVX512F-LABEL: concat_shuf_of_a_to_poison:
544 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1]
545 ; AVX512F-NEXT: vmovaps %xmm0, 16(%rdx)
548 ; AVX512BW-LABEL: concat_shuf_of_a_to_poison:
550 ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1]
551 ; AVX512BW-NEXT: vmovaps %xmm0, 16(%rdx)
552 ; AVX512BW-NEXT: retq
553 %a = load <2 x i64>, ptr %a.ptr, align 64
554 %b = load <2 x i64>, ptr %b.ptr, align 64
555 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
556 %concat = shufflevector <2 x i64> poison, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
557 store <4 x i64> %concat, ptr %dst, align 64
561 define void @concat_shuf_of_a_to_itself(ptr %a.ptr, ptr %dst) {
562 ; SSE-LABEL: concat_shuf_of_a_to_itself:
564 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
565 ; SSE-NEXT: movdqa %xmm0, 16(%rsi)
566 ; SSE-NEXT: movdqa %xmm0, (%rsi)
569 ; AVX-LABEL: concat_shuf_of_a_to_itself:
571 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
572 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
573 ; AVX-NEXT: vmovaps %ymm0, (%rsi)
574 ; AVX-NEXT: vzeroupper
577 ; AVX2-LABEL: concat_shuf_of_a_to_itself:
579 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
580 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,1,0]
581 ; AVX2-NEXT: vmovaps %ymm0, (%rsi)
582 ; AVX2-NEXT: vzeroupper
585 ; AVX512F-LABEL: concat_shuf_of_a_to_itself:
587 ; AVX512F-NEXT: vmovaps (%rdi), %xmm0
588 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,1,0]
589 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
590 ; AVX512F-NEXT: vzeroupper
593 ; AVX512BW-LABEL: concat_shuf_of_a_to_itself:
595 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
596 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,1,0]
597 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
598 ; AVX512BW-NEXT: vzeroupper
599 ; AVX512BW-NEXT: retq
600 %a = load <2 x i64>, ptr %a.ptr, align 64
601 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
602 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
603 store <4 x i64> %concat, ptr %dst, align 64
607 define void @concat_aaa_to_shuf_of_a(ptr %a.ptr, ptr %dst) {
608 ; SSE-LABEL: concat_aaa_to_shuf_of_a:
610 ; SSE-NEXT: movdqa (%rdi), %xmm0
611 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
612 ; SSE-NEXT: movdqa %xmm0, 32(%rsi)
613 ; SSE-NEXT: movdqa %xmm0, 48(%rsi)
614 ; SSE-NEXT: movdqa %xmm0, 16(%rsi)
615 ; SSE-NEXT: movdqa %xmm1, (%rsi)
618 ; AVX-LABEL: concat_aaa_to_shuf_of_a:
620 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
621 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
622 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
623 ; AVX-NEXT: vmovaps %ymm0, 32(%rsi)
624 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
625 ; AVX-NEXT: vzeroupper
628 ; AVX2-LABEL: concat_aaa_to_shuf_of_a:
630 ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
631 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,1]
632 ; AVX2-NEXT: vmovaps %ymm0, 32(%rsi)
633 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
634 ; AVX2-NEXT: vzeroupper
637 ; AVX512F-LABEL: concat_aaa_to_shuf_of_a:
639 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
640 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
641 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
642 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
643 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi)
644 ; AVX512F-NEXT: vzeroupper
647 ; AVX512BW-LABEL: concat_aaa_to_shuf_of_a:
649 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
650 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
651 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
652 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
653 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi)
654 ; AVX512BW-NEXT: vzeroupper
655 ; AVX512BW-NEXT: retq
656 %a = load <2 x i64>, ptr %a.ptr, align 64
657 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
658 %concat01 = shufflevector <2 x i64> %shuffle, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
659 %concat23 = shufflevector <2 x i64> %a, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
660 %concat = shufflevector <4 x i64> %concat01, <4 x i64> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
661 store <8 x i64> %concat, ptr %dst, align 64
664 define void @concat_shuf_of_a_to_aaa(ptr %a.ptr, ptr %dst) {
665 ; SSE-LABEL: concat_shuf_of_a_to_aaa:
667 ; SSE-NEXT: movdqa (%rdi), %xmm0
668 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
669 ; SSE-NEXT: movdqa %xmm0, 32(%rsi)
670 ; SSE-NEXT: movdqa %xmm0, 16(%rsi)
671 ; SSE-NEXT: movdqa %xmm0, (%rsi)
672 ; SSE-NEXT: movdqa %xmm1, 48(%rsi)
675 ; AVX-LABEL: concat_shuf_of_a_to_aaa:
677 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
678 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
679 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
680 ; AVX-NEXT: vmovaps %ymm0, (%rsi)
681 ; AVX-NEXT: vmovaps %ymm1, 32(%rsi)
682 ; AVX-NEXT: vzeroupper
685 ; AVX2-LABEL: concat_shuf_of_a_to_aaa:
687 ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
688 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,1,1,0]
689 ; AVX2-NEXT: vmovaps %ymm0, (%rsi)
690 ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
691 ; AVX2-NEXT: vzeroupper
694 ; AVX512F-LABEL: concat_shuf_of_a_to_aaa:
696 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
697 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
698 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
699 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
700 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi)
701 ; AVX512F-NEXT: vzeroupper
704 ; AVX512BW-LABEL: concat_shuf_of_a_to_aaa:
706 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
707 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
708 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
709 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
710 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi)
711 ; AVX512BW-NEXT: vzeroupper
712 ; AVX512BW-NEXT: retq
713 %a = load <2 x i64>, ptr %a.ptr, align 64
714 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
715 %concat01 = shufflevector <2 x i64> %a, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
716 %concat23 = shufflevector <2 x i64> %a, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
717 %concat = shufflevector <4 x i64> %concat01, <4 x i64> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
718 store <8 x i64> %concat, ptr %dst, align 64
721 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
724 ; AVX2-FAST-PERLANE: {{.*}}
726 ; AVX512BW-FAST: {{.*}}
727 ; AVX512BW-SLOW: {{.*}}
728 ; AVX512F-FAST: {{.*}}
729 ; AVX512F-SLOW: {{.*}}