1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512F
8 define void @foo(<4 x float> %in, ptr %out) {
11 ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
12 ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
13 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
14 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
15 ; SSE2-NEXT: shll $8, %ecx
16 ; SSE2-NEXT: orl %eax, %ecx
17 ; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
18 ; SSE2-NEXT: shll $16, %eax
19 ; SSE2-NEXT: orl %ecx, %eax
20 ; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000
21 ; SSE2-NEXT: movl %eax, (%rdi)
26 ; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
27 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
28 ; SSE42-NEXT: movl $255, %eax
29 ; SSE42-NEXT: pinsrb $3, %eax, %xmm0
30 ; SSE42-NEXT: movd %xmm0, (%rdi)
35 ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
36 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
37 ; AVX-NEXT: movl $255, %eax
38 ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
39 ; AVX-NEXT: vmovd %xmm0, (%rdi)
41 %t0 = fptosi <4 x float> %in to <4 x i32>
42 %t1 = trunc <4 x i32> %t0 to <4 x i16>
43 %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
44 %t3 = trunc <8 x i16> %t2 to <8 x i8>
45 %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
46 %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
47 store <4 x i8> %t5, ptr %out
51 define <16 x i64> @catcat(<4 x i64> %x) {
54 ; SSE-NEXT: movq %rdi, %rax
55 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
56 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
57 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
58 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
59 ; SSE-NEXT: movdqa %xmm1, 112(%rdi)
60 ; SSE-NEXT: movdqa %xmm1, 96(%rdi)
61 ; SSE-NEXT: movdqa %xmm3, 80(%rdi)
62 ; SSE-NEXT: movdqa %xmm3, 64(%rdi)
63 ; SSE-NEXT: movdqa %xmm0, 48(%rdi)
64 ; SSE-NEXT: movdqa %xmm0, 32(%rdi)
65 ; SSE-NEXT: movdqa %xmm2, 16(%rdi)
66 ; SSE-NEXT: movdqa %xmm2, (%rdi)
71 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1]
72 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4
73 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
74 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
75 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
76 ; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm0[0,0,2,2]
77 ; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[1,1,3,3]
78 ; AVX1-NEXT: vmovaps %ymm4, %ymm0
83 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,1,1,1]
84 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2]
85 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[3,3,3,3]
86 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
89 ; AVX512F-LABEL: catcat:
91 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
92 ; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
93 ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm2
94 ; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3]
95 ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm1
96 ; AVX512F-NEXT: vmovaps %zmm2, %zmm0
98 %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
99 %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
100 %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
104 define <16 x i64> @load_catcat(ptr %p) {
105 ; SSE-LABEL: load_catcat:
107 ; SSE-NEXT: movq %rdi, %rax
108 ; SSE-NEXT: movdqa (%rsi), %xmm0
109 ; SSE-NEXT: movdqa 16(%rsi), %xmm1
110 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
111 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
112 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
113 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
114 ; SSE-NEXT: movdqa %xmm1, 112(%rdi)
115 ; SSE-NEXT: movdqa %xmm1, 96(%rdi)
116 ; SSE-NEXT: movdqa %xmm3, 80(%rdi)
117 ; SSE-NEXT: movdqa %xmm3, 64(%rdi)
118 ; SSE-NEXT: movdqa %xmm0, 48(%rdi)
119 ; SSE-NEXT: movdqa %xmm0, 32(%rdi)
120 ; SSE-NEXT: movdqa %xmm2, 16(%rdi)
121 ; SSE-NEXT: movdqa %xmm2, (%rdi)
124 ; AVX1-LABEL: load_catcat:
126 ; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
127 ; AVX1-NEXT: vbroadcastsd 8(%rdi), %ymm1
128 ; AVX1-NEXT: vbroadcastsd 16(%rdi), %ymm2
129 ; AVX1-NEXT: vbroadcastsd 24(%rdi), %ymm3
132 ; AVX2-LABEL: load_catcat:
134 ; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
135 ; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm1
136 ; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm2
137 ; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3
140 ; AVX512F-LABEL: load_catcat:
142 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
143 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5]
144 ; AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm0
145 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7]
146 ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
148 %x = load <4 x i64>, ptr %p
149 %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
150 %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
151 %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
155 ; Use weird types to make sure we do not miscompile a case where
156 ; the source ops are not an even multiple size of the result.
158 define <4 x i32> @cat_ext_straddle(ptr %px, ptr %py) {
159 ; SSE-LABEL: cat_ext_straddle:
161 ; SSE-NEXT: movaps 16(%rdi), %xmm0
162 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
165 ; AVX-LABEL: cat_ext_straddle:
167 ; AVX-NEXT: vmovaps 16(%rdi), %xmm0
168 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
170 %x = load <6 x i32>, ptr %px
171 %y = load <6 x i32>, ptr %py
172 %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
173 %ext = shufflevector <12 x i32> %cat, <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>