1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512F
8 define void @foo(<4 x float> %in, ptr %out) {
11 ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
12 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
13 ; SSE2-NEXT: packuswb %xmm0, %xmm0
14 ; SSE2-NEXT: packuswb %xmm0, %xmm0
15 ; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
16 ; SSE2-NEXT: movd %xmm0, (%rdi)
21 ; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
22 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
23 ; SSE42-NEXT: movl $255, %eax
24 ; SSE42-NEXT: pinsrb $3, %eax, %xmm0
25 ; SSE42-NEXT: movd %xmm0, (%rdi)
30 ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
31 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
32 ; AVX-NEXT: movl $255, %eax
33 ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
34 ; AVX-NEXT: vmovd %xmm0, (%rdi)
36 %t0 = fptosi <4 x float> %in to <4 x i32>
37 %t1 = trunc <4 x i32> %t0 to <4 x i16>
38 %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
39 %t3 = trunc <8 x i16> %t2 to <8 x i8>
40 %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
41 %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
42 store <4 x i8> %t5, ptr %out
46 define <16 x i64> @catcat(<4 x i64> %x) {
49 ; SSE-NEXT: movq %rdi, %rax
50 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
51 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
52 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
53 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
54 ; SSE-NEXT: movdqa %xmm1, 112(%rdi)
55 ; SSE-NEXT: movdqa %xmm1, 96(%rdi)
56 ; SSE-NEXT: movdqa %xmm3, 80(%rdi)
57 ; SSE-NEXT: movdqa %xmm3, 64(%rdi)
58 ; SSE-NEXT: movdqa %xmm0, 48(%rdi)
59 ; SSE-NEXT: movdqa %xmm0, 32(%rdi)
60 ; SSE-NEXT: movdqa %xmm2, 16(%rdi)
61 ; SSE-NEXT: movdqa %xmm2, (%rdi)
66 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1]
67 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4
68 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
69 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
70 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
71 ; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm0[0,0,2,2]
72 ; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[1,1,3,3]
73 ; AVX1-NEXT: vmovaps %ymm4, %ymm0
78 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,1,1,1]
79 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2]
80 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[3,3,3,3]
81 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
84 ; AVX512F-LABEL: catcat:
86 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
87 ; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
88 ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm2
89 ; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3]
90 ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm1
91 ; AVX512F-NEXT: vmovaps %zmm2, %zmm0
93 %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
94 %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
95 %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
99 define <16 x i64> @load_catcat(ptr %p) {
100 ; SSE-LABEL: load_catcat:
102 ; SSE-NEXT: movq %rdi, %rax
103 ; SSE-NEXT: movdqa (%rsi), %xmm0
104 ; SSE-NEXT: movdqa 16(%rsi), %xmm1
105 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
106 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
107 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
108 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
109 ; SSE-NEXT: movdqa %xmm1, 112(%rdi)
110 ; SSE-NEXT: movdqa %xmm1, 96(%rdi)
111 ; SSE-NEXT: movdqa %xmm3, 80(%rdi)
112 ; SSE-NEXT: movdqa %xmm3, 64(%rdi)
113 ; SSE-NEXT: movdqa %xmm0, 48(%rdi)
114 ; SSE-NEXT: movdqa %xmm0, 32(%rdi)
115 ; SSE-NEXT: movdqa %xmm2, 16(%rdi)
116 ; SSE-NEXT: movdqa %xmm2, (%rdi)
119 ; AVX1-LABEL: load_catcat:
121 ; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
122 ; AVX1-NEXT: vbroadcastsd 8(%rdi), %ymm1
123 ; AVX1-NEXT: vbroadcastsd 16(%rdi), %ymm2
124 ; AVX1-NEXT: vbroadcastsd 24(%rdi), %ymm3
127 ; AVX2-LABEL: load_catcat:
129 ; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
130 ; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm1
131 ; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm2
132 ; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3
135 ; AVX512F-LABEL: load_catcat:
137 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
138 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5]
139 ; AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm0
140 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7]
141 ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
143 %x = load <4 x i64>, ptr %p
144 %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
145 %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
146 %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
150 ; Use weird types to make sure we do not miscompile a case where
151 ; the source ops are not an even multiple size of the result.
153 define <4 x i32> @cat_ext_straddle(ptr %px, ptr %py) {
154 ; SSE-LABEL: cat_ext_straddle:
156 ; SSE-NEXT: movaps 16(%rdi), %xmm0
157 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
160 ; AVX-LABEL: cat_ext_straddle:
162 ; AVX-NEXT: vmovaps 16(%rdi), %xmm0
163 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
165 %x = load <6 x i32>, ptr %px
166 %y = load <6 x i32>, ptr %py
167 %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
168 %ext = shufflevector <12 x i32> %cat, <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>