1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
7 ; Combine tests involving SSE41 target shuffles (BLEND,INSERTPS,MOVZX)
9 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
11 define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) {
12 ; SSE-LABEL: combine_vpshufb_as_movzx:
14 ; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
17 ; AVX-LABEL: combine_vpshufb_as_movzx:
19 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
21 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 undef, i8 undef, i8 -1, i8 -1, i8 -1, i8 -1>)
25 define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) {
26 ; SSE-LABEL: combine_blend_of_permutes_v4i32:
28 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
29 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
32 ; AVX1-LABEL: combine_blend_of_permutes_v4i32:
34 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
35 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
38 ; AVX2-LABEL: combine_blend_of_permutes_v4i32:
40 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
41 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
44 ; AVX512-LABEL: combine_blend_of_permutes_v4i32:
46 ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
47 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
48 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,19,0,17]
49 ; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
50 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
51 ; AVX512-NEXT: vzeroupper
53 %s0 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
54 %s1 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
55 %x0 = bitcast <2 x i64> %s0 to <4 x i32>
56 %x1 = bitcast <2 x i64> %s1 to <4 x i32>
57 %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
61 define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
64 ; SSE-NEXT: movdqa (%rdi), %xmm2
65 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
66 ; SSE-NEXT: movdqa 32(%rdi), %xmm0
67 ; SSE-NEXT: movdqa (%rsi), %xmm4
68 ; SSE-NEXT: movdqa 16(%rsi), %xmm5
69 ; SSE-NEXT: movdqa 32(%rsi), %xmm1
70 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u]
71 ; SSE-NEXT: pshufb %xmm6, %xmm3
72 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u]
73 ; SSE-NEXT: pshufb %xmm7, %xmm2
74 ; SSE-NEXT: por %xmm3, %xmm2
75 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
76 ; SSE-NEXT: pshufb %xmm3, %xmm2
77 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,1,4,7,10,13]
78 ; SSE-NEXT: pshufb %xmm8, %xmm0
79 ; SSE-NEXT: por %xmm2, %xmm0
80 ; SSE-NEXT: pshufb %xmm6, %xmm5
81 ; SSE-NEXT: pshufb %xmm7, %xmm4
82 ; SSE-NEXT: por %xmm5, %xmm4
83 ; SSE-NEXT: pshufb %xmm3, %xmm4
84 ; SSE-NEXT: pshufb %xmm8, %xmm1
85 ; SSE-NEXT: por %xmm4, %xmm1
86 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
87 ; SSE-NEXT: movdqa %xmm2, %xmm3
88 ; SSE-NEXT: pand %xmm1, %xmm3
89 ; SSE-NEXT: movdqa %xmm0, %xmm4
90 ; SSE-NEXT: pmaddubsw %xmm3, %xmm4
91 ; SSE-NEXT: pand %xmm2, %xmm4
92 ; SSE-NEXT: pandn %xmm1, %xmm2
93 ; SSE-NEXT: pmaddubsw %xmm2, %xmm0
94 ; SSE-NEXT: psllw $8, %xmm0
95 ; SSE-NEXT: por %xmm4, %xmm0
98 ; AVX1-LABEL: PR50049:
100 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
101 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
102 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
103 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
104 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
105 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
106 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
107 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
108 ; AVX1-NEXT: vmovdqa (%rsi), %xmm2
109 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5
110 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm6
111 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
112 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
113 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
114 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
115 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
116 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
117 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
118 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
119 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1
120 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2
121 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
122 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
123 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm3
124 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3
125 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3
126 ; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1
127 ; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
128 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
129 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
132 ; AVX2-LABEL: PR50049:
134 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
135 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
136 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
137 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
138 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
139 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
140 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
141 ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
142 ; AVX2-NEXT: vmovdqa (%rsi), %xmm2
143 ; AVX2-NEXT: vmovdqa 16(%rsi), %xmm5
144 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm6
145 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3
146 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
147 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
148 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
149 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
150 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
151 ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
152 ; AVX2-NEXT: vpor %xmm5, %xmm2, %xmm2
153 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
154 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
155 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
156 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
157 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
158 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
159 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
160 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
161 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
162 ; AVX2-NEXT: vzeroupper
165 ; AVX512-LABEL: PR50049:
167 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
168 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
169 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
170 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
171 ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
172 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
173 ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
174 ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
175 ; AVX512-NEXT: vmovdqa (%rsi), %xmm2
176 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm5
177 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm6
178 ; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3
179 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
180 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
181 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
182 ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
183 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
184 ; AVX512-NEXT: vpshufb %xmm4, %xmm5, %xmm5
185 ; AVX512-NEXT: vpor %xmm5, %xmm2, %xmm2
186 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
187 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
188 ; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1
189 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
190 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
191 ; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
192 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
193 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
194 ; AVX512-NEXT: vzeroupper
196 %x1 = load <48 x i8>, ptr %p1, align 16
197 %x2 = load <48 x i8>, ptr %p2, align 16
198 %s1 = shufflevector <48 x i8> %x1, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
199 %s2 = shufflevector <48 x i8> %x2, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
200 %r = mul <16 x i8> %s1, %s2