1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7 define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
8 ; CHECK-LABEL: test_x86_sse41_blend_pd:
11 %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0)
15 define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
16 ; CHECK-LABEL: test_x86_sse41_blend_ps:
19 %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0)
23 define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
24 ; CHECK-LABEL: test_x86_sse41_pblend_w:
27 %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0)
31 define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
32 ; SSE-LABEL: test2_x86_sse41_blend_pd:
34 ; SSE-NEXT: movaps %xmm1, %xmm0
37 ; AVX-LABEL: test2_x86_sse41_blend_pd:
39 ; AVX-NEXT: vmovaps %xmm1, %xmm0
41 %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1)
45 define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
46 ; SSE-LABEL: test2_x86_sse41_blend_ps:
48 ; SSE-NEXT: movaps %xmm1, %xmm0
51 ; AVX-LABEL: test2_x86_sse41_blend_ps:
53 ; AVX-NEXT: vmovaps %xmm1, %xmm0
55 %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1)
59 define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
60 ; SSE-LABEL: test2_x86_sse41_pblend_w:
62 ; SSE-NEXT: movaps %xmm1, %xmm0
65 ; AVX-LABEL: test2_x86_sse41_pblend_w:
67 ; AVX-NEXT: vmovaps %xmm1, %xmm0
69 %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1)
73 define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) {
74 ; CHECK-LABEL: test3_x86_sse41_blend_pd:
77 %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7)
81 define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
82 ; CHECK-LABEL: test3_x86_sse41_blend_ps:
85 %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7)
89 define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
90 ; CHECK-LABEL: test3_x86_sse41_pblend_w:
93 %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7)
97 define double @demandedelts_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
98 ; SSE-LABEL: demandedelts_blendvpd:
100 ; SSE-NEXT: movapd %xmm0, %xmm3
101 ; SSE-NEXT: movaps %xmm2, %xmm0
102 ; SSE-NEXT: blendvpd %xmm0, %xmm1, %xmm3
103 ; SSE-NEXT: movapd %xmm3, %xmm0
106 ; AVX-LABEL: demandedelts_blendvpd:
108 ; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
110 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
111 %2 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
112 %3 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
113 %4 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %3)
114 %5 = extractelement <2 x double> %4, i32 0
118 define float @demandedelts_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
119 ; SSE-LABEL: demandedelts_blendvps:
121 ; SSE-NEXT: movaps %xmm0, %xmm3
122 ; SSE-NEXT: movaps %xmm2, %xmm0
123 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3
124 ; SSE-NEXT: movaps %xmm3, %xmm0
127 ; AVX-LABEL: demandedelts_blendvps:
129 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
131 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
132 %2 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
133 %3 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer
134 %4 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %3)
135 %5 = extractelement <4 x float> %4, i32 0
139 define <16 x i8> @demandedelts_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
140 ; SSE-LABEL: demandedelts_pblendvb:
142 ; SSE-NEXT: movdqa %xmm0, %xmm3
143 ; SSE-NEXT: movdqa %xmm2, %xmm0
144 ; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3
145 ; SSE-NEXT: pxor %xmm0, %xmm0
146 ; SSE-NEXT: pshufb %xmm0, %xmm3
147 ; SSE-NEXT: movdqa %xmm3, %xmm0
150 ; AVX1-LABEL: demandedelts_pblendvb:
152 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
153 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
154 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
157 ; AVX2-LABEL: demandedelts_pblendvb:
159 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
160 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
162 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
163 %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <16 x i32> zeroinitializer
164 %3 = shufflevector <16 x i8> %a2, <16 x i8> undef, <16 x i32> zeroinitializer
165 %4 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %3)
166 %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
170 define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
171 ; SSE-LABEL: demandedbits_sitofp_blendvps:
173 ; SSE-NEXT: movaps %xmm0, %xmm3
174 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm0
175 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3
176 ; SSE-NEXT: movaps %xmm3, %xmm0
179 ; AVX-LABEL: demandedbits_sitofp_blendvps:
181 ; AVX-NEXT: vcvtdq2ps %xmm2, %xmm2
182 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
184 %cvt = sitofp <4 x i32> %a2 to <4 x float>
185 %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
189 define <4 x float> @demandedbits_uitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
190 ; SSE-LABEL: demandedbits_uitofp_blendvps:
192 ; SSE-NEXT: movaps %xmm0, %xmm3
193 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1258291200,1258291200,1258291200,1258291200]
194 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
195 ; SSE-NEXT: psrld $16, %xmm2
196 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
197 ; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
198 ; SSE-NEXT: addps %xmm2, %xmm0
199 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3
200 ; SSE-NEXT: movaps %xmm3, %xmm0
203 ; AVX1-LABEL: demandedbits_uitofp_blendvps:
205 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
206 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
207 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
208 ; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
209 ; AVX1-NEXT: vaddps %xmm2, %xmm3, %xmm2
210 ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
213 ; AVX2-LABEL: demandedbits_uitofp_blendvps:
215 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1258291200,1258291200,1258291200,1258291200]
216 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
217 ; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2
218 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
219 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
220 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
221 ; AVX2-NEXT: vsubps %xmm4, %xmm2, %xmm2
222 ; AVX2-NEXT: vaddps %xmm2, %xmm3, %xmm2
223 ; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
225 %cvt = uitofp <4 x i32> %a2 to <4 x float>
226 %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
230 define <2 x i64> @demandedbits_blendvpd(i64 %a0, i64 %a2, <2 x double> %a3) {
231 ; SSE-LABEL: demandedbits_blendvpd:
233 ; SSE-NEXT: movq %rdi, %rax
234 ; SSE-NEXT: orq $1, %rax
235 ; SSE-NEXT: orq $4, %rdi
236 ; SSE-NEXT: movq %rax, %xmm1
237 ; SSE-NEXT: movq %rdi, %xmm2
238 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero
239 ; SSE-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero
240 ; SSE-NEXT: blendvpd %xmm0, %xmm2, %xmm1
241 ; SSE-NEXT: psrlq $11, %xmm1
242 ; SSE-NEXT: movdqa %xmm1, %xmm0
245 ; AVX-LABEL: demandedbits_blendvpd:
247 ; AVX-NEXT: movq %rdi, %rax
248 ; AVX-NEXT: orq $1, %rax
249 ; AVX-NEXT: orq $4, %rdi
250 ; AVX-NEXT: vmovq %rax, %xmm1
251 ; AVX-NEXT: vmovq %rdi, %xmm2
252 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
253 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
254 ; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
255 ; AVX-NEXT: vpsrlq $11, %xmm0, %xmm0
259 %3 = bitcast i64 %1 to double
260 %4 = bitcast i64 %2 to double
261 %5 = insertelement <2 x double> zeroinitializer, double %3, i32 0
262 %6 = insertelement <2 x double> zeroinitializer, double %4, i32 0
263 %7 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %5, <2 x double> %6, <2 x double> %a3)
264 %8 = bitcast <2 x double> %7 to <2 x i64>
265 %9 = lshr <2 x i64> %8, <i64 11, i64 11>
269 define <16 x i8> @xor_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
270 ; SSE-LABEL: xor_pblendvb:
272 ; SSE-NEXT: movdqa %xmm0, %xmm3
273 ; SSE-NEXT: movaps %xmm2, %xmm0
274 ; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1
275 ; SSE-NEXT: movdqa %xmm1, %xmm0
278 ; AVX-LABEL: xor_pblendvb:
280 ; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
282 %1 = xor <16 x i8> %a2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
283 %2 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %1)
287 define <4 x float> @xor_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
288 ; SSE-LABEL: xor_blendvps:
290 ; SSE-NEXT: movaps %xmm0, %xmm3
291 ; SSE-NEXT: movaps %xmm2, %xmm0
292 ; SSE-NEXT: blendvps %xmm0, %xmm3, %xmm1
293 ; SSE-NEXT: movaps %xmm1, %xmm0
296 ; AVX-LABEL: xor_blendvps:
298 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
300 %1 = bitcast <4 x float> %a2 to <4 x i32>
301 %2 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
302 %3 = bitcast <4 x i32> %2 to <4 x float>
303 %4 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %3)
307 define <2 x double> @xor_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
308 ; SSE-LABEL: xor_blendvpd:
310 ; SSE-NEXT: movapd %xmm0, %xmm3
311 ; SSE-NEXT: movaps %xmm2, %xmm0
312 ; SSE-NEXT: blendvpd %xmm0, %xmm3, %xmm1
313 ; SSE-NEXT: movapd %xmm1, %xmm0
316 ; AVX-LABEL: xor_blendvpd:
318 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
320 %1 = bitcast <2 x double> %a2 to <4 x i32>
321 %2 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
322 %3 = bitcast <4 x i32> %2 to <2 x double>
323 %4 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %3)
327 define <16 x i8> @PR47404(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
328 ; SSE-LABEL: PR47404:
330 ; SSE-NEXT: movdqa %xmm0, %xmm3
331 ; SSE-NEXT: movaps %xmm2, %xmm0
332 ; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3
333 ; SSE-NEXT: movdqa %xmm3, %xmm0
336 ; AVX-LABEL: PR47404:
338 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
340 %4 = icmp sgt <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
341 %5 = select <16 x i1> %4, <16 x i8> %0, <16 x i8> %1
345 declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32)
346 declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32)
347 declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32)
349 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
350 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
351 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)