1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE2,X86-SSE2
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE41,X86-SSE41
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE2,X64-SSE2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE41,X64-SSE41
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
11 ; Ensure that the backend no longer emits unnecessary vector insert
12 ; instructions immediately after SSE scalar fp instructions
13 ; like addss or mulss.
15 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
16 ; SSE-LABEL: test_add_ss:
18 ; SSE-NEXT: addss %xmm1, %xmm0
19 ; SSE-NEXT: ret{{[l|q]}}
21 ; AVX-LABEL: test_add_ss:
23 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
24 ; AVX-NEXT: ret{{[l|q]}}
25 %1 = extractelement <4 x float> %b, i32 0
26 %2 = extractelement <4 x float> %a, i32 0
27 %add = fadd float %2, %1
28 %3 = insertelement <4 x float> %a, float %add, i32 0
32 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
33 ; SSE-LABEL: test_sub_ss:
35 ; SSE-NEXT: subss %xmm1, %xmm0
36 ; SSE-NEXT: ret{{[l|q]}}
38 ; AVX-LABEL: test_sub_ss:
40 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
41 ; AVX-NEXT: ret{{[l|q]}}
42 %1 = extractelement <4 x float> %b, i32 0
43 %2 = extractelement <4 x float> %a, i32 0
44 %sub = fsub float %2, %1
45 %3 = insertelement <4 x float> %a, float %sub, i32 0
49 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
50 ; SSE-LABEL: test_mul_ss:
52 ; SSE-NEXT: mulss %xmm1, %xmm0
53 ; SSE-NEXT: ret{{[l|q]}}
55 ; AVX-LABEL: test_mul_ss:
57 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
58 ; AVX-NEXT: ret{{[l|q]}}
59 %1 = extractelement <4 x float> %b, i32 0
60 %2 = extractelement <4 x float> %a, i32 0
61 %mul = fmul float %2, %1
62 %3 = insertelement <4 x float> %a, float %mul, i32 0
66 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
67 ; SSE-LABEL: test_div_ss:
69 ; SSE-NEXT: divss %xmm1, %xmm0
70 ; SSE-NEXT: ret{{[l|q]}}
72 ; AVX-LABEL: test_div_ss:
74 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
75 ; AVX-NEXT: ret{{[l|q]}}
76 %1 = extractelement <4 x float> %b, i32 0
77 %2 = extractelement <4 x float> %a, i32 0
78 %div = fdiv float %2, %1
79 %3 = insertelement <4 x float> %a, float %div, i32 0
83 define <4 x float> @test_sqrt_ss(<4 x float> %a) {
84 ; SSE-LABEL: test_sqrt_ss:
86 ; SSE-NEXT: sqrtss %xmm0, %xmm0
87 ; SSE-NEXT: ret{{[l|q]}}
89 ; AVX-LABEL: test_sqrt_ss:
91 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
92 ; AVX-NEXT: ret{{[l|q]}}
93 %1 = extractelement <4 x float> %a, i32 0
94 %2 = call float @llvm.sqrt.f32(float %1)
95 %3 = insertelement <4 x float> %a, float %2, i32 0
98 declare float @llvm.sqrt.f32(float)
100 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
101 ; SSE-LABEL: test_add_sd:
103 ; SSE-NEXT: addsd %xmm1, %xmm0
104 ; SSE-NEXT: ret{{[l|q]}}
106 ; AVX-LABEL: test_add_sd:
108 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
109 ; AVX-NEXT: ret{{[l|q]}}
110 %1 = extractelement <2 x double> %b, i32 0
111 %2 = extractelement <2 x double> %a, i32 0
112 %add = fadd double %2, %1
113 %3 = insertelement <2 x double> %a, double %add, i32 0
117 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
118 ; SSE-LABEL: test_sub_sd:
120 ; SSE-NEXT: subsd %xmm1, %xmm0
121 ; SSE-NEXT: ret{{[l|q]}}
123 ; AVX-LABEL: test_sub_sd:
125 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
126 ; AVX-NEXT: ret{{[l|q]}}
127 %1 = extractelement <2 x double> %b, i32 0
128 %2 = extractelement <2 x double> %a, i32 0
129 %sub = fsub double %2, %1
130 %3 = insertelement <2 x double> %a, double %sub, i32 0
134 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
135 ; SSE-LABEL: test_mul_sd:
137 ; SSE-NEXT: mulsd %xmm1, %xmm0
138 ; SSE-NEXT: ret{{[l|q]}}
140 ; AVX-LABEL: test_mul_sd:
142 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
143 ; AVX-NEXT: ret{{[l|q]}}
144 %1 = extractelement <2 x double> %b, i32 0
145 %2 = extractelement <2 x double> %a, i32 0
146 %mul = fmul double %2, %1
147 %3 = insertelement <2 x double> %a, double %mul, i32 0
151 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
152 ; SSE-LABEL: test_div_sd:
154 ; SSE-NEXT: divsd %xmm1, %xmm0
155 ; SSE-NEXT: ret{{[l|q]}}
157 ; AVX-LABEL: test_div_sd:
159 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
160 ; AVX-NEXT: ret{{[l|q]}}
161 %1 = extractelement <2 x double> %b, i32 0
162 %2 = extractelement <2 x double> %a, i32 0
163 %div = fdiv double %2, %1
164 %3 = insertelement <2 x double> %a, double %div, i32 0
168 define <2 x double> @test_sqrt_sd(<2 x double> %a) {
169 ; SSE-LABEL: test_sqrt_sd:
171 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
172 ; SSE-NEXT: ret{{[l|q]}}
174 ; AVX-LABEL: test_sqrt_sd:
176 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
177 ; AVX-NEXT: ret{{[l|q]}}
178 %1 = extractelement <2 x double> %a, i32 0
179 %2 = call double @llvm.sqrt.f64(double %1)
180 %3 = insertelement <2 x double> %a, double %2, i32 0
183 declare double @llvm.sqrt.f64(double)
185 define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
186 ; SSE-LABEL: test2_add_ss:
188 ; SSE-NEXT: addss %xmm0, %xmm1
189 ; SSE-NEXT: movaps %xmm1, %xmm0
190 ; SSE-NEXT: ret{{[l|q]}}
192 ; AVX-LABEL: test2_add_ss:
194 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
195 ; AVX-NEXT: ret{{[l|q]}}
196 %1 = extractelement <4 x float> %a, i32 0
197 %2 = extractelement <4 x float> %b, i32 0
198 %add = fadd float %1, %2
199 %3 = insertelement <4 x float> %b, float %add, i32 0
203 define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
204 ; SSE-LABEL: test2_sub_ss:
206 ; SSE-NEXT: subss %xmm0, %xmm1
207 ; SSE-NEXT: movaps %xmm1, %xmm0
208 ; SSE-NEXT: ret{{[l|q]}}
210 ; AVX-LABEL: test2_sub_ss:
212 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
213 ; AVX-NEXT: ret{{[l|q]}}
214 %1 = extractelement <4 x float> %a, i32 0
215 %2 = extractelement <4 x float> %b, i32 0
216 %sub = fsub float %2, %1
217 %3 = insertelement <4 x float> %b, float %sub, i32 0
221 define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
222 ; SSE-LABEL: test2_mul_ss:
224 ; SSE-NEXT: mulss %xmm0, %xmm1
225 ; SSE-NEXT: movaps %xmm1, %xmm0
226 ; SSE-NEXT: ret{{[l|q]}}
228 ; AVX-LABEL: test2_mul_ss:
230 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
231 ; AVX-NEXT: ret{{[l|q]}}
232 %1 = extractelement <4 x float> %a, i32 0
233 %2 = extractelement <4 x float> %b, i32 0
234 %mul = fmul float %1, %2
235 %3 = insertelement <4 x float> %b, float %mul, i32 0
239 define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
240 ; SSE-LABEL: test2_div_ss:
242 ; SSE-NEXT: divss %xmm0, %xmm1
243 ; SSE-NEXT: movaps %xmm1, %xmm0
244 ; SSE-NEXT: ret{{[l|q]}}
246 ; AVX-LABEL: test2_div_ss:
248 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
249 ; AVX-NEXT: ret{{[l|q]}}
250 %1 = extractelement <4 x float> %a, i32 0
251 %2 = extractelement <4 x float> %b, i32 0
252 %div = fdiv float %2, %1
253 %3 = insertelement <4 x float> %b, float %div, i32 0
257 define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
258 ; SSE-LABEL: test2_add_sd:
260 ; SSE-NEXT: addsd %xmm0, %xmm1
261 ; SSE-NEXT: movapd %xmm1, %xmm0
262 ; SSE-NEXT: ret{{[l|q]}}
264 ; AVX-LABEL: test2_add_sd:
266 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
267 ; AVX-NEXT: ret{{[l|q]}}
268 %1 = extractelement <2 x double> %a, i32 0
269 %2 = extractelement <2 x double> %b, i32 0
270 %add = fadd double %1, %2
271 %3 = insertelement <2 x double> %b, double %add, i32 0
275 define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
276 ; SSE-LABEL: test2_sub_sd:
278 ; SSE-NEXT: subsd %xmm0, %xmm1
279 ; SSE-NEXT: movapd %xmm1, %xmm0
280 ; SSE-NEXT: ret{{[l|q]}}
282 ; AVX-LABEL: test2_sub_sd:
284 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
285 ; AVX-NEXT: ret{{[l|q]}}
286 %1 = extractelement <2 x double> %a, i32 0
287 %2 = extractelement <2 x double> %b, i32 0
288 %sub = fsub double %2, %1
289 %3 = insertelement <2 x double> %b, double %sub, i32 0
293 define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
294 ; SSE-LABEL: test2_mul_sd:
296 ; SSE-NEXT: mulsd %xmm0, %xmm1
297 ; SSE-NEXT: movapd %xmm1, %xmm0
298 ; SSE-NEXT: ret{{[l|q]}}
300 ; AVX-LABEL: test2_mul_sd:
302 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
303 ; AVX-NEXT: ret{{[l|q]}}
304 %1 = extractelement <2 x double> %a, i32 0
305 %2 = extractelement <2 x double> %b, i32 0
306 %mul = fmul double %1, %2
307 %3 = insertelement <2 x double> %b, double %mul, i32 0
311 define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
312 ; SSE-LABEL: test2_div_sd:
314 ; SSE-NEXT: divsd %xmm0, %xmm1
315 ; SSE-NEXT: movapd %xmm1, %xmm0
316 ; SSE-NEXT: ret{{[l|q]}}
318 ; AVX-LABEL: test2_div_sd:
320 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
321 ; AVX-NEXT: ret{{[l|q]}}
322 %1 = extractelement <2 x double> %a, i32 0
323 %2 = extractelement <2 x double> %b, i32 0
324 %div = fdiv double %2, %1
325 %3 = insertelement <2 x double> %b, double %div, i32 0
329 define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
330 ; SSE-LABEL: test_multiple_add_ss:
332 ; SSE-NEXT: addss %xmm0, %xmm1
333 ; SSE-NEXT: addss %xmm1, %xmm0
334 ; SSE-NEXT: ret{{[l|q]}}
336 ; AVX-LABEL: test_multiple_add_ss:
338 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
339 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
340 ; AVX-NEXT: ret{{[l|q]}}
341 %1 = extractelement <4 x float> %b, i32 0
342 %2 = extractelement <4 x float> %a, i32 0
343 %add = fadd float %2, %1
344 %add2 = fadd float %2, %add
345 %3 = insertelement <4 x float> %a, float %add2, i32 0
349 define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
350 ; SSE-LABEL: test_multiple_sub_ss:
352 ; SSE-NEXT: movaps %xmm0, %xmm2
353 ; SSE-NEXT: subss %xmm1, %xmm2
354 ; SSE-NEXT: subss %xmm2, %xmm0
355 ; SSE-NEXT: ret{{[l|q]}}
357 ; AVX-LABEL: test_multiple_sub_ss:
359 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm1
360 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
361 ; AVX-NEXT: ret{{[l|q]}}
362 %1 = extractelement <4 x float> %b, i32 0
363 %2 = extractelement <4 x float> %a, i32 0
364 %sub = fsub float %2, %1
365 %sub2 = fsub float %2, %sub
366 %3 = insertelement <4 x float> %a, float %sub2, i32 0
370 define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
371 ; SSE-LABEL: test_multiple_mul_ss:
373 ; SSE-NEXT: mulss %xmm0, %xmm1
374 ; SSE-NEXT: mulss %xmm1, %xmm0
375 ; SSE-NEXT: ret{{[l|q]}}
377 ; AVX-LABEL: test_multiple_mul_ss:
379 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
380 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
381 ; AVX-NEXT: ret{{[l|q]}}
382 %1 = extractelement <4 x float> %b, i32 0
383 %2 = extractelement <4 x float> %a, i32 0
384 %mul = fmul float %2, %1
385 %mul2 = fmul float %2, %mul
386 %3 = insertelement <4 x float> %a, float %mul2, i32 0
390 define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
391 ; SSE-LABEL: test_multiple_div_ss:
393 ; SSE-NEXT: movaps %xmm0, %xmm2
394 ; SSE-NEXT: divss %xmm1, %xmm2
395 ; SSE-NEXT: divss %xmm2, %xmm0
396 ; SSE-NEXT: ret{{[l|q]}}
398 ; AVX-LABEL: test_multiple_div_ss:
400 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm1
401 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
402 ; AVX-NEXT: ret{{[l|q]}}
403 %1 = extractelement <4 x float> %b, i32 0
404 %2 = extractelement <4 x float> %a, i32 0
405 %div = fdiv float %2, %1
406 %div2 = fdiv float %2, %div
407 %3 = insertelement <4 x float> %a, float %div2, i32 0
411 ; With SSE4.1 or greater, the shuffles in the following tests may
412 ; be lowered to X86Blendi nodes.
414 define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
415 ; X86-SSE-LABEL: blend_add_ss:
417 ; X86-SSE-NEXT: addss {{[0-9]+}}(%esp), %xmm0
420 ; X86-AVX-LABEL: blend_add_ss:
422 ; X86-AVX-NEXT: vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
425 ; X64-SSE-LABEL: blend_add_ss:
427 ; X64-SSE-NEXT: addss %xmm1, %xmm0
430 ; X64-AVX-LABEL: blend_add_ss:
432 ; X64-AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
435 %ext = extractelement <4 x float> %a, i32 0
436 %op = fadd float %b, %ext
437 %ins = insertelement <4 x float> undef, float %op, i32 0
438 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
439 ret <4 x float> %shuf
442 define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
443 ; X86-SSE-LABEL: blend_sub_ss:
445 ; X86-SSE-NEXT: subss {{[0-9]+}}(%esp), %xmm0
448 ; X86-AVX-LABEL: blend_sub_ss:
450 ; X86-AVX-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0
453 ; X64-SSE-LABEL: blend_sub_ss:
455 ; X64-SSE-NEXT: subss %xmm1, %xmm0
458 ; X64-AVX-LABEL: blend_sub_ss:
460 ; X64-AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
463 %ext = extractelement <4 x float> %a, i32 0
464 %op = fsub float %ext, %b
465 %ins = insertelement <4 x float> undef, float %op, i32 0
466 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
467 ret <4 x float> %shuf
470 define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
471 ; X86-SSE-LABEL: blend_mul_ss:
473 ; X86-SSE-NEXT: mulss {{[0-9]+}}(%esp), %xmm0
476 ; X86-AVX-LABEL: blend_mul_ss:
478 ; X86-AVX-NEXT: vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
481 ; X64-SSE-LABEL: blend_mul_ss:
483 ; X64-SSE-NEXT: mulss %xmm1, %xmm0
486 ; X64-AVX-LABEL: blend_mul_ss:
488 ; X64-AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
491 %ext = extractelement <4 x float> %a, i32 0
492 %op = fmul float %b, %ext
493 %ins = insertelement <4 x float> undef, float %op, i32 0
494 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
495 ret <4 x float> %shuf
498 define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
499 ; X86-SSE-LABEL: blend_div_ss:
501 ; X86-SSE-NEXT: divss {{[0-9]+}}(%esp), %xmm0
504 ; X86-AVX-LABEL: blend_div_ss:
506 ; X86-AVX-NEXT: vdivss {{[0-9]+}}(%esp), %xmm0, %xmm0
509 ; X64-SSE-LABEL: blend_div_ss:
511 ; X64-SSE-NEXT: divss %xmm1, %xmm0
514 ; X64-AVX-LABEL: blend_div_ss:
516 ; X64-AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
519 %ext = extractelement <4 x float> %a, i32 0
520 %op = fdiv float %ext, %b
521 %ins = insertelement <4 x float> undef, float %op, i32 0
522 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
523 ret <4 x float> %shuf
526 define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
527 ; X86-SSE-LABEL: blend_add_sd:
529 ; X86-SSE-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
532 ; X86-AVX-LABEL: blend_add_sd:
534 ; X86-AVX-NEXT: vaddsd {{[0-9]+}}(%esp), %xmm0, %xmm0
537 ; X64-SSE-LABEL: blend_add_sd:
539 ; X64-SSE-NEXT: addsd %xmm1, %xmm0
542 ; X64-AVX-LABEL: blend_add_sd:
544 ; X64-AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
547 %ext = extractelement <2 x double> %a, i32 0
548 %op = fadd double %b, %ext
549 %ins = insertelement <2 x double> undef, double %op, i32 0
550 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
551 ret <2 x double> %shuf
554 define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
555 ; X86-SSE-LABEL: blend_sub_sd:
557 ; X86-SSE-NEXT: subsd {{[0-9]+}}(%esp), %xmm0
560 ; X86-AVX-LABEL: blend_sub_sd:
562 ; X86-AVX-NEXT: vsubsd {{[0-9]+}}(%esp), %xmm0, %xmm0
565 ; X64-SSE-LABEL: blend_sub_sd:
567 ; X64-SSE-NEXT: subsd %xmm1, %xmm0
570 ; X64-AVX-LABEL: blend_sub_sd:
572 ; X64-AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
575 %ext = extractelement <2 x double> %a, i32 0
576 %op = fsub double %ext, %b
577 %ins = insertelement <2 x double> undef, double %op, i32 0
578 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
579 ret <2 x double> %shuf
582 define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
583 ; X86-SSE-LABEL: blend_mul_sd:
585 ; X86-SSE-NEXT: mulsd {{[0-9]+}}(%esp), %xmm0
588 ; X86-AVX-LABEL: blend_mul_sd:
590 ; X86-AVX-NEXT: vmulsd {{[0-9]+}}(%esp), %xmm0, %xmm0
593 ; X64-SSE-LABEL: blend_mul_sd:
595 ; X64-SSE-NEXT: mulsd %xmm1, %xmm0
598 ; X64-AVX-LABEL: blend_mul_sd:
600 ; X64-AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
603 %ext = extractelement <2 x double> %a, i32 0
604 %op = fmul double %b, %ext
605 %ins = insertelement <2 x double> undef, double %op, i32 0
606 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
607 ret <2 x double> %shuf
610 define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
611 ; X86-SSE-LABEL: blend_div_sd:
613 ; X86-SSE-NEXT: divsd {{[0-9]+}}(%esp), %xmm0
616 ; X86-AVX-LABEL: blend_div_sd:
618 ; X86-AVX-NEXT: vdivsd {{[0-9]+}}(%esp), %xmm0, %xmm0
621 ; X64-SSE-LABEL: blend_div_sd:
623 ; X64-SSE-NEXT: divsd %xmm1, %xmm0
626 ; X64-AVX-LABEL: blend_div_sd:
628 ; X64-AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
631 %ext = extractelement <2 x double> %a, i32 0
632 %op = fdiv double %ext, %b
633 %ins = insertelement <2 x double> undef, double %op, i32 0
634 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
635 ret <2 x double> %shuf
638 ; Ensure that the backend selects SSE/AVX scalar fp instructions
639 ; from a packed fp instruction plus a vector insert.
641 define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
642 ; SSE-LABEL: insert_test_add_ss:
644 ; SSE-NEXT: addss %xmm1, %xmm0
645 ; SSE-NEXT: ret{{[l|q]}}
647 ; AVX-LABEL: insert_test_add_ss:
649 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
650 ; AVX-NEXT: ret{{[l|q]}}
651 %1 = fadd <4 x float> %a, %b
652 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
656 define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
657 ; SSE-LABEL: insert_test_sub_ss:
659 ; SSE-NEXT: subss %xmm1, %xmm0
660 ; SSE-NEXT: ret{{[l|q]}}
662 ; AVX-LABEL: insert_test_sub_ss:
664 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
665 ; AVX-NEXT: ret{{[l|q]}}
666 %1 = fsub <4 x float> %a, %b
667 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
671 define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
672 ; SSE-LABEL: insert_test_mul_ss:
674 ; SSE-NEXT: mulss %xmm1, %xmm0
675 ; SSE-NEXT: ret{{[l|q]}}
677 ; AVX-LABEL: insert_test_mul_ss:
679 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
680 ; AVX-NEXT: ret{{[l|q]}}
681 %1 = fmul <4 x float> %a, %b
682 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
686 define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
687 ; SSE-LABEL: insert_test_div_ss:
689 ; SSE-NEXT: divss %xmm1, %xmm0
690 ; SSE-NEXT: ret{{[l|q]}}
692 ; AVX-LABEL: insert_test_div_ss:
694 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
695 ; AVX-NEXT: ret{{[l|q]}}
696 %1 = fdiv <4 x float> %a, %b
697 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
701 define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
702 ; SSE-LABEL: insert_test_add_sd:
704 ; SSE-NEXT: addsd %xmm1, %xmm0
705 ; SSE-NEXT: ret{{[l|q]}}
707 ; AVX-LABEL: insert_test_add_sd:
709 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
710 ; AVX-NEXT: ret{{[l|q]}}
711 %1 = fadd <2 x double> %a, %b
712 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
716 define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
717 ; SSE-LABEL: insert_test_sub_sd:
719 ; SSE-NEXT: subsd %xmm1, %xmm0
720 ; SSE-NEXT: ret{{[l|q]}}
722 ; AVX-LABEL: insert_test_sub_sd:
724 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
725 ; AVX-NEXT: ret{{[l|q]}}
726 %1 = fsub <2 x double> %a, %b
727 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
731 define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
732 ; SSE-LABEL: insert_test_mul_sd:
734 ; SSE-NEXT: mulsd %xmm1, %xmm0
735 ; SSE-NEXT: ret{{[l|q]}}
737 ; AVX-LABEL: insert_test_mul_sd:
739 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
740 ; AVX-NEXT: ret{{[l|q]}}
741 %1 = fmul <2 x double> %a, %b
742 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
746 define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
747 ; SSE-LABEL: insert_test_div_sd:
749 ; SSE-NEXT: divsd %xmm1, %xmm0
750 ; SSE-NEXT: ret{{[l|q]}}
752 ; AVX-LABEL: insert_test_div_sd:
754 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
755 ; AVX-NEXT: ret{{[l|q]}}
756 %1 = fdiv <2 x double> %a, %b
757 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
761 define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
762 ; SSE-LABEL: insert_test2_add_ss:
764 ; SSE-NEXT: addss %xmm0, %xmm1
765 ; SSE-NEXT: movaps %xmm1, %xmm0
766 ; SSE-NEXT: ret{{[l|q]}}
768 ; AVX-LABEL: insert_test2_add_ss:
770 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
771 ; AVX-NEXT: ret{{[l|q]}}
772 %1 = fadd <4 x float> %b, %a
773 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
777 define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
778 ; SSE-LABEL: insert_test2_sub_ss:
780 ; SSE-NEXT: subss %xmm0, %xmm1
781 ; SSE-NEXT: movaps %xmm1, %xmm0
782 ; SSE-NEXT: ret{{[l|q]}}
784 ; AVX-LABEL: insert_test2_sub_ss:
786 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
787 ; AVX-NEXT: ret{{[l|q]}}
788 %1 = fsub <4 x float> %b, %a
789 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
793 define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
794 ; SSE-LABEL: insert_test2_mul_ss:
796 ; SSE-NEXT: mulss %xmm0, %xmm1
797 ; SSE-NEXT: movaps %xmm1, %xmm0
798 ; SSE-NEXT: ret{{[l|q]}}
800 ; AVX-LABEL: insert_test2_mul_ss:
802 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
803 ; AVX-NEXT: ret{{[l|q]}}
804 %1 = fmul <4 x float> %b, %a
805 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
809 define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
810 ; SSE-LABEL: insert_test2_div_ss:
812 ; SSE-NEXT: divss %xmm0, %xmm1
813 ; SSE-NEXT: movaps %xmm1, %xmm0
814 ; SSE-NEXT: ret{{[l|q]}}
816 ; AVX-LABEL: insert_test2_div_ss:
818 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
819 ; AVX-NEXT: ret{{[l|q]}}
820 %1 = fdiv <4 x float> %b, %a
821 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
825 define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
826 ; SSE-LABEL: insert_test2_add_sd:
828 ; SSE-NEXT: addsd %xmm0, %xmm1
829 ; SSE-NEXT: movapd %xmm1, %xmm0
830 ; SSE-NEXT: ret{{[l|q]}}
832 ; AVX-LABEL: insert_test2_add_sd:
834 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
835 ; AVX-NEXT: ret{{[l|q]}}
836 %1 = fadd <2 x double> %b, %a
837 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
841 define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
842 ; SSE-LABEL: insert_test2_sub_sd:
844 ; SSE-NEXT: subsd %xmm0, %xmm1
845 ; SSE-NEXT: movapd %xmm1, %xmm0
846 ; SSE-NEXT: ret{{[l|q]}}
848 ; AVX-LABEL: insert_test2_sub_sd:
850 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
851 ; AVX-NEXT: ret{{[l|q]}}
852 %1 = fsub <2 x double> %b, %a
853 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
857 define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
858 ; SSE-LABEL: insert_test2_mul_sd:
860 ; SSE-NEXT: mulsd %xmm0, %xmm1
861 ; SSE-NEXT: movapd %xmm1, %xmm0
862 ; SSE-NEXT: ret{{[l|q]}}
864 ; AVX-LABEL: insert_test2_mul_sd:
866 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
867 ; AVX-NEXT: ret{{[l|q]}}
868 %1 = fmul <2 x double> %b, %a
869 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
873 define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
874 ; SSE-LABEL: insert_test2_div_sd:
876 ; SSE-NEXT: divsd %xmm0, %xmm1
877 ; SSE-NEXT: movapd %xmm1, %xmm0
878 ; SSE-NEXT: ret{{[l|q]}}
880 ; AVX-LABEL: insert_test2_div_sd:
882 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
883 ; AVX-NEXT: ret{{[l|q]}}
884 %1 = fdiv <2 x double> %b, %a
885 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
889 define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
890 ; SSE-LABEL: insert_test3_add_ss:
892 ; SSE-NEXT: addss %xmm1, %xmm0
893 ; SSE-NEXT: ret{{[l|q]}}
895 ; AVX-LABEL: insert_test3_add_ss:
897 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
898 ; AVX-NEXT: ret{{[l|q]}}
899 %1 = fadd <4 x float> %a, %b
900 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
904 define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
905 ; SSE-LABEL: insert_test3_sub_ss:
907 ; SSE-NEXT: subss %xmm1, %xmm0
908 ; SSE-NEXT: ret{{[l|q]}}
910 ; AVX-LABEL: insert_test3_sub_ss:
912 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
913 ; AVX-NEXT: ret{{[l|q]}}
914 %1 = fsub <4 x float> %a, %b
915 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
919 define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
920 ; SSE-LABEL: insert_test3_mul_ss:
922 ; SSE-NEXT: mulss %xmm1, %xmm0
923 ; SSE-NEXT: ret{{[l|q]}}
925 ; AVX-LABEL: insert_test3_mul_ss:
927 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
928 ; AVX-NEXT: ret{{[l|q]}}
929 %1 = fmul <4 x float> %a, %b
930 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
934 define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
935 ; SSE-LABEL: insert_test3_div_ss:
937 ; SSE-NEXT: divss %xmm1, %xmm0
938 ; SSE-NEXT: ret{{[l|q]}}
940 ; AVX-LABEL: insert_test3_div_ss:
942 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
943 ; AVX-NEXT: ret{{[l|q]}}
944 %1 = fdiv <4 x float> %a, %b
945 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
949 define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
950 ; SSE-LABEL: insert_test3_add_sd:
952 ; SSE-NEXT: addsd %xmm1, %xmm0
953 ; SSE-NEXT: ret{{[l|q]}}
955 ; AVX-LABEL: insert_test3_add_sd:
957 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
958 ; AVX-NEXT: ret{{[l|q]}}
959 %1 = fadd <2 x double> %a, %b
960 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
964 define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
965 ; SSE-LABEL: insert_test3_sub_sd:
967 ; SSE-NEXT: subsd %xmm1, %xmm0
968 ; SSE-NEXT: ret{{[l|q]}}
970 ; AVX-LABEL: insert_test3_sub_sd:
972 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
973 ; AVX-NEXT: ret{{[l|q]}}
974 %1 = fsub <2 x double> %a, %b
975 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
979 define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
980 ; SSE-LABEL: insert_test3_mul_sd:
982 ; SSE-NEXT: mulsd %xmm1, %xmm0
983 ; SSE-NEXT: ret{{[l|q]}}
985 ; AVX-LABEL: insert_test3_mul_sd:
987 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
988 ; AVX-NEXT: ret{{[l|q]}}
989 %1 = fmul <2 x double> %a, %b
990 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
994 define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
995 ; SSE-LABEL: insert_test3_div_sd:
997 ; SSE-NEXT: divsd %xmm1, %xmm0
998 ; SSE-NEXT: ret{{[l|q]}}
1000 ; AVX-LABEL: insert_test3_div_sd:
1002 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
1003 ; AVX-NEXT: ret{{[l|q]}}
1004 %1 = fdiv <2 x double> %a, %b
1005 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
1009 define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
1010 ; SSE-LABEL: insert_test4_add_ss:
1012 ; SSE-NEXT: addss %xmm0, %xmm1
1013 ; SSE-NEXT: movaps %xmm1, %xmm0
1014 ; SSE-NEXT: ret{{[l|q]}}
1016 ; AVX-LABEL: insert_test4_add_ss:
1018 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
1019 ; AVX-NEXT: ret{{[l|q]}}
1020 %1 = fadd <4 x float> %b, %a
1021 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1025 define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
1026 ; SSE-LABEL: insert_test4_sub_ss:
1028 ; SSE-NEXT: subss %xmm0, %xmm1
1029 ; SSE-NEXT: movaps %xmm1, %xmm0
1030 ; SSE-NEXT: ret{{[l|q]}}
1032 ; AVX-LABEL: insert_test4_sub_ss:
1034 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1035 ; AVX-NEXT: ret{{[l|q]}}
1036 %1 = fsub <4 x float> %b, %a
1037 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1041 define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
1042 ; SSE-LABEL: insert_test4_mul_ss:
1044 ; SSE-NEXT: mulss %xmm0, %xmm1
1045 ; SSE-NEXT: movaps %xmm1, %xmm0
1046 ; SSE-NEXT: ret{{[l|q]}}
1048 ; AVX-LABEL: insert_test4_mul_ss:
1050 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
1051 ; AVX-NEXT: ret{{[l|q]}}
1052 %1 = fmul <4 x float> %b, %a
1053 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1057 define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
1058 ; SSE-LABEL: insert_test4_div_ss:
1060 ; SSE-NEXT: divss %xmm0, %xmm1
1061 ; SSE-NEXT: movaps %xmm1, %xmm0
1062 ; SSE-NEXT: ret{{[l|q]}}
1064 ; AVX-LABEL: insert_test4_div_ss:
1066 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
1067 ; AVX-NEXT: ret{{[l|q]}}
1068 %1 = fdiv <4 x float> %b, %a
1069 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1073 define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
1074 ; SSE-LABEL: insert_test4_add_sd:
1076 ; SSE-NEXT: addsd %xmm0, %xmm1
1077 ; SSE-NEXT: movapd %xmm1, %xmm0
1078 ; SSE-NEXT: ret{{[l|q]}}
1080 ; AVX-LABEL: insert_test4_add_sd:
1082 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1083 ; AVX-NEXT: ret{{[l|q]}}
1084 %1 = fadd <2 x double> %b, %a
1085 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1089 define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
1090 ; SSE-LABEL: insert_test4_sub_sd:
1092 ; SSE-NEXT: subsd %xmm0, %xmm1
1093 ; SSE-NEXT: movapd %xmm1, %xmm0
1094 ; SSE-NEXT: ret{{[l|q]}}
1096 ; AVX-LABEL: insert_test4_sub_sd:
1098 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1099 ; AVX-NEXT: ret{{[l|q]}}
1100 %1 = fsub <2 x double> %b, %a
1101 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1105 define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
1106 ; SSE-LABEL: insert_test4_mul_sd:
1108 ; SSE-NEXT: mulsd %xmm0, %xmm1
1109 ; SSE-NEXT: movapd %xmm1, %xmm0
1110 ; SSE-NEXT: ret{{[l|q]}}
1112 ; AVX-LABEL: insert_test4_mul_sd:
1114 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1115 ; AVX-NEXT: ret{{[l|q]}}
1116 %1 = fmul <2 x double> %b, %a
1117 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1121 define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
1122 ; SSE-LABEL: insert_test4_div_sd:
1124 ; SSE-NEXT: divsd %xmm0, %xmm1
1125 ; SSE-NEXT: movapd %xmm1, %xmm0
1126 ; SSE-NEXT: ret{{[l|q]}}
1128 ; AVX-LABEL: insert_test4_div_sd:
1130 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
1131 ; AVX-NEXT: ret{{[l|q]}}
1132 %1 = fdiv <2 x double> %b, %a
1133 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1137 define <4 x float> @insert_test5_add_ss(<4 x float> %a, <4 x float> %b) {
1138 ; SSE-LABEL: insert_test5_add_ss:
1140 ; SSE-NEXT: addss %xmm1, %xmm0
1141 ; SSE-NEXT: ret{{[l|q]}}
1143 ; AVX-LABEL: insert_test5_add_ss:
1145 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
1146 ; AVX-NEXT: ret{{[l|q]}}
1147 %1 = fadd <4 x float> %b, %a
1148 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1152 define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
1153 ; SSE2-LABEL: insert_test5_sub_ss:
1155 ; SSE2-NEXT: subps %xmm0, %xmm1
1156 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1157 ; SSE2-NEXT: ret{{[l|q]}}
1159 ; SSE41-LABEL: insert_test5_sub_ss:
1161 ; SSE41-NEXT: subps %xmm0, %xmm1
1162 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1163 ; SSE41-NEXT: ret{{[l|q]}}
1165 ; AVX-LABEL: insert_test5_sub_ss:
1167 ; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm1
1168 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1169 ; AVX-NEXT: ret{{[l|q]}}
1170 %1 = fsub <4 x float> %b, %a
1171 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1175 define <4 x float> @insert_test5_mul_ss(<4 x float> %a, <4 x float> %b) {
1176 ; SSE-LABEL: insert_test5_mul_ss:
1178 ; SSE-NEXT: mulss %xmm1, %xmm0
1179 ; SSE-NEXT: ret{{[l|q]}}
1181 ; AVX-LABEL: insert_test5_mul_ss:
1183 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
1184 ; AVX-NEXT: ret{{[l|q]}}
1185 %1 = fmul <4 x float> %b, %a
1186 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1190 define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
1191 ; SSE2-LABEL: insert_test5_div_ss:
1193 ; SSE2-NEXT: divps %xmm0, %xmm1
1194 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1195 ; SSE2-NEXT: ret{{[l|q]}}
1197 ; SSE41-LABEL: insert_test5_div_ss:
1199 ; SSE41-NEXT: divps %xmm0, %xmm1
1200 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1201 ; SSE41-NEXT: ret{{[l|q]}}
1203 ; AVX-LABEL: insert_test5_div_ss:
1205 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm1
1206 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1207 ; AVX-NEXT: ret{{[l|q]}}
1208 %1 = fdiv <4 x float> %b, %a
1209 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1213 define <2 x double> @insert_test5_add_sd(<2 x double> %a, <2 x double> %b) {
1214 ; SSE-LABEL: insert_test5_add_sd:
1216 ; SSE-NEXT: addsd %xmm1, %xmm0
1217 ; SSE-NEXT: ret{{[l|q]}}
1219 ; AVX-LABEL: insert_test5_add_sd:
1221 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1222 ; AVX-NEXT: ret{{[l|q]}}
1223 %1 = fadd <2 x double> %b, %a
1224 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1228 define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
1229 ; SSE2-LABEL: insert_test5_sub_sd:
1231 ; SSE2-NEXT: subpd %xmm0, %xmm1
1232 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1233 ; SSE2-NEXT: ret{{[l|q]}}
1235 ; SSE41-LABEL: insert_test5_sub_sd:
1237 ; SSE41-NEXT: subpd %xmm0, %xmm1
1238 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1239 ; SSE41-NEXT: ret{{[l|q]}}
1241 ; AVX-LABEL: insert_test5_sub_sd:
1243 ; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm1
1244 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1245 ; AVX-NEXT: ret{{[l|q]}}
1246 %1 = fsub <2 x double> %b, %a
1247 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1251 define <2 x double> @insert_test5_mul_sd(<2 x double> %a, <2 x double> %b) {
1252 ; SSE-LABEL: insert_test5_mul_sd:
1254 ; SSE-NEXT: mulsd %xmm1, %xmm0
1255 ; SSE-NEXT: ret{{[l|q]}}
1257 ; AVX-LABEL: insert_test5_mul_sd:
1259 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
1260 ; AVX-NEXT: ret{{[l|q]}}
1261 %1 = fmul <2 x double> %b, %a
1262 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1266 define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
1267 ; SSE2-LABEL: insert_test5_div_sd:
1269 ; SSE2-NEXT: divpd %xmm0, %xmm1
1270 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1271 ; SSE2-NEXT: ret{{[l|q]}}
1273 ; SSE41-LABEL: insert_test5_div_sd:
1275 ; SSE41-NEXT: divpd %xmm0, %xmm1
1276 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1277 ; SSE41-NEXT: ret{{[l|q]}}
1279 ; AVX-LABEL: insert_test5_div_sd:
1281 ; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1
1282 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1283 ; AVX-NEXT: ret{{[l|q]}}
1284 %1 = fdiv <2 x double> %b, %a
1285 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1289 define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1290 ; X86-SSE2-LABEL: add_ss_mask:
1291 ; X86-SSE2: # %bb.0:
1292 ; X86-SSE2-NEXT: testb $1, {{[0-9]+}}(%esp)
1293 ; X86-SSE2-NEXT: jne .LBB70_1
1294 ; X86-SSE2-NEXT: # %bb.2:
1295 ; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1296 ; X86-SSE2-NEXT: retl
1297 ; X86-SSE2-NEXT: .LBB70_1:
1298 ; X86-SSE2-NEXT: addss %xmm0, %xmm1
1299 ; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1300 ; X86-SSE2-NEXT: retl
1302 ; X86-SSE41-LABEL: add_ss_mask:
1303 ; X86-SSE41: # %bb.0:
1304 ; X86-SSE41-NEXT: testb $1, {{[0-9]+}}(%esp)
1305 ; X86-SSE41-NEXT: jne .LBB70_1
1306 ; X86-SSE41-NEXT: # %bb.2:
1307 ; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1308 ; X86-SSE41-NEXT: retl
1309 ; X86-SSE41-NEXT: .LBB70_1:
1310 ; X86-SSE41-NEXT: addss %xmm0, %xmm1
1311 ; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1312 ; X86-SSE41-NEXT: retl
1314 ; X86-AVX1-LABEL: add_ss_mask:
1315 ; X86-AVX1: # %bb.0:
1316 ; X86-AVX1-NEXT: testb $1, {{[0-9]+}}(%esp)
1317 ; X86-AVX1-NEXT: je .LBB70_2
1318 ; X86-AVX1-NEXT: # %bb.1:
1319 ; X86-AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
1320 ; X86-AVX1-NEXT: .LBB70_2:
1321 ; X86-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1322 ; X86-AVX1-NEXT: retl
1324 ; X86-AVX512-LABEL: add_ss_mask:
1325 ; X86-AVX512: # %bb.0:
1326 ; X86-AVX512-NEXT: movb {{[0-9]+}}(%esp), %al
1327 ; X86-AVX512-NEXT: kmovw %eax, %k1
1328 ; X86-AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
1329 ; X86-AVX512-NEXT: vmovaps %xmm2, %xmm0
1330 ; X86-AVX512-NEXT: retl
1332 ; X64-SSE2-LABEL: add_ss_mask:
1333 ; X64-SSE2: # %bb.0:
1334 ; X64-SSE2-NEXT: testb $1, %dil
1335 ; X64-SSE2-NEXT: jne .LBB70_1
1336 ; X64-SSE2-NEXT: # %bb.2:
1337 ; X64-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1338 ; X64-SSE2-NEXT: retq
1339 ; X64-SSE2-NEXT: .LBB70_1:
1340 ; X64-SSE2-NEXT: addss %xmm0, %xmm1
1341 ; X64-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1342 ; X64-SSE2-NEXT: retq
1344 ; X64-SSE41-LABEL: add_ss_mask:
1345 ; X64-SSE41: # %bb.0:
1346 ; X64-SSE41-NEXT: testb $1, %dil
1347 ; X64-SSE41-NEXT: jne .LBB70_1
1348 ; X64-SSE41-NEXT: # %bb.2:
1349 ; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1350 ; X64-SSE41-NEXT: retq
1351 ; X64-SSE41-NEXT: .LBB70_1:
1352 ; X64-SSE41-NEXT: addss %xmm0, %xmm1
1353 ; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1354 ; X64-SSE41-NEXT: retq
1356 ; X64-AVX1-LABEL: add_ss_mask:
1357 ; X64-AVX1: # %bb.0:
1358 ; X64-AVX1-NEXT: testb $1, %dil
1359 ; X64-AVX1-NEXT: je .LBB70_2
1360 ; X64-AVX1-NEXT: # %bb.1:
1361 ; X64-AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
1362 ; X64-AVX1-NEXT: .LBB70_2:
1363 ; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1364 ; X64-AVX1-NEXT: retq
1366 ; X64-AVX512-LABEL: add_ss_mask:
1367 ; X64-AVX512: # %bb.0:
1368 ; X64-AVX512-NEXT: kmovw %edi, %k1
1369 ; X64-AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
1370 ; X64-AVX512-NEXT: vmovaps %xmm2, %xmm0
1371 ; X64-AVX512-NEXT: retq
1372 %1 = extractelement <4 x float> %a, i64 0
1373 %2 = extractelement <4 x float> %b, i64 0
1374 %3 = fadd float %1, %2
1375 %4 = extractelement <4 x float> %c, i32 0
1376 %5 = bitcast i8 %mask to <8 x i1>
1377 %6 = extractelement <8 x i1> %5, i64 0
1378 %7 = select i1 %6, float %3, float %4
1379 %8 = insertelement <4 x float> %a, float %7, i64 0
1383 define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1384 ; X86-SSE2-LABEL: add_sd_mask:
1385 ; X86-SSE2: # %bb.0:
1386 ; X86-SSE2-NEXT: testb $1, {{[0-9]+}}(%esp)
1387 ; X86-SSE2-NEXT: jne .LBB71_1
1388 ; X86-SSE2-NEXT: # %bb.2:
1389 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1390 ; X86-SSE2-NEXT: retl
1391 ; X86-SSE2-NEXT: .LBB71_1:
1392 ; X86-SSE2-NEXT: addsd %xmm0, %xmm1
1393 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1394 ; X86-SSE2-NEXT: retl
1396 ; X86-SSE41-LABEL: add_sd_mask:
1397 ; X86-SSE41: # %bb.0:
1398 ; X86-SSE41-NEXT: testb $1, {{[0-9]+}}(%esp)
1399 ; X86-SSE41-NEXT: jne .LBB71_1
1400 ; X86-SSE41-NEXT: # %bb.2:
1401 ; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1402 ; X86-SSE41-NEXT: retl
1403 ; X86-SSE41-NEXT: .LBB71_1:
1404 ; X86-SSE41-NEXT: addsd %xmm0, %xmm1
1405 ; X86-SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1406 ; X86-SSE41-NEXT: retl
1408 ; X86-AVX1-LABEL: add_sd_mask:
1409 ; X86-AVX1: # %bb.0:
1410 ; X86-AVX1-NEXT: testb $1, {{[0-9]+}}(%esp)
1411 ; X86-AVX1-NEXT: je .LBB71_2
1412 ; X86-AVX1-NEXT: # %bb.1:
1413 ; X86-AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2
1414 ; X86-AVX1-NEXT: .LBB71_2:
1415 ; X86-AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1416 ; X86-AVX1-NEXT: retl
1418 ; X86-AVX512-LABEL: add_sd_mask:
1419 ; X86-AVX512: # %bb.0:
1420 ; X86-AVX512-NEXT: movb {{[0-9]+}}(%esp), %al
1421 ; X86-AVX512-NEXT: kmovw %eax, %k1
1422 ; X86-AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1423 ; X86-AVX512-NEXT: vmovapd %xmm2, %xmm0
1424 ; X86-AVX512-NEXT: retl
1426 ; X64-SSE2-LABEL: add_sd_mask:
1427 ; X64-SSE2: # %bb.0:
1428 ; X64-SSE2-NEXT: testb $1, %dil
1429 ; X64-SSE2-NEXT: jne .LBB71_1
1430 ; X64-SSE2-NEXT: # %bb.2:
1431 ; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1432 ; X64-SSE2-NEXT: retq
1433 ; X64-SSE2-NEXT: .LBB71_1:
1434 ; X64-SSE2-NEXT: addsd %xmm0, %xmm1
1435 ; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1436 ; X64-SSE2-NEXT: retq
1438 ; X64-SSE41-LABEL: add_sd_mask:
1439 ; X64-SSE41: # %bb.0:
1440 ; X64-SSE41-NEXT: testb $1, %dil
1441 ; X64-SSE41-NEXT: jne .LBB71_1
1442 ; X64-SSE41-NEXT: # %bb.2:
1443 ; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1444 ; X64-SSE41-NEXT: retq
1445 ; X64-SSE41-NEXT: .LBB71_1:
1446 ; X64-SSE41-NEXT: addsd %xmm0, %xmm1
1447 ; X64-SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1448 ; X64-SSE41-NEXT: retq
1450 ; X64-AVX1-LABEL: add_sd_mask:
1451 ; X64-AVX1: # %bb.0:
1452 ; X64-AVX1-NEXT: testb $1, %dil
1453 ; X64-AVX1-NEXT: je .LBB71_2
1454 ; X64-AVX1-NEXT: # %bb.1:
1455 ; X64-AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2
1456 ; X64-AVX1-NEXT: .LBB71_2:
1457 ; X64-AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1458 ; X64-AVX1-NEXT: retq
1460 ; X64-AVX512-LABEL: add_sd_mask:
1461 ; X64-AVX512: # %bb.0:
1462 ; X64-AVX512-NEXT: kmovw %edi, %k1
1463 ; X64-AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1464 ; X64-AVX512-NEXT: vmovapd %xmm2, %xmm0
1465 ; X64-AVX512-NEXT: retq
1466 %1 = extractelement <2 x double> %a, i64 0
1467 %2 = extractelement <2 x double> %b, i64 0
1468 %3 = fadd double %1, %2
1469 %4 = extractelement <2 x double> %c, i32 0
1470 %5 = bitcast i8 %mask to <8 x i1>
1471 %6 = extractelement <8 x i1> %5, i64 0
1472 %7 = select i1 %6, double %3, double %4
1473 %8 = insertelement <2 x double> %a, double %7, i64 0