1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X64
3 ; RUN: llc < %s -mtriple=i686-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X86
5 define float @fneg_v4f32(<4 x float> %x) nounwind {
6 ; X64-LABEL: fneg_v4f32:
8 ; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
9 ; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0
12 ; X86-LABEL: fneg_v4f32:
14 ; X86-NEXT: pushl %eax
15 ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
16 ; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0
17 ; X86-NEXT: vmovss %xmm0, (%esp)
18 ; X86-NEXT: flds (%esp)
21 %v = fneg <4 x float> %x
22 %r = extractelement <4 x float> %v, i32 0
26 define double @fneg_v4f64(<4 x double> %x) nounwind {
27 ; X64-LABEL: fneg_v4f64:
29 ; X64-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
30 ; X64-NEXT: # xmm1 = mem[0,0]
31 ; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0
32 ; X64-NEXT: vzeroupper
35 ; X86-LABEL: fneg_v4f64:
37 ; X86-NEXT: pushl %ebp
38 ; X86-NEXT: movl %esp, %ebp
39 ; X86-NEXT: andl $-8, %esp
40 ; X86-NEXT: subl $8, %esp
41 ; X86-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
42 ; X86-NEXT: # xmm1 = mem[0,0]
43 ; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0
44 ; X86-NEXT: vmovlps %xmm0, (%esp)
45 ; X86-NEXT: fldl (%esp)
46 ; X86-NEXT: movl %ebp, %esp
48 ; X86-NEXT: vzeroupper
50 %v = fneg <4 x double> %x
51 %r = extractelement <4 x double> %v, i32 0
55 define float @fadd_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
56 ; X64-LABEL: fadd_v4f32:
58 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
61 ; X86-LABEL: fadd_v4f32:
63 ; X86-NEXT: pushl %eax
64 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
65 ; X86-NEXT: vmovss %xmm0, (%esp)
66 ; X86-NEXT: flds (%esp)
69 %v = fadd <4 x float> %x, %y
70 %r = extractelement <4 x float> %v, i32 0
74 define double @fadd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
75 ; X64-LABEL: fadd_v4f64:
77 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
78 ; X64-NEXT: vzeroupper
81 ; X86-LABEL: fadd_v4f64:
83 ; X86-NEXT: pushl %ebp
84 ; X86-NEXT: movl %esp, %ebp
85 ; X86-NEXT: andl $-8, %esp
86 ; X86-NEXT: subl $8, %esp
87 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
88 ; X86-NEXT: vmovsd %xmm0, (%esp)
89 ; X86-NEXT: fldl (%esp)
90 ; X86-NEXT: movl %ebp, %esp
92 ; X86-NEXT: vzeroupper
94 %v = fadd <4 x double> %x, %y
95 %r = extractelement <4 x double> %v, i32 0
99 define float @fsub_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
100 ; X64-LABEL: fsub_v4f32:
102 ; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0
105 ; X86-LABEL: fsub_v4f32:
107 ; X86-NEXT: pushl %eax
108 ; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0
109 ; X86-NEXT: vmovss %xmm0, (%esp)
110 ; X86-NEXT: flds (%esp)
111 ; X86-NEXT: popl %eax
113 %v = fsub <4 x float> %x, %y
114 %r = extractelement <4 x float> %v, i32 0
118 define double @fsub_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
119 ; X64-LABEL: fsub_v4f64:
121 ; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0
122 ; X64-NEXT: vzeroupper
125 ; X86-LABEL: fsub_v4f64:
127 ; X86-NEXT: pushl %ebp
128 ; X86-NEXT: movl %esp, %ebp
129 ; X86-NEXT: andl $-8, %esp
130 ; X86-NEXT: subl $8, %esp
131 ; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0
132 ; X86-NEXT: vmovsd %xmm0, (%esp)
133 ; X86-NEXT: fldl (%esp)
134 ; X86-NEXT: movl %ebp, %esp
135 ; X86-NEXT: popl %ebp
136 ; X86-NEXT: vzeroupper
138 %v = fsub <4 x double> %x, %y
139 %r = extractelement <4 x double> %v, i32 0
143 define float @fmul_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
144 ; X64-LABEL: fmul_v4f32:
146 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
149 ; X86-LABEL: fmul_v4f32:
151 ; X86-NEXT: pushl %eax
152 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
153 ; X86-NEXT: vmovss %xmm0, (%esp)
154 ; X86-NEXT: flds (%esp)
155 ; X86-NEXT: popl %eax
157 %v = fmul <4 x float> %x, %y
158 %r = extractelement <4 x float> %v, i32 0
162 define double @fmul_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
163 ; X64-LABEL: fmul_v4f64:
165 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
166 ; X64-NEXT: vzeroupper
169 ; X86-LABEL: fmul_v4f64:
171 ; X86-NEXT: pushl %ebp
172 ; X86-NEXT: movl %esp, %ebp
173 ; X86-NEXT: andl $-8, %esp
174 ; X86-NEXT: subl $8, %esp
175 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
176 ; X86-NEXT: vmovsd %xmm0, (%esp)
177 ; X86-NEXT: fldl (%esp)
178 ; X86-NEXT: movl %ebp, %esp
179 ; X86-NEXT: popl %ebp
180 ; X86-NEXT: vzeroupper
182 %v = fmul <4 x double> %x, %y
183 %r = extractelement <4 x double> %v, i32 0
187 define float @fdiv_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
188 ; X64-LABEL: fdiv_v4f32:
190 ; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0
193 ; X86-LABEL: fdiv_v4f32:
195 ; X86-NEXT: pushl %eax
196 ; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0
197 ; X86-NEXT: vmovss %xmm0, (%esp)
198 ; X86-NEXT: flds (%esp)
199 ; X86-NEXT: popl %eax
201 %v = fdiv <4 x float> %x, %y
202 %r = extractelement <4 x float> %v, i32 0
206 define double @fdiv_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
207 ; X64-LABEL: fdiv_v4f64:
209 ; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0
210 ; X64-NEXT: vzeroupper
213 ; X86-LABEL: fdiv_v4f64:
215 ; X86-NEXT: pushl %ebp
216 ; X86-NEXT: movl %esp, %ebp
217 ; X86-NEXT: andl $-8, %esp
218 ; X86-NEXT: subl $8, %esp
219 ; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0
220 ; X86-NEXT: vmovsd %xmm0, (%esp)
221 ; X86-NEXT: fldl (%esp)
222 ; X86-NEXT: movl %ebp, %esp
223 ; X86-NEXT: popl %ebp
224 ; X86-NEXT: vzeroupper
226 %v = fdiv <4 x double> %x, %y
227 %r = extractelement <4 x double> %v, i32 0
231 define float @frem_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
232 ; X64-LABEL: frem_v4f32:
234 ; X64-NEXT: jmp fmodf@PLT # TAILCALL
236 ; X86-LABEL: frem_v4f32:
238 ; X86-NEXT: subl $8, %esp
239 ; X86-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
240 ; X86-NEXT: vmovss %xmm0, (%esp)
241 ; X86-NEXT: calll fmodf
242 ; X86-NEXT: addl $8, %esp
244 %v = frem <4 x float> %x, %y
245 %r = extractelement <4 x float> %v, i32 0
249 define double @frem_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
250 ; X64-LABEL: frem_v4f64:
252 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
253 ; X64-NEXT: # kill: def $xmm1 killed $xmm1 killed $ymm1
254 ; X64-NEXT: vzeroupper
255 ; X64-NEXT: jmp fmod@PLT # TAILCALL
257 ; X86-LABEL: frem_v4f64:
259 ; X86-NEXT: subl $16, %esp
260 ; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
261 ; X86-NEXT: vmovups %xmm0, (%esp)
262 ; X86-NEXT: vzeroupper
263 ; X86-NEXT: calll fmod
264 ; X86-NEXT: addl $16, %esp
266 %v = frem <4 x double> %x, %y
267 %r = extractelement <4 x double> %v, i32 0
271 define i1 @fcmp_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
272 ; CHECK-LABEL: fcmp_v4f32:
274 ; CHECK-NEXT: vucomiss %xmm1, %xmm0
275 ; CHECK-NEXT: seta %al
276 ; CHECK-NEXT: ret{{[l|q]}}
277 %v = fcmp ogt <4 x float> %x, %y
278 %r = extractelement <4 x i1> %v, i32 0
282 define i1 @fcmp_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
283 ; CHECK-LABEL: fcmp_v4f64:
285 ; CHECK-NEXT: vucomisd %xmm0, %xmm1
286 ; CHECK-NEXT: setb %al
287 ; CHECK-NEXT: vzeroupper
288 ; CHECK-NEXT: ret{{[l|q]}}
289 %v = fcmp ugt <4 x double> %x, %y
290 %r = extractelement <4 x i1> %v, i32 0
294 ; If we do the fcmp transform late, make sure we have the right types.
295 ; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=13700
297 define void @extsetcc(<4 x float> %x) {
298 ; X64-LABEL: extsetcc:
300 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
301 ; X64-NEXT: vucomiss %xmm1, %xmm0
302 ; X64-NEXT: setb (%rax)
305 ; X86-LABEL: extsetcc:
307 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
308 ; X86-NEXT: vucomiss %xmm1, %xmm0
309 ; X86-NEXT: setb (%eax)
311 %cmp = fcmp ult <4 x float> %x, zeroinitializer
312 %sext = sext <4 x i1> %cmp to <4 x i32>
313 %e = extractelement <4 x i1> %cmp, i1 0
314 store i1 %e, ptr undef
318 ; This used to crash by creating a setcc with an i64 condition on a 32-bit target.
319 define <3 x double> @extvselectsetcc_crash(<2 x double> %x) {
320 ; X64-LABEL: extvselectsetcc_crash:
322 ; X64-NEXT: vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
323 ; X64-NEXT: vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
324 ; X64-NEXT: vandpd %xmm2, %xmm1, %xmm1
325 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
326 ; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
329 ; X86-LABEL: extvselectsetcc_crash:
331 ; X86-NEXT: vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
332 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
333 ; X86-NEXT: vandpd %xmm2, %xmm1, %xmm1
334 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
335 ; X86-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
337 %cmp = fcmp oeq <2 x double> %x, <double 5.0, double 5.0>
338 %s = select <2 x i1> %cmp, <2 x double> <double 1.0, double undef>, <2 x double> <double 0.0, double undef>
339 %r = shufflevector <2 x double> %s, <2 x double> %x, <3 x i32> <i32 0, i32 2, i32 3>
343 define float @select_fcmp_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) nounwind {
344 ; X64-LABEL: select_fcmp_v4f32:
346 ; X64-NEXT: vcmpneq_oqss %xmm1, %xmm0, %xmm0
347 ; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
350 ; X86-LABEL: select_fcmp_v4f32:
352 ; X86-NEXT: pushl %ebp
353 ; X86-NEXT: movl %esp, %ebp
354 ; X86-NEXT: andl $-16, %esp
355 ; X86-NEXT: subl $16, %esp
356 ; X86-NEXT: vmovaps 8(%ebp), %xmm3
357 ; X86-NEXT: vcmpneq_oqss %xmm1, %xmm0, %xmm0
358 ; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
359 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
360 ; X86-NEXT: flds {{[0-9]+}}(%esp)
361 ; X86-NEXT: movl %ebp, %esp
362 ; X86-NEXT: popl %ebp
364 %c = fcmp one <4 x float> %x, %y
365 %s = select <4 x i1> %c, <4 x float> %z, <4 x float> %w
366 %r = extractelement <4 x float> %s, i32 0
370 define double @select_fcmp_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, <4 x double> %w) nounwind {
371 ; X64-LABEL: select_fcmp_v4f64:
373 ; X64-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm0
374 ; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
375 ; X64-NEXT: vzeroupper
378 ; X86-LABEL: select_fcmp_v4f64:
380 ; X86-NEXT: pushl %ebp
381 ; X86-NEXT: movl %esp, %ebp
382 ; X86-NEXT: andl $-32, %esp
383 ; X86-NEXT: subl $32, %esp
384 ; X86-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm0
385 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
386 ; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
387 ; X86-NEXT: vmovlpd %xmm0, {{[0-9]+}}(%esp)
388 ; X86-NEXT: fldl {{[0-9]+}}(%esp)
389 ; X86-NEXT: movl %ebp, %esp
390 ; X86-NEXT: popl %ebp
391 ; X86-NEXT: vzeroupper
393 %c = fcmp ule <4 x double> %x, %y
394 %s = select <4 x i1> %c, <4 x double> %z, <4 x double> %w
395 %r = extractelement <4 x double> %s, i32 0
399 define float @fsqrt_v4f32(<4 x float> %x) nounwind {
400 ; X64-LABEL: fsqrt_v4f32:
402 ; X64-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
405 ; X86-LABEL: fsqrt_v4f32:
407 ; X86-NEXT: pushl %eax
408 ; X86-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
409 ; X86-NEXT: vmovss %xmm0, (%esp)
410 ; X86-NEXT: flds (%esp)
411 ; X86-NEXT: popl %eax
413 %v = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
414 %r = extractelement <4 x float> %v, i32 0
418 define double @fsqrt_v4f64(<4 x double> %x) nounwind {
419 ; X64-LABEL: fsqrt_v4f64:
421 ; X64-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
422 ; X64-NEXT: vzeroupper
425 ; X86-LABEL: fsqrt_v4f64:
427 ; X86-NEXT: pushl %ebp
428 ; X86-NEXT: movl %esp, %ebp
429 ; X86-NEXT: andl $-8, %esp
430 ; X86-NEXT: subl $8, %esp
431 ; X86-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
432 ; X86-NEXT: vmovsd %xmm0, (%esp)
433 ; X86-NEXT: fldl (%esp)
434 ; X86-NEXT: movl %ebp, %esp
435 ; X86-NEXT: popl %ebp
436 ; X86-NEXT: vzeroupper
438 %v = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %x)
439 %r = extractelement <4 x double> %v, i32 0
443 define float @fsin_v4f32(<4 x float> %x) nounwind {
444 ; X64-LABEL: fsin_v4f32:
446 ; X64-NEXT: jmp sinf@PLT # TAILCALL
448 ; X86-LABEL: fsin_v4f32:
450 ; X86-NEXT: pushl %eax
451 ; X86-NEXT: vmovss %xmm0, (%esp)
452 ; X86-NEXT: calll sinf
453 ; X86-NEXT: popl %eax
455 %v = call <4 x float> @llvm.sin.v4f32(<4 x float> %x)
456 %r = extractelement <4 x float> %v, i32 0
460 define double @fsin_v4f64(<4 x double> %x) nounwind {
461 ; X64-LABEL: fsin_v4f64:
463 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
464 ; X64-NEXT: vzeroupper
465 ; X64-NEXT: jmp sin@PLT # TAILCALL
467 ; X86-LABEL: fsin_v4f64:
469 ; X86-NEXT: subl $8, %esp
470 ; X86-NEXT: vmovlps %xmm0, (%esp)
471 ; X86-NEXT: vzeroupper
472 ; X86-NEXT: calll sin
473 ; X86-NEXT: addl $8, %esp
475 %v = call <4 x double> @llvm.sin.v4f64(<4 x double> %x)
476 %r = extractelement <4 x double> %v, i32 0
480 define float @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind {
481 ; X64-LABEL: fma_v4f32:
483 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
486 ; X86-LABEL: fma_v4f32:
488 ; X86-NEXT: pushl %eax
489 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
490 ; X86-NEXT: vmovss %xmm0, (%esp)
491 ; X86-NEXT: flds (%esp)
492 ; X86-NEXT: popl %eax
494 %v = call <4 x float> @llvm.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z)
495 %r = extractelement <4 x float> %v, i32 0
499 define double @fma_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z) nounwind {
500 ; X64-LABEL: fma_v4f64:
502 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
503 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
504 ; X64-NEXT: vzeroupper
507 ; X86-LABEL: fma_v4f64:
509 ; X86-NEXT: pushl %ebp
510 ; X86-NEXT: movl %esp, %ebp
511 ; X86-NEXT: andl $-8, %esp
512 ; X86-NEXT: subl $8, %esp
513 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm2
514 ; X86-NEXT: vmovsd %xmm1, (%esp)
515 ; X86-NEXT: fldl (%esp)
516 ; X86-NEXT: movl %ebp, %esp
517 ; X86-NEXT: popl %ebp
518 ; X86-NEXT: vzeroupper
520 %v = call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z)
521 %r = extractelement <4 x double> %v, i32 0
525 define float @fabs_v4f32(<4 x float> %x) nounwind {
526 ; X64-LABEL: fabs_v4f32:
528 ; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
529 ; X64-NEXT: vandps %xmm1, %xmm0, %xmm0
532 ; X86-LABEL: fabs_v4f32:
534 ; X86-NEXT: pushl %eax
535 ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
536 ; X86-NEXT: vandps %xmm1, %xmm0, %xmm0
537 ; X86-NEXT: vmovss %xmm0, (%esp)
538 ; X86-NEXT: flds (%esp)
539 ; X86-NEXT: popl %eax
541 %v = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
542 %r = extractelement <4 x float> %v, i32 0
546 define double @fabs_v4f64(<4 x double> %x) nounwind {
547 ; X64-LABEL: fabs_v4f64:
549 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
550 ; X64-NEXT: vzeroupper
553 ; X86-LABEL: fabs_v4f64:
555 ; X86-NEXT: pushl %ebp
556 ; X86-NEXT: movl %esp, %ebp
557 ; X86-NEXT: andl $-8, %esp
558 ; X86-NEXT: subl $8, %esp
559 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
560 ; X86-NEXT: vmovlps %xmm0, (%esp)
561 ; X86-NEXT: fldl (%esp)
562 ; X86-NEXT: movl %ebp, %esp
563 ; X86-NEXT: popl %ebp
564 ; X86-NEXT: vzeroupper
566 %v = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
567 %r = extractelement <4 x double> %v, i32 0
571 define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
572 ; X64-LABEL: fmaxnum_v4f32:
574 ; X64-NEXT: vmaxss %xmm0, %xmm1, %xmm2
575 ; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
576 ; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
579 ; X86-LABEL: fmaxnum_v4f32:
581 ; X86-NEXT: pushl %eax
582 ; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2
583 ; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
584 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
585 ; X86-NEXT: vmovss %xmm0, (%esp)
586 ; X86-NEXT: flds (%esp)
587 ; X86-NEXT: popl %eax
589 %v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
590 %r = extractelement <4 x float> %v, i32 0
594 define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
595 ; X64-LABEL: fmaxnum_v4f64:
597 ; X64-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
598 ; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
599 ; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
600 ; X64-NEXT: vzeroupper
603 ; X86-LABEL: fmaxnum_v4f64:
605 ; X86-NEXT: pushl %ebp
606 ; X86-NEXT: movl %esp, %ebp
607 ; X86-NEXT: andl $-8, %esp
608 ; X86-NEXT: subl $8, %esp
609 ; X86-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
610 ; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
611 ; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
612 ; X86-NEXT: vmovlpd %xmm0, (%esp)
613 ; X86-NEXT: fldl (%esp)
614 ; X86-NEXT: movl %ebp, %esp
615 ; X86-NEXT: popl %ebp
616 ; X86-NEXT: vzeroupper
618 %v = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y)
619 %r = extractelement <4 x double> %v, i32 0
623 define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
624 ; X64-LABEL: fminnum_v4f32:
626 ; X64-NEXT: vminss %xmm0, %xmm1, %xmm2
627 ; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
628 ; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
631 ; X86-LABEL: fminnum_v4f32:
633 ; X86-NEXT: pushl %eax
634 ; X86-NEXT: vminss %xmm0, %xmm1, %xmm2
635 ; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
636 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
637 ; X86-NEXT: vmovss %xmm0, (%esp)
638 ; X86-NEXT: flds (%esp)
639 ; X86-NEXT: popl %eax
641 %v = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
642 %r = extractelement <4 x float> %v, i32 0
646 define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
647 ; X64-LABEL: fminnum_v4f64:
649 ; X64-NEXT: vminsd %xmm0, %xmm1, %xmm2
650 ; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
651 ; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
652 ; X64-NEXT: vzeroupper
655 ; X86-LABEL: fminnum_v4f64:
657 ; X86-NEXT: pushl %ebp
658 ; X86-NEXT: movl %esp, %ebp
659 ; X86-NEXT: andl $-8, %esp
660 ; X86-NEXT: subl $8, %esp
661 ; X86-NEXT: vminsd %xmm0, %xmm1, %xmm2
662 ; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
663 ; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
664 ; X86-NEXT: vmovlpd %xmm0, (%esp)
665 ; X86-NEXT: fldl (%esp)
666 ; X86-NEXT: movl %ebp, %esp
667 ; X86-NEXT: popl %ebp
668 ; X86-NEXT: vzeroupper
670 %v = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y)
671 %r = extractelement <4 x double> %v, i32 0
675 define float @fmaximum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
676 ; X64-LABEL: fmaximum_v4f32:
678 ; X64-NEXT: vmovd %xmm0, %eax
679 ; X64-NEXT: testl %eax, %eax
680 ; X64-NEXT: js .LBB30_1
682 ; X64-NEXT: vmovdqa %xmm0, %xmm2
683 ; X64-NEXT: jmp .LBB30_3
684 ; X64-NEXT: .LBB30_1:
685 ; X64-NEXT: vmovdqa %xmm1, %xmm2
686 ; X64-NEXT: vmovdqa %xmm0, %xmm1
687 ; X64-NEXT: .LBB30_3:
688 ; X64-NEXT: vmaxss %xmm2, %xmm1, %xmm0
689 ; X64-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
690 ; X64-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
693 ; X86-LABEL: fmaximum_v4f32:
695 ; X86-NEXT: vmovd %xmm0, %eax
696 ; X86-NEXT: testl %eax, %eax
697 ; X86-NEXT: js .LBB30_1
699 ; X86-NEXT: vmovdqa %xmm0, %xmm2
700 ; X86-NEXT: jmp .LBB30_3
701 ; X86-NEXT: .LBB30_1:
702 ; X86-NEXT: vmovdqa %xmm1, %xmm2
703 ; X86-NEXT: vmovdqa %xmm0, %xmm1
704 ; X86-NEXT: .LBB30_3:
705 ; X86-NEXT: pushl %eax
706 ; X86-NEXT: vmaxss %xmm2, %xmm1, %xmm0
707 ; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
708 ; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
709 ; X86-NEXT: vmovss %xmm0, (%esp)
710 ; X86-NEXT: flds (%esp)
711 ; X86-NEXT: popl %eax
713 %v = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
714 %r = extractelement <4 x float> %v, i32 0
718 define double @fmaximum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
719 ; X64-LABEL: fmaximum_v4f64:
721 ; X64-NEXT: vmovq %xmm0, %rax
722 ; X64-NEXT: testq %rax, %rax
723 ; X64-NEXT: js .LBB31_1
725 ; X64-NEXT: vmovdqa %xmm0, %xmm2
726 ; X64-NEXT: jmp .LBB31_3
727 ; X64-NEXT: .LBB31_1:
728 ; X64-NEXT: vmovdqa %xmm1, %xmm2
729 ; X64-NEXT: vmovdqa %xmm0, %xmm1
730 ; X64-NEXT: .LBB31_3:
731 ; X64-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
732 ; X64-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
733 ; X64-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
734 ; X64-NEXT: vzeroupper
737 ; X86-LABEL: fmaximum_v4f64:
739 ; X86-NEXT: vextractps $1, %xmm0, %eax
740 ; X86-NEXT: testl %eax, %eax
741 ; X86-NEXT: js .LBB31_1
743 ; X86-NEXT: vmovapd %xmm0, %xmm2
744 ; X86-NEXT: jmp .LBB31_3
745 ; X86-NEXT: .LBB31_1:
746 ; X86-NEXT: vmovapd %xmm1, %xmm2
747 ; X86-NEXT: vmovapd %xmm0, %xmm1
748 ; X86-NEXT: .LBB31_3:
749 ; X86-NEXT: pushl %ebp
750 ; X86-NEXT: movl %esp, %ebp
751 ; X86-NEXT: andl $-8, %esp
752 ; X86-NEXT: subl $8, %esp
753 ; X86-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
754 ; X86-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
755 ; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
756 ; X86-NEXT: vmovlpd %xmm0, (%esp)
757 ; X86-NEXT: fldl (%esp)
758 ; X86-NEXT: movl %ebp, %esp
759 ; X86-NEXT: popl %ebp
760 ; X86-NEXT: vzeroupper
762 %v = call <4 x double> @llvm.maximum.v4f64(<4 x double> %x, <4 x double> %y)
763 %r = extractelement <4 x double> %v, i32 0
767 define float @fminimum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
768 ; X64-LABEL: fminimum_v4f32:
770 ; X64-NEXT: vmovd %xmm0, %eax
771 ; X64-NEXT: testl %eax, %eax
772 ; X64-NEXT: js .LBB32_1
774 ; X64-NEXT: vmovdqa %xmm1, %xmm2
775 ; X64-NEXT: jmp .LBB32_3
776 ; X64-NEXT: .LBB32_1:
777 ; X64-NEXT: vmovdqa %xmm0, %xmm2
778 ; X64-NEXT: vmovdqa %xmm1, %xmm0
779 ; X64-NEXT: .LBB32_3:
780 ; X64-NEXT: vminss %xmm2, %xmm0, %xmm1
781 ; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
782 ; X64-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
785 ; X86-LABEL: fminimum_v4f32:
787 ; X86-NEXT: vmovd %xmm0, %eax
788 ; X86-NEXT: testl %eax, %eax
789 ; X86-NEXT: js .LBB32_1
791 ; X86-NEXT: vmovdqa %xmm1, %xmm2
792 ; X86-NEXT: jmp .LBB32_3
793 ; X86-NEXT: .LBB32_1:
794 ; X86-NEXT: vmovdqa %xmm0, %xmm2
795 ; X86-NEXT: vmovdqa %xmm1, %xmm0
796 ; X86-NEXT: .LBB32_3:
797 ; X86-NEXT: pushl %eax
798 ; X86-NEXT: vminss %xmm2, %xmm0, %xmm1
799 ; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
800 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
801 ; X86-NEXT: vmovss %xmm0, (%esp)
802 ; X86-NEXT: flds (%esp)
803 ; X86-NEXT: popl %eax
805 %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
806 %r = extractelement <4 x float> %v, i32 0
810 define double @fminimum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
811 ; X64-LABEL: fminimum_v4f64:
813 ; X64-NEXT: vmovq %xmm0, %rax
814 ; X64-NEXT: testq %rax, %rax
815 ; X64-NEXT: js .LBB33_1
817 ; X64-NEXT: vmovdqa %xmm1, %xmm2
818 ; X64-NEXT: jmp .LBB33_3
819 ; X64-NEXT: .LBB33_1:
820 ; X64-NEXT: vmovdqa %xmm0, %xmm2
821 ; X64-NEXT: vmovdqa %xmm1, %xmm0
822 ; X64-NEXT: .LBB33_3:
823 ; X64-NEXT: vminsd %xmm2, %xmm0, %xmm1
824 ; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
825 ; X64-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
826 ; X64-NEXT: vzeroupper
829 ; X86-LABEL: fminimum_v4f64:
831 ; X86-NEXT: vextractps $1, %xmm0, %eax
832 ; X86-NEXT: testl %eax, %eax
833 ; X86-NEXT: js .LBB33_1
835 ; X86-NEXT: vmovapd %xmm1, %xmm2
836 ; X86-NEXT: jmp .LBB33_3
837 ; X86-NEXT: .LBB33_1:
838 ; X86-NEXT: vmovapd %xmm0, %xmm2
839 ; X86-NEXT: vmovapd %xmm1, %xmm0
840 ; X86-NEXT: .LBB33_3:
841 ; X86-NEXT: pushl %ebp
842 ; X86-NEXT: movl %esp, %ebp
843 ; X86-NEXT: andl $-8, %esp
844 ; X86-NEXT: subl $8, %esp
845 ; X86-NEXT: vminsd %xmm2, %xmm0, %xmm1
846 ; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
847 ; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
848 ; X86-NEXT: vmovlpd %xmm0, (%esp)
849 ; X86-NEXT: fldl (%esp)
850 ; X86-NEXT: movl %ebp, %esp
851 ; X86-NEXT: popl %ebp
852 ; X86-NEXT: vzeroupper
854 %v = call <4 x double> @llvm.minimum.v4f64(<4 x double> %x, <4 x double> %y)
855 %r = extractelement <4 x double> %v, i32 0
859 define float @maxps_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
860 ; X64-LABEL: maxps_v4f32:
862 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
865 ; X86-LABEL: maxps_v4f32:
867 ; X86-NEXT: pushl %eax
868 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
869 ; X86-NEXT: vmovss %xmm0, (%esp)
870 ; X86-NEXT: flds (%esp)
871 ; X86-NEXT: popl %eax
873 %cmp = fcmp ogt <4 x float> %x, %y
874 %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y
875 %r = extractelement <4 x float> %v, i32 0
879 define double @maxpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
880 ; X64-LABEL: maxpd_v4f64:
882 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
883 ; X64-NEXT: vzeroupper
886 ; X86-LABEL: maxpd_v4f64:
888 ; X86-NEXT: pushl %ebp
889 ; X86-NEXT: movl %esp, %ebp
890 ; X86-NEXT: andl $-8, %esp
891 ; X86-NEXT: subl $8, %esp
892 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
893 ; X86-NEXT: vmovsd %xmm0, (%esp)
894 ; X86-NEXT: fldl (%esp)
895 ; X86-NEXT: movl %ebp, %esp
896 ; X86-NEXT: popl %ebp
897 ; X86-NEXT: vzeroupper
899 %cmp = fcmp ogt <4 x double> %x, %y
900 %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y
901 %r = extractelement <4 x double> %v, i32 0
905 define float @minps_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
906 ; X64-LABEL: minps_v4f32:
908 ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
911 ; X86-LABEL: minps_v4f32:
913 ; X86-NEXT: pushl %eax
914 ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
915 ; X86-NEXT: vmovss %xmm0, (%esp)
916 ; X86-NEXT: flds (%esp)
917 ; X86-NEXT: popl %eax
919 %cmp = fcmp olt <4 x float> %x, %y
920 %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y
921 %r = extractelement <4 x float> %v, i32 0
925 define double @minpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
926 ; X64-LABEL: minpd_v4f64:
928 ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0
929 ; X64-NEXT: vzeroupper
932 ; X86-LABEL: minpd_v4f64:
934 ; X86-NEXT: pushl %ebp
935 ; X86-NEXT: movl %esp, %ebp
936 ; X86-NEXT: andl $-8, %esp
937 ; X86-NEXT: subl $8, %esp
938 ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0
939 ; X86-NEXT: vmovsd %xmm0, (%esp)
940 ; X86-NEXT: fldl (%esp)
941 ; X86-NEXT: movl %ebp, %esp
942 ; X86-NEXT: popl %ebp
943 ; X86-NEXT: vzeroupper
945 %cmp = fcmp olt <4 x double> %x, %y
946 %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y
947 %r = extractelement <4 x double> %v, i32 0
951 define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
952 ; X64-LABEL: copysign_v4f32:
954 ; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
955 ; X64-NEXT: vandps %xmm2, %xmm1, %xmm1
956 ; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
957 ; X64-NEXT: vandps %xmm2, %xmm0, %xmm0
958 ; X64-NEXT: vorps %xmm1, %xmm0, %xmm0
961 ; X86-LABEL: copysign_v4f32:
963 ; X86-NEXT: pushl %eax
964 ; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
965 ; X86-NEXT: vandps %xmm2, %xmm1, %xmm1
966 ; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
967 ; X86-NEXT: vandps %xmm2, %xmm0, %xmm0
968 ; X86-NEXT: vorps %xmm1, %xmm0, %xmm0
969 ; X86-NEXT: vmovss %xmm0, (%esp)
970 ; X86-NEXT: flds (%esp)
971 ; X86-NEXT: popl %eax
973 %v = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %y)
974 %r = extractelement <4 x float> %v, i32 0
978 define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
979 ; X64-LABEL: copysign_v4f64:
981 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
982 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
983 ; X64-NEXT: vorps %xmm1, %xmm0, %xmm0
984 ; X64-NEXT: vzeroupper
987 ; X86-LABEL: copysign_v4f64:
989 ; X86-NEXT: pushl %ebp
990 ; X86-NEXT: movl %esp, %ebp
991 ; X86-NEXT: andl $-8, %esp
992 ; X86-NEXT: subl $8, %esp
993 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
994 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
995 ; X86-NEXT: vorps %xmm1, %xmm0, %xmm0
996 ; X86-NEXT: vmovlps %xmm0, (%esp)
997 ; X86-NEXT: fldl (%esp)
998 ; X86-NEXT: movl %ebp, %esp
999 ; X86-NEXT: popl %ebp
1000 ; X86-NEXT: vzeroupper
1002 %v = call <4 x double> @llvm.copysign.v4f64(<4 x double> %x, <4 x double> %y)
1003 %r = extractelement <4 x double> %v, i32 0
1007 define float @floor_v4f32(<4 x float> %x) nounwind {
1008 ; X64-LABEL: floor_v4f32:
1010 ; X64-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1013 ; X86-LABEL: floor_v4f32:
1015 ; X86-NEXT: pushl %eax
1016 ; X86-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1017 ; X86-NEXT: vmovss %xmm0, (%esp)
1018 ; X86-NEXT: flds (%esp)
1019 ; X86-NEXT: popl %eax
1021 %v = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
1022 %r = extractelement <4 x float> %v, i32 0
1026 define double @floor_v4f64(<4 x double> %x) nounwind {
1027 ; X64-LABEL: floor_v4f64:
1029 ; X64-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
1030 ; X64-NEXT: vzeroupper
1033 ; X86-LABEL: floor_v4f64:
1035 ; X86-NEXT: pushl %ebp
1036 ; X86-NEXT: movl %esp, %ebp
1037 ; X86-NEXT: andl $-8, %esp
1038 ; X86-NEXT: subl $8, %esp
1039 ; X86-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
1040 ; X86-NEXT: vmovsd %xmm0, (%esp)
1041 ; X86-NEXT: fldl (%esp)
1042 ; X86-NEXT: movl %ebp, %esp
1043 ; X86-NEXT: popl %ebp
1044 ; X86-NEXT: vzeroupper
1046 %v = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
1047 %r = extractelement <4 x double> %v, i32 0
1051 define float @ceil_v4f32(<4 x float> %x) nounwind {
1052 ; X64-LABEL: ceil_v4f32:
1054 ; X64-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
1057 ; X86-LABEL: ceil_v4f32:
1059 ; X86-NEXT: pushl %eax
1060 ; X86-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
1061 ; X86-NEXT: vmovss %xmm0, (%esp)
1062 ; X86-NEXT: flds (%esp)
1063 ; X86-NEXT: popl %eax
1065 %v = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
1066 %r = extractelement <4 x float> %v, i32 0
1070 define double @ceil_v4f64(<4 x double> %x) nounwind {
1071 ; X64-LABEL: ceil_v4f64:
1073 ; X64-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
1074 ; X64-NEXT: vzeroupper
1077 ; X86-LABEL: ceil_v4f64:
1079 ; X86-NEXT: pushl %ebp
1080 ; X86-NEXT: movl %esp, %ebp
1081 ; X86-NEXT: andl $-8, %esp
1082 ; X86-NEXT: subl $8, %esp
1083 ; X86-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
1084 ; X86-NEXT: vmovsd %xmm0, (%esp)
1085 ; X86-NEXT: fldl (%esp)
1086 ; X86-NEXT: movl %ebp, %esp
1087 ; X86-NEXT: popl %ebp
1088 ; X86-NEXT: vzeroupper
1090 %v = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
1091 %r = extractelement <4 x double> %v, i32 0
1095 define float @trunc_v4f32(<4 x float> %x) nounwind {
1096 ; X64-LABEL: trunc_v4f32:
1098 ; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
1101 ; X86-LABEL: trunc_v4f32:
1103 ; X86-NEXT: pushl %eax
1104 ; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
1105 ; X86-NEXT: vmovss %xmm0, (%esp)
1106 ; X86-NEXT: flds (%esp)
1107 ; X86-NEXT: popl %eax
1109 %v = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x)
1110 %r = extractelement <4 x float> %v, i32 0
1114 define double @trunc_v4f64(<4 x double> %x) nounwind {
1115 ; X64-LABEL: trunc_v4f64:
1117 ; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
1118 ; X64-NEXT: vzeroupper
1121 ; X86-LABEL: trunc_v4f64:
1123 ; X86-NEXT: pushl %ebp
1124 ; X86-NEXT: movl %esp, %ebp
1125 ; X86-NEXT: andl $-8, %esp
1126 ; X86-NEXT: subl $8, %esp
1127 ; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
1128 ; X86-NEXT: vmovsd %xmm0, (%esp)
1129 ; X86-NEXT: fldl (%esp)
1130 ; X86-NEXT: movl %ebp, %esp
1131 ; X86-NEXT: popl %ebp
1132 ; X86-NEXT: vzeroupper
1134 %v = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x)
1135 %r = extractelement <4 x double> %v, i32 0
1139 define float @rint_v4f32(<4 x float> %x) nounwind {
1140 ; X64-LABEL: rint_v4f32:
1142 ; X64-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
1145 ; X86-LABEL: rint_v4f32:
1147 ; X86-NEXT: pushl %eax
1148 ; X86-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
1149 ; X86-NEXT: vmovss %xmm0, (%esp)
1150 ; X86-NEXT: flds (%esp)
1151 ; X86-NEXT: popl %eax
1153 %v = call <4 x float> @llvm.rint.v4f32(<4 x float> %x)
1154 %r = extractelement <4 x float> %v, i32 0
1158 define double @rint_v4f64(<4 x double> %x) nounwind {
1159 ; X64-LABEL: rint_v4f64:
1161 ; X64-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0
1162 ; X64-NEXT: vzeroupper
1165 ; X86-LABEL: rint_v4f64:
1167 ; X86-NEXT: pushl %ebp
1168 ; X86-NEXT: movl %esp, %ebp
1169 ; X86-NEXT: andl $-8, %esp
1170 ; X86-NEXT: subl $8, %esp
1171 ; X86-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0
1172 ; X86-NEXT: vmovsd %xmm0, (%esp)
1173 ; X86-NEXT: fldl (%esp)
1174 ; X86-NEXT: movl %ebp, %esp
1175 ; X86-NEXT: popl %ebp
1176 ; X86-NEXT: vzeroupper
1178 %v = call <4 x double> @llvm.rint.v4f64(<4 x double> %x)
1179 %r = extractelement <4 x double> %v, i32 0
1183 define float @nearbyint_v4f32(<4 x float> %x) nounwind {
1184 ; X64-LABEL: nearbyint_v4f32:
1186 ; X64-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
1189 ; X86-LABEL: nearbyint_v4f32:
1191 ; X86-NEXT: pushl %eax
1192 ; X86-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
1193 ; X86-NEXT: vmovss %xmm0, (%esp)
1194 ; X86-NEXT: flds (%esp)
1195 ; X86-NEXT: popl %eax
1197 %v = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x)
1198 %r = extractelement <4 x float> %v, i32 0
1202 define double @nearbyint_v4f64(<4 x double> %x) nounwind {
1203 ; X64-LABEL: nearbyint_v4f64:
1205 ; X64-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0
1206 ; X64-NEXT: vzeroupper
1209 ; X86-LABEL: nearbyint_v4f64:
1211 ; X86-NEXT: pushl %ebp
1212 ; X86-NEXT: movl %esp, %ebp
1213 ; X86-NEXT: andl $-8, %esp
1214 ; X86-NEXT: subl $8, %esp
1215 ; X86-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0
1216 ; X86-NEXT: vmovsd %xmm0, (%esp)
1217 ; X86-NEXT: fldl (%esp)
1218 ; X86-NEXT: movl %ebp, %esp
1219 ; X86-NEXT: popl %ebp
1220 ; X86-NEXT: vzeroupper
1222 %v = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x)
1223 %r = extractelement <4 x double> %v, i32 0
1227 define float @round_v4f32(<4 x float> %x) nounwind {
1228 ; X64-LABEL: round_v4f32:
1230 ; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1231 ; X64-NEXT: vandps %xmm1, %xmm0, %xmm1
1232 ; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
1233 ; X64-NEXT: vorps %xmm2, %xmm1, %xmm1
1234 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
1235 ; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
1238 ; X86-LABEL: round_v4f32:
1240 ; X86-NEXT: pushl %eax
1241 ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1242 ; X86-NEXT: vandps %xmm1, %xmm0, %xmm1
1243 ; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
1244 ; X86-NEXT: vorps %xmm2, %xmm1, %xmm1
1245 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
1246 ; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
1247 ; X86-NEXT: vmovss %xmm0, (%esp)
1248 ; X86-NEXT: flds (%esp)
1249 ; X86-NEXT: popl %eax
1251 %v = call <4 x float> @llvm.round.v4f32(<4 x float> %x)
1252 %r = extractelement <4 x float> %v, i32 0
1256 define double @round_v4f64(<4 x double> %x) nounwind {
1257 ; X64-LABEL: round_v4f64:
1259 ; X64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1260 ; X64-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
1261 ; X64-NEXT: # xmm2 = mem[0,0]
1262 ; X64-NEXT: vorpd %xmm2, %xmm1, %xmm1
1263 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1264 ; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
1265 ; X64-NEXT: vzeroupper
1268 ; X86-LABEL: round_v4f64:
1270 ; X86-NEXT: pushl %ebp
1271 ; X86-NEXT: movl %esp, %ebp
1272 ; X86-NEXT: andl $-8, %esp
1273 ; X86-NEXT: subl $8, %esp
1274 ; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
1275 ; X86-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
1276 ; X86-NEXT: # xmm2 = mem[0,0]
1277 ; X86-NEXT: vorpd %xmm2, %xmm1, %xmm1
1278 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1279 ; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
1280 ; X86-NEXT: vmovsd %xmm0, (%esp)
1281 ; X86-NEXT: fldl (%esp)
1282 ; X86-NEXT: movl %ebp, %esp
1283 ; X86-NEXT: popl %ebp
1284 ; X86-NEXT: vzeroupper
1286 %v = call <4 x double> @llvm.round.v4f64(<4 x double> %x)
1287 %r = extractelement <4 x double> %v, i32 0
1291 define float @rcp_v4f32(<4 x float> %x) nounwind {
1292 ; X64-LABEL: rcp_v4f32:
1294 ; X64-NEXT: vrcpss %xmm0, %xmm0, %xmm0
1297 ; X86-LABEL: rcp_v4f32:
1299 ; X86-NEXT: pushl %eax
1300 ; X86-NEXT: vrcpss %xmm0, %xmm0, %xmm0
1301 ; X86-NEXT: vmovss %xmm0, (%esp)
1302 ; X86-NEXT: flds (%esp)
1303 ; X86-NEXT: popl %eax
1305 %v = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %x)
1306 %r = extractelement <4 x float> %v, i32 0
1310 define float @rcp_v8f32(<8 x float> %x) nounwind {
1311 ; X64-LABEL: rcp_v8f32:
1313 ; X64-NEXT: vrcpps %ymm0, %ymm0
1314 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1315 ; X64-NEXT: vzeroupper
1318 ; X86-LABEL: rcp_v8f32:
1320 ; X86-NEXT: pushl %eax
1321 ; X86-NEXT: vrcpps %ymm0, %ymm0
1322 ; X86-NEXT: vmovss %xmm0, (%esp)
1323 ; X86-NEXT: flds (%esp)
1324 ; X86-NEXT: popl %eax
1325 ; X86-NEXT: vzeroupper
1327 %v = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %x)
1328 %r = extractelement <8 x float> %v, i32 0
1332 define float @rsqrt_v4f32(<4 x float> %x) nounwind {
1333 ; X64-LABEL: rsqrt_v4f32:
1335 ; X64-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
1338 ; X86-LABEL: rsqrt_v4f32:
1340 ; X86-NEXT: pushl %eax
1341 ; X86-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
1342 ; X86-NEXT: vmovss %xmm0, (%esp)
1343 ; X86-NEXT: flds (%esp)
1344 ; X86-NEXT: popl %eax
1346 %v = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %x)
1347 %r = extractelement <4 x float> %v, i32 0
1351 define float @rsqrt_v8f32(<8 x float> %x) nounwind {
1352 ; X64-LABEL: rsqrt_v8f32:
1354 ; X64-NEXT: vrsqrtps %ymm0, %ymm0
1355 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1356 ; X64-NEXT: vzeroupper
1359 ; X86-LABEL: rsqrt_v8f32:
1361 ; X86-NEXT: pushl %eax
1362 ; X86-NEXT: vrsqrtps %ymm0, %ymm0
1363 ; X86-NEXT: vmovss %xmm0, (%esp)
1364 ; X86-NEXT: flds (%esp)
1365 ; X86-NEXT: popl %eax
1366 ; X86-NEXT: vzeroupper
1368 %v = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %x)
1369 %r = extractelement <8 x float> %v, i32 0
1373 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
1374 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
1375 declare <4 x float> @llvm.sin.v4f32(<4 x float>)
1376 declare <4 x double> @llvm.sin.v4f64(<4 x double>)
1377 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
1378 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
1379 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
1380 declare <4 x double> @llvm.fabs.v4f64(<4 x double>)
1381 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
1382 declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
1383 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
1384 declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
1385 declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
1386 declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
1387 declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
1388 declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
1389 declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
1390 declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>)
1391 declare <4 x float> @llvm.floor.v4f32(<4 x float>)
1392 declare <4 x double> @llvm.floor.v4f64(<4 x double>)
1393 declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
1394 declare <4 x double> @llvm.ceil.v4f64(<4 x double>)
1395 declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
1396 declare <4 x double> @llvm.trunc.v4f64(<4 x double>)
1397 declare <4 x float> @llvm.rint.v4f32(<4 x float>)
1398 declare <4 x double> @llvm.rint.v4f64(<4 x double>)
1399 declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
1400 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>)
1401 declare <4 x float> @llvm.round.v4f32(<4 x float>)
1402 declare <4 x double> @llvm.round.v4f64(<4 x double>)
1404 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>)
1405 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>)
1406 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>)
1407 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>)