1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX
4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
7 define <2 x double> @floor_v2f64(<2 x double> %p) {
8 ; SSE41-LABEL: floor_v2f64:
10 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
13 ; AVX-LABEL: floor_v2f64:
15 ; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
18 ; AVX512-LABEL: floor_v2f64:
20 ; AVX512-NEXT: vroundpd $9, %xmm0, %xmm0
22 %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
25 declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
27 define <4 x float> @floor_v4f32(<4 x float> %p) {
28 ; SSE41-LABEL: floor_v4f32:
30 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
33 ; AVX-LABEL: floor_v4f32:
35 ; AVX-NEXT: vroundps $9, %xmm0, %xmm0
38 ; AVX512-LABEL: floor_v4f32:
40 ; AVX512-NEXT: vroundps $9, %xmm0, %xmm0
42 %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
45 declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
47 define <4 x double> @floor_v4f64(<4 x double> %p){
48 ; SSE41-LABEL: floor_v4f64:
50 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
51 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
54 ; AVX-LABEL: floor_v4f64:
56 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
59 ; AVX512-LABEL: floor_v4f64:
61 ; AVX512-NEXT: vroundpd $9, %ymm0, %ymm0
63 %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
66 declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
68 define <8 x float> @floor_v8f32(<8 x float> %p) {
69 ; SSE41-LABEL: floor_v8f32:
71 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
72 ; SSE41-NEXT: roundps $9, %xmm1, %xmm1
75 ; AVX-LABEL: floor_v8f32:
77 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
80 ; AVX512-LABEL: floor_v8f32:
82 ; AVX512-NEXT: vroundps $9, %ymm0, %ymm0
84 %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
87 declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
89 define <8 x double> @floor_v8f64(<8 x double> %p){
90 ; SSE41-LABEL: floor_v8f64:
92 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
93 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
94 ; SSE41-NEXT: roundpd $9, %xmm2, %xmm2
95 ; SSE41-NEXT: roundpd $9, %xmm3, %xmm3
98 ; AVX-LABEL: floor_v8f64:
100 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
101 ; AVX-NEXT: vroundpd $9, %ymm1, %ymm1
104 ; AVX512-LABEL: floor_v8f64:
106 ; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0
108 %t = call <8 x double> @llvm.floor.v8f64(<8 x double> %p)
111 declare <8 x double> @llvm.floor.v8f64(<8 x double> %p)
113 define <16 x float> @floor_v16f32(<16 x float> %p) {
114 ; SSE41-LABEL: floor_v16f32:
116 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
117 ; SSE41-NEXT: roundps $9, %xmm1, %xmm1
118 ; SSE41-NEXT: roundps $9, %xmm2, %xmm2
119 ; SSE41-NEXT: roundps $9, %xmm3, %xmm3
122 ; AVX-LABEL: floor_v16f32:
124 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
125 ; AVX-NEXT: vroundps $9, %ymm1, %ymm1
128 ; AVX512-LABEL: floor_v16f32:
130 ; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0
132 %t = call <16 x float> @llvm.floor.v16f32(<16 x float> %p)
135 declare <16 x float> @llvm.floor.v16f32(<16 x float> %p)
137 define <2 x double> @ceil_v2f64(<2 x double> %p) {
138 ; SSE41-LABEL: ceil_v2f64:
140 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
143 ; AVX-LABEL: ceil_v2f64:
145 ; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
148 ; AVX512-LABEL: ceil_v2f64:
150 ; AVX512-NEXT: vroundpd $10, %xmm0, %xmm0
152 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
155 declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
157 define <2 x double> @ceil_v2f64_load(<2 x double>* %ptr) {
158 ; SSE41-LABEL: ceil_v2f64_load:
160 ; SSE41-NEXT: movupd (%rdi), %xmm0
161 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
164 ; AVX-LABEL: ceil_v2f64_load:
166 ; AVX-NEXT: vroundpd $10, (%rdi), %xmm0
169 ; AVX512-LABEL: ceil_v2f64_load:
171 ; AVX512-NEXT: vroundpd $10, (%rdi), %xmm0
173 %p = load <2 x double>, <2 x double>* %ptr, align 1
174 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
178 define <4 x float> @ceil_v4f32(<4 x float> %p) {
179 ; SSE41-LABEL: ceil_v4f32:
181 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
184 ; AVX-LABEL: ceil_v4f32:
186 ; AVX-NEXT: vroundps $10, %xmm0, %xmm0
189 ; AVX512-LABEL: ceil_v4f32:
191 ; AVX512-NEXT: vroundps $10, %xmm0, %xmm0
193 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
196 declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
198 define <4 x float> @ceil_v4f32_load(<4 x float>* %ptr) {
199 ; SSE41-LABEL: ceil_v4f32_load:
201 ; SSE41-NEXT: movups (%rdi), %xmm0
202 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
205 ; AVX-LABEL: ceil_v4f32_load:
207 ; AVX-NEXT: vroundps $10, (%rdi), %xmm0
210 ; AVX512-LABEL: ceil_v4f32_load:
212 ; AVX512-NEXT: vroundps $10, (%rdi), %xmm0
214 %p = load <4 x float>, <4 x float>* %ptr, align 1
215 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
219 define <4 x double> @ceil_v4f64(<4 x double> %p) {
220 ; SSE41-LABEL: ceil_v4f64:
222 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
223 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
226 ; AVX-LABEL: ceil_v4f64:
228 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
231 ; AVX512-LABEL: ceil_v4f64:
233 ; AVX512-NEXT: vroundpd $10, %ymm0, %ymm0
235 %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
238 declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
240 define <8 x float> @ceil_v8f32(<8 x float> %p) {
241 ; SSE41-LABEL: ceil_v8f32:
243 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
244 ; SSE41-NEXT: roundps $10, %xmm1, %xmm1
247 ; AVX-LABEL: ceil_v8f32:
249 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
252 ; AVX512-LABEL: ceil_v8f32:
254 ; AVX512-NEXT: vroundps $10, %ymm0, %ymm0
256 %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
259 declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
261 define <8 x double> @ceil_v8f64(<8 x double> %p){
262 ; SSE41-LABEL: ceil_v8f64:
264 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
265 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
266 ; SSE41-NEXT: roundpd $10, %xmm2, %xmm2
267 ; SSE41-NEXT: roundpd $10, %xmm3, %xmm3
270 ; AVX-LABEL: ceil_v8f64:
272 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
273 ; AVX-NEXT: vroundpd $10, %ymm1, %ymm1
276 ; AVX512-LABEL: ceil_v8f64:
278 ; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0
280 %t = call <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
283 declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
285 define <16 x float> @ceil_v16f32(<16 x float> %p) {
286 ; SSE41-LABEL: ceil_v16f32:
288 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
289 ; SSE41-NEXT: roundps $10, %xmm1, %xmm1
290 ; SSE41-NEXT: roundps $10, %xmm2, %xmm2
291 ; SSE41-NEXT: roundps $10, %xmm3, %xmm3
294 ; AVX-LABEL: ceil_v16f32:
296 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
297 ; AVX-NEXT: vroundps $10, %ymm1, %ymm1
300 ; AVX512-LABEL: ceil_v16f32:
302 ; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0
304 %t = call <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
307 declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
309 define <2 x double> @trunc_v2f64(<2 x double> %p) {
310 ; SSE41-LABEL: trunc_v2f64:
312 ; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
315 ; AVX-LABEL: trunc_v2f64:
317 ; AVX-NEXT: vroundpd $11, %xmm0, %xmm0
320 ; AVX512-LABEL: trunc_v2f64:
322 ; AVX512-NEXT: vroundpd $11, %xmm0, %xmm0
324 %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
327 declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
329 define <4 x float> @trunc_v4f32(<4 x float> %p) {
330 ; SSE41-LABEL: trunc_v4f32:
332 ; SSE41-NEXT: roundps $11, %xmm0, %xmm0
335 ; AVX-LABEL: trunc_v4f32:
337 ; AVX-NEXT: vroundps $11, %xmm0, %xmm0
340 ; AVX512-LABEL: trunc_v4f32:
342 ; AVX512-NEXT: vroundps $11, %xmm0, %xmm0
344 %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
347 declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
349 define <4 x double> @trunc_v4f64(<4 x double> %p) {
350 ; SSE41-LABEL: trunc_v4f64:
352 ; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
353 ; SSE41-NEXT: roundpd $11, %xmm1, %xmm1
356 ; AVX-LABEL: trunc_v4f64:
358 ; AVX-NEXT: vroundpd $11, %ymm0, %ymm0
361 ; AVX512-LABEL: trunc_v4f64:
363 ; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0
365 %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
368 declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
370 define <8 x float> @trunc_v8f32(<8 x float> %p) {
371 ; SSE41-LABEL: trunc_v8f32:
373 ; SSE41-NEXT: roundps $11, %xmm0, %xmm0
374 ; SSE41-NEXT: roundps $11, %xmm1, %xmm1
377 ; AVX-LABEL: trunc_v8f32:
379 ; AVX-NEXT: vroundps $11, %ymm0, %ymm0
382 ; AVX512-LABEL: trunc_v8f32:
384 ; AVX512-NEXT: vroundps $11, %ymm0, %ymm0
386 %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
389 declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
391 define <8 x double> @trunc_v8f64(<8 x double> %p){
392 ; SSE41-LABEL: trunc_v8f64:
394 ; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
395 ; SSE41-NEXT: roundpd $11, %xmm1, %xmm1
396 ; SSE41-NEXT: roundpd $11, %xmm2, %xmm2
397 ; SSE41-NEXT: roundpd $11, %xmm3, %xmm3
400 ; AVX-LABEL: trunc_v8f64:
402 ; AVX-NEXT: vroundpd $11, %ymm0, %ymm0
403 ; AVX-NEXT: vroundpd $11, %ymm1, %ymm1
406 ; AVX512-LABEL: trunc_v8f64:
408 ; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0
410 %t = call <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
413 declare <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
415 define <16 x float> @trunc_v16f32(<16 x float> %p) {
416 ; SSE41-LABEL: trunc_v16f32:
418 ; SSE41-NEXT: roundps $11, %xmm0, %xmm0
419 ; SSE41-NEXT: roundps $11, %xmm1, %xmm1
420 ; SSE41-NEXT: roundps $11, %xmm2, %xmm2
421 ; SSE41-NEXT: roundps $11, %xmm3, %xmm3
424 ; AVX-LABEL: trunc_v16f32:
426 ; AVX-NEXT: vroundps $11, %ymm0, %ymm0
427 ; AVX-NEXT: vroundps $11, %ymm1, %ymm1
430 ; AVX512-LABEL: trunc_v16f32:
432 ; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0
434 %t = call <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
437 declare <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
439 define <2 x double> @rint_v2f64(<2 x double> %p) {
440 ; SSE41-LABEL: rint_v2f64:
442 ; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
445 ; AVX-LABEL: rint_v2f64:
447 ; AVX-NEXT: vroundpd $4, %xmm0, %xmm0
450 ; AVX512-LABEL: rint_v2f64:
452 ; AVX512-NEXT: vroundpd $4, %xmm0, %xmm0
454 %t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
457 declare <2 x double> @llvm.rint.v2f64(<2 x double> %p)
459 define <4 x float> @rint_v4f32(<4 x float> %p) {
460 ; SSE41-LABEL: rint_v4f32:
462 ; SSE41-NEXT: roundps $4, %xmm0, %xmm0
465 ; AVX-LABEL: rint_v4f32:
467 ; AVX-NEXT: vroundps $4, %xmm0, %xmm0
470 ; AVX512-LABEL: rint_v4f32:
472 ; AVX512-NEXT: vroundps $4, %xmm0, %xmm0
474 %t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
477 declare <4 x float> @llvm.rint.v4f32(<4 x float> %p)
479 define <4 x double> @rint_v4f64(<4 x double> %p) {
480 ; SSE41-LABEL: rint_v4f64:
482 ; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
483 ; SSE41-NEXT: roundpd $4, %xmm1, %xmm1
486 ; AVX-LABEL: rint_v4f64:
488 ; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
491 ; AVX512-LABEL: rint_v4f64:
493 ; AVX512-NEXT: vroundpd $4, %ymm0, %ymm0
495 %t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
498 declare <4 x double> @llvm.rint.v4f64(<4 x double> %p)
500 define <8 x float> @rint_v8f32(<8 x float> %p) {
501 ; SSE41-LABEL: rint_v8f32:
503 ; SSE41-NEXT: roundps $4, %xmm0, %xmm0
504 ; SSE41-NEXT: roundps $4, %xmm1, %xmm1
507 ; AVX-LABEL: rint_v8f32:
509 ; AVX-NEXT: vroundps $4, %ymm0, %ymm0
512 ; AVX512-LABEL: rint_v8f32:
514 ; AVX512-NEXT: vroundps $4, %ymm0, %ymm0
516 %t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
519 declare <8 x float> @llvm.rint.v8f32(<8 x float> %p)
521 define <8 x double> @rint_v8f64(<8 x double> %p){
522 ; SSE41-LABEL: rint_v8f64:
524 ; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
525 ; SSE41-NEXT: roundpd $4, %xmm1, %xmm1
526 ; SSE41-NEXT: roundpd $4, %xmm2, %xmm2
527 ; SSE41-NEXT: roundpd $4, %xmm3, %xmm3
530 ; AVX-LABEL: rint_v8f64:
532 ; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
533 ; AVX-NEXT: vroundpd $4, %ymm1, %ymm1
536 ; AVX512-LABEL: rint_v8f64:
538 ; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0
540 %t = call <8 x double> @llvm.rint.v8f64(<8 x double> %p)
543 declare <8 x double> @llvm.rint.v8f64(<8 x double> %p)
545 define <16 x float> @rint_v16f32(<16 x float> %p) {
546 ; SSE41-LABEL: rint_v16f32:
548 ; SSE41-NEXT: roundps $4, %xmm0, %xmm0
549 ; SSE41-NEXT: roundps $4, %xmm1, %xmm1
550 ; SSE41-NEXT: roundps $4, %xmm2, %xmm2
551 ; SSE41-NEXT: roundps $4, %xmm3, %xmm3
554 ; AVX-LABEL: rint_v16f32:
556 ; AVX-NEXT: vroundps $4, %ymm0, %ymm0
557 ; AVX-NEXT: vroundps $4, %ymm1, %ymm1
560 ; AVX512-LABEL: rint_v16f32:
562 ; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0
564 %t = call <16 x float> @llvm.rint.v16f32(<16 x float> %p)
567 declare <16 x float> @llvm.rint.v16f32(<16 x float> %p)
569 define <2 x double> @nearbyint_v2f64(<2 x double> %p) {
570 ; SSE41-LABEL: nearbyint_v2f64:
572 ; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
575 ; AVX-LABEL: nearbyint_v2f64:
577 ; AVX-NEXT: vroundpd $12, %xmm0, %xmm0
580 ; AVX512-LABEL: nearbyint_v2f64:
582 ; AVX512-NEXT: vroundpd $12, %xmm0, %xmm0
584 %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
587 declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
589 define <4 x float> @nearbyint_v4f32(<4 x float> %p) {
590 ; SSE41-LABEL: nearbyint_v4f32:
592 ; SSE41-NEXT: roundps $12, %xmm0, %xmm0
595 ; AVX-LABEL: nearbyint_v4f32:
597 ; AVX-NEXT: vroundps $12, %xmm0, %xmm0
600 ; AVX512-LABEL: nearbyint_v4f32:
602 ; AVX512-NEXT: vroundps $12, %xmm0, %xmm0
604 %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
607 declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
609 define <4 x double> @nearbyint_v4f64(<4 x double> %p) {
610 ; SSE41-LABEL: nearbyint_v4f64:
612 ; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
613 ; SSE41-NEXT: roundpd $12, %xmm1, %xmm1
616 ; AVX-LABEL: nearbyint_v4f64:
618 ; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
621 ; AVX512-LABEL: nearbyint_v4f64:
623 ; AVX512-NEXT: vroundpd $12, %ymm0, %ymm0
625 %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
628 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
630 define <8 x float> @nearbyint_v8f32(<8 x float> %p) {
631 ; SSE41-LABEL: nearbyint_v8f32:
633 ; SSE41-NEXT: roundps $12, %xmm0, %xmm0
634 ; SSE41-NEXT: roundps $12, %xmm1, %xmm1
637 ; AVX-LABEL: nearbyint_v8f32:
639 ; AVX-NEXT: vroundps $12, %ymm0, %ymm0
642 ; AVX512-LABEL: nearbyint_v8f32:
644 ; AVX512-NEXT: vroundps $12, %ymm0, %ymm0
646 %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
649 declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
651 define <8 x double> @nearbyint_v8f64(<8 x double> %p){
652 ; SSE41-LABEL: nearbyint_v8f64:
654 ; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
655 ; SSE41-NEXT: roundpd $12, %xmm1, %xmm1
656 ; SSE41-NEXT: roundpd $12, %xmm2, %xmm2
657 ; SSE41-NEXT: roundpd $12, %xmm3, %xmm3
660 ; AVX-LABEL: nearbyint_v8f64:
662 ; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
663 ; AVX-NEXT: vroundpd $12, %ymm1, %ymm1
666 ; AVX512-LABEL: nearbyint_v8f64:
668 ; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0
670 %t = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
673 declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
675 define <16 x float> @nearbyint_v16f32(<16 x float> %p) {
676 ; SSE41-LABEL: nearbyint_v16f32:
678 ; SSE41-NEXT: roundps $12, %xmm0, %xmm0
679 ; SSE41-NEXT: roundps $12, %xmm1, %xmm1
680 ; SSE41-NEXT: roundps $12, %xmm2, %xmm2
681 ; SSE41-NEXT: roundps $12, %xmm3, %xmm3
684 ; AVX-LABEL: nearbyint_v16f32:
686 ; AVX-NEXT: vroundps $12, %ymm0, %ymm0
687 ; AVX-NEXT: vroundps $12, %ymm1, %ymm1
690 ; AVX512-LABEL: nearbyint_v16f32:
692 ; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0
694 %t = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
697 declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
703 define <2 x double> @const_floor_v2f64() {
704 ; SSE41-LABEL: const_floor_v2f64:
706 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
709 ; AVX-LABEL: const_floor_v2f64:
711 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
714 ; AVX512-LABEL: const_floor_v2f64:
716 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
718 %t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
722 define <4 x float> @const_floor_v4f32() {
723 ; SSE41-LABEL: const_floor_v4f32:
725 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
728 ; AVX-LABEL: const_floor_v4f32:
730 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
733 ; AVX512-LABEL: const_floor_v4f32:
735 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
737 %t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
741 define <2 x double> @const_ceil_v2f64() {
742 ; SSE41-LABEL: const_ceil_v2f64:
744 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
747 ; AVX-LABEL: const_ceil_v2f64:
749 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
752 ; AVX512-LABEL: const_ceil_v2f64:
754 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
756 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
760 define <4 x float> @const_ceil_v4f32() {
761 ; SSE41-LABEL: const_ceil_v4f32:
763 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
766 ; AVX-LABEL: const_ceil_v4f32:
768 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
771 ; AVX512-LABEL: const_ceil_v4f32:
773 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
775 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
779 define <2 x double> @const_trunc_v2f64() {
780 ; SSE41-LABEL: const_trunc_v2f64:
782 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
785 ; AVX-LABEL: const_trunc_v2f64:
787 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
790 ; AVX512-LABEL: const_trunc_v2f64:
792 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
794 %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
798 define <4 x float> @const_trunc_v4f32() {
799 ; SSE41-LABEL: const_trunc_v4f32:
801 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
804 ; AVX-LABEL: const_trunc_v4f32:
806 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
809 ; AVX512-LABEL: const_trunc_v4f32:
811 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
813 %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
818 ; Scalar and masked instructions
821 define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
822 ; SSE41-LABEL: floor_ss:
824 ; SSE41-NEXT: roundss $1, %xmm0, %xmm1
825 ; SSE41-NEXT: movaps %xmm1, %xmm0
828 ; AVX-LABEL: floor_ss:
830 ; AVX-NEXT: vroundss $1, %xmm0, %xmm1, %xmm0
833 ; AVX512-LABEL: floor_ss:
835 ; AVX512-NEXT: vroundss $1, %xmm0, %xmm1, %xmm0
837 %s = extractelement <4 x float> %x, i32 0
838 %call = call float @llvm.floor.f32(float %s)
839 %res = insertelement <4 x float> %y, float %call, i32 0
842 declare float @llvm.floor.f32(float %s)
844 define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
845 ; SSE41-LABEL: floor_sd:
847 ; SSE41-NEXT: roundsd $1, %xmm0, %xmm1
848 ; SSE41-NEXT: movapd %xmm1, %xmm0
851 ; AVX-LABEL: floor_sd:
853 ; AVX-NEXT: vroundsd $1, %xmm0, %xmm1, %xmm0
856 ; AVX512-LABEL: floor_sd:
858 ; AVX512-NEXT: vroundsd $1, %xmm0, %xmm1, %xmm0
860 %s = extractelement <2 x double> %x, i32 0
861 %call = call double @llvm.floor.f64(double %s)
862 %res = insertelement <2 x double> %y, double %call, i32 0
863 ret <2 x double> %res
865 declare double @llvm.floor.f64(double %s)
867 define <4 x float> @floor_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
868 ; SSE41-LABEL: floor_mask_128_ps:
870 ; SSE41-NEXT: roundps $9, %xmm0, %xmm2
871 ; SSE41-NEXT: cmpeqps %xmm1, %xmm0
872 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
873 ; SSE41-NEXT: movaps %xmm1, %xmm0
876 ; AVX-LABEL: floor_mask_128_ps:
878 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2
879 ; AVX-NEXT: vroundps $9, %xmm0, %xmm0
880 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
883 ; AVX512F-LABEL: floor_mask_128_ps:
885 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
886 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
887 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
888 ; AVX512F-NEXT: vroundps $9, %xmm0, %xmm0
889 ; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
890 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
891 ; AVX512F-NEXT: vzeroupper
894 ; AVX512VL-LABEL: floor_mask_128_ps:
895 ; AVX512VL: ## %bb.0:
896 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
897 ; AVX512VL-NEXT: vrndscaleps $9, %xmm0, %xmm1 {%k1}
898 ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
899 ; AVX512VL-NEXT: retq
900 %k = fcmp oeq <4 x float> %x, %y
901 %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
902 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y
906 define <4 x float> @floor_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
907 ; SSE41-LABEL: floor_maskz_128_ps:
909 ; SSE41-NEXT: cmpeqps %xmm0, %xmm1
910 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
911 ; SSE41-NEXT: andps %xmm1, %xmm0
914 ; AVX-LABEL: floor_maskz_128_ps:
916 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1
917 ; AVX-NEXT: vroundps $9, %xmm0, %xmm0
918 ; AVX-NEXT: vandps %xmm0, %xmm1, %xmm0
921 ; AVX512F-LABEL: floor_maskz_128_ps:
923 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
924 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
925 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
926 ; AVX512F-NEXT: vroundps $9, %xmm0, %xmm0
927 ; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
928 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
929 ; AVX512F-NEXT: vzeroupper
932 ; AVX512VL-LABEL: floor_maskz_128_ps:
933 ; AVX512VL: ## %bb.0:
934 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
935 ; AVX512VL-NEXT: vrndscaleps $9, %xmm0, %xmm0 {%k1} {z}
936 ; AVX512VL-NEXT: retq
937 %k = fcmp oeq <4 x float> %x, %y
938 %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
939 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer
943 define <2 x double> @floor_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
944 ; SSE41-LABEL: floor_mask_128_pd:
946 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm2
947 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm0
948 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
949 ; SSE41-NEXT: movapd %xmm1, %xmm0
952 ; AVX-LABEL: floor_mask_128_pd:
954 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2
955 ; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
956 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
959 ; AVX512F-LABEL: floor_mask_128_pd:
961 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
962 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
963 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
964 ; AVX512F-NEXT: vroundpd $9, %xmm0, %xmm0
965 ; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
966 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
967 ; AVX512F-NEXT: vzeroupper
970 ; AVX512VL-LABEL: floor_mask_128_pd:
971 ; AVX512VL: ## %bb.0:
972 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
973 ; AVX512VL-NEXT: vrndscalepd $9, %xmm0, %xmm1 {%k1}
974 ; AVX512VL-NEXT: vmovapd %xmm1, %xmm0
975 ; AVX512VL-NEXT: retq
976 %k = fcmp oeq <2 x double> %x, %y
977 %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
978 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y
979 ret <2 x double> %res
982 define <2 x double> @floor_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
983 ; SSE41-LABEL: floor_maskz_128_pd:
985 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm1
986 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
987 ; SSE41-NEXT: andpd %xmm1, %xmm0
990 ; AVX-LABEL: floor_maskz_128_pd:
992 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1
993 ; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
994 ; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0
997 ; AVX512F-LABEL: floor_maskz_128_pd:
999 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1000 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1001 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1002 ; AVX512F-NEXT: vroundpd $9, %xmm0, %xmm0
1003 ; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
1004 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1005 ; AVX512F-NEXT: vzeroupper
1006 ; AVX512F-NEXT: retq
1008 ; AVX512VL-LABEL: floor_maskz_128_pd:
1009 ; AVX512VL: ## %bb.0:
1010 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
1011 ; AVX512VL-NEXT: vrndscalepd $9, %xmm0, %xmm0 {%k1} {z}
1012 ; AVX512VL-NEXT: retq
1013 %k = fcmp oeq <2 x double> %x, %y
1014 %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
1015 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer
1016 ret <2 x double> %res
1019 define <8 x float> @floor_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
1020 ; SSE41-LABEL: floor_mask_256_ps:
1022 ; SSE41-NEXT: roundps $9, %xmm1, %xmm4
1023 ; SSE41-NEXT: cmpeqps %xmm3, %xmm1
1024 ; SSE41-NEXT: roundps $9, %xmm0, %xmm5
1025 ; SSE41-NEXT: cmpeqps %xmm2, %xmm0
1026 ; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
1027 ; SSE41-NEXT: movaps %xmm1, %xmm0
1028 ; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3
1029 ; SSE41-NEXT: movaps %xmm2, %xmm0
1030 ; SSE41-NEXT: movaps %xmm3, %xmm1
1033 ; AVX-LABEL: floor_mask_256_ps:
1035 ; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm2
1036 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
1037 ; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
1040 ; AVX512F-LABEL: floor_mask_256_ps:
1041 ; AVX512F: ## %bb.0:
1042 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1043 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1044 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1045 ; AVX512F-NEXT: vroundps $9, %ymm0, %ymm0
1046 ; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
1047 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1048 ; AVX512F-NEXT: retq
1050 ; AVX512VL-LABEL: floor_mask_256_ps:
1051 ; AVX512VL: ## %bb.0:
1052 ; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
1053 ; AVX512VL-NEXT: vrndscaleps $9, %ymm0, %ymm1 {%k1}
1054 ; AVX512VL-NEXT: vmovaps %ymm1, %ymm0
1055 ; AVX512VL-NEXT: retq
1056 %k = fcmp oeq <8 x float> %x, %y
1057 %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x)
1058 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y
1059 ret <8 x float> %res
1062 define <8 x float> @floor_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
1063 ; SSE41-LABEL: floor_maskz_256_ps:
1065 ; SSE41-NEXT: cmpeqps %xmm1, %xmm3
1066 ; SSE41-NEXT: cmpeqps %xmm0, %xmm2
1067 ; SSE41-NEXT: roundps $9, %xmm1, %xmm1
1068 ; SSE41-NEXT: andps %xmm3, %xmm1
1069 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
1070 ; SSE41-NEXT: andps %xmm2, %xmm0
1073 ; AVX-LABEL: floor_maskz_256_ps:
1075 ; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
1076 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
1077 ; AVX-NEXT: vandps %ymm0, %ymm1, %ymm0
1080 ; AVX512F-LABEL: floor_maskz_256_ps:
1081 ; AVX512F: ## %bb.0:
1082 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1083 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1084 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1085 ; AVX512F-NEXT: vroundps $9, %ymm0, %ymm0
1086 ; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
1087 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1088 ; AVX512F-NEXT: retq
1090 ; AVX512VL-LABEL: floor_maskz_256_ps:
1091 ; AVX512VL: ## %bb.0:
1092 ; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
1093 ; AVX512VL-NEXT: vrndscaleps $9, %ymm0, %ymm0 {%k1} {z}
1094 ; AVX512VL-NEXT: retq
1095 %k = fcmp oeq <8 x float> %x, %y
1096 %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x)
1097 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer
1098 ret <8 x float> %res
1101 define <4 x double> @floor_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
1102 ; SSE41-LABEL: floor_mask_256_pd:
1104 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm4
1105 ; SSE41-NEXT: cmpeqpd %xmm3, %xmm1
1106 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm5
1107 ; SSE41-NEXT: cmpeqpd %xmm2, %xmm0
1108 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
1109 ; SSE41-NEXT: movapd %xmm1, %xmm0
1110 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
1111 ; SSE41-NEXT: movapd %xmm2, %xmm0
1112 ; SSE41-NEXT: movapd %xmm3, %xmm1
1115 ; AVX-LABEL: floor_mask_256_pd:
1117 ; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
1118 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
1119 ; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1122 ; AVX512F-LABEL: floor_mask_256_pd:
1123 ; AVX512F: ## %bb.0:
1124 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1125 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1126 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1127 ; AVX512F-NEXT: vroundpd $9, %ymm0, %ymm0
1128 ; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
1129 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1130 ; AVX512F-NEXT: retq
1132 ; AVX512VL-LABEL: floor_mask_256_pd:
1133 ; AVX512VL: ## %bb.0:
1134 ; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
1135 ; AVX512VL-NEXT: vrndscalepd $9, %ymm0, %ymm1 {%k1}
1136 ; AVX512VL-NEXT: vmovapd %ymm1, %ymm0
1137 ; AVX512VL-NEXT: retq
1138 %k = fcmp oeq <4 x double> %x, %y
1139 %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
1140 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y
1141 ret <4 x double> %res
1144 define <4 x double> @floor_maskz_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
1145 ; SSE41-LABEL: floor_maskz_256_pd:
1147 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm3
1148 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm2
1149 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
1150 ; SSE41-NEXT: andpd %xmm3, %xmm1
1151 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
1152 ; SSE41-NEXT: andpd %xmm2, %xmm0
1155 ; AVX-LABEL: floor_maskz_256_pd:
1157 ; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1
1158 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
1159 ; AVX-NEXT: vandpd %ymm0, %ymm1, %ymm0
1162 ; AVX512F-LABEL: floor_maskz_256_pd:
1163 ; AVX512F: ## %bb.0:
1164 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1165 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1166 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1167 ; AVX512F-NEXT: vroundpd $9, %ymm0, %ymm0
1168 ; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
1169 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1170 ; AVX512F-NEXT: retq
1172 ; AVX512VL-LABEL: floor_maskz_256_pd:
1173 ; AVX512VL: ## %bb.0:
1174 ; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
1175 ; AVX512VL-NEXT: vrndscalepd $9, %ymm0, %ymm0 {%k1} {z}
1176 ; AVX512VL-NEXT: retq
1177 %k = fcmp oeq <4 x double> %x, %y
1178 %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
1179 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> zeroinitializer
1180 ret <4 x double> %res
1183 define <16 x float> @floor_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
1184 ; SSE41-LABEL: floor_mask_512_ps:
1186 ; SSE41-NEXT: roundps $9, %xmm3, %xmm8
1187 ; SSE41-NEXT: cmpeqps %xmm7, %xmm3
1188 ; SSE41-NEXT: roundps $9, %xmm2, %xmm9
1189 ; SSE41-NEXT: cmpeqps %xmm6, %xmm2
1190 ; SSE41-NEXT: roundps $9, %xmm1, %xmm10
1191 ; SSE41-NEXT: cmpeqps %xmm5, %xmm1
1192 ; SSE41-NEXT: roundps $9, %xmm0, %xmm11
1193 ; SSE41-NEXT: cmpeqps %xmm4, %xmm0
1194 ; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4
1195 ; SSE41-NEXT: movaps %xmm1, %xmm0
1196 ; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5
1197 ; SSE41-NEXT: movaps %xmm2, %xmm0
1198 ; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6
1199 ; SSE41-NEXT: movaps %xmm3, %xmm0
1200 ; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7
1201 ; SSE41-NEXT: movaps %xmm4, %xmm0
1202 ; SSE41-NEXT: movaps %xmm5, %xmm1
1203 ; SSE41-NEXT: movaps %xmm6, %xmm2
1204 ; SSE41-NEXT: movaps %xmm7, %xmm3
1207 ; AVX-LABEL: floor_mask_512_ps:
1209 ; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm4
1210 ; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm5
1211 ; AVX-NEXT: vroundps $9, %ymm1, %ymm1
1212 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
1213 ; AVX-NEXT: vblendvps %ymm5, %ymm0, %ymm2, %ymm0
1214 ; AVX-NEXT: vblendvps %ymm4, %ymm1, %ymm3, %ymm1
1217 ; AVX512-LABEL: floor_mask_512_ps:
1219 ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1220 ; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm1 {%k1}
1221 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
1223 %k = fcmp oeq <16 x float> %x, %y
1224 %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x)
1225 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y
1226 ret <16 x float> %res
1229 define <16 x float> @floor_maskz_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
1230 ; SSE41-LABEL: floor_maskz_512_ps:
1232 ; SSE41-NEXT: cmpeqps %xmm3, %xmm7
1233 ; SSE41-NEXT: cmpeqps %xmm2, %xmm6
1234 ; SSE41-NEXT: cmpeqps %xmm1, %xmm5
1235 ; SSE41-NEXT: cmpeqps %xmm0, %xmm4
1236 ; SSE41-NEXT: roundps $9, %xmm3, %xmm3
1237 ; SSE41-NEXT: andps %xmm7, %xmm3
1238 ; SSE41-NEXT: roundps $9, %xmm2, %xmm2
1239 ; SSE41-NEXT: andps %xmm6, %xmm2
1240 ; SSE41-NEXT: roundps $9, %xmm1, %xmm1
1241 ; SSE41-NEXT: andps %xmm5, %xmm1
1242 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
1243 ; SSE41-NEXT: andps %xmm4, %xmm0
1246 ; AVX-LABEL: floor_maskz_512_ps:
1248 ; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm3
1249 ; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm2
1250 ; AVX-NEXT: vroundps $9, %ymm1, %ymm1
1251 ; AVX-NEXT: vandps %ymm1, %ymm3, %ymm1
1252 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
1253 ; AVX-NEXT: vandps %ymm0, %ymm2, %ymm0
1256 ; AVX512-LABEL: floor_maskz_512_ps:
1258 ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1259 ; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0 {%k1} {z}
1261 %k = fcmp oeq <16 x float> %x, %y
1262 %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x)
1263 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> zeroinitializer
1264 ret <16 x float> %res
1267 define <8 x double> @floor_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
1268 ; SSE41-LABEL: floor_mask_512_pd:
1270 ; SSE41-NEXT: roundpd $9, %xmm3, %xmm8
1271 ; SSE41-NEXT: cmpeqpd %xmm7, %xmm3
1272 ; SSE41-NEXT: roundpd $9, %xmm2, %xmm9
1273 ; SSE41-NEXT: cmpeqpd %xmm6, %xmm2
1274 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm10
1275 ; SSE41-NEXT: cmpeqpd %xmm5, %xmm1
1276 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm11
1277 ; SSE41-NEXT: cmpeqpd %xmm4, %xmm0
1278 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4
1279 ; SSE41-NEXT: movapd %xmm1, %xmm0
1280 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5
1281 ; SSE41-NEXT: movapd %xmm2, %xmm0
1282 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
1283 ; SSE41-NEXT: movapd %xmm3, %xmm0
1284 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
1285 ; SSE41-NEXT: movapd %xmm4, %xmm0
1286 ; SSE41-NEXT: movapd %xmm5, %xmm1
1287 ; SSE41-NEXT: movapd %xmm6, %xmm2
1288 ; SSE41-NEXT: movapd %xmm7, %xmm3
1291 ; AVX-LABEL: floor_mask_512_pd:
1293 ; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm4
1294 ; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm5
1295 ; AVX-NEXT: vroundpd $9, %ymm1, %ymm1
1296 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
1297 ; AVX-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
1298 ; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
1301 ; AVX512-LABEL: floor_mask_512_pd:
1303 ; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1304 ; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm1 {%k1}
1305 ; AVX512-NEXT: vmovapd %zmm1, %zmm0
1307 %k = fcmp oeq <8 x double> %x, %y
1308 %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x)
1309 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y
1310 ret <8 x double> %res
1313 define <8 x double> @floor_maskz_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
1314 ; SSE41-LABEL: floor_maskz_512_pd:
1316 ; SSE41-NEXT: cmpeqpd %xmm3, %xmm7
1317 ; SSE41-NEXT: cmpeqpd %xmm2, %xmm6
1318 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm5
1319 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm4
1320 ; SSE41-NEXT: roundpd $9, %xmm3, %xmm3
1321 ; SSE41-NEXT: andpd %xmm7, %xmm3
1322 ; SSE41-NEXT: roundpd $9, %xmm2, %xmm2
1323 ; SSE41-NEXT: andpd %xmm6, %xmm2
1324 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
1325 ; SSE41-NEXT: andpd %xmm5, %xmm1
1326 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
1327 ; SSE41-NEXT: andpd %xmm4, %xmm0
1330 ; AVX-LABEL: floor_maskz_512_pd:
1332 ; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm3
1333 ; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm2
1334 ; AVX-NEXT: vroundpd $9, %ymm1, %ymm1
1335 ; AVX-NEXT: vandpd %ymm1, %ymm3, %ymm1
1336 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
1337 ; AVX-NEXT: vandpd %ymm0, %ymm2, %ymm0
1340 ; AVX512-LABEL: floor_maskz_512_pd:
1342 ; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1343 ; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0 {%k1} {z}
1345 %k = fcmp oeq <8 x double> %x, %y
1346 %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x)
1347 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> zeroinitializer
1348 ret <8 x double> %res
1351 define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i8 %k) nounwind {
1352 ; SSE41-LABEL: floor_mask_ss:
1354 ; SSE41-NEXT: testb $1, %dil
1355 ; SSE41-NEXT: je LBB52_2
1356 ; SSE41-NEXT: ## %bb.1:
1357 ; SSE41-NEXT: xorps %xmm2, %xmm2
1358 ; SSE41-NEXT: roundss $9, %xmm0, %xmm2
1359 ; SSE41-NEXT: LBB52_2:
1360 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1361 ; SSE41-NEXT: movaps %xmm1, %xmm0
1364 ; AVX-LABEL: floor_mask_ss:
1366 ; AVX-NEXT: testb $1, %dil
1367 ; AVX-NEXT: je LBB52_2
1368 ; AVX-NEXT: ## %bb.1:
1369 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
1370 ; AVX-NEXT: LBB52_2:
1371 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1374 ; AVX512-LABEL: floor_mask_ss:
1376 ; AVX512-NEXT: kmovw %edi, %k1
1377 ; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1}
1378 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
1380 %mask = and i8 %k, 1
1381 %nmask = icmp eq i8 %mask, 0
1382 %s = extractelement <4 x float> %x, i64 0
1383 %call = tail call float @llvm.floor.f32(float %s)
1384 %dst = extractelement <4 x float> %w, i64 0
1385 %low = select i1 %nmask, float %dst, float %call
1386 %res = insertelement <4 x float> %y, float %low, i64 0
1387 ret <4 x float> %res
1390 define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind {
1391 ; SSE41-LABEL: floor_maskz_ss:
1393 ; SSE41-NEXT: testb $1, %dil
1394 ; SSE41-NEXT: xorps %xmm2, %xmm2
1395 ; SSE41-NEXT: je LBB53_2
1396 ; SSE41-NEXT: ## %bb.1:
1397 ; SSE41-NEXT: xorps %xmm2, %xmm2
1398 ; SSE41-NEXT: roundss $9, %xmm0, %xmm2
1399 ; SSE41-NEXT: LBB53_2:
1400 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1401 ; SSE41-NEXT: movaps %xmm1, %xmm0
1404 ; AVX-LABEL: floor_maskz_ss:
1406 ; AVX-NEXT: testb $1, %dil
1407 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
1408 ; AVX-NEXT: je LBB53_2
1409 ; AVX-NEXT: ## %bb.1:
1410 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
1411 ; AVX-NEXT: LBB53_2:
1412 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1415 ; AVX512-LABEL: floor_maskz_ss:
1417 ; AVX512-NEXT: kmovw %edi, %k1
1418 ; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
1420 %mask = and i8 %k, 1
1421 %nmask = icmp eq i8 %mask, 0
1422 %s = extractelement <4 x float> %x, i64 0
1423 %call = tail call float @llvm.floor.f32(float %s)
1424 %low = select i1 %nmask, float zeroinitializer, float %call
1425 %res = insertelement <4 x float> %y, float %low, i64 0
1426 ret <4 x float> %res
1429 define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i8 %k) nounwind {
1430 ; SSE41-LABEL: floor_mask_sd:
1432 ; SSE41-NEXT: testb $1, %dil
1433 ; SSE41-NEXT: je LBB54_2
1434 ; SSE41-NEXT: ## %bb.1:
1435 ; SSE41-NEXT: xorps %xmm2, %xmm2
1436 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
1437 ; SSE41-NEXT: LBB54_2:
1438 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1439 ; SSE41-NEXT: movapd %xmm1, %xmm0
1442 ; AVX-LABEL: floor_mask_sd:
1444 ; AVX-NEXT: testb $1, %dil
1445 ; AVX-NEXT: je LBB54_2
1446 ; AVX-NEXT: ## %bb.1:
1447 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
1448 ; AVX-NEXT: LBB54_2:
1449 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1452 ; AVX512-LABEL: floor_mask_sd:
1454 ; AVX512-NEXT: kmovw %edi, %k1
1455 ; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1}
1456 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
1458 %mask = and i8 %k, 1
1459 %nmask = icmp eq i8 %mask, 0
1460 %s = extractelement <2 x double> %x, i64 0
1461 %call = tail call double @llvm.floor.f64(double %s)
1462 %dst = extractelement <2 x double> %w, i64 0
1463 %low = select i1 %nmask, double %dst, double %call
1464 %res = insertelement <2 x double> %y, double %low, i64 0
1465 ret <2 x double> %res
1468 define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind {
1469 ; SSE41-LABEL: floor_maskz_sd:
1471 ; SSE41-NEXT: testb $1, %dil
1472 ; SSE41-NEXT: xorpd %xmm2, %xmm2
1473 ; SSE41-NEXT: je LBB55_2
1474 ; SSE41-NEXT: ## %bb.1:
1475 ; SSE41-NEXT: xorps %xmm2, %xmm2
1476 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
1477 ; SSE41-NEXT: LBB55_2:
1478 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1479 ; SSE41-NEXT: movapd %xmm1, %xmm0
1482 ; AVX-LABEL: floor_maskz_sd:
1484 ; AVX-NEXT: testb $1, %dil
1485 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
1486 ; AVX-NEXT: je LBB55_2
1487 ; AVX-NEXT: ## %bb.1:
1488 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
1489 ; AVX-NEXT: LBB55_2:
1490 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1493 ; AVX512-LABEL: floor_maskz_sd:
1495 ; AVX512-NEXT: kmovw %edi, %k1
1496 ; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
1498 %mask = and i8 %k, 1
1499 %nmask = icmp eq i8 %mask, 0
1500 %s = extractelement <2 x double> %x, i64 0
1501 %call = tail call double @llvm.floor.f64(double %s)
1502 %low = select i1 %nmask, double zeroinitializer, double %call
1503 %res = insertelement <2 x double> %y, double %low, i64 0
1504 ret <2 x double> %res
1507 define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind {
1508 ; SSE41-LABEL: floor_mask_ss_trunc:
1510 ; SSE41-NEXT: testb $1, %dil
1511 ; SSE41-NEXT: je LBB56_2
1512 ; SSE41-NEXT: ## %bb.1:
1513 ; SSE41-NEXT: xorps %xmm2, %xmm2
1514 ; SSE41-NEXT: roundss $9, %xmm0, %xmm2
1515 ; SSE41-NEXT: LBB56_2:
1516 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1517 ; SSE41-NEXT: movaps %xmm1, %xmm0
1520 ; AVX-LABEL: floor_mask_ss_trunc:
1522 ; AVX-NEXT: testb $1, %dil
1523 ; AVX-NEXT: je LBB56_2
1524 ; AVX-NEXT: ## %bb.1:
1525 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
1526 ; AVX-NEXT: LBB56_2:
1527 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1530 ; AVX512-LABEL: floor_mask_ss_trunc:
1532 ; AVX512-NEXT: kmovw %edi, %k1
1533 ; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1}
1534 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
1536 %mask = trunc i16 %k to i1
1537 %s = extractelement <4 x float> %x, i64 0
1538 %call = tail call float @llvm.floor.f32(float %s)
1539 %dst = extractelement <4 x float> %w, i64 0
1540 %low = select i1 %mask, float %call, float %dst
1541 %res = insertelement <4 x float> %y, float %low, i64 0
1542 ret <4 x float> %res
1545 define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k) nounwind {
1546 ; SSE41-LABEL: floor_maskz_ss_trunc:
1548 ; SSE41-NEXT: testb $1, %dil
1549 ; SSE41-NEXT: jne LBB57_1
1550 ; SSE41-NEXT: ## %bb.2:
1551 ; SSE41-NEXT: xorps %xmm0, %xmm0
1552 ; SSE41-NEXT: jmp LBB57_3
1553 ; SSE41-NEXT: LBB57_1:
1554 ; SSE41-NEXT: roundss $9, %xmm0, %xmm0
1555 ; SSE41-NEXT: LBB57_3:
1556 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1557 ; SSE41-NEXT: movaps %xmm1, %xmm0
1560 ; AVX-LABEL: floor_maskz_ss_trunc:
1562 ; AVX-NEXT: testb $1, %dil
1563 ; AVX-NEXT: jne LBB57_1
1564 ; AVX-NEXT: ## %bb.2:
1565 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1566 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1568 ; AVX-NEXT: LBB57_1:
1569 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1570 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1573 ; AVX512-LABEL: floor_maskz_ss_trunc:
1575 ; AVX512-NEXT: kmovw %edi, %k1
1576 ; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
1578 %mask = trunc i16 %k to i1
1579 %s = extractelement <4 x float> %x, i64 0
1580 %call = tail call float @llvm.floor.f32(float %s)
1581 %low = select i1 %mask, float %call, float zeroinitializer
1582 %res = insertelement <4 x float> %y, float %low, i64 0
1583 ret <4 x float> %res
1586 define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind {
1587 ; SSE41-LABEL: floor_mask_sd_trunc:
1589 ; SSE41-NEXT: testb $1, %dil
1590 ; SSE41-NEXT: je LBB58_2
1591 ; SSE41-NEXT: ## %bb.1:
1592 ; SSE41-NEXT: xorps %xmm2, %xmm2
1593 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
1594 ; SSE41-NEXT: LBB58_2:
1595 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1596 ; SSE41-NEXT: movapd %xmm1, %xmm0
1599 ; AVX-LABEL: floor_mask_sd_trunc:
1601 ; AVX-NEXT: testb $1, %dil
1602 ; AVX-NEXT: je LBB58_2
1603 ; AVX-NEXT: ## %bb.1:
1604 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
1605 ; AVX-NEXT: LBB58_2:
1606 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1609 ; AVX512-LABEL: floor_mask_sd_trunc:
1611 ; AVX512-NEXT: kmovw %edi, %k1
1612 ; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1}
1613 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
1615 %mask = trunc i16 %k to i1
1616 %s = extractelement <2 x double> %x, i64 0
1617 %call = tail call double @llvm.floor.f64(double %s)
1618 %dst = extractelement <2 x double> %w, i64 0
1619 %low = select i1 %mask, double %call, double %dst
1620 %res = insertelement <2 x double> %y, double %low, i64 0
1621 ret <2 x double> %res
1624 define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %k) nounwind {
1625 ; SSE41-LABEL: floor_maskz_sd_trunc:
1627 ; SSE41-NEXT: testb $1, %dil
1628 ; SSE41-NEXT: jne LBB59_1
1629 ; SSE41-NEXT: ## %bb.2:
1630 ; SSE41-NEXT: xorpd %xmm0, %xmm0
1631 ; SSE41-NEXT: jmp LBB59_3
1632 ; SSE41-NEXT: LBB59_1:
1633 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm0
1634 ; SSE41-NEXT: LBB59_3:
1635 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1636 ; SSE41-NEXT: movapd %xmm1, %xmm0
1639 ; AVX-LABEL: floor_maskz_sd_trunc:
1641 ; AVX-NEXT: testb $1, %dil
1642 ; AVX-NEXT: jne LBB59_1
1643 ; AVX-NEXT: ## %bb.2:
1644 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1645 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1647 ; AVX-NEXT: LBB59_1:
1648 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
1649 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1652 ; AVX512-LABEL: floor_maskz_sd_trunc:
1654 ; AVX512-NEXT: kmovw %edi, %k1
1655 ; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
1657 %mask = trunc i16 %k to i1
1658 %s = extractelement <2 x double> %x, i64 0
1659 %call = tail call double @llvm.floor.f64(double %s)
1660 %low = select i1 %mask, double %call, double zeroinitializer
1661 %res = insertelement <2 x double> %y, double %low, i64 0
1662 ret <2 x double> %res
1665 define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind {
1666 ; SSE41-LABEL: floor_mask_ss_mask8:
1668 ; SSE41-NEXT: movaps %xmm0, %xmm3
1669 ; SSE41-NEXT: cmpeqps %xmm1, %xmm3
1670 ; SSE41-NEXT: pextrb $0, %xmm3, %eax
1671 ; SSE41-NEXT: testb $1, %al
1672 ; SSE41-NEXT: je LBB60_2
1673 ; SSE41-NEXT: ## %bb.1:
1674 ; SSE41-NEXT: xorps %xmm2, %xmm2
1675 ; SSE41-NEXT: roundss $9, %xmm0, %xmm2
1676 ; SSE41-NEXT: LBB60_2:
1677 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1678 ; SSE41-NEXT: movaps %xmm1, %xmm0
1681 ; AVX-LABEL: floor_mask_ss_mask8:
1683 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm3
1684 ; AVX-NEXT: vpextrb $0, %xmm3, %eax
1685 ; AVX-NEXT: testb $1, %al
1686 ; AVX-NEXT: je LBB60_2
1687 ; AVX-NEXT: ## %bb.1:
1688 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
1689 ; AVX-NEXT: LBB60_2:
1690 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1693 ; AVX512F-LABEL: floor_mask_ss_mask8:
1694 ; AVX512F: ## %bb.0:
1695 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1696 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1697 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1698 ; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1}
1699 ; AVX512F-NEXT: vmovaps %xmm2, %xmm0
1700 ; AVX512F-NEXT: vzeroupper
1701 ; AVX512F-NEXT: retq
1703 ; AVX512VL-LABEL: floor_mask_ss_mask8:
1704 ; AVX512VL: ## %bb.0:
1705 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
1706 ; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1}
1707 ; AVX512VL-NEXT: vmovaps %xmm2, %xmm0
1708 ; AVX512VL-NEXT: retq
1709 %mask1 = fcmp oeq <4 x float> %x, %y
1710 %mask = extractelement <4 x i1> %mask1, i64 0
1711 %s = extractelement <4 x float> %x, i64 0
1712 %call = tail call float @llvm.floor.f32(float %s)
1713 %dst = extractelement <4 x float> %w, i64 0
1714 %low = select i1 %mask, float %call, float %dst
1715 %res = insertelement <4 x float> %y, float %low, i64 0
1716 ret <4 x float> %res
1719 define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind {
1720 ; SSE41-LABEL: floor_maskz_ss_mask8:
1722 ; SSE41-NEXT: movaps %xmm0, %xmm2
1723 ; SSE41-NEXT: cmpeqps %xmm1, %xmm2
1724 ; SSE41-NEXT: pextrb $0, %xmm2, %eax
1725 ; SSE41-NEXT: testb $1, %al
1726 ; SSE41-NEXT: jne LBB61_1
1727 ; SSE41-NEXT: ## %bb.2:
1728 ; SSE41-NEXT: xorps %xmm0, %xmm0
1729 ; SSE41-NEXT: jmp LBB61_3
1730 ; SSE41-NEXT: LBB61_1:
1731 ; SSE41-NEXT: roundss $9, %xmm0, %xmm0
1732 ; SSE41-NEXT: LBB61_3:
1733 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1734 ; SSE41-NEXT: movaps %xmm1, %xmm0
1737 ; AVX-LABEL: floor_maskz_ss_mask8:
1739 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2
1740 ; AVX-NEXT: vpextrb $0, %xmm2, %eax
1741 ; AVX-NEXT: testb $1, %al
1742 ; AVX-NEXT: jne LBB61_1
1743 ; AVX-NEXT: ## %bb.2:
1744 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1745 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1747 ; AVX-NEXT: LBB61_1:
1748 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1749 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1752 ; AVX512F-LABEL: floor_maskz_ss_mask8:
1753 ; AVX512F: ## %bb.0:
1754 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1755 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1756 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1757 ; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
1758 ; AVX512F-NEXT: vzeroupper
1759 ; AVX512F-NEXT: retq
1761 ; AVX512VL-LABEL: floor_maskz_ss_mask8:
1762 ; AVX512VL: ## %bb.0:
1763 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
1764 ; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
1765 ; AVX512VL-NEXT: retq
1766 %mask1 = fcmp oeq <4 x float> %x, %y
1767 %mask = extractelement <4 x i1> %mask1, i64 0
1768 %s = extractelement <4 x float> %x, i64 0
1769 %call = tail call float @llvm.floor.f32(float %s)
1770 %low = select i1 %mask, float %call, float zeroinitializer
1771 %res = insertelement <4 x float> %y, float %low, i64 0
1772 ret <4 x float> %res
1775 define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind {
1776 ; SSE41-LABEL: floor_mask_sd_mask8:
1778 ; SSE41-NEXT: movapd %xmm0, %xmm3
1779 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm3
1780 ; SSE41-NEXT: pextrb $0, %xmm3, %eax
1781 ; SSE41-NEXT: testb $1, %al
1782 ; SSE41-NEXT: je LBB62_2
1783 ; SSE41-NEXT: ## %bb.1:
1784 ; SSE41-NEXT: xorps %xmm2, %xmm2
1785 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
1786 ; SSE41-NEXT: LBB62_2:
1787 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1788 ; SSE41-NEXT: movapd %xmm1, %xmm0
1791 ; AVX-LABEL: floor_mask_sd_mask8:
1793 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm3
1794 ; AVX-NEXT: vpextrb $0, %xmm3, %eax
1795 ; AVX-NEXT: testb $1, %al
1796 ; AVX-NEXT: je LBB62_2
1797 ; AVX-NEXT: ## %bb.1:
1798 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
1799 ; AVX-NEXT: LBB62_2:
1800 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1803 ; AVX512F-LABEL: floor_mask_sd_mask8:
1804 ; AVX512F: ## %bb.0:
1805 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1806 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1807 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1808 ; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1}
1809 ; AVX512F-NEXT: vmovapd %xmm2, %xmm0
1810 ; AVX512F-NEXT: vzeroupper
1811 ; AVX512F-NEXT: retq
1813 ; AVX512VL-LABEL: floor_mask_sd_mask8:
1814 ; AVX512VL: ## %bb.0:
1815 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
1816 ; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1}
1817 ; AVX512VL-NEXT: vmovapd %xmm2, %xmm0
1818 ; AVX512VL-NEXT: retq
1819 %mask1 = fcmp oeq <2 x double> %x, %y
1820 %mask = extractelement <2 x i1> %mask1, i64 0
1821 %s = extractelement <2 x double> %x, i64 0
1822 %call = tail call double @llvm.floor.f64(double %s)
1823 %dst = extractelement <2 x double> %w, i64 0
1824 %low = select i1 %mask, double %call, double %dst
1825 %res = insertelement <2 x double> %y, double %low, i64 0
1826 ret <2 x double> %res
1829 define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind {
1830 ; SSE41-LABEL: floor_maskz_sd_mask8:
1832 ; SSE41-NEXT: movapd %xmm0, %xmm2
1833 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm2
1834 ; SSE41-NEXT: pextrb $0, %xmm2, %eax
1835 ; SSE41-NEXT: testb $1, %al
1836 ; SSE41-NEXT: jne LBB63_1
1837 ; SSE41-NEXT: ## %bb.2:
1838 ; SSE41-NEXT: xorpd %xmm0, %xmm0
1839 ; SSE41-NEXT: jmp LBB63_3
1840 ; SSE41-NEXT: LBB63_1:
1841 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm0
1842 ; SSE41-NEXT: LBB63_3:
1843 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1844 ; SSE41-NEXT: movapd %xmm1, %xmm0
1847 ; AVX-LABEL: floor_maskz_sd_mask8:
1849 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2
1850 ; AVX-NEXT: vpextrb $0, %xmm2, %eax
1851 ; AVX-NEXT: testb $1, %al
1852 ; AVX-NEXT: jne LBB63_1
1853 ; AVX-NEXT: ## %bb.2:
1854 ; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
1855 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1857 ; AVX-NEXT: LBB63_1:
1858 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
1859 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1862 ; AVX512F-LABEL: floor_maskz_sd_mask8:
1863 ; AVX512F: ## %bb.0:
1864 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1865 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1866 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1867 ; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
1868 ; AVX512F-NEXT: vzeroupper
1869 ; AVX512F-NEXT: retq
1871 ; AVX512VL-LABEL: floor_maskz_sd_mask8:
1872 ; AVX512VL: ## %bb.0:
1873 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
1874 ; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
1875 ; AVX512VL-NEXT: retq
1876 %mask1 = fcmp oeq <2 x double> %x, %y
1877 %mask = extractelement <2 x i1> %mask1, i64 0
1878 %s = extractelement <2 x double> %x, i64 0
1879 %call = tail call double @llvm.floor.f64(double %s)
1880 %low = select i1 %mask, double %call, double zeroinitializer
1881 %res = insertelement <2 x double> %y, double %low, i64 0
1882 ret <2 x double> %res
1885 define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
1886 ; SSE41-LABEL: ceil_ss:
1888 ; SSE41-NEXT: roundss $2, %xmm0, %xmm1
1889 ; SSE41-NEXT: movaps %xmm1, %xmm0
1892 ; AVX-LABEL: ceil_ss:
1894 ; AVX-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0
1897 ; AVX512-LABEL: ceil_ss:
1899 ; AVX512-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0
1901 %s = extractelement <4 x float> %x, i32 0
1902 %call = call float @llvm.ceil.f32(float %s)
1903 %res = insertelement <4 x float> %y, float %call, i32 0
1904 ret <4 x float> %res
1906 declare float @llvm.ceil.f32(float %s)
1908 define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
1909 ; SSE41-LABEL: ceil_sd:
1911 ; SSE41-NEXT: roundsd $2, %xmm0, %xmm1
1912 ; SSE41-NEXT: movapd %xmm1, %xmm0
1915 ; AVX-LABEL: ceil_sd:
1917 ; AVX-NEXT: vroundsd $2, %xmm0, %xmm1, %xmm0
1920 ; AVX512-LABEL: ceil_sd:
1922 ; AVX512-NEXT: vroundsd $2, %xmm0, %xmm1, %xmm0
1924 %s = extractelement <2 x double> %x, i32 0
1925 %call = call double @llvm.ceil.f64(double %s)
1926 %res = insertelement <2 x double> %y, double %call, i32 0
1927 ret <2 x double> %res
1929 declare double @llvm.ceil.f64(double %s)
1931 define <4 x float> @ceil_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
1932 ; SSE41-LABEL: ceil_mask_128_ps:
1934 ; SSE41-NEXT: roundps $10, %xmm0, %xmm2
1935 ; SSE41-NEXT: cmpeqps %xmm1, %xmm0
1936 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
1937 ; SSE41-NEXT: movaps %xmm1, %xmm0
1940 ; AVX-LABEL: ceil_mask_128_ps:
1942 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2
1943 ; AVX-NEXT: vroundps $10, %xmm0, %xmm0
1944 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1947 ; AVX512F-LABEL: ceil_mask_128_ps:
1948 ; AVX512F: ## %bb.0:
1949 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1950 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1951 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1952 ; AVX512F-NEXT: vroundps $10, %xmm0, %xmm0
1953 ; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
1954 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1955 ; AVX512F-NEXT: vzeroupper
1956 ; AVX512F-NEXT: retq
1958 ; AVX512VL-LABEL: ceil_mask_128_ps:
1959 ; AVX512VL: ## %bb.0:
1960 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
1961 ; AVX512VL-NEXT: vrndscaleps $10, %xmm0, %xmm1 {%k1}
1962 ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
1963 ; AVX512VL-NEXT: retq
1964 %k = fcmp oeq <4 x float> %x, %y
1965 %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
1966 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y
1967 ret <4 x float> %res
1970 define <4 x float> @ceil_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
1971 ; SSE41-LABEL: ceil_maskz_128_ps:
1973 ; SSE41-NEXT: cmpeqps %xmm0, %xmm1
1974 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
1975 ; SSE41-NEXT: andps %xmm1, %xmm0
1978 ; AVX-LABEL: ceil_maskz_128_ps:
1980 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1
1981 ; AVX-NEXT: vroundps $10, %xmm0, %xmm0
1982 ; AVX-NEXT: vandps %xmm0, %xmm1, %xmm0
1985 ; AVX512F-LABEL: ceil_maskz_128_ps:
1986 ; AVX512F: ## %bb.0:
1987 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1988 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1989 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1990 ; AVX512F-NEXT: vroundps $10, %xmm0, %xmm0
1991 ; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
1992 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1993 ; AVX512F-NEXT: vzeroupper
1994 ; AVX512F-NEXT: retq
1996 ; AVX512VL-LABEL: ceil_maskz_128_ps:
1997 ; AVX512VL: ## %bb.0:
1998 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
1999 ; AVX512VL-NEXT: vrndscaleps $10, %xmm0, %xmm0 {%k1} {z}
2000 ; AVX512VL-NEXT: retq
2001 %k = fcmp oeq <4 x float> %x, %y
2002 %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
2003 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer
2004 ret <4 x float> %res
2007 define <2 x double> @ceil_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
2008 ; SSE41-LABEL: ceil_mask_128_pd:
2010 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm2
2011 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm0
2012 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
2013 ; SSE41-NEXT: movapd %xmm1, %xmm0
2016 ; AVX-LABEL: ceil_mask_128_pd:
2018 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2
2019 ; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
2020 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
2023 ; AVX512F-LABEL: ceil_mask_128_pd:
2024 ; AVX512F: ## %bb.0:
2025 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2026 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2027 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2028 ; AVX512F-NEXT: vroundpd $10, %xmm0, %xmm0
2029 ; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
2030 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
2031 ; AVX512F-NEXT: vzeroupper
2032 ; AVX512F-NEXT: retq
2034 ; AVX512VL-LABEL: ceil_mask_128_pd:
2035 ; AVX512VL: ## %bb.0:
2036 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
2037 ; AVX512VL-NEXT: vrndscalepd $10, %xmm0, %xmm1 {%k1}
2038 ; AVX512VL-NEXT: vmovapd %xmm1, %xmm0
2039 ; AVX512VL-NEXT: retq
2040 %k = fcmp oeq <2 x double> %x, %y
2041 %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
2042 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y
2043 ret <2 x double> %res
2046 define <2 x double> @ceil_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
2047 ; SSE41-LABEL: ceil_maskz_128_pd:
2049 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm1
2050 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
2051 ; SSE41-NEXT: andpd %xmm1, %xmm0
2054 ; AVX-LABEL: ceil_maskz_128_pd:
2056 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1
2057 ; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
2058 ; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0
2061 ; AVX512F-LABEL: ceil_maskz_128_pd:
2062 ; AVX512F: ## %bb.0:
2063 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2064 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2065 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2066 ; AVX512F-NEXT: vroundpd $10, %xmm0, %xmm0
2067 ; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
2068 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
2069 ; AVX512F-NEXT: vzeroupper
2070 ; AVX512F-NEXT: retq
2072 ; AVX512VL-LABEL: ceil_maskz_128_pd:
2073 ; AVX512VL: ## %bb.0:
2074 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
2075 ; AVX512VL-NEXT: vrndscalepd $10, %xmm0, %xmm0 {%k1} {z}
2076 ; AVX512VL-NEXT: retq
2077 %k = fcmp oeq <2 x double> %x, %y
2078 %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
2079 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer
2080 ret <2 x double> %res
2083 define <8 x float> @ceil_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
2084 ; SSE41-LABEL: ceil_mask_256_ps:
2086 ; SSE41-NEXT: roundps $10, %xmm1, %xmm4
2087 ; SSE41-NEXT: cmpeqps %xmm3, %xmm1
2088 ; SSE41-NEXT: roundps $10, %xmm0, %xmm5
2089 ; SSE41-NEXT: cmpeqps %xmm2, %xmm0
2090 ; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
2091 ; SSE41-NEXT: movaps %xmm1, %xmm0
2092 ; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3
2093 ; SSE41-NEXT: movaps %xmm2, %xmm0
2094 ; SSE41-NEXT: movaps %xmm3, %xmm1
2097 ; AVX-LABEL: ceil_mask_256_ps:
2099 ; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm2
2100 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
2101 ; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
2104 ; AVX512F-LABEL: ceil_mask_256_ps:
2105 ; AVX512F: ## %bb.0:
2106 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2107 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
2108 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2109 ; AVX512F-NEXT: vroundps $10, %ymm0, %ymm0
2110 ; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
2111 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2112 ; AVX512F-NEXT: retq
2114 ; AVX512VL-LABEL: ceil_mask_256_ps:
2115 ; AVX512VL: ## %bb.0:
2116 ; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
2117 ; AVX512VL-NEXT: vrndscaleps $10, %ymm0, %ymm1 {%k1}
2118 ; AVX512VL-NEXT: vmovaps %ymm1, %ymm0
2119 ; AVX512VL-NEXT: retq
2120 %k = fcmp oeq <8 x float> %x, %y
2121 %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x)
2122 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y
2123 ret <8 x float> %res
2126 define <8 x float> @ceil_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
2127 ; SSE41-LABEL: ceil_maskz_256_ps:
2129 ; SSE41-NEXT: cmpeqps %xmm1, %xmm3
2130 ; SSE41-NEXT: cmpeqps %xmm0, %xmm2
2131 ; SSE41-NEXT: roundps $10, %xmm1, %xmm1
2132 ; SSE41-NEXT: andps %xmm3, %xmm1
2133 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
2134 ; SSE41-NEXT: andps %xmm2, %xmm0
2137 ; AVX-LABEL: ceil_maskz_256_ps:
2139 ; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
2140 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
2141 ; AVX-NEXT: vandps %ymm0, %ymm1, %ymm0
2144 ; AVX512F-LABEL: ceil_maskz_256_ps:
2145 ; AVX512F: ## %bb.0:
2146 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2147 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
2148 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2149 ; AVX512F-NEXT: vroundps $10, %ymm0, %ymm0
2150 ; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
2151 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2152 ; AVX512F-NEXT: retq
2154 ; AVX512VL-LABEL: ceil_maskz_256_ps:
2155 ; AVX512VL: ## %bb.0:
2156 ; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
2157 ; AVX512VL-NEXT: vrndscaleps $10, %ymm0, %ymm0 {%k1} {z}
2158 ; AVX512VL-NEXT: retq
2159 %k = fcmp oeq <8 x float> %x, %y
2160 %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x)
2161 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer
2162 ret <8 x float> %res
2165 define <4 x double> @ceil_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
2166 ; SSE41-LABEL: ceil_mask_256_pd:
2168 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm4
2169 ; SSE41-NEXT: cmpeqpd %xmm3, %xmm1
2170 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm5
2171 ; SSE41-NEXT: cmpeqpd %xmm2, %xmm0
2172 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
2173 ; SSE41-NEXT: movapd %xmm1, %xmm0
2174 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
2175 ; SSE41-NEXT: movapd %xmm2, %xmm0
2176 ; SSE41-NEXT: movapd %xmm3, %xmm1
2179 ; AVX-LABEL: ceil_mask_256_pd:
2181 ; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
2182 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
2183 ; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
2186 ; AVX512F-LABEL: ceil_mask_256_pd:
2187 ; AVX512F: ## %bb.0:
2188 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2189 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
2190 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2191 ; AVX512F-NEXT: vroundpd $10, %ymm0, %ymm0
2192 ; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
2193 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2194 ; AVX512F-NEXT: retq
2196 ; AVX512VL-LABEL: ceil_mask_256_pd:
2197 ; AVX512VL: ## %bb.0:
2198 ; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
2199 ; AVX512VL-NEXT: vrndscalepd $10, %ymm0, %ymm1 {%k1}
2200 ; AVX512VL-NEXT: vmovapd %ymm1, %ymm0
2201 ; AVX512VL-NEXT: retq
2202 %k = fcmp oeq <4 x double> %x, %y
2203 %call = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
2204 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y
2205 ret <4 x double> %res
2208 define <4 x double> @ceil_maskz_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
2209 ; SSE41-LABEL: ceil_maskz_256_pd:
2211 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm3
2212 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm2
2213 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
2214 ; SSE41-NEXT: andpd %xmm3, %xmm1
2215 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
2216 ; SSE41-NEXT: andpd %xmm2, %xmm0
2219 ; AVX-LABEL: ceil_maskz_256_pd:
2221 ; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1
2222 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
2223 ; AVX-NEXT: vandpd %ymm0, %ymm1, %ymm0
2226 ; AVX512F-LABEL: ceil_maskz_256_pd:
2227 ; AVX512F: ## %bb.0:
2228 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2229 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
2230 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2231 ; AVX512F-NEXT: vroundpd $10, %ymm0, %ymm0
2232 ; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
2233 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2234 ; AVX512F-NEXT: retq
2236 ; AVX512VL-LABEL: ceil_maskz_256_pd:
2237 ; AVX512VL: ## %bb.0:
2238 ; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
2239 ; AVX512VL-NEXT: vrndscalepd $10, %ymm0, %ymm0 {%k1} {z}
2240 ; AVX512VL-NEXT: retq
2241 %k = fcmp oeq <4 x double> %x, %y
2242 %call = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
2243 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> zeroinitializer
2244 ret <4 x double> %res
2247 define <16 x float> @ceil_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
2248 ; SSE41-LABEL: ceil_mask_512_ps:
2250 ; SSE41-NEXT: roundps $10, %xmm3, %xmm8
2251 ; SSE41-NEXT: cmpeqps %xmm7, %xmm3
2252 ; SSE41-NEXT: roundps $10, %xmm2, %xmm9
2253 ; SSE41-NEXT: cmpeqps %xmm6, %xmm2
2254 ; SSE41-NEXT: roundps $10, %xmm1, %xmm10
2255 ; SSE41-NEXT: cmpeqps %xmm5, %xmm1
2256 ; SSE41-NEXT: roundps $10, %xmm0, %xmm11
2257 ; SSE41-NEXT: cmpeqps %xmm4, %xmm0
2258 ; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4
2259 ; SSE41-NEXT: movaps %xmm1, %xmm0
2260 ; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5
2261 ; SSE41-NEXT: movaps %xmm2, %xmm0
2262 ; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6
2263 ; SSE41-NEXT: movaps %xmm3, %xmm0
2264 ; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7
2265 ; SSE41-NEXT: movaps %xmm4, %xmm0
2266 ; SSE41-NEXT: movaps %xmm5, %xmm1
2267 ; SSE41-NEXT: movaps %xmm6, %xmm2
2268 ; SSE41-NEXT: movaps %xmm7, %xmm3
2271 ; AVX-LABEL: ceil_mask_512_ps:
2273 ; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm4
2274 ; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm5
2275 ; AVX-NEXT: vroundps $10, %ymm1, %ymm1
2276 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
2277 ; AVX-NEXT: vblendvps %ymm5, %ymm0, %ymm2, %ymm0
2278 ; AVX-NEXT: vblendvps %ymm4, %ymm1, %ymm3, %ymm1
2281 ; AVX512-LABEL: ceil_mask_512_ps:
2283 ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2284 ; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm1 {%k1}
2285 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
2287 %k = fcmp oeq <16 x float> %x, %y
2288 %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x)
2289 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y
2290 ret <16 x float> %res
2293 define <16 x float> @ceil_maskz_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
2294 ; SSE41-LABEL: ceil_maskz_512_ps:
2296 ; SSE41-NEXT: cmpeqps %xmm3, %xmm7
2297 ; SSE41-NEXT: cmpeqps %xmm2, %xmm6
2298 ; SSE41-NEXT: cmpeqps %xmm1, %xmm5
2299 ; SSE41-NEXT: cmpeqps %xmm0, %xmm4
2300 ; SSE41-NEXT: roundps $10, %xmm3, %xmm3
2301 ; SSE41-NEXT: andps %xmm7, %xmm3
2302 ; SSE41-NEXT: roundps $10, %xmm2, %xmm2
2303 ; SSE41-NEXT: andps %xmm6, %xmm2
2304 ; SSE41-NEXT: roundps $10, %xmm1, %xmm1
2305 ; SSE41-NEXT: andps %xmm5, %xmm1
2306 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
2307 ; SSE41-NEXT: andps %xmm4, %xmm0
2310 ; AVX-LABEL: ceil_maskz_512_ps:
2312 ; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm3
2313 ; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm2
2314 ; AVX-NEXT: vroundps $10, %ymm1, %ymm1
2315 ; AVX-NEXT: vandps %ymm1, %ymm3, %ymm1
2316 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
2317 ; AVX-NEXT: vandps %ymm0, %ymm2, %ymm0
2320 ; AVX512-LABEL: ceil_maskz_512_ps:
2322 ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2323 ; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0 {%k1} {z}
2325 %k = fcmp oeq <16 x float> %x, %y
2326 %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x)
2327 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> zeroinitializer
2328 ret <16 x float> %res
2331 define <8 x double> @ceil_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
2332 ; SSE41-LABEL: ceil_mask_512_pd:
2334 ; SSE41-NEXT: roundpd $10, %xmm3, %xmm8
2335 ; SSE41-NEXT: cmpeqpd %xmm7, %xmm3
2336 ; SSE41-NEXT: roundpd $10, %xmm2, %xmm9
2337 ; SSE41-NEXT: cmpeqpd %xmm6, %xmm2
2338 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm10
2339 ; SSE41-NEXT: cmpeqpd %xmm5, %xmm1
2340 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm11
2341 ; SSE41-NEXT: cmpeqpd %xmm4, %xmm0
2342 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4
2343 ; SSE41-NEXT: movapd %xmm1, %xmm0
2344 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5
2345 ; SSE41-NEXT: movapd %xmm2, %xmm0
2346 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
2347 ; SSE41-NEXT: movapd %xmm3, %xmm0
2348 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
2349 ; SSE41-NEXT: movapd %xmm4, %xmm0
2350 ; SSE41-NEXT: movapd %xmm5, %xmm1
2351 ; SSE41-NEXT: movapd %xmm6, %xmm2
2352 ; SSE41-NEXT: movapd %xmm7, %xmm3
2355 ; AVX-LABEL: ceil_mask_512_pd:
2357 ; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm4
2358 ; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm5
2359 ; AVX-NEXT: vroundpd $10, %ymm1, %ymm1
2360 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
2361 ; AVX-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
2362 ; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
2365 ; AVX512-LABEL: ceil_mask_512_pd:
2367 ; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2368 ; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm1 {%k1}
2369 ; AVX512-NEXT: vmovapd %zmm1, %zmm0
2371 %k = fcmp oeq <8 x double> %x, %y
2372 %call = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x)
2373 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y
2374 ret <8 x double> %res
2377 define <8 x double> @ceil_maskz_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
2378 ; SSE41-LABEL: ceil_maskz_512_pd:
2380 ; SSE41-NEXT: cmpeqpd %xmm3, %xmm7
2381 ; SSE41-NEXT: cmpeqpd %xmm2, %xmm6
2382 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm5
2383 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm4
2384 ; SSE41-NEXT: roundpd $10, %xmm3, %xmm3
2385 ; SSE41-NEXT: andpd %xmm7, %xmm3
2386 ; SSE41-NEXT: roundpd $10, %xmm2, %xmm2
2387 ; SSE41-NEXT: andpd %xmm6, %xmm2
2388 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
2389 ; SSE41-NEXT: andpd %xmm5, %xmm1
2390 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
2391 ; SSE41-NEXT: andpd %xmm4, %xmm0
2394 ; AVX-LABEL: ceil_maskz_512_pd:
2396 ; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm3
2397 ; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm2
2398 ; AVX-NEXT: vroundpd $10, %ymm1, %ymm1
2399 ; AVX-NEXT: vandpd %ymm1, %ymm3, %ymm1
2400 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
2401 ; AVX-NEXT: vandpd %ymm0, %ymm2, %ymm0
2404 ; AVX512-LABEL: ceil_maskz_512_pd:
2406 ; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2407 ; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0 {%k1} {z}
2409 %k = fcmp oeq <8 x double> %x, %y
2410 %call = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x)
2411 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> zeroinitializer
2412 ret <8 x double> %res
2415 define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i8 %k) nounwind {
2416 ; SSE41-LABEL: ceil_mask_ss:
2418 ; SSE41-NEXT: testb $1, %dil
2419 ; SSE41-NEXT: je LBB78_2
2420 ; SSE41-NEXT: ## %bb.1:
2421 ; SSE41-NEXT: xorps %xmm2, %xmm2
2422 ; SSE41-NEXT: roundss $10, %xmm0, %xmm2
2423 ; SSE41-NEXT: LBB78_2:
2424 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2425 ; SSE41-NEXT: movaps %xmm1, %xmm0
2428 ; AVX-LABEL: ceil_mask_ss:
2430 ; AVX-NEXT: testb $1, %dil
2431 ; AVX-NEXT: je LBB78_2
2432 ; AVX-NEXT: ## %bb.1:
2433 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
2434 ; AVX-NEXT: LBB78_2:
2435 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2438 ; AVX512-LABEL: ceil_mask_ss:
2440 ; AVX512-NEXT: kmovw %edi, %k1
2441 ; AVX512-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1}
2442 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
2444 %mask = and i8 %k, 1
2445 %nmask = icmp eq i8 %mask, 0
2446 %s = extractelement <4 x float> %x, i64 0
2447 %call = tail call float @llvm.ceil.f32(float %s)
2448 %dst = extractelement <4 x float> %w, i64 0
2449 %low = select i1 %nmask, float %dst, float %call
2450 %res = insertelement <4 x float> %y, float %low, i64 0
2451 ret <4 x float> %res
2454 define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind {
2455 ; SSE41-LABEL: ceil_maskz_ss:
2457 ; SSE41-NEXT: testb $1, %dil
2458 ; SSE41-NEXT: xorps %xmm2, %xmm2
2459 ; SSE41-NEXT: je LBB79_2
2460 ; SSE41-NEXT: ## %bb.1:
2461 ; SSE41-NEXT: xorps %xmm2, %xmm2
2462 ; SSE41-NEXT: roundss $10, %xmm0, %xmm2
2463 ; SSE41-NEXT: LBB79_2:
2464 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2465 ; SSE41-NEXT: movaps %xmm1, %xmm0
2468 ; AVX-LABEL: ceil_maskz_ss:
2470 ; AVX-NEXT: testb $1, %dil
2471 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
2472 ; AVX-NEXT: je LBB79_2
2473 ; AVX-NEXT: ## %bb.1:
2474 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
2475 ; AVX-NEXT: LBB79_2:
2476 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2479 ; AVX512-LABEL: ceil_maskz_ss:
2481 ; AVX512-NEXT: kmovw %edi, %k1
2482 ; AVX512-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z}
2484 %mask = and i8 %k, 1
2485 %nmask = icmp eq i8 %mask, 0
2486 %s = extractelement <4 x float> %x, i64 0
2487 %call = tail call float @llvm.ceil.f32(float %s)
2488 %low = select i1 %nmask, float zeroinitializer, float %call
2489 %res = insertelement <4 x float> %y, float %low, i64 0
2490 ret <4 x float> %res
2493 define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i8 %k) nounwind {
2494 ; SSE41-LABEL: ceil_mask_sd:
2496 ; SSE41-NEXT: testb $1, %dil
2497 ; SSE41-NEXT: je LBB80_2
2498 ; SSE41-NEXT: ## %bb.1:
2499 ; SSE41-NEXT: xorps %xmm2, %xmm2
2500 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm2
2501 ; SSE41-NEXT: LBB80_2:
2502 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2503 ; SSE41-NEXT: movapd %xmm1, %xmm0
2506 ; AVX-LABEL: ceil_mask_sd:
2508 ; AVX-NEXT: testb $1, %dil
2509 ; AVX-NEXT: je LBB80_2
2510 ; AVX-NEXT: ## %bb.1:
2511 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
2512 ; AVX-NEXT: LBB80_2:
2513 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2516 ; AVX512-LABEL: ceil_mask_sd:
2518 ; AVX512-NEXT: kmovw %edi, %k1
2519 ; AVX512-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1}
2520 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
2522 %mask = and i8 %k, 1
2523 %nmask = icmp eq i8 %mask, 0
2524 %s = extractelement <2 x double> %x, i64 0
2525 %call = tail call double @llvm.ceil.f64(double %s)
2526 %dst = extractelement <2 x double> %w, i64 0
2527 %low = select i1 %nmask, double %dst, double %call
2528 %res = insertelement <2 x double> %y, double %low, i64 0
2529 ret <2 x double> %res
2532 define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind {
2533 ; SSE41-LABEL: ceil_maskz_sd:
2535 ; SSE41-NEXT: testb $1, %dil
2536 ; SSE41-NEXT: xorpd %xmm2, %xmm2
2537 ; SSE41-NEXT: je LBB81_2
2538 ; SSE41-NEXT: ## %bb.1:
2539 ; SSE41-NEXT: xorps %xmm2, %xmm2
2540 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm2
2541 ; SSE41-NEXT: LBB81_2:
2542 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2543 ; SSE41-NEXT: movapd %xmm1, %xmm0
2546 ; AVX-LABEL: ceil_maskz_sd:
2548 ; AVX-NEXT: testb $1, %dil
2549 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
2550 ; AVX-NEXT: je LBB81_2
2551 ; AVX-NEXT: ## %bb.1:
2552 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
2553 ; AVX-NEXT: LBB81_2:
2554 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2557 ; AVX512-LABEL: ceil_maskz_sd:
2559 ; AVX512-NEXT: kmovw %edi, %k1
2560 ; AVX512-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z}
2562 %mask = and i8 %k, 1
2563 %nmask = icmp eq i8 %mask, 0
2564 %s = extractelement <2 x double> %x, i64 0
2565 %call = tail call double @llvm.ceil.f64(double %s)
2566 %low = select i1 %nmask, double zeroinitializer, double %call
2567 %res = insertelement <2 x double> %y, double %low, i64 0
2568 ret <2 x double> %res
2571 define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind {
2572 ; SSE41-LABEL: ceil_mask_ss_trunc:
2574 ; SSE41-NEXT: testb $1, %dil
2575 ; SSE41-NEXT: je LBB82_2
2576 ; SSE41-NEXT: ## %bb.1:
2577 ; SSE41-NEXT: xorps %xmm2, %xmm2
2578 ; SSE41-NEXT: roundss $10, %xmm0, %xmm2
2579 ; SSE41-NEXT: LBB82_2:
2580 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2581 ; SSE41-NEXT: movaps %xmm1, %xmm0
2584 ; AVX-LABEL: ceil_mask_ss_trunc:
2586 ; AVX-NEXT: testb $1, %dil
2587 ; AVX-NEXT: je LBB82_2
2588 ; AVX-NEXT: ## %bb.1:
2589 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
2590 ; AVX-NEXT: LBB82_2:
2591 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2594 ; AVX512-LABEL: ceil_mask_ss_trunc:
2596 ; AVX512-NEXT: kmovw %edi, %k1
2597 ; AVX512-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1}
2598 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
2600 %mask = trunc i16 %k to i1
2601 %s = extractelement <4 x float> %x, i64 0
2602 %call = tail call float @llvm.ceil.f32(float %s)
2603 %dst = extractelement <4 x float> %w, i64 0
2604 %low = select i1 %mask, float %call, float %dst
2605 %res = insertelement <4 x float> %y, float %low, i64 0
2606 ret <4 x float> %res
2609 define <4 x float> @ceil_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k) nounwind {
2610 ; SSE41-LABEL: ceil_maskz_ss_trunc:
2612 ; SSE41-NEXT: testb $1, %dil
2613 ; SSE41-NEXT: jne LBB83_1
2614 ; SSE41-NEXT: ## %bb.2:
2615 ; SSE41-NEXT: xorps %xmm0, %xmm0
2616 ; SSE41-NEXT: jmp LBB83_3
2617 ; SSE41-NEXT: LBB83_1:
2618 ; SSE41-NEXT: roundss $10, %xmm0, %xmm0
2619 ; SSE41-NEXT: LBB83_3:
2620 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2621 ; SSE41-NEXT: movaps %xmm1, %xmm0
2624 ; AVX-LABEL: ceil_maskz_ss_trunc:
2626 ; AVX-NEXT: testb $1, %dil
2627 ; AVX-NEXT: jne LBB83_1
2628 ; AVX-NEXT: ## %bb.2:
2629 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
2630 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2632 ; AVX-NEXT: LBB83_1:
2633 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
2634 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2637 ; AVX512-LABEL: ceil_maskz_ss_trunc:
2639 ; AVX512-NEXT: kmovw %edi, %k1
2640 ; AVX512-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z}
2642 %mask = trunc i16 %k to i1
2643 %s = extractelement <4 x float> %x, i64 0
2644 %call = tail call float @llvm.ceil.f32(float %s)
2645 %low = select i1 %mask, float %call, float zeroinitializer
2646 %res = insertelement <4 x float> %y, float %low, i64 0
2647 ret <4 x float> %res
2650 define <2 x double> @ceil_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind {
2651 ; SSE41-LABEL: ceil_mask_sd_trunc:
2653 ; SSE41-NEXT: testb $1, %dil
2654 ; SSE41-NEXT: je LBB84_2
2655 ; SSE41-NEXT: ## %bb.1:
2656 ; SSE41-NEXT: xorps %xmm2, %xmm2
2657 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm2
2658 ; SSE41-NEXT: LBB84_2:
2659 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2660 ; SSE41-NEXT: movapd %xmm1, %xmm0
2663 ; AVX-LABEL: ceil_mask_sd_trunc:
2665 ; AVX-NEXT: testb $1, %dil
2666 ; AVX-NEXT: je LBB84_2
2667 ; AVX-NEXT: ## %bb.1:
2668 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
2669 ; AVX-NEXT: LBB84_2:
2670 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2673 ; AVX512-LABEL: ceil_mask_sd_trunc:
2675 ; AVX512-NEXT: kmovw %edi, %k1
2676 ; AVX512-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1}
2677 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
2679 %mask = trunc i16 %k to i1
2680 %s = extractelement <2 x double> %x, i64 0
2681 %call = tail call double @llvm.ceil.f64(double %s)
2682 %dst = extractelement <2 x double> %w, i64 0
2683 %low = select i1 %mask, double %call, double %dst
2684 %res = insertelement <2 x double> %y, double %low, i64 0
2685 ret <2 x double> %res
2688 define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %k) nounwind {
2689 ; SSE41-LABEL: ceil_maskz_sd_trunc:
2691 ; SSE41-NEXT: testb $1, %dil
2692 ; SSE41-NEXT: jne LBB85_1
2693 ; SSE41-NEXT: ## %bb.2:
2694 ; SSE41-NEXT: xorpd %xmm0, %xmm0
2695 ; SSE41-NEXT: jmp LBB85_3
2696 ; SSE41-NEXT: LBB85_1:
2697 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm0
2698 ; SSE41-NEXT: LBB85_3:
2699 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2700 ; SSE41-NEXT: movapd %xmm1, %xmm0
2703 ; AVX-LABEL: ceil_maskz_sd_trunc:
2705 ; AVX-NEXT: testb $1, %dil
2706 ; AVX-NEXT: jne LBB85_1
2707 ; AVX-NEXT: ## %bb.2:
2708 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
2709 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2711 ; AVX-NEXT: LBB85_1:
2712 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
2713 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2716 ; AVX512-LABEL: ceil_maskz_sd_trunc:
2718 ; AVX512-NEXT: kmovw %edi, %k1
2719 ; AVX512-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z}
2721 %mask = trunc i16 %k to i1
2722 %s = extractelement <2 x double> %x, i64 0
2723 %call = tail call double @llvm.ceil.f64(double %s)
2724 %low = select i1 %mask, double %call, double zeroinitializer
2725 %res = insertelement <2 x double> %y, double %low, i64 0
2726 ret <2 x double> %res
2729 define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind {
2730 ; SSE41-LABEL: ceil_mask_ss_mask8:
2732 ; SSE41-NEXT: movaps %xmm0, %xmm3
2733 ; SSE41-NEXT: cmpeqps %xmm1, %xmm3
2734 ; SSE41-NEXT: pextrb $0, %xmm3, %eax
2735 ; SSE41-NEXT: testb $1, %al
2736 ; SSE41-NEXT: je LBB86_2
2737 ; SSE41-NEXT: ## %bb.1:
2738 ; SSE41-NEXT: xorps %xmm2, %xmm2
2739 ; SSE41-NEXT: roundss $10, %xmm0, %xmm2
2740 ; SSE41-NEXT: LBB86_2:
2741 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2742 ; SSE41-NEXT: movaps %xmm1, %xmm0
2745 ; AVX-LABEL: ceil_mask_ss_mask8:
2747 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm3
2748 ; AVX-NEXT: vpextrb $0, %xmm3, %eax
2749 ; AVX-NEXT: testb $1, %al
2750 ; AVX-NEXT: je LBB86_2
2751 ; AVX-NEXT: ## %bb.1:
2752 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
2753 ; AVX-NEXT: LBB86_2:
2754 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2757 ; AVX512F-LABEL: ceil_mask_ss_mask8:
2758 ; AVX512F: ## %bb.0:
2759 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2760 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2761 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2762 ; AVX512F-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1}
2763 ; AVX512F-NEXT: vmovaps %xmm2, %xmm0
2764 ; AVX512F-NEXT: vzeroupper
2765 ; AVX512F-NEXT: retq
2767 ; AVX512VL-LABEL: ceil_mask_ss_mask8:
2768 ; AVX512VL: ## %bb.0:
2769 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
2770 ; AVX512VL-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1}
2771 ; AVX512VL-NEXT: vmovaps %xmm2, %xmm0
2772 ; AVX512VL-NEXT: retq
2773 %mask1 = fcmp oeq <4 x float> %x, %y
2774 %mask = extractelement <4 x i1> %mask1, i64 0
2775 %s = extractelement <4 x float> %x, i64 0
2776 %call = tail call float @llvm.ceil.f32(float %s)
2777 %dst = extractelement <4 x float> %w, i64 0
2778 %low = select i1 %mask, float %call, float %dst
2779 %res = insertelement <4 x float> %y, float %low, i64 0
2780 ret <4 x float> %res
2783 define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind {
2784 ; SSE41-LABEL: ceil_maskz_ss_mask8:
2786 ; SSE41-NEXT: movaps %xmm0, %xmm2
2787 ; SSE41-NEXT: cmpeqps %xmm1, %xmm2
2788 ; SSE41-NEXT: pextrb $0, %xmm2, %eax
2789 ; SSE41-NEXT: testb $1, %al
2790 ; SSE41-NEXT: jne LBB87_1
2791 ; SSE41-NEXT: ## %bb.2:
2792 ; SSE41-NEXT: xorps %xmm0, %xmm0
2793 ; SSE41-NEXT: jmp LBB87_3
2794 ; SSE41-NEXT: LBB87_1:
2795 ; SSE41-NEXT: roundss $10, %xmm0, %xmm0
2796 ; SSE41-NEXT: LBB87_3:
2797 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2798 ; SSE41-NEXT: movaps %xmm1, %xmm0
2801 ; AVX-LABEL: ceil_maskz_ss_mask8:
2803 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2
2804 ; AVX-NEXT: vpextrb $0, %xmm2, %eax
2805 ; AVX-NEXT: testb $1, %al
2806 ; AVX-NEXT: jne LBB87_1
2807 ; AVX-NEXT: ## %bb.2:
2808 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
2809 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2811 ; AVX-NEXT: LBB87_1:
2812 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
2813 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2816 ; AVX512F-LABEL: ceil_maskz_ss_mask8:
2817 ; AVX512F: ## %bb.0:
2818 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2819 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2820 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2821 ; AVX512F-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z}
2822 ; AVX512F-NEXT: vzeroupper
2823 ; AVX512F-NEXT: retq
2825 ; AVX512VL-LABEL: ceil_maskz_ss_mask8:
2826 ; AVX512VL: ## %bb.0:
2827 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
2828 ; AVX512VL-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z}
2829 ; AVX512VL-NEXT: retq
2830 %mask1 = fcmp oeq <4 x float> %x, %y
2831 %mask = extractelement <4 x i1> %mask1, i64 0
2832 %s = extractelement <4 x float> %x, i64 0
2833 %call = tail call float @llvm.ceil.f32(float %s)
2834 %low = select i1 %mask, float %call, float zeroinitializer
2835 %res = insertelement <4 x float> %y, float %low, i64 0
2836 ret <4 x float> %res
2839 define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind {
2840 ; SSE41-LABEL: ceil_mask_sd_mask8:
2842 ; SSE41-NEXT: movapd %xmm0, %xmm3
2843 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm3
2844 ; SSE41-NEXT: pextrb $0, %xmm3, %eax
2845 ; SSE41-NEXT: testb $1, %al
2846 ; SSE41-NEXT: je LBB88_2
2847 ; SSE41-NEXT: ## %bb.1:
2848 ; SSE41-NEXT: xorps %xmm2, %xmm2
2849 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm2
2850 ; SSE41-NEXT: LBB88_2:
2851 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2852 ; SSE41-NEXT: movapd %xmm1, %xmm0
2855 ; AVX-LABEL: ceil_mask_sd_mask8:
2857 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm3
2858 ; AVX-NEXT: vpextrb $0, %xmm3, %eax
2859 ; AVX-NEXT: testb $1, %al
2860 ; AVX-NEXT: je LBB88_2
2861 ; AVX-NEXT: ## %bb.1:
2862 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
2863 ; AVX-NEXT: LBB88_2:
2864 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2867 ; AVX512F-LABEL: ceil_mask_sd_mask8:
2868 ; AVX512F: ## %bb.0:
2869 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2870 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2871 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2872 ; AVX512F-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1}
2873 ; AVX512F-NEXT: vmovapd %xmm2, %xmm0
2874 ; AVX512F-NEXT: vzeroupper
2875 ; AVX512F-NEXT: retq
2877 ; AVX512VL-LABEL: ceil_mask_sd_mask8:
2878 ; AVX512VL: ## %bb.0:
2879 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
2880 ; AVX512VL-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1}
2881 ; AVX512VL-NEXT: vmovapd %xmm2, %xmm0
2882 ; AVX512VL-NEXT: retq
2883 %mask1 = fcmp oeq <2 x double> %x, %y
2884 %mask = extractelement <2 x i1> %mask1, i64 0
2885 %s = extractelement <2 x double> %x, i64 0
2886 %call = tail call double @llvm.ceil.f64(double %s)
2887 %dst = extractelement <2 x double> %w, i64 0
2888 %low = select i1 %mask, double %call, double %dst
2889 %res = insertelement <2 x double> %y, double %low, i64 0
2890 ret <2 x double> %res
2893 define <2 x double> @ceil_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind {
2894 ; SSE41-LABEL: ceil_maskz_sd_mask8:
2896 ; SSE41-NEXT: movapd %xmm0, %xmm2
2897 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm2
2898 ; SSE41-NEXT: pextrb $0, %xmm2, %eax
2899 ; SSE41-NEXT: testb $1, %al
2900 ; SSE41-NEXT: jne LBB89_1
2901 ; SSE41-NEXT: ## %bb.2:
2902 ; SSE41-NEXT: xorpd %xmm0, %xmm0
2903 ; SSE41-NEXT: jmp LBB89_3
2904 ; SSE41-NEXT: LBB89_1:
2905 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm0
2906 ; SSE41-NEXT: LBB89_3:
2907 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2908 ; SSE41-NEXT: movapd %xmm1, %xmm0
2911 ; AVX-LABEL: ceil_maskz_sd_mask8:
2913 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2
2914 ; AVX-NEXT: vpextrb $0, %xmm2, %eax
2915 ; AVX-NEXT: testb $1, %al
2916 ; AVX-NEXT: jne LBB89_1
2917 ; AVX-NEXT: ## %bb.2:
2918 ; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
2919 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2921 ; AVX-NEXT: LBB89_1:
2922 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
2923 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2926 ; AVX512F-LABEL: ceil_maskz_sd_mask8:
2927 ; AVX512F: ## %bb.0:
2928 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2929 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2930 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2931 ; AVX512F-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z}
2932 ; AVX512F-NEXT: vzeroupper
2933 ; AVX512F-NEXT: retq
2935 ; AVX512VL-LABEL: ceil_maskz_sd_mask8:
2936 ; AVX512VL: ## %bb.0:
2937 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
2938 ; AVX512VL-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z}
2939 ; AVX512VL-NEXT: retq
2940 %mask1 = fcmp oeq <2 x double> %x, %y
2941 %mask = extractelement <2 x i1> %mask1, i64 0
2942 %s = extractelement <2 x double> %x, i64 0
2943 %call = tail call double @llvm.ceil.f64(double %s)
2944 %low = select i1 %mask, double %call, double zeroinitializer
2945 %res = insertelement <2 x double> %y, double %low, i64 0
2946 ret <2 x double> %res