1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX
4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
7 define <2 x double> @floor_v2f64(<2 x double> %p) {
8 ; SSE41-LABEL: floor_v2f64:
10 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
13 ; AVX-LABEL: floor_v2f64:
15 ; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
18 ; AVX512-LABEL: floor_v2f64:
20 ; AVX512-NEXT: vroundpd $9, %xmm0, %xmm0
22 %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
25 declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
27 define <4 x float> @floor_v4f32(<4 x float> %p) {
28 ; SSE41-LABEL: floor_v4f32:
30 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
33 ; AVX-LABEL: floor_v4f32:
35 ; AVX-NEXT: vroundps $9, %xmm0, %xmm0
38 ; AVX512-LABEL: floor_v4f32:
40 ; AVX512-NEXT: vroundps $9, %xmm0, %xmm0
42 %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
45 declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
47 define <4 x double> @floor_v4f64(<4 x double> %p){
48 ; SSE41-LABEL: floor_v4f64:
50 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
51 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
54 ; AVX-LABEL: floor_v4f64:
56 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
59 ; AVX512-LABEL: floor_v4f64:
61 ; AVX512-NEXT: vroundpd $9, %ymm0, %ymm0
63 %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
66 declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
68 define <8 x float> @floor_v8f32(<8 x float> %p) {
69 ; SSE41-LABEL: floor_v8f32:
71 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
72 ; SSE41-NEXT: roundps $9, %xmm1, %xmm1
75 ; AVX-LABEL: floor_v8f32:
77 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
80 ; AVX512-LABEL: floor_v8f32:
82 ; AVX512-NEXT: vroundps $9, %ymm0, %ymm0
84 %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
87 declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
89 define <8 x double> @floor_v8f64(<8 x double> %p){
90 ; SSE41-LABEL: floor_v8f64:
92 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
93 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
94 ; SSE41-NEXT: roundpd $9, %xmm2, %xmm2
95 ; SSE41-NEXT: roundpd $9, %xmm3, %xmm3
98 ; AVX-LABEL: floor_v8f64:
100 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
101 ; AVX-NEXT: vroundpd $9, %ymm1, %ymm1
104 ; AVX512-LABEL: floor_v8f64:
106 ; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0
108 %t = call <8 x double> @llvm.floor.v8f64(<8 x double> %p)
111 declare <8 x double> @llvm.floor.v8f64(<8 x double> %p)
113 define <16 x float> @floor_v16f32(<16 x float> %p) {
114 ; SSE41-LABEL: floor_v16f32:
116 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
117 ; SSE41-NEXT: roundps $9, %xmm1, %xmm1
118 ; SSE41-NEXT: roundps $9, %xmm2, %xmm2
119 ; SSE41-NEXT: roundps $9, %xmm3, %xmm3
122 ; AVX-LABEL: floor_v16f32:
124 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
125 ; AVX-NEXT: vroundps $9, %ymm1, %ymm1
128 ; AVX512-LABEL: floor_v16f32:
130 ; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0
132 %t = call <16 x float> @llvm.floor.v16f32(<16 x float> %p)
135 declare <16 x float> @llvm.floor.v16f32(<16 x float> %p)
137 define <2 x double> @ceil_v2f64(<2 x double> %p) {
138 ; SSE41-LABEL: ceil_v2f64:
140 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
143 ; AVX-LABEL: ceil_v2f64:
145 ; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
148 ; AVX512-LABEL: ceil_v2f64:
150 ; AVX512-NEXT: vroundpd $10, %xmm0, %xmm0
152 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
155 declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
157 define <2 x double> @ceil_v2f64_load(ptr %ptr) {
158 ; SSE41-LABEL: ceil_v2f64_load:
160 ; SSE41-NEXT: movupd (%rdi), %xmm0
161 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
164 ; AVX-LABEL: ceil_v2f64_load:
166 ; AVX-NEXT: vroundpd $10, (%rdi), %xmm0
169 ; AVX512-LABEL: ceil_v2f64_load:
171 ; AVX512-NEXT: vroundpd $10, (%rdi), %xmm0
173 %p = load <2 x double>, ptr %ptr, align 1
174 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
178 define <4 x float> @ceil_v4f32(<4 x float> %p) {
179 ; SSE41-LABEL: ceil_v4f32:
181 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
184 ; AVX-LABEL: ceil_v4f32:
186 ; AVX-NEXT: vroundps $10, %xmm0, %xmm0
189 ; AVX512-LABEL: ceil_v4f32:
191 ; AVX512-NEXT: vroundps $10, %xmm0, %xmm0
193 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
196 declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
198 define <4 x float> @ceil_v4f32_load(ptr %ptr) {
199 ; SSE41-LABEL: ceil_v4f32_load:
201 ; SSE41-NEXT: movups (%rdi), %xmm0
202 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
205 ; AVX-LABEL: ceil_v4f32_load:
207 ; AVX-NEXT: vroundps $10, (%rdi), %xmm0
210 ; AVX512-LABEL: ceil_v4f32_load:
212 ; AVX512-NEXT: vroundps $10, (%rdi), %xmm0
214 %p = load <4 x float>, ptr %ptr, align 1
215 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
219 define <4 x double> @ceil_v4f64(<4 x double> %p) {
220 ; SSE41-LABEL: ceil_v4f64:
222 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
223 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
226 ; AVX-LABEL: ceil_v4f64:
228 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
231 ; AVX512-LABEL: ceil_v4f64:
233 ; AVX512-NEXT: vroundpd $10, %ymm0, %ymm0
235 %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
238 declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
240 define <8 x float> @ceil_v8f32(<8 x float> %p) {
241 ; SSE41-LABEL: ceil_v8f32:
243 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
244 ; SSE41-NEXT: roundps $10, %xmm1, %xmm1
247 ; AVX-LABEL: ceil_v8f32:
249 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
252 ; AVX512-LABEL: ceil_v8f32:
254 ; AVX512-NEXT: vroundps $10, %ymm0, %ymm0
256 %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
259 declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
261 define <8 x double> @ceil_v8f64(<8 x double> %p){
262 ; SSE41-LABEL: ceil_v8f64:
264 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
265 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
266 ; SSE41-NEXT: roundpd $10, %xmm2, %xmm2
267 ; SSE41-NEXT: roundpd $10, %xmm3, %xmm3
270 ; AVX-LABEL: ceil_v8f64:
272 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
273 ; AVX-NEXT: vroundpd $10, %ymm1, %ymm1
276 ; AVX512-LABEL: ceil_v8f64:
278 ; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0
280 %t = call <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
283 declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
285 define <16 x float> @ceil_v16f32(<16 x float> %p) {
286 ; SSE41-LABEL: ceil_v16f32:
288 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
289 ; SSE41-NEXT: roundps $10, %xmm1, %xmm1
290 ; SSE41-NEXT: roundps $10, %xmm2, %xmm2
291 ; SSE41-NEXT: roundps $10, %xmm3, %xmm3
294 ; AVX-LABEL: ceil_v16f32:
296 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
297 ; AVX-NEXT: vroundps $10, %ymm1, %ymm1
300 ; AVX512-LABEL: ceil_v16f32:
302 ; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0
304 %t = call <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
307 declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
309 define <2 x double> @trunc_v2f64(<2 x double> %p) {
310 ; SSE41-LABEL: trunc_v2f64:
312 ; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
315 ; AVX-LABEL: trunc_v2f64:
317 ; AVX-NEXT: vroundpd $11, %xmm0, %xmm0
320 ; AVX512-LABEL: trunc_v2f64:
322 ; AVX512-NEXT: vroundpd $11, %xmm0, %xmm0
324 %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
327 declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
329 define <4 x float> @trunc_v4f32(<4 x float> %p) {
330 ; SSE41-LABEL: trunc_v4f32:
332 ; SSE41-NEXT: roundps $11, %xmm0, %xmm0
335 ; AVX-LABEL: trunc_v4f32:
337 ; AVX-NEXT: vroundps $11, %xmm0, %xmm0
340 ; AVX512-LABEL: trunc_v4f32:
342 ; AVX512-NEXT: vroundps $11, %xmm0, %xmm0
344 %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
347 declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
349 define <4 x double> @trunc_v4f64(<4 x double> %p) {
350 ; SSE41-LABEL: trunc_v4f64:
352 ; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
353 ; SSE41-NEXT: roundpd $11, %xmm1, %xmm1
356 ; AVX-LABEL: trunc_v4f64:
358 ; AVX-NEXT: vroundpd $11, %ymm0, %ymm0
361 ; AVX512-LABEL: trunc_v4f64:
363 ; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0
365 %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
368 declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
370 define <8 x float> @trunc_v8f32(<8 x float> %p) {
371 ; SSE41-LABEL: trunc_v8f32:
373 ; SSE41-NEXT: roundps $11, %xmm0, %xmm0
374 ; SSE41-NEXT: roundps $11, %xmm1, %xmm1
377 ; AVX-LABEL: trunc_v8f32:
379 ; AVX-NEXT: vroundps $11, %ymm0, %ymm0
382 ; AVX512-LABEL: trunc_v8f32:
384 ; AVX512-NEXT: vroundps $11, %ymm0, %ymm0
386 %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
389 declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
391 define <8 x double> @trunc_v8f64(<8 x double> %p){
392 ; SSE41-LABEL: trunc_v8f64:
394 ; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
395 ; SSE41-NEXT: roundpd $11, %xmm1, %xmm1
396 ; SSE41-NEXT: roundpd $11, %xmm2, %xmm2
397 ; SSE41-NEXT: roundpd $11, %xmm3, %xmm3
400 ; AVX-LABEL: trunc_v8f64:
402 ; AVX-NEXT: vroundpd $11, %ymm0, %ymm0
403 ; AVX-NEXT: vroundpd $11, %ymm1, %ymm1
406 ; AVX512-LABEL: trunc_v8f64:
408 ; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0
410 %t = call <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
413 declare <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
415 define <16 x float> @trunc_v16f32(<16 x float> %p) {
416 ; SSE41-LABEL: trunc_v16f32:
418 ; SSE41-NEXT: roundps $11, %xmm0, %xmm0
419 ; SSE41-NEXT: roundps $11, %xmm1, %xmm1
420 ; SSE41-NEXT: roundps $11, %xmm2, %xmm2
421 ; SSE41-NEXT: roundps $11, %xmm3, %xmm3
424 ; AVX-LABEL: trunc_v16f32:
426 ; AVX-NEXT: vroundps $11, %ymm0, %ymm0
427 ; AVX-NEXT: vroundps $11, %ymm1, %ymm1
430 ; AVX512-LABEL: trunc_v16f32:
432 ; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0
434 %t = call <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
437 declare <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
439 define <2 x double> @rint_v2f64(<2 x double> %p) {
440 ; SSE41-LABEL: rint_v2f64:
442 ; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
445 ; AVX-LABEL: rint_v2f64:
447 ; AVX-NEXT: vroundpd $4, %xmm0, %xmm0
450 ; AVX512-LABEL: rint_v2f64:
452 ; AVX512-NEXT: vroundpd $4, %xmm0, %xmm0
454 %t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
457 declare <2 x double> @llvm.rint.v2f64(<2 x double> %p)
459 define <4 x float> @rint_v4f32(<4 x float> %p) {
460 ; SSE41-LABEL: rint_v4f32:
462 ; SSE41-NEXT: roundps $4, %xmm0, %xmm0
465 ; AVX-LABEL: rint_v4f32:
467 ; AVX-NEXT: vroundps $4, %xmm0, %xmm0
470 ; AVX512-LABEL: rint_v4f32:
472 ; AVX512-NEXT: vroundps $4, %xmm0, %xmm0
474 %t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
477 declare <4 x float> @llvm.rint.v4f32(<4 x float> %p)
479 define <4 x double> @rint_v4f64(<4 x double> %p) {
480 ; SSE41-LABEL: rint_v4f64:
482 ; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
483 ; SSE41-NEXT: roundpd $4, %xmm1, %xmm1
486 ; AVX-LABEL: rint_v4f64:
488 ; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
491 ; AVX512-LABEL: rint_v4f64:
493 ; AVX512-NEXT: vroundpd $4, %ymm0, %ymm0
495 %t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
498 declare <4 x double> @llvm.rint.v4f64(<4 x double> %p)
500 define <8 x float> @rint_v8f32(<8 x float> %p) {
501 ; SSE41-LABEL: rint_v8f32:
503 ; SSE41-NEXT: roundps $4, %xmm0, %xmm0
504 ; SSE41-NEXT: roundps $4, %xmm1, %xmm1
507 ; AVX-LABEL: rint_v8f32:
509 ; AVX-NEXT: vroundps $4, %ymm0, %ymm0
512 ; AVX512-LABEL: rint_v8f32:
514 ; AVX512-NEXT: vroundps $4, %ymm0, %ymm0
516 %t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
519 declare <8 x float> @llvm.rint.v8f32(<8 x float> %p)
521 define <8 x double> @rint_v8f64(<8 x double> %p){
522 ; SSE41-LABEL: rint_v8f64:
524 ; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
525 ; SSE41-NEXT: roundpd $4, %xmm1, %xmm1
526 ; SSE41-NEXT: roundpd $4, %xmm2, %xmm2
527 ; SSE41-NEXT: roundpd $4, %xmm3, %xmm3
530 ; AVX-LABEL: rint_v8f64:
532 ; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
533 ; AVX-NEXT: vroundpd $4, %ymm1, %ymm1
536 ; AVX512-LABEL: rint_v8f64:
538 ; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0
540 %t = call <8 x double> @llvm.rint.v8f64(<8 x double> %p)
543 declare <8 x double> @llvm.rint.v8f64(<8 x double> %p)
545 define <16 x float> @rint_v16f32(<16 x float> %p) {
546 ; SSE41-LABEL: rint_v16f32:
548 ; SSE41-NEXT: roundps $4, %xmm0, %xmm0
549 ; SSE41-NEXT: roundps $4, %xmm1, %xmm1
550 ; SSE41-NEXT: roundps $4, %xmm2, %xmm2
551 ; SSE41-NEXT: roundps $4, %xmm3, %xmm3
554 ; AVX-LABEL: rint_v16f32:
556 ; AVX-NEXT: vroundps $4, %ymm0, %ymm0
557 ; AVX-NEXT: vroundps $4, %ymm1, %ymm1
560 ; AVX512-LABEL: rint_v16f32:
562 ; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0
564 %t = call <16 x float> @llvm.rint.v16f32(<16 x float> %p)
567 declare <16 x float> @llvm.rint.v16f32(<16 x float> %p)
569 define <2 x double> @nearbyint_v2f64(<2 x double> %p) {
570 ; SSE41-LABEL: nearbyint_v2f64:
572 ; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
575 ; AVX-LABEL: nearbyint_v2f64:
577 ; AVX-NEXT: vroundpd $12, %xmm0, %xmm0
580 ; AVX512-LABEL: nearbyint_v2f64:
582 ; AVX512-NEXT: vroundpd $12, %xmm0, %xmm0
584 %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
587 declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
589 define <4 x float> @nearbyint_v4f32(<4 x float> %p) {
590 ; SSE41-LABEL: nearbyint_v4f32:
592 ; SSE41-NEXT: roundps $12, %xmm0, %xmm0
595 ; AVX-LABEL: nearbyint_v4f32:
597 ; AVX-NEXT: vroundps $12, %xmm0, %xmm0
600 ; AVX512-LABEL: nearbyint_v4f32:
602 ; AVX512-NEXT: vroundps $12, %xmm0, %xmm0
604 %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
607 declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
609 define <4 x double> @nearbyint_v4f64(<4 x double> %p) {
610 ; SSE41-LABEL: nearbyint_v4f64:
612 ; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
613 ; SSE41-NEXT: roundpd $12, %xmm1, %xmm1
616 ; AVX-LABEL: nearbyint_v4f64:
618 ; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
621 ; AVX512-LABEL: nearbyint_v4f64:
623 ; AVX512-NEXT: vroundpd $12, %ymm0, %ymm0
625 %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
628 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
630 define <8 x float> @nearbyint_v8f32(<8 x float> %p) {
631 ; SSE41-LABEL: nearbyint_v8f32:
633 ; SSE41-NEXT: roundps $12, %xmm0, %xmm0
634 ; SSE41-NEXT: roundps $12, %xmm1, %xmm1
637 ; AVX-LABEL: nearbyint_v8f32:
639 ; AVX-NEXT: vroundps $12, %ymm0, %ymm0
642 ; AVX512-LABEL: nearbyint_v8f32:
644 ; AVX512-NEXT: vroundps $12, %ymm0, %ymm0
646 %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
649 declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
651 define <8 x double> @nearbyint_v8f64(<8 x double> %p){
652 ; SSE41-LABEL: nearbyint_v8f64:
654 ; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
655 ; SSE41-NEXT: roundpd $12, %xmm1, %xmm1
656 ; SSE41-NEXT: roundpd $12, %xmm2, %xmm2
657 ; SSE41-NEXT: roundpd $12, %xmm3, %xmm3
660 ; AVX-LABEL: nearbyint_v8f64:
662 ; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
663 ; AVX-NEXT: vroundpd $12, %ymm1, %ymm1
666 ; AVX512-LABEL: nearbyint_v8f64:
668 ; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0
670 %t = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
673 declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
675 define <16 x float> @nearbyint_v16f32(<16 x float> %p) {
676 ; SSE41-LABEL: nearbyint_v16f32:
678 ; SSE41-NEXT: roundps $12, %xmm0, %xmm0
679 ; SSE41-NEXT: roundps $12, %xmm1, %xmm1
680 ; SSE41-NEXT: roundps $12, %xmm2, %xmm2
681 ; SSE41-NEXT: roundps $12, %xmm3, %xmm3
684 ; AVX-LABEL: nearbyint_v16f32:
686 ; AVX-NEXT: vroundps $12, %ymm0, %ymm0
687 ; AVX-NEXT: vroundps $12, %ymm1, %ymm1
690 ; AVX512-LABEL: nearbyint_v16f32:
692 ; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0
694 %t = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
697 declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
703 define <2 x double> @const_floor_v2f64() {
704 ; SSE41-LABEL: const_floor_v2f64:
706 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
709 ; AVX-LABEL: const_floor_v2f64:
711 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
714 ; AVX512-LABEL: const_floor_v2f64:
716 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
718 %t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
722 define <4 x float> @const_floor_v4f32() {
723 ; SSE41-LABEL: const_floor_v4f32:
725 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
728 ; AVX-LABEL: const_floor_v4f32:
730 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
733 ; AVX512-LABEL: const_floor_v4f32:
735 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
737 %t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
741 define <2 x double> @const_ceil_v2f64() {
742 ; SSE41-LABEL: const_ceil_v2f64:
744 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
747 ; AVX-LABEL: const_ceil_v2f64:
749 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
752 ; AVX512-LABEL: const_ceil_v2f64:
754 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
756 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
760 define <4 x float> @const_ceil_v4f32() {
761 ; SSE41-LABEL: const_ceil_v4f32:
763 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
766 ; AVX-LABEL: const_ceil_v4f32:
768 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
771 ; AVX512-LABEL: const_ceil_v4f32:
773 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
775 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
779 define <2 x double> @const_trunc_v2f64() {
780 ; SSE41-LABEL: const_trunc_v2f64:
782 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
785 ; AVX-LABEL: const_trunc_v2f64:
787 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
790 ; AVX512-LABEL: const_trunc_v2f64:
792 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
794 %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
798 define <4 x float> @const_trunc_v4f32() {
799 ; SSE41-LABEL: const_trunc_v4f32:
801 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
804 ; AVX-LABEL: const_trunc_v4f32:
806 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
809 ; AVX512-LABEL: const_trunc_v4f32:
811 ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
813 %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
818 ; Scalar and masked instructions
821 define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
822 ; SSE41-LABEL: floor_ss:
824 ; SSE41-NEXT: roundss $9, %xmm0, %xmm0
825 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
828 ; AVX-LABEL: floor_ss:
830 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
831 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
834 ; AVX512-LABEL: floor_ss:
836 ; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
837 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
839 %s = extractelement <4 x float> %x, i32 0
840 %call = call float @llvm.floor.f32(float %s)
841 %res = insertelement <4 x float> %y, float %call, i32 0
844 declare float @llvm.floor.f32(float %s)
846 define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
847 ; SSE41-LABEL: floor_sd:
849 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm0
850 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
853 ; AVX-LABEL: floor_sd:
855 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
856 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
859 ; AVX512-LABEL: floor_sd:
861 ; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
862 ; AVX512-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
864 %s = extractelement <2 x double> %x, i32 0
865 %call = call double @llvm.floor.f64(double %s)
866 %res = insertelement <2 x double> %y, double %call, i32 0
867 ret <2 x double> %res
869 declare double @llvm.floor.f64(double %s)
871 define <4 x float> @floor_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
872 ; SSE41-LABEL: floor_mask_128_ps:
874 ; SSE41-NEXT: roundps $9, %xmm0, %xmm2
875 ; SSE41-NEXT: cmpeqps %xmm1, %xmm0
876 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
877 ; SSE41-NEXT: movaps %xmm1, %xmm0
880 ; AVX-LABEL: floor_mask_128_ps:
882 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2
883 ; AVX-NEXT: vroundps $9, %xmm0, %xmm0
884 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
887 ; AVX512F-LABEL: floor_mask_128_ps:
889 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
890 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
891 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
892 ; AVX512F-NEXT: vroundps $9, %xmm0, %xmm0
893 ; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
894 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
895 ; AVX512F-NEXT: vzeroupper
898 ; AVX512VL-LABEL: floor_mask_128_ps:
899 ; AVX512VL: ## %bb.0:
900 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
901 ; AVX512VL-NEXT: vrndscaleps $9, %xmm0, %xmm1 {%k1}
902 ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
903 ; AVX512VL-NEXT: retq
904 %k = fcmp oeq <4 x float> %x, %y
905 %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
906 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y
910 define <4 x float> @floor_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
911 ; SSE41-LABEL: floor_maskz_128_ps:
913 ; SSE41-NEXT: cmpeqps %xmm0, %xmm1
914 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
915 ; SSE41-NEXT: andps %xmm1, %xmm0
918 ; AVX-LABEL: floor_maskz_128_ps:
920 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1
921 ; AVX-NEXT: vroundps $9, %xmm0, %xmm0
922 ; AVX-NEXT: vandps %xmm0, %xmm1, %xmm0
925 ; AVX512F-LABEL: floor_maskz_128_ps:
927 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
928 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
929 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
930 ; AVX512F-NEXT: vroundps $9, %xmm0, %xmm0
931 ; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
932 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
933 ; AVX512F-NEXT: vzeroupper
936 ; AVX512VL-LABEL: floor_maskz_128_ps:
937 ; AVX512VL: ## %bb.0:
938 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
939 ; AVX512VL-NEXT: vrndscaleps $9, %xmm0, %xmm0 {%k1} {z}
940 ; AVX512VL-NEXT: retq
941 %k = fcmp oeq <4 x float> %x, %y
942 %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
943 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer
947 define <2 x double> @floor_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
948 ; SSE41-LABEL: floor_mask_128_pd:
950 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm2
951 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm0
952 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
953 ; SSE41-NEXT: movapd %xmm1, %xmm0
956 ; AVX-LABEL: floor_mask_128_pd:
958 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2
959 ; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
960 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
963 ; AVX512F-LABEL: floor_mask_128_pd:
965 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
966 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
967 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
968 ; AVX512F-NEXT: vroundpd $9, %xmm0, %xmm0
969 ; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
970 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
971 ; AVX512F-NEXT: vzeroupper
974 ; AVX512VL-LABEL: floor_mask_128_pd:
975 ; AVX512VL: ## %bb.0:
976 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
977 ; AVX512VL-NEXT: vrndscalepd $9, %xmm0, %xmm1 {%k1}
978 ; AVX512VL-NEXT: vmovapd %xmm1, %xmm0
979 ; AVX512VL-NEXT: retq
980 %k = fcmp oeq <2 x double> %x, %y
981 %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
982 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y
983 ret <2 x double> %res
986 define <2 x double> @floor_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
987 ; SSE41-LABEL: floor_maskz_128_pd:
989 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm1
990 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
991 ; SSE41-NEXT: andpd %xmm1, %xmm0
994 ; AVX-LABEL: floor_maskz_128_pd:
996 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1
997 ; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
998 ; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0
1001 ; AVX512F-LABEL: floor_maskz_128_pd:
1002 ; AVX512F: ## %bb.0:
1003 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1004 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1005 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1006 ; AVX512F-NEXT: vroundpd $9, %xmm0, %xmm0
1007 ; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
1008 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1009 ; AVX512F-NEXT: vzeroupper
1010 ; AVX512F-NEXT: retq
1012 ; AVX512VL-LABEL: floor_maskz_128_pd:
1013 ; AVX512VL: ## %bb.0:
1014 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
1015 ; AVX512VL-NEXT: vrndscalepd $9, %xmm0, %xmm0 {%k1} {z}
1016 ; AVX512VL-NEXT: retq
1017 %k = fcmp oeq <2 x double> %x, %y
1018 %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
1019 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer
1020 ret <2 x double> %res
1023 define <8 x float> @floor_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
1024 ; SSE41-LABEL: floor_mask_256_ps:
1026 ; SSE41-NEXT: roundps $9, %xmm1, %xmm4
1027 ; SSE41-NEXT: cmpeqps %xmm3, %xmm1
1028 ; SSE41-NEXT: roundps $9, %xmm0, %xmm5
1029 ; SSE41-NEXT: cmpeqps %xmm2, %xmm0
1030 ; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
1031 ; SSE41-NEXT: movaps %xmm1, %xmm0
1032 ; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3
1033 ; SSE41-NEXT: movaps %xmm2, %xmm0
1034 ; SSE41-NEXT: movaps %xmm3, %xmm1
1037 ; AVX-LABEL: floor_mask_256_ps:
1039 ; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm2
1040 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
1041 ; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
1044 ; AVX512F-LABEL: floor_mask_256_ps:
1045 ; AVX512F: ## %bb.0:
1046 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1047 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1048 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1049 ; AVX512F-NEXT: vroundps $9, %ymm0, %ymm0
1050 ; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
1051 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1052 ; AVX512F-NEXT: retq
1054 ; AVX512VL-LABEL: floor_mask_256_ps:
1055 ; AVX512VL: ## %bb.0:
1056 ; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
1057 ; AVX512VL-NEXT: vrndscaleps $9, %ymm0, %ymm1 {%k1}
1058 ; AVX512VL-NEXT: vmovaps %ymm1, %ymm0
1059 ; AVX512VL-NEXT: retq
1060 %k = fcmp oeq <8 x float> %x, %y
1061 %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x)
1062 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y
1063 ret <8 x float> %res
1066 define <8 x float> @floor_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
1067 ; SSE41-LABEL: floor_maskz_256_ps:
1069 ; SSE41-NEXT: cmpeqps %xmm1, %xmm3
1070 ; SSE41-NEXT: cmpeqps %xmm0, %xmm2
1071 ; SSE41-NEXT: roundps $9, %xmm1, %xmm1
1072 ; SSE41-NEXT: andps %xmm3, %xmm1
1073 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
1074 ; SSE41-NEXT: andps %xmm2, %xmm0
1077 ; AVX-LABEL: floor_maskz_256_ps:
1079 ; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
1080 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
1081 ; AVX-NEXT: vandps %ymm0, %ymm1, %ymm0
1084 ; AVX512F-LABEL: floor_maskz_256_ps:
1085 ; AVX512F: ## %bb.0:
1086 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1087 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1088 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1089 ; AVX512F-NEXT: vroundps $9, %ymm0, %ymm0
1090 ; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
1091 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1092 ; AVX512F-NEXT: retq
1094 ; AVX512VL-LABEL: floor_maskz_256_ps:
1095 ; AVX512VL: ## %bb.0:
1096 ; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
1097 ; AVX512VL-NEXT: vrndscaleps $9, %ymm0, %ymm0 {%k1} {z}
1098 ; AVX512VL-NEXT: retq
1099 %k = fcmp oeq <8 x float> %x, %y
1100 %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x)
1101 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer
1102 ret <8 x float> %res
1105 define <4 x double> @floor_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
1106 ; SSE41-LABEL: floor_mask_256_pd:
1108 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm4
1109 ; SSE41-NEXT: cmpeqpd %xmm3, %xmm1
1110 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm5
1111 ; SSE41-NEXT: cmpeqpd %xmm2, %xmm0
1112 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
1113 ; SSE41-NEXT: movapd %xmm1, %xmm0
1114 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
1115 ; SSE41-NEXT: movapd %xmm2, %xmm0
1116 ; SSE41-NEXT: movapd %xmm3, %xmm1
1119 ; AVX-LABEL: floor_mask_256_pd:
1121 ; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
1122 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
1123 ; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1126 ; AVX512F-LABEL: floor_mask_256_pd:
1127 ; AVX512F: ## %bb.0:
1128 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1129 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1130 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1131 ; AVX512F-NEXT: vroundpd $9, %ymm0, %ymm0
1132 ; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
1133 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1134 ; AVX512F-NEXT: retq
1136 ; AVX512VL-LABEL: floor_mask_256_pd:
1137 ; AVX512VL: ## %bb.0:
1138 ; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
1139 ; AVX512VL-NEXT: vrndscalepd $9, %ymm0, %ymm1 {%k1}
1140 ; AVX512VL-NEXT: vmovapd %ymm1, %ymm0
1141 ; AVX512VL-NEXT: retq
1142 %k = fcmp oeq <4 x double> %x, %y
1143 %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
1144 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y
1145 ret <4 x double> %res
1148 define <4 x double> @floor_maskz_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
1149 ; SSE41-LABEL: floor_maskz_256_pd:
1151 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm3
1152 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm2
1153 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
1154 ; SSE41-NEXT: andpd %xmm3, %xmm1
1155 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
1156 ; SSE41-NEXT: andpd %xmm2, %xmm0
1159 ; AVX-LABEL: floor_maskz_256_pd:
1161 ; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1
1162 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
1163 ; AVX-NEXT: vandpd %ymm0, %ymm1, %ymm0
1166 ; AVX512F-LABEL: floor_maskz_256_pd:
1167 ; AVX512F: ## %bb.0:
1168 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1169 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1170 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1171 ; AVX512F-NEXT: vroundpd $9, %ymm0, %ymm0
1172 ; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
1173 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1174 ; AVX512F-NEXT: retq
1176 ; AVX512VL-LABEL: floor_maskz_256_pd:
1177 ; AVX512VL: ## %bb.0:
1178 ; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
1179 ; AVX512VL-NEXT: vrndscalepd $9, %ymm0, %ymm0 {%k1} {z}
1180 ; AVX512VL-NEXT: retq
1181 %k = fcmp oeq <4 x double> %x, %y
1182 %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
1183 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> zeroinitializer
1184 ret <4 x double> %res
1187 define <16 x float> @floor_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
1188 ; SSE41-LABEL: floor_mask_512_ps:
1190 ; SSE41-NEXT: roundps $9, %xmm3, %xmm8
1191 ; SSE41-NEXT: cmpeqps %xmm7, %xmm3
1192 ; SSE41-NEXT: roundps $9, %xmm2, %xmm9
1193 ; SSE41-NEXT: cmpeqps %xmm6, %xmm2
1194 ; SSE41-NEXT: roundps $9, %xmm1, %xmm10
1195 ; SSE41-NEXT: cmpeqps %xmm5, %xmm1
1196 ; SSE41-NEXT: roundps $9, %xmm0, %xmm11
1197 ; SSE41-NEXT: cmpeqps %xmm4, %xmm0
1198 ; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4
1199 ; SSE41-NEXT: movaps %xmm1, %xmm0
1200 ; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5
1201 ; SSE41-NEXT: movaps %xmm2, %xmm0
1202 ; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6
1203 ; SSE41-NEXT: movaps %xmm3, %xmm0
1204 ; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7
1205 ; SSE41-NEXT: movaps %xmm4, %xmm0
1206 ; SSE41-NEXT: movaps %xmm5, %xmm1
1207 ; SSE41-NEXT: movaps %xmm6, %xmm2
1208 ; SSE41-NEXT: movaps %xmm7, %xmm3
1211 ; AVX-LABEL: floor_mask_512_ps:
1213 ; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm4
1214 ; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm5
1215 ; AVX-NEXT: vroundps $9, %ymm1, %ymm1
1216 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
1217 ; AVX-NEXT: vblendvps %ymm5, %ymm0, %ymm2, %ymm0
1218 ; AVX-NEXT: vblendvps %ymm4, %ymm1, %ymm3, %ymm1
1221 ; AVX512-LABEL: floor_mask_512_ps:
1223 ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1224 ; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm1 {%k1}
1225 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
1227 %k = fcmp oeq <16 x float> %x, %y
1228 %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x)
1229 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y
1230 ret <16 x float> %res
1233 define <16 x float> @floor_maskz_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
1234 ; SSE41-LABEL: floor_maskz_512_ps:
1236 ; SSE41-NEXT: cmpeqps %xmm3, %xmm7
1237 ; SSE41-NEXT: cmpeqps %xmm2, %xmm6
1238 ; SSE41-NEXT: cmpeqps %xmm1, %xmm5
1239 ; SSE41-NEXT: cmpeqps %xmm0, %xmm4
1240 ; SSE41-NEXT: roundps $9, %xmm3, %xmm3
1241 ; SSE41-NEXT: andps %xmm7, %xmm3
1242 ; SSE41-NEXT: roundps $9, %xmm2, %xmm2
1243 ; SSE41-NEXT: andps %xmm6, %xmm2
1244 ; SSE41-NEXT: roundps $9, %xmm1, %xmm1
1245 ; SSE41-NEXT: andps %xmm5, %xmm1
1246 ; SSE41-NEXT: roundps $9, %xmm0, %xmm0
1247 ; SSE41-NEXT: andps %xmm4, %xmm0
1250 ; AVX-LABEL: floor_maskz_512_ps:
1252 ; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm3
1253 ; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm2
1254 ; AVX-NEXT: vroundps $9, %ymm1, %ymm1
1255 ; AVX-NEXT: vandps %ymm1, %ymm3, %ymm1
1256 ; AVX-NEXT: vroundps $9, %ymm0, %ymm0
1257 ; AVX-NEXT: vandps %ymm0, %ymm2, %ymm0
1260 ; AVX512-LABEL: floor_maskz_512_ps:
1262 ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1263 ; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0 {%k1} {z}
1265 %k = fcmp oeq <16 x float> %x, %y
1266 %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x)
1267 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> zeroinitializer
1268 ret <16 x float> %res
1271 define <8 x double> @floor_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
1272 ; SSE41-LABEL: floor_mask_512_pd:
1274 ; SSE41-NEXT: roundpd $9, %xmm3, %xmm8
1275 ; SSE41-NEXT: cmpeqpd %xmm7, %xmm3
1276 ; SSE41-NEXT: roundpd $9, %xmm2, %xmm9
1277 ; SSE41-NEXT: cmpeqpd %xmm6, %xmm2
1278 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm10
1279 ; SSE41-NEXT: cmpeqpd %xmm5, %xmm1
1280 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm11
1281 ; SSE41-NEXT: cmpeqpd %xmm4, %xmm0
1282 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4
1283 ; SSE41-NEXT: movapd %xmm1, %xmm0
1284 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5
1285 ; SSE41-NEXT: movapd %xmm2, %xmm0
1286 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
1287 ; SSE41-NEXT: movapd %xmm3, %xmm0
1288 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
1289 ; SSE41-NEXT: movapd %xmm4, %xmm0
1290 ; SSE41-NEXT: movapd %xmm5, %xmm1
1291 ; SSE41-NEXT: movapd %xmm6, %xmm2
1292 ; SSE41-NEXT: movapd %xmm7, %xmm3
1295 ; AVX-LABEL: floor_mask_512_pd:
1297 ; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm4
1298 ; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm5
1299 ; AVX-NEXT: vroundpd $9, %ymm1, %ymm1
1300 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
1301 ; AVX-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
1302 ; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
1305 ; AVX512-LABEL: floor_mask_512_pd:
1307 ; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1308 ; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm1 {%k1}
1309 ; AVX512-NEXT: vmovapd %zmm1, %zmm0
1311 %k = fcmp oeq <8 x double> %x, %y
1312 %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x)
1313 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y
1314 ret <8 x double> %res
1317 define <8 x double> @floor_maskz_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
1318 ; SSE41-LABEL: floor_maskz_512_pd:
1320 ; SSE41-NEXT: cmpeqpd %xmm3, %xmm7
1321 ; SSE41-NEXT: cmpeqpd %xmm2, %xmm6
1322 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm5
1323 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm4
1324 ; SSE41-NEXT: roundpd $9, %xmm3, %xmm3
1325 ; SSE41-NEXT: andpd %xmm7, %xmm3
1326 ; SSE41-NEXT: roundpd $9, %xmm2, %xmm2
1327 ; SSE41-NEXT: andpd %xmm6, %xmm2
1328 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
1329 ; SSE41-NEXT: andpd %xmm5, %xmm1
1330 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
1331 ; SSE41-NEXT: andpd %xmm4, %xmm0
1334 ; AVX-LABEL: floor_maskz_512_pd:
1336 ; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm3
1337 ; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm2
1338 ; AVX-NEXT: vroundpd $9, %ymm1, %ymm1
1339 ; AVX-NEXT: vandpd %ymm1, %ymm3, %ymm1
1340 ; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
1341 ; AVX-NEXT: vandpd %ymm0, %ymm2, %ymm0
1344 ; AVX512-LABEL: floor_maskz_512_pd:
1346 ; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1347 ; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0 {%k1} {z}
1349 %k = fcmp oeq <8 x double> %x, %y
1350 %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x)
1351 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> zeroinitializer
1352 ret <8 x double> %res
1355 define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i8 %k) nounwind {
1356 ; SSE41-LABEL: floor_mask_ss:
1358 ; SSE41-NEXT: testb $1, %dil
1359 ; SSE41-NEXT: je LBB52_2
1360 ; SSE41-NEXT: ## %bb.1:
1361 ; SSE41-NEXT: xorps %xmm2, %xmm2
1362 ; SSE41-NEXT: roundss $9, %xmm0, %xmm2
1363 ; SSE41-NEXT: LBB52_2:
1364 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1365 ; SSE41-NEXT: movaps %xmm1, %xmm0
1368 ; AVX-LABEL: floor_mask_ss:
1370 ; AVX-NEXT: testb $1, %dil
1371 ; AVX-NEXT: je LBB52_2
1372 ; AVX-NEXT: ## %bb.1:
1373 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
1374 ; AVX-NEXT: LBB52_2:
1375 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1378 ; AVX512-LABEL: floor_mask_ss:
1380 ; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1381 ; AVX512-NEXT: kmovw %edi, %k1
1382 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1}
1383 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
1385 %mask = and i8 %k, 1
1386 %nmask = icmp eq i8 %mask, 0
1387 %s = extractelement <4 x float> %x, i64 0
1388 %call = tail call float @llvm.floor.f32(float %s)
1389 %dst = extractelement <4 x float> %w, i64 0
1390 %low = select i1 %nmask, float %dst, float %call
1391 %res = insertelement <4 x float> %y, float %low, i64 0
1392 ret <4 x float> %res
1395 define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind {
1396 ; SSE41-LABEL: floor_maskz_ss:
1398 ; SSE41-NEXT: testb $1, %dil
1399 ; SSE41-NEXT: xorps %xmm2, %xmm2
1400 ; SSE41-NEXT: je LBB53_2
1401 ; SSE41-NEXT: ## %bb.1:
1402 ; SSE41-NEXT: xorps %xmm2, %xmm2
1403 ; SSE41-NEXT: roundss $9, %xmm0, %xmm2
1404 ; SSE41-NEXT: LBB53_2:
1405 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1406 ; SSE41-NEXT: movaps %xmm1, %xmm0
1409 ; AVX-LABEL: floor_maskz_ss:
1411 ; AVX-NEXT: testb $1, %dil
1412 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
1413 ; AVX-NEXT: je LBB53_2
1414 ; AVX-NEXT: ## %bb.1:
1415 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
1416 ; AVX-NEXT: LBB53_2:
1417 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1420 ; AVX512-LABEL: floor_maskz_ss:
1422 ; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1423 ; AVX512-NEXT: kmovw %edi, %k1
1424 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 {%k1} {z}
1426 %mask = and i8 %k, 1
1427 %nmask = icmp eq i8 %mask, 0
1428 %s = extractelement <4 x float> %x, i64 0
1429 %call = tail call float @llvm.floor.f32(float %s)
1430 %low = select i1 %nmask, float zeroinitializer, float %call
1431 %res = insertelement <4 x float> %y, float %low, i64 0
1432 ret <4 x float> %res
1435 define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i8 %k) nounwind {
1436 ; SSE41-LABEL: floor_mask_sd:
1438 ; SSE41-NEXT: testb $1, %dil
1439 ; SSE41-NEXT: je LBB54_2
1440 ; SSE41-NEXT: ## %bb.1:
1441 ; SSE41-NEXT: xorps %xmm2, %xmm2
1442 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
1443 ; SSE41-NEXT: LBB54_2:
1444 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1445 ; SSE41-NEXT: movapd %xmm1, %xmm0
1448 ; AVX-LABEL: floor_mask_sd:
1450 ; AVX-NEXT: testb $1, %dil
1451 ; AVX-NEXT: je LBB54_2
1452 ; AVX-NEXT: ## %bb.1:
1453 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
1454 ; AVX-NEXT: LBB54_2:
1455 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1458 ; AVX512-LABEL: floor_mask_sd:
1460 ; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
1461 ; AVX512-NEXT: kmovw %edi, %k1
1462 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1}
1463 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
1465 %mask = and i8 %k, 1
1466 %nmask = icmp eq i8 %mask, 0
1467 %s = extractelement <2 x double> %x, i64 0
1468 %call = tail call double @llvm.floor.f64(double %s)
1469 %dst = extractelement <2 x double> %w, i64 0
1470 %low = select i1 %nmask, double %dst, double %call
1471 %res = insertelement <2 x double> %y, double %low, i64 0
1472 ret <2 x double> %res
1475 define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind {
1476 ; SSE41-LABEL: floor_maskz_sd:
1478 ; SSE41-NEXT: testb $1, %dil
1479 ; SSE41-NEXT: xorpd %xmm2, %xmm2
1480 ; SSE41-NEXT: je LBB55_2
1481 ; SSE41-NEXT: ## %bb.1:
1482 ; SSE41-NEXT: xorps %xmm2, %xmm2
1483 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
1484 ; SSE41-NEXT: LBB55_2:
1485 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1486 ; SSE41-NEXT: movapd %xmm1, %xmm0
1489 ; AVX-LABEL: floor_maskz_sd:
1491 ; AVX-NEXT: testb $1, %dil
1492 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
1493 ; AVX-NEXT: je LBB55_2
1494 ; AVX-NEXT: ## %bb.1:
1495 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
1496 ; AVX-NEXT: LBB55_2:
1497 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1500 ; AVX512-LABEL: floor_maskz_sd:
1502 ; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
1503 ; AVX512-NEXT: kmovw %edi, %k1
1504 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z}
1506 %mask = and i8 %k, 1
1507 %nmask = icmp eq i8 %mask, 0
1508 %s = extractelement <2 x double> %x, i64 0
1509 %call = tail call double @llvm.floor.f64(double %s)
1510 %low = select i1 %nmask, double zeroinitializer, double %call
1511 %res = insertelement <2 x double> %y, double %low, i64 0
1512 ret <2 x double> %res
1515 define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind {
1516 ; SSE41-LABEL: floor_mask_ss_trunc:
1518 ; SSE41-NEXT: testb $1, %dil
1519 ; SSE41-NEXT: je LBB56_2
1520 ; SSE41-NEXT: ## %bb.1:
1521 ; SSE41-NEXT: xorps %xmm2, %xmm2
1522 ; SSE41-NEXT: roundss $9, %xmm0, %xmm2
1523 ; SSE41-NEXT: LBB56_2:
1524 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1525 ; SSE41-NEXT: movaps %xmm1, %xmm0
1528 ; AVX-LABEL: floor_mask_ss_trunc:
1530 ; AVX-NEXT: testb $1, %dil
1531 ; AVX-NEXT: je LBB56_2
1532 ; AVX-NEXT: ## %bb.1:
1533 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
1534 ; AVX-NEXT: LBB56_2:
1535 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1538 ; AVX512-LABEL: floor_mask_ss_trunc:
1540 ; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1541 ; AVX512-NEXT: kmovw %edi, %k1
1542 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1}
1543 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
1545 %mask = trunc i16 %k to i1
1546 %s = extractelement <4 x float> %x, i64 0
1547 %call = tail call float @llvm.floor.f32(float %s)
1548 %dst = extractelement <4 x float> %w, i64 0
1549 %low = select i1 %mask, float %call, float %dst
1550 %res = insertelement <4 x float> %y, float %low, i64 0
1551 ret <4 x float> %res
1554 define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k) nounwind {
1555 ; SSE41-LABEL: floor_maskz_ss_trunc:
1557 ; SSE41-NEXT: testb $1, %dil
1558 ; SSE41-NEXT: jne LBB57_1
1559 ; SSE41-NEXT: ## %bb.2:
1560 ; SSE41-NEXT: xorps %xmm0, %xmm0
1561 ; SSE41-NEXT: jmp LBB57_3
1562 ; SSE41-NEXT: LBB57_1:
1563 ; SSE41-NEXT: roundss $9, %xmm0, %xmm0
1564 ; SSE41-NEXT: LBB57_3:
1565 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1566 ; SSE41-NEXT: movaps %xmm1, %xmm0
1569 ; AVX-LABEL: floor_maskz_ss_trunc:
1571 ; AVX-NEXT: testb $1, %dil
1572 ; AVX-NEXT: jne LBB57_1
1573 ; AVX-NEXT: ## %bb.2:
1574 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1575 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1577 ; AVX-NEXT: LBB57_1:
1578 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1579 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1582 ; AVX512-LABEL: floor_maskz_ss_trunc:
1584 ; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1585 ; AVX512-NEXT: kmovw %edi, %k1
1586 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 {%k1} {z}
1588 %mask = trunc i16 %k to i1
1589 %s = extractelement <4 x float> %x, i64 0
1590 %call = tail call float @llvm.floor.f32(float %s)
1591 %low = select i1 %mask, float %call, float zeroinitializer
1592 %res = insertelement <4 x float> %y, float %low, i64 0
1593 ret <4 x float> %res
1596 define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind {
1597 ; SSE41-LABEL: floor_mask_sd_trunc:
1599 ; SSE41-NEXT: testb $1, %dil
1600 ; SSE41-NEXT: je LBB58_2
1601 ; SSE41-NEXT: ## %bb.1:
1602 ; SSE41-NEXT: xorps %xmm2, %xmm2
1603 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
1604 ; SSE41-NEXT: LBB58_2:
1605 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1606 ; SSE41-NEXT: movapd %xmm1, %xmm0
1609 ; AVX-LABEL: floor_mask_sd_trunc:
1611 ; AVX-NEXT: testb $1, %dil
1612 ; AVX-NEXT: je LBB58_2
1613 ; AVX-NEXT: ## %bb.1:
1614 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
1615 ; AVX-NEXT: LBB58_2:
1616 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1619 ; AVX512-LABEL: floor_mask_sd_trunc:
1621 ; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
1622 ; AVX512-NEXT: kmovw %edi, %k1
1623 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1}
1624 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
1626 %mask = trunc i16 %k to i1
1627 %s = extractelement <2 x double> %x, i64 0
1628 %call = tail call double @llvm.floor.f64(double %s)
1629 %dst = extractelement <2 x double> %w, i64 0
1630 %low = select i1 %mask, double %call, double %dst
1631 %res = insertelement <2 x double> %y, double %low, i64 0
1632 ret <2 x double> %res
1635 define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %k) nounwind {
1636 ; SSE41-LABEL: floor_maskz_sd_trunc:
1638 ; SSE41-NEXT: testb $1, %dil
1639 ; SSE41-NEXT: jne LBB59_1
1640 ; SSE41-NEXT: ## %bb.2:
1641 ; SSE41-NEXT: xorpd %xmm0, %xmm0
1642 ; SSE41-NEXT: jmp LBB59_3
1643 ; SSE41-NEXT: LBB59_1:
1644 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm0
1645 ; SSE41-NEXT: LBB59_3:
1646 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1647 ; SSE41-NEXT: movapd %xmm1, %xmm0
1650 ; AVX-LABEL: floor_maskz_sd_trunc:
1652 ; AVX-NEXT: testb $1, %dil
1653 ; AVX-NEXT: jne LBB59_1
1654 ; AVX-NEXT: ## %bb.2:
1655 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1656 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1658 ; AVX-NEXT: LBB59_1:
1659 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
1660 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1663 ; AVX512-LABEL: floor_maskz_sd_trunc:
1665 ; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
1666 ; AVX512-NEXT: kmovw %edi, %k1
1667 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z}
1669 %mask = trunc i16 %k to i1
1670 %s = extractelement <2 x double> %x, i64 0
1671 %call = tail call double @llvm.floor.f64(double %s)
1672 %low = select i1 %mask, double %call, double zeroinitializer
1673 %res = insertelement <2 x double> %y, double %low, i64 0
1674 ret <2 x double> %res
1677 define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind {
1678 ; SSE41-LABEL: floor_mask_ss_mask8:
1680 ; SSE41-NEXT: roundss $9, %xmm0, %xmm3
1681 ; SSE41-NEXT: cmpeqss %xmm1, %xmm0
1682 ; SSE41-NEXT: andps %xmm0, %xmm3
1683 ; SSE41-NEXT: andnps %xmm2, %xmm0
1684 ; SSE41-NEXT: orps %xmm3, %xmm0
1685 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1688 ; AVX-LABEL: floor_mask_ss_mask8:
1690 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm3
1691 ; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
1692 ; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
1693 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1696 ; AVX512-LABEL: floor_mask_ss_mask8:
1698 ; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm3
1699 ; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1
1700 ; AVX512-NEXT: vmovss %xmm3, %xmm1, %xmm2 {%k1}
1701 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
1703 %mask1 = fcmp oeq <4 x float> %x, %y
1704 %mask = extractelement <4 x i1> %mask1, i64 0
1705 %s = extractelement <4 x float> %x, i64 0
1706 %call = tail call float @llvm.floor.f32(float %s)
1707 %dst = extractelement <4 x float> %w, i64 0
1708 %low = select i1 %mask, float %call, float %dst
1709 %res = insertelement <4 x float> %y, float %low, i64 0
1710 ret <4 x float> %res
1713 define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind {
1714 ; SSE41-LABEL: floor_maskz_ss_mask8:
1716 ; SSE41-NEXT: roundss $9, %xmm0, %xmm2
1717 ; SSE41-NEXT: cmpeqss %xmm1, %xmm0
1718 ; SSE41-NEXT: andps %xmm2, %xmm0
1719 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1722 ; AVX-LABEL: floor_maskz_ss_mask8:
1724 ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
1725 ; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
1726 ; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
1727 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1730 ; AVX512-LABEL: floor_maskz_ss_mask8:
1732 ; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
1733 ; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1
1734 ; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1} {z}
1736 %mask1 = fcmp oeq <4 x float> %x, %y
1737 %mask = extractelement <4 x i1> %mask1, i64 0
1738 %s = extractelement <4 x float> %x, i64 0
1739 %call = tail call float @llvm.floor.f32(float %s)
1740 %low = select i1 %mask, float %call, float zeroinitializer
1741 %res = insertelement <4 x float> %y, float %low, i64 0
1742 ret <4 x float> %res
1745 define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind {
1746 ; SSE41-LABEL: floor_mask_sd_mask8:
1748 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm3
1749 ; SSE41-NEXT: cmpeqsd %xmm1, %xmm0
1750 ; SSE41-NEXT: andpd %xmm0, %xmm3
1751 ; SSE41-NEXT: andnpd %xmm2, %xmm0
1752 ; SSE41-NEXT: orpd %xmm3, %xmm0
1753 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1756 ; AVX-LABEL: floor_mask_sd_mask8:
1758 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm3
1759 ; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
1760 ; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
1761 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1764 ; AVX512-LABEL: floor_mask_sd_mask8:
1766 ; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm3
1767 ; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1
1768 ; AVX512-NEXT: vmovsd %xmm3, %xmm1, %xmm2 {%k1}
1769 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
1771 %mask1 = fcmp oeq <2 x double> %x, %y
1772 %mask = extractelement <2 x i1> %mask1, i64 0
1773 %s = extractelement <2 x double> %x, i64 0
1774 %call = tail call double @llvm.floor.f64(double %s)
1775 %dst = extractelement <2 x double> %w, i64 0
1776 %low = select i1 %mask, double %call, double %dst
1777 %res = insertelement <2 x double> %y, double %low, i64 0
1778 ret <2 x double> %res
1781 define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind {
1782 ; SSE41-LABEL: floor_maskz_sd_mask8:
1784 ; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
1785 ; SSE41-NEXT: cmpeqsd %xmm1, %xmm0
1786 ; SSE41-NEXT: andpd %xmm2, %xmm0
1787 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1790 ; AVX-LABEL: floor_maskz_sd_mask8:
1792 ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
1793 ; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
1794 ; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0
1795 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1798 ; AVX512-LABEL: floor_maskz_sd_mask8:
1800 ; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
1801 ; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1
1802 ; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1} {z}
1804 %mask1 = fcmp oeq <2 x double> %x, %y
1805 %mask = extractelement <2 x i1> %mask1, i64 0
1806 %s = extractelement <2 x double> %x, i64 0
1807 %call = tail call double @llvm.floor.f64(double %s)
1808 %low = select i1 %mask, double %call, double zeroinitializer
1809 %res = insertelement <2 x double> %y, double %low, i64 0
1810 ret <2 x double> %res
1813 define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
1814 ; SSE41-LABEL: ceil_ss:
1816 ; SSE41-NEXT: roundss $10, %xmm0, %xmm0
1817 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1820 ; AVX-LABEL: ceil_ss:
1822 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
1823 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1826 ; AVX512-LABEL: ceil_ss:
1828 ; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
1829 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1831 %s = extractelement <4 x float> %x, i32 0
1832 %call = call float @llvm.ceil.f32(float %s)
1833 %res = insertelement <4 x float> %y, float %call, i32 0
1834 ret <4 x float> %res
1836 declare float @llvm.ceil.f32(float %s)
1838 define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
1839 ; SSE41-LABEL: ceil_sd:
1841 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm0
1842 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1845 ; AVX-LABEL: ceil_sd:
1847 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
1848 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1851 ; AVX512-LABEL: ceil_sd:
1853 ; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
1854 ; AVX512-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1856 %s = extractelement <2 x double> %x, i32 0
1857 %call = call double @llvm.ceil.f64(double %s)
1858 %res = insertelement <2 x double> %y, double %call, i32 0
1859 ret <2 x double> %res
1861 declare double @llvm.ceil.f64(double %s)
1863 define <4 x float> @ceil_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
1864 ; SSE41-LABEL: ceil_mask_128_ps:
1866 ; SSE41-NEXT: roundps $10, %xmm0, %xmm2
1867 ; SSE41-NEXT: cmpeqps %xmm1, %xmm0
1868 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
1869 ; SSE41-NEXT: movaps %xmm1, %xmm0
1872 ; AVX-LABEL: ceil_mask_128_ps:
1874 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2
1875 ; AVX-NEXT: vroundps $10, %xmm0, %xmm0
1876 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1879 ; AVX512F-LABEL: ceil_mask_128_ps:
1880 ; AVX512F: ## %bb.0:
1881 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1882 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1883 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1884 ; AVX512F-NEXT: vroundps $10, %xmm0, %xmm0
1885 ; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
1886 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1887 ; AVX512F-NEXT: vzeroupper
1888 ; AVX512F-NEXT: retq
1890 ; AVX512VL-LABEL: ceil_mask_128_ps:
1891 ; AVX512VL: ## %bb.0:
1892 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
1893 ; AVX512VL-NEXT: vrndscaleps $10, %xmm0, %xmm1 {%k1}
1894 ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
1895 ; AVX512VL-NEXT: retq
1896 %k = fcmp oeq <4 x float> %x, %y
1897 %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
1898 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y
1899 ret <4 x float> %res
1902 define <4 x float> @ceil_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
1903 ; SSE41-LABEL: ceil_maskz_128_ps:
1905 ; SSE41-NEXT: cmpeqps %xmm0, %xmm1
1906 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
1907 ; SSE41-NEXT: andps %xmm1, %xmm0
1910 ; AVX-LABEL: ceil_maskz_128_ps:
1912 ; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1
1913 ; AVX-NEXT: vroundps $10, %xmm0, %xmm0
1914 ; AVX-NEXT: vandps %xmm0, %xmm1, %xmm0
1917 ; AVX512F-LABEL: ceil_maskz_128_ps:
1918 ; AVX512F: ## %bb.0:
1919 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1920 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1921 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
1922 ; AVX512F-NEXT: vroundps $10, %xmm0, %xmm0
1923 ; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
1924 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1925 ; AVX512F-NEXT: vzeroupper
1926 ; AVX512F-NEXT: retq
1928 ; AVX512VL-LABEL: ceil_maskz_128_ps:
1929 ; AVX512VL: ## %bb.0:
1930 ; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
1931 ; AVX512VL-NEXT: vrndscaleps $10, %xmm0, %xmm0 {%k1} {z}
1932 ; AVX512VL-NEXT: retq
1933 %k = fcmp oeq <4 x float> %x, %y
1934 %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
1935 %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer
1936 ret <4 x float> %res
1939 define <2 x double> @ceil_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
1940 ; SSE41-LABEL: ceil_mask_128_pd:
1942 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm2
1943 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm0
1944 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
1945 ; SSE41-NEXT: movapd %xmm1, %xmm0
1948 ; AVX-LABEL: ceil_mask_128_pd:
1950 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2
1951 ; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
1952 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1955 ; AVX512F-LABEL: ceil_mask_128_pd:
1956 ; AVX512F: ## %bb.0:
1957 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1958 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1959 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1960 ; AVX512F-NEXT: vroundpd $10, %xmm0, %xmm0
1961 ; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
1962 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1963 ; AVX512F-NEXT: vzeroupper
1964 ; AVX512F-NEXT: retq
1966 ; AVX512VL-LABEL: ceil_mask_128_pd:
1967 ; AVX512VL: ## %bb.0:
1968 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
1969 ; AVX512VL-NEXT: vrndscalepd $10, %xmm0, %xmm1 {%k1}
1970 ; AVX512VL-NEXT: vmovapd %xmm1, %xmm0
1971 ; AVX512VL-NEXT: retq
1972 %k = fcmp oeq <2 x double> %x, %y
1973 %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
1974 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y
1975 ret <2 x double> %res
1978 define <2 x double> @ceil_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
1979 ; SSE41-LABEL: ceil_maskz_128_pd:
1981 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm1
1982 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
1983 ; SSE41-NEXT: andpd %xmm1, %xmm0
1986 ; AVX-LABEL: ceil_maskz_128_pd:
1988 ; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1
1989 ; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
1990 ; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0
1993 ; AVX512F-LABEL: ceil_maskz_128_pd:
1994 ; AVX512F: ## %bb.0:
1995 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1996 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1997 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
1998 ; AVX512F-NEXT: vroundpd $10, %xmm0, %xmm0
1999 ; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
2000 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
2001 ; AVX512F-NEXT: vzeroupper
2002 ; AVX512F-NEXT: retq
2004 ; AVX512VL-LABEL: ceil_maskz_128_pd:
2005 ; AVX512VL: ## %bb.0:
2006 ; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
2007 ; AVX512VL-NEXT: vrndscalepd $10, %xmm0, %xmm0 {%k1} {z}
2008 ; AVX512VL-NEXT: retq
2009 %k = fcmp oeq <2 x double> %x, %y
2010 %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
2011 %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer
2012 ret <2 x double> %res
2015 define <8 x float> @ceil_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
2016 ; SSE41-LABEL: ceil_mask_256_ps:
2018 ; SSE41-NEXT: roundps $10, %xmm1, %xmm4
2019 ; SSE41-NEXT: cmpeqps %xmm3, %xmm1
2020 ; SSE41-NEXT: roundps $10, %xmm0, %xmm5
2021 ; SSE41-NEXT: cmpeqps %xmm2, %xmm0
2022 ; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
2023 ; SSE41-NEXT: movaps %xmm1, %xmm0
2024 ; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3
2025 ; SSE41-NEXT: movaps %xmm2, %xmm0
2026 ; SSE41-NEXT: movaps %xmm3, %xmm1
2029 ; AVX-LABEL: ceil_mask_256_ps:
2031 ; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm2
2032 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
2033 ; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
2036 ; AVX512F-LABEL: ceil_mask_256_ps:
2037 ; AVX512F: ## %bb.0:
2038 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2039 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
2040 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2041 ; AVX512F-NEXT: vroundps $10, %ymm0, %ymm0
2042 ; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
2043 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2044 ; AVX512F-NEXT: retq
2046 ; AVX512VL-LABEL: ceil_mask_256_ps:
2047 ; AVX512VL: ## %bb.0:
2048 ; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
2049 ; AVX512VL-NEXT: vrndscaleps $10, %ymm0, %ymm1 {%k1}
2050 ; AVX512VL-NEXT: vmovaps %ymm1, %ymm0
2051 ; AVX512VL-NEXT: retq
2052 %k = fcmp oeq <8 x float> %x, %y
2053 %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x)
2054 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y
2055 ret <8 x float> %res
2058 define <8 x float> @ceil_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
2059 ; SSE41-LABEL: ceil_maskz_256_ps:
2061 ; SSE41-NEXT: cmpeqps %xmm1, %xmm3
2062 ; SSE41-NEXT: cmpeqps %xmm0, %xmm2
2063 ; SSE41-NEXT: roundps $10, %xmm1, %xmm1
2064 ; SSE41-NEXT: andps %xmm3, %xmm1
2065 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
2066 ; SSE41-NEXT: andps %xmm2, %xmm0
2069 ; AVX-LABEL: ceil_maskz_256_ps:
2071 ; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
2072 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
2073 ; AVX-NEXT: vandps %ymm0, %ymm1, %ymm0
2076 ; AVX512F-LABEL: ceil_maskz_256_ps:
2077 ; AVX512F: ## %bb.0:
2078 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2079 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
2080 ; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2081 ; AVX512F-NEXT: vroundps $10, %ymm0, %ymm0
2082 ; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
2083 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2084 ; AVX512F-NEXT: retq
2086 ; AVX512VL-LABEL: ceil_maskz_256_ps:
2087 ; AVX512VL: ## %bb.0:
2088 ; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
2089 ; AVX512VL-NEXT: vrndscaleps $10, %ymm0, %ymm0 {%k1} {z}
2090 ; AVX512VL-NEXT: retq
2091 %k = fcmp oeq <8 x float> %x, %y
2092 %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x)
2093 %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer
2094 ret <8 x float> %res
2097 define <4 x double> @ceil_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
2098 ; SSE41-LABEL: ceil_mask_256_pd:
2100 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm4
2101 ; SSE41-NEXT: cmpeqpd %xmm3, %xmm1
2102 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm5
2103 ; SSE41-NEXT: cmpeqpd %xmm2, %xmm0
2104 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
2105 ; SSE41-NEXT: movapd %xmm1, %xmm0
2106 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
2107 ; SSE41-NEXT: movapd %xmm2, %xmm0
2108 ; SSE41-NEXT: movapd %xmm3, %xmm1
2111 ; AVX-LABEL: ceil_mask_256_pd:
2113 ; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
2114 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
2115 ; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
2118 ; AVX512F-LABEL: ceil_mask_256_pd:
2119 ; AVX512F: ## %bb.0:
2120 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2121 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
2122 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2123 ; AVX512F-NEXT: vroundpd $10, %ymm0, %ymm0
2124 ; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
2125 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2126 ; AVX512F-NEXT: retq
2128 ; AVX512VL-LABEL: ceil_mask_256_pd:
2129 ; AVX512VL: ## %bb.0:
2130 ; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
2131 ; AVX512VL-NEXT: vrndscalepd $10, %ymm0, %ymm1 {%k1}
2132 ; AVX512VL-NEXT: vmovapd %ymm1, %ymm0
2133 ; AVX512VL-NEXT: retq
2134 %k = fcmp oeq <4 x double> %x, %y
2135 %call = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
2136 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y
2137 ret <4 x double> %res
2140 define <4 x double> @ceil_maskz_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
2141 ; SSE41-LABEL: ceil_maskz_256_pd:
2143 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm3
2144 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm2
2145 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
2146 ; SSE41-NEXT: andpd %xmm3, %xmm1
2147 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
2148 ; SSE41-NEXT: andpd %xmm2, %xmm0
2151 ; AVX-LABEL: ceil_maskz_256_pd:
2153 ; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1
2154 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
2155 ; AVX-NEXT: vandpd %ymm0, %ymm1, %ymm0
2158 ; AVX512F-LABEL: ceil_maskz_256_pd:
2159 ; AVX512F: ## %bb.0:
2160 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2161 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
2162 ; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2163 ; AVX512F-NEXT: vroundpd $10, %ymm0, %ymm0
2164 ; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
2165 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2166 ; AVX512F-NEXT: retq
2168 ; AVX512VL-LABEL: ceil_maskz_256_pd:
2169 ; AVX512VL: ## %bb.0:
2170 ; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
2171 ; AVX512VL-NEXT: vrndscalepd $10, %ymm0, %ymm0 {%k1} {z}
2172 ; AVX512VL-NEXT: retq
2173 %k = fcmp oeq <4 x double> %x, %y
2174 %call = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
2175 %res = select <4 x i1> %k, <4 x double> %call, <4 x double> zeroinitializer
2176 ret <4 x double> %res
2179 define <16 x float> @ceil_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
2180 ; SSE41-LABEL: ceil_mask_512_ps:
2182 ; SSE41-NEXT: roundps $10, %xmm3, %xmm8
2183 ; SSE41-NEXT: cmpeqps %xmm7, %xmm3
2184 ; SSE41-NEXT: roundps $10, %xmm2, %xmm9
2185 ; SSE41-NEXT: cmpeqps %xmm6, %xmm2
2186 ; SSE41-NEXT: roundps $10, %xmm1, %xmm10
2187 ; SSE41-NEXT: cmpeqps %xmm5, %xmm1
2188 ; SSE41-NEXT: roundps $10, %xmm0, %xmm11
2189 ; SSE41-NEXT: cmpeqps %xmm4, %xmm0
2190 ; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4
2191 ; SSE41-NEXT: movaps %xmm1, %xmm0
2192 ; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5
2193 ; SSE41-NEXT: movaps %xmm2, %xmm0
2194 ; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6
2195 ; SSE41-NEXT: movaps %xmm3, %xmm0
2196 ; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7
2197 ; SSE41-NEXT: movaps %xmm4, %xmm0
2198 ; SSE41-NEXT: movaps %xmm5, %xmm1
2199 ; SSE41-NEXT: movaps %xmm6, %xmm2
2200 ; SSE41-NEXT: movaps %xmm7, %xmm3
2203 ; AVX-LABEL: ceil_mask_512_ps:
2205 ; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm4
2206 ; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm5
2207 ; AVX-NEXT: vroundps $10, %ymm1, %ymm1
2208 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
2209 ; AVX-NEXT: vblendvps %ymm5, %ymm0, %ymm2, %ymm0
2210 ; AVX-NEXT: vblendvps %ymm4, %ymm1, %ymm3, %ymm1
2213 ; AVX512-LABEL: ceil_mask_512_ps:
2215 ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2216 ; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm1 {%k1}
2217 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
2219 %k = fcmp oeq <16 x float> %x, %y
2220 %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x)
2221 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y
2222 ret <16 x float> %res
2225 define <16 x float> @ceil_maskz_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
2226 ; SSE41-LABEL: ceil_maskz_512_ps:
2228 ; SSE41-NEXT: cmpeqps %xmm3, %xmm7
2229 ; SSE41-NEXT: cmpeqps %xmm2, %xmm6
2230 ; SSE41-NEXT: cmpeqps %xmm1, %xmm5
2231 ; SSE41-NEXT: cmpeqps %xmm0, %xmm4
2232 ; SSE41-NEXT: roundps $10, %xmm3, %xmm3
2233 ; SSE41-NEXT: andps %xmm7, %xmm3
2234 ; SSE41-NEXT: roundps $10, %xmm2, %xmm2
2235 ; SSE41-NEXT: andps %xmm6, %xmm2
2236 ; SSE41-NEXT: roundps $10, %xmm1, %xmm1
2237 ; SSE41-NEXT: andps %xmm5, %xmm1
2238 ; SSE41-NEXT: roundps $10, %xmm0, %xmm0
2239 ; SSE41-NEXT: andps %xmm4, %xmm0
2242 ; AVX-LABEL: ceil_maskz_512_ps:
2244 ; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm3
2245 ; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm2
2246 ; AVX-NEXT: vroundps $10, %ymm1, %ymm1
2247 ; AVX-NEXT: vandps %ymm1, %ymm3, %ymm1
2248 ; AVX-NEXT: vroundps $10, %ymm0, %ymm0
2249 ; AVX-NEXT: vandps %ymm0, %ymm2, %ymm0
2252 ; AVX512-LABEL: ceil_maskz_512_ps:
2254 ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
2255 ; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0 {%k1} {z}
2257 %k = fcmp oeq <16 x float> %x, %y
2258 %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x)
2259 %res = select <16 x i1> %k, <16 x float> %call, <16 x float> zeroinitializer
2260 ret <16 x float> %res
2263 define <8 x double> @ceil_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
2264 ; SSE41-LABEL: ceil_mask_512_pd:
2266 ; SSE41-NEXT: roundpd $10, %xmm3, %xmm8
2267 ; SSE41-NEXT: cmpeqpd %xmm7, %xmm3
2268 ; SSE41-NEXT: roundpd $10, %xmm2, %xmm9
2269 ; SSE41-NEXT: cmpeqpd %xmm6, %xmm2
2270 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm10
2271 ; SSE41-NEXT: cmpeqpd %xmm5, %xmm1
2272 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm11
2273 ; SSE41-NEXT: cmpeqpd %xmm4, %xmm0
2274 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4
2275 ; SSE41-NEXT: movapd %xmm1, %xmm0
2276 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5
2277 ; SSE41-NEXT: movapd %xmm2, %xmm0
2278 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
2279 ; SSE41-NEXT: movapd %xmm3, %xmm0
2280 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
2281 ; SSE41-NEXT: movapd %xmm4, %xmm0
2282 ; SSE41-NEXT: movapd %xmm5, %xmm1
2283 ; SSE41-NEXT: movapd %xmm6, %xmm2
2284 ; SSE41-NEXT: movapd %xmm7, %xmm3
2287 ; AVX-LABEL: ceil_mask_512_pd:
2289 ; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm4
2290 ; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm5
2291 ; AVX-NEXT: vroundpd $10, %ymm1, %ymm1
2292 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
2293 ; AVX-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
2294 ; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
2297 ; AVX512-LABEL: ceil_mask_512_pd:
2299 ; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2300 ; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm1 {%k1}
2301 ; AVX512-NEXT: vmovapd %zmm1, %zmm0
2303 %k = fcmp oeq <8 x double> %x, %y
2304 %call = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x)
2305 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y
2306 ret <8 x double> %res
2309 define <8 x double> @ceil_maskz_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
2310 ; SSE41-LABEL: ceil_maskz_512_pd:
2312 ; SSE41-NEXT: cmpeqpd %xmm3, %xmm7
2313 ; SSE41-NEXT: cmpeqpd %xmm2, %xmm6
2314 ; SSE41-NEXT: cmpeqpd %xmm1, %xmm5
2315 ; SSE41-NEXT: cmpeqpd %xmm0, %xmm4
2316 ; SSE41-NEXT: roundpd $10, %xmm3, %xmm3
2317 ; SSE41-NEXT: andpd %xmm7, %xmm3
2318 ; SSE41-NEXT: roundpd $10, %xmm2, %xmm2
2319 ; SSE41-NEXT: andpd %xmm6, %xmm2
2320 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
2321 ; SSE41-NEXT: andpd %xmm5, %xmm1
2322 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
2323 ; SSE41-NEXT: andpd %xmm4, %xmm0
2326 ; AVX-LABEL: ceil_maskz_512_pd:
2328 ; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm3
2329 ; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm2
2330 ; AVX-NEXT: vroundpd $10, %ymm1, %ymm1
2331 ; AVX-NEXT: vandpd %ymm1, %ymm3, %ymm1
2332 ; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
2333 ; AVX-NEXT: vandpd %ymm0, %ymm2, %ymm0
2336 ; AVX512-LABEL: ceil_maskz_512_pd:
2338 ; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
2339 ; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0 {%k1} {z}
2341 %k = fcmp oeq <8 x double> %x, %y
2342 %call = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x)
2343 %res = select <8 x i1> %k, <8 x double> %call, <8 x double> zeroinitializer
2344 ret <8 x double> %res
2347 define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i8 %k) nounwind {
2348 ; SSE41-LABEL: ceil_mask_ss:
2350 ; SSE41-NEXT: testb $1, %dil
2351 ; SSE41-NEXT: je LBB78_2
2352 ; SSE41-NEXT: ## %bb.1:
2353 ; SSE41-NEXT: xorps %xmm2, %xmm2
2354 ; SSE41-NEXT: roundss $10, %xmm0, %xmm2
2355 ; SSE41-NEXT: LBB78_2:
2356 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2357 ; SSE41-NEXT: movaps %xmm1, %xmm0
2360 ; AVX-LABEL: ceil_mask_ss:
2362 ; AVX-NEXT: testb $1, %dil
2363 ; AVX-NEXT: je LBB78_2
2364 ; AVX-NEXT: ## %bb.1:
2365 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
2366 ; AVX-NEXT: LBB78_2:
2367 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2370 ; AVX512-LABEL: ceil_mask_ss:
2372 ; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
2373 ; AVX512-NEXT: kmovw %edi, %k1
2374 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1}
2375 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
2377 %mask = and i8 %k, 1
2378 %nmask = icmp eq i8 %mask, 0
2379 %s = extractelement <4 x float> %x, i64 0
2380 %call = tail call float @llvm.ceil.f32(float %s)
2381 %dst = extractelement <4 x float> %w, i64 0
2382 %low = select i1 %nmask, float %dst, float %call
2383 %res = insertelement <4 x float> %y, float %low, i64 0
2384 ret <4 x float> %res
2387 define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind {
2388 ; SSE41-LABEL: ceil_maskz_ss:
2390 ; SSE41-NEXT: testb $1, %dil
2391 ; SSE41-NEXT: xorps %xmm2, %xmm2
2392 ; SSE41-NEXT: je LBB79_2
2393 ; SSE41-NEXT: ## %bb.1:
2394 ; SSE41-NEXT: xorps %xmm2, %xmm2
2395 ; SSE41-NEXT: roundss $10, %xmm0, %xmm2
2396 ; SSE41-NEXT: LBB79_2:
2397 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2398 ; SSE41-NEXT: movaps %xmm1, %xmm0
2401 ; AVX-LABEL: ceil_maskz_ss:
2403 ; AVX-NEXT: testb $1, %dil
2404 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
2405 ; AVX-NEXT: je LBB79_2
2406 ; AVX-NEXT: ## %bb.1:
2407 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
2408 ; AVX-NEXT: LBB79_2:
2409 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2412 ; AVX512-LABEL: ceil_maskz_ss:
2414 ; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
2415 ; AVX512-NEXT: kmovw %edi, %k1
2416 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 {%k1} {z}
2418 %mask = and i8 %k, 1
2419 %nmask = icmp eq i8 %mask, 0
2420 %s = extractelement <4 x float> %x, i64 0
2421 %call = tail call float @llvm.ceil.f32(float %s)
2422 %low = select i1 %nmask, float zeroinitializer, float %call
2423 %res = insertelement <4 x float> %y, float %low, i64 0
2424 ret <4 x float> %res
2427 define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i8 %k) nounwind {
2428 ; SSE41-LABEL: ceil_mask_sd:
2430 ; SSE41-NEXT: testb $1, %dil
2431 ; SSE41-NEXT: je LBB80_2
2432 ; SSE41-NEXT: ## %bb.1:
2433 ; SSE41-NEXT: xorps %xmm2, %xmm2
2434 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm2
2435 ; SSE41-NEXT: LBB80_2:
2436 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2437 ; SSE41-NEXT: movapd %xmm1, %xmm0
2440 ; AVX-LABEL: ceil_mask_sd:
2442 ; AVX-NEXT: testb $1, %dil
2443 ; AVX-NEXT: je LBB80_2
2444 ; AVX-NEXT: ## %bb.1:
2445 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
2446 ; AVX-NEXT: LBB80_2:
2447 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2450 ; AVX512-LABEL: ceil_mask_sd:
2452 ; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
2453 ; AVX512-NEXT: kmovw %edi, %k1
2454 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1}
2455 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
2457 %mask = and i8 %k, 1
2458 %nmask = icmp eq i8 %mask, 0
2459 %s = extractelement <2 x double> %x, i64 0
2460 %call = tail call double @llvm.ceil.f64(double %s)
2461 %dst = extractelement <2 x double> %w, i64 0
2462 %low = select i1 %nmask, double %dst, double %call
2463 %res = insertelement <2 x double> %y, double %low, i64 0
2464 ret <2 x double> %res
2467 define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind {
2468 ; SSE41-LABEL: ceil_maskz_sd:
2470 ; SSE41-NEXT: testb $1, %dil
2471 ; SSE41-NEXT: xorpd %xmm2, %xmm2
2472 ; SSE41-NEXT: je LBB81_2
2473 ; SSE41-NEXT: ## %bb.1:
2474 ; SSE41-NEXT: xorps %xmm2, %xmm2
2475 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm2
2476 ; SSE41-NEXT: LBB81_2:
2477 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2478 ; SSE41-NEXT: movapd %xmm1, %xmm0
2481 ; AVX-LABEL: ceil_maskz_sd:
2483 ; AVX-NEXT: testb $1, %dil
2484 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
2485 ; AVX-NEXT: je LBB81_2
2486 ; AVX-NEXT: ## %bb.1:
2487 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
2488 ; AVX-NEXT: LBB81_2:
2489 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2492 ; AVX512-LABEL: ceil_maskz_sd:
2494 ; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
2495 ; AVX512-NEXT: kmovw %edi, %k1
2496 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z}
2498 %mask = and i8 %k, 1
2499 %nmask = icmp eq i8 %mask, 0
2500 %s = extractelement <2 x double> %x, i64 0
2501 %call = tail call double @llvm.ceil.f64(double %s)
2502 %low = select i1 %nmask, double zeroinitializer, double %call
2503 %res = insertelement <2 x double> %y, double %low, i64 0
2504 ret <2 x double> %res
2507 define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind {
2508 ; SSE41-LABEL: ceil_mask_ss_trunc:
2510 ; SSE41-NEXT: testb $1, %dil
2511 ; SSE41-NEXT: je LBB82_2
2512 ; SSE41-NEXT: ## %bb.1:
2513 ; SSE41-NEXT: xorps %xmm2, %xmm2
2514 ; SSE41-NEXT: roundss $10, %xmm0, %xmm2
2515 ; SSE41-NEXT: LBB82_2:
2516 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2517 ; SSE41-NEXT: movaps %xmm1, %xmm0
2520 ; AVX-LABEL: ceil_mask_ss_trunc:
2522 ; AVX-NEXT: testb $1, %dil
2523 ; AVX-NEXT: je LBB82_2
2524 ; AVX-NEXT: ## %bb.1:
2525 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
2526 ; AVX-NEXT: LBB82_2:
2527 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2530 ; AVX512-LABEL: ceil_mask_ss_trunc:
2532 ; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
2533 ; AVX512-NEXT: kmovw %edi, %k1
2534 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1}
2535 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
2537 %mask = trunc i16 %k to i1
2538 %s = extractelement <4 x float> %x, i64 0
2539 %call = tail call float @llvm.ceil.f32(float %s)
2540 %dst = extractelement <4 x float> %w, i64 0
2541 %low = select i1 %mask, float %call, float %dst
2542 %res = insertelement <4 x float> %y, float %low, i64 0
2543 ret <4 x float> %res
2546 define <4 x float> @ceil_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k) nounwind {
2547 ; SSE41-LABEL: ceil_maskz_ss_trunc:
2549 ; SSE41-NEXT: testb $1, %dil
2550 ; SSE41-NEXT: jne LBB83_1
2551 ; SSE41-NEXT: ## %bb.2:
2552 ; SSE41-NEXT: xorps %xmm0, %xmm0
2553 ; SSE41-NEXT: jmp LBB83_3
2554 ; SSE41-NEXT: LBB83_1:
2555 ; SSE41-NEXT: roundss $10, %xmm0, %xmm0
2556 ; SSE41-NEXT: LBB83_3:
2557 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2558 ; SSE41-NEXT: movaps %xmm1, %xmm0
2561 ; AVX-LABEL: ceil_maskz_ss_trunc:
2563 ; AVX-NEXT: testb $1, %dil
2564 ; AVX-NEXT: jne LBB83_1
2565 ; AVX-NEXT: ## %bb.2:
2566 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
2567 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2569 ; AVX-NEXT: LBB83_1:
2570 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
2571 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2574 ; AVX512-LABEL: ceil_maskz_ss_trunc:
2576 ; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
2577 ; AVX512-NEXT: kmovw %edi, %k1
2578 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 {%k1} {z}
2580 %mask = trunc i16 %k to i1
2581 %s = extractelement <4 x float> %x, i64 0
2582 %call = tail call float @llvm.ceil.f32(float %s)
2583 %low = select i1 %mask, float %call, float zeroinitializer
2584 %res = insertelement <4 x float> %y, float %low, i64 0
2585 ret <4 x float> %res
2588 define <2 x double> @ceil_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind {
2589 ; SSE41-LABEL: ceil_mask_sd_trunc:
2591 ; SSE41-NEXT: testb $1, %dil
2592 ; SSE41-NEXT: je LBB84_2
2593 ; SSE41-NEXT: ## %bb.1:
2594 ; SSE41-NEXT: xorps %xmm2, %xmm2
2595 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm2
2596 ; SSE41-NEXT: LBB84_2:
2597 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2598 ; SSE41-NEXT: movapd %xmm1, %xmm0
2601 ; AVX-LABEL: ceil_mask_sd_trunc:
2603 ; AVX-NEXT: testb $1, %dil
2604 ; AVX-NEXT: je LBB84_2
2605 ; AVX-NEXT: ## %bb.1:
2606 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
2607 ; AVX-NEXT: LBB84_2:
2608 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2611 ; AVX512-LABEL: ceil_mask_sd_trunc:
2613 ; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
2614 ; AVX512-NEXT: kmovw %edi, %k1
2615 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1}
2616 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
2618 %mask = trunc i16 %k to i1
2619 %s = extractelement <2 x double> %x, i64 0
2620 %call = tail call double @llvm.ceil.f64(double %s)
2621 %dst = extractelement <2 x double> %w, i64 0
2622 %low = select i1 %mask, double %call, double %dst
2623 %res = insertelement <2 x double> %y, double %low, i64 0
2624 ret <2 x double> %res
2627 define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %k) nounwind {
2628 ; SSE41-LABEL: ceil_maskz_sd_trunc:
2630 ; SSE41-NEXT: testb $1, %dil
2631 ; SSE41-NEXT: jne LBB85_1
2632 ; SSE41-NEXT: ## %bb.2:
2633 ; SSE41-NEXT: xorpd %xmm0, %xmm0
2634 ; SSE41-NEXT: jmp LBB85_3
2635 ; SSE41-NEXT: LBB85_1:
2636 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm0
2637 ; SSE41-NEXT: LBB85_3:
2638 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2639 ; SSE41-NEXT: movapd %xmm1, %xmm0
2642 ; AVX-LABEL: ceil_maskz_sd_trunc:
2644 ; AVX-NEXT: testb $1, %dil
2645 ; AVX-NEXT: jne LBB85_1
2646 ; AVX-NEXT: ## %bb.2:
2647 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
2648 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2650 ; AVX-NEXT: LBB85_1:
2651 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
2652 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2655 ; AVX512-LABEL: ceil_maskz_sd_trunc:
2657 ; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
2658 ; AVX512-NEXT: kmovw %edi, %k1
2659 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z}
2661 %mask = trunc i16 %k to i1
2662 %s = extractelement <2 x double> %x, i64 0
2663 %call = tail call double @llvm.ceil.f64(double %s)
2664 %low = select i1 %mask, double %call, double zeroinitializer
2665 %res = insertelement <2 x double> %y, double %low, i64 0
2666 ret <2 x double> %res
2669 define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind {
2670 ; SSE41-LABEL: ceil_mask_ss_mask8:
2672 ; SSE41-NEXT: roundss $10, %xmm0, %xmm3
2673 ; SSE41-NEXT: cmpeqss %xmm1, %xmm0
2674 ; SSE41-NEXT: andps %xmm0, %xmm3
2675 ; SSE41-NEXT: andnps %xmm2, %xmm0
2676 ; SSE41-NEXT: orps %xmm3, %xmm0
2677 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2680 ; AVX-LABEL: ceil_mask_ss_mask8:
2682 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm3
2683 ; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
2684 ; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
2685 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2688 ; AVX512-LABEL: ceil_mask_ss_mask8:
2690 ; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm3
2691 ; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1
2692 ; AVX512-NEXT: vmovss %xmm3, %xmm1, %xmm2 {%k1}
2693 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
2695 %mask1 = fcmp oeq <4 x float> %x, %y
2696 %mask = extractelement <4 x i1> %mask1, i64 0
2697 %s = extractelement <4 x float> %x, i64 0
2698 %call = tail call float @llvm.ceil.f32(float %s)
2699 %dst = extractelement <4 x float> %w, i64 0
2700 %low = select i1 %mask, float %call, float %dst
2701 %res = insertelement <4 x float> %y, float %low, i64 0
2702 ret <4 x float> %res
2705 define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind {
2706 ; SSE41-LABEL: ceil_maskz_ss_mask8:
2708 ; SSE41-NEXT: roundss $10, %xmm0, %xmm2
2709 ; SSE41-NEXT: cmpeqss %xmm1, %xmm0
2710 ; SSE41-NEXT: andps %xmm2, %xmm0
2711 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2714 ; AVX-LABEL: ceil_maskz_ss_mask8:
2716 ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
2717 ; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
2718 ; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
2719 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2722 ; AVX512-LABEL: ceil_maskz_ss_mask8:
2724 ; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
2725 ; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1
2726 ; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1} {z}
2728 %mask1 = fcmp oeq <4 x float> %x, %y
2729 %mask = extractelement <4 x i1> %mask1, i64 0
2730 %s = extractelement <4 x float> %x, i64 0
2731 %call = tail call float @llvm.ceil.f32(float %s)
2732 %low = select i1 %mask, float %call, float zeroinitializer
2733 %res = insertelement <4 x float> %y, float %low, i64 0
2734 ret <4 x float> %res
2737 define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind {
2738 ; SSE41-LABEL: ceil_mask_sd_mask8:
2740 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm3
2741 ; SSE41-NEXT: cmpeqsd %xmm1, %xmm0
2742 ; SSE41-NEXT: andpd %xmm0, %xmm3
2743 ; SSE41-NEXT: andnpd %xmm2, %xmm0
2744 ; SSE41-NEXT: orpd %xmm3, %xmm0
2745 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2748 ; AVX-LABEL: ceil_mask_sd_mask8:
2750 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm3
2751 ; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
2752 ; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
2753 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2756 ; AVX512-LABEL: ceil_mask_sd_mask8:
2758 ; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm3
2759 ; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1
2760 ; AVX512-NEXT: vmovsd %xmm3, %xmm1, %xmm2 {%k1}
2761 ; AVX512-NEXT: vmovapd %xmm2, %xmm0
2763 %mask1 = fcmp oeq <2 x double> %x, %y
2764 %mask = extractelement <2 x i1> %mask1, i64 0
2765 %s = extractelement <2 x double> %x, i64 0
2766 %call = tail call double @llvm.ceil.f64(double %s)
2767 %dst = extractelement <2 x double> %w, i64 0
2768 %low = select i1 %mask, double %call, double %dst
2769 %res = insertelement <2 x double> %y, double %low, i64 0
2770 ret <2 x double> %res
2773 define <2 x double> @ceil_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind {
2774 ; SSE41-LABEL: ceil_maskz_sd_mask8:
2776 ; SSE41-NEXT: roundsd $10, %xmm0, %xmm2
2777 ; SSE41-NEXT: cmpeqsd %xmm1, %xmm0
2778 ; SSE41-NEXT: andpd %xmm2, %xmm0
2779 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2782 ; AVX-LABEL: ceil_maskz_sd_mask8:
2784 ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
2785 ; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
2786 ; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0
2787 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2790 ; AVX512-LABEL: ceil_maskz_sd_mask8:
2792 ; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
2793 ; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1
2794 ; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1} {z}
2796 %mask1 = fcmp oeq <2 x double> %x, %y
2797 %mask = extractelement <2 x i1> %mask1, i64 0
2798 %s = extractelement <2 x double> %x, i64 0
2799 %call = tail call double @llvm.ceil.f64(double %s)
2800 %low = select i1 %mask, double %call, double zeroinitializer
2801 %res = insertelement <2 x double> %y, double %low, i64 0
2802 ret <2 x double> %res