1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86
5 define <8 x half> @broadcastph128(ptr %x) {
6 ; X64-LABEL: broadcastph128:
8 ; X64-NEXT: vpbroadcastw (%rdi), %xmm0
11 ; X86-LABEL: broadcastph128:
13 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
14 ; X86-NEXT: vpbroadcastw (%eax), %xmm0
16 %l1 = load half, ptr %x, align 2
17 %vec = insertelement <8 x half> undef, half %l1, i32 0
18 %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
22 define <16 x half> @broadcastph256(ptr %x) {
23 ; X64-LABEL: broadcastph256:
25 ; X64-NEXT: vpbroadcastw (%rdi), %ymm0
28 ; X86-LABEL: broadcastph256:
30 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
31 ; X86-NEXT: vpbroadcastw (%eax), %ymm0
33 %l1 = load half, ptr %x, align 2
34 %vec = insertelement <16 x half> undef, half %l1, i32 0
35 %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
39 define <32 x half> @broadcastph512(ptr %x) {
40 ; X64-LABEL: broadcastph512:
42 ; X64-NEXT: vpbroadcastw (%rdi), %zmm0
45 ; X86-LABEL: broadcastph512:
47 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
48 ; X86-NEXT: vpbroadcastw (%eax), %zmm0
50 %l1 = load half, ptr %x, align 2
51 %vec = insertelement <32 x half> undef, half %l1, i32 0
52 %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
56 define <8 x half> @broadcastph128_scalar(half %x) {
57 ; X64-LABEL: broadcastph128_scalar:
59 ; X64-NEXT: vpbroadcastw %xmm0, %xmm0
62 ; X86-LABEL: broadcastph128_scalar:
64 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0
66 %vec = insertelement <8 x half> undef, half %x, i32 0
67 %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
71 define <16 x half> @broadcastph256_scalar(half %x) {
72 ; X64-LABEL: broadcastph256_scalar:
74 ; X64-NEXT: vpbroadcastw %xmm0, %ymm0
77 ; X86-LABEL: broadcastph256_scalar:
79 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0
81 %vec = insertelement <16 x half> undef, half %x, i32 0
82 %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
86 define <32 x half> @broadcastph512_scalar(half %x) {
87 ; X64-LABEL: broadcastph512_scalar:
89 ; X64-NEXT: vpbroadcastw %xmm0, %zmm0
92 ; X86-LABEL: broadcastph512_scalar:
94 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0
96 %vec = insertelement <32 x half> undef, half %x, i32 0
97 %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
101 define <8 x half> @broadcastph128_reg(<8 x half> %x) {
102 ; CHECK-LABEL: broadcastph128_reg:
104 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
105 ; CHECK-NEXT: ret{{[l|q]}}
106 %res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer
110 define <16 x half> @broadcastph256_reg(<16 x half> %x) {
111 ; CHECK-LABEL: broadcastph256_reg:
113 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
114 ; CHECK-NEXT: ret{{[l|q]}}
115 %res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer
119 define <32 x half> @broadcastph512_reg(<32 x half> %x) {
120 ; CHECK-LABEL: broadcastph512_reg:
122 ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0
123 ; CHECK-NEXT: ret{{[l|q]}}
124 %res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer
128 define i16 @test1(half %x) {
131 ; X64-NEXT: vmovw %xmm0, %eax
132 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
137 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
139 %res = bitcast half %x to i16
143 define <8 x i16> @test2(i16 %x) {
146 ; X64-NEXT: vmovw %edi, %xmm0
151 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0
153 %res = insertelement <8 x i16>undef, i16 %x, i32 0
157 define <8 x i16> @test4(ptr %x) {
160 ; X64-NEXT: vpbroadcastw (%rdi), %xmm0
165 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
166 ; X86-NEXT: vpbroadcastw (%eax), %xmm0
168 %y = load i16, ptr %x
169 %res = insertelement <8 x i16>undef, i16 %y, i32 0
173 define void @test5(half %x, ptr %y) {
176 ; X64-NEXT: vmovsh %xmm0, (%rdi)
181 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
182 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
183 ; X86-NEXT: vmovsh %xmm0, (%eax)
185 store half %x, ptr %y, align 2
189 define half @test7(ptr %x) {
192 ; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
197 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
198 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
200 %y = load i16, ptr %x
201 %res = bitcast i16 %y to half
205 define <8 x i16> @test10(ptr %x) {
208 ; X64-NEXT: vmovw (%rdi), %xmm0
213 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
214 ; X86-NEXT: vmovw (%eax), %xmm0
216 %y = load i16, ptr %x, align 2
217 %res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0
221 define <16 x i16> @test10b(ptr %x) {
222 ; X64-LABEL: test10b:
224 ; X64-NEXT: vmovw (%rdi), %xmm0
227 ; X86-LABEL: test10b:
229 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
230 ; X86-NEXT: vmovw (%eax), %xmm0
232 %y = load i16, ptr %x, align 2
233 %res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0
237 define <32 x i16> @test10c(ptr %x) {
238 ; X64-LABEL: test10c:
240 ; X64-NEXT: vmovw (%rdi), %xmm0
243 ; X86-LABEL: test10c:
245 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
246 ; X86-NEXT: vmovw (%eax), %xmm0
248 %y = load i16, ptr %x, align 2
249 %res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0
253 define <8 x half> @test11(ptr %x) {
256 ; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
261 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
262 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
264 %y = load half, ptr %x, align 2
265 %res = insertelement <8 x half>zeroinitializer, half %y, i32 0
269 define <16 x half> @test11b(ptr %x) {
270 ; X64-LABEL: test11b:
272 ; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
275 ; X86-LABEL: test11b:
277 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
278 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
280 %y = load half, ptr %x, align 2
281 %res = insertelement <16 x half>zeroinitializer, half %y, i32 0
285 define <32 x half> @test11c(ptr %x) {
286 ; X64-LABEL: test11c:
288 ; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
291 ; X86-LABEL: test11c:
293 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
294 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
296 %y = load half, ptr %x, align 2
297 %res = insertelement <32 x half>zeroinitializer, half %y, i32 0
301 define <8 x half> @test14(half %x) {
304 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
305 ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
310 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
312 %res = insertelement <8 x half>zeroinitializer, half %x, i32 0
316 define <16 x half> @test14b(half %x) {
317 ; X64-LABEL: test14b:
319 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
320 ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
323 ; X86-LABEL: test14b:
325 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
327 %res = insertelement <16 x half>zeroinitializer, half %x, i32 0
331 define <32 x half> @test14c(half %x) {
332 ; X64-LABEL: test14c:
334 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
335 ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
338 ; X86-LABEL: test14c:
340 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
342 %res = insertelement <32 x half>zeroinitializer, half %x, i32 0
346 define <8 x i16> @test15(i16 %x) {
349 ; X64-NEXT: vmovw %edi, %xmm0
354 ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0
356 %res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0
360 define <16 x i16> @test16(i16 %x) {
363 ; X64-NEXT: vmovw %edi, %xmm0
368 ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0
370 %res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0
374 define <32 x i16> @test17(i16 %x) {
377 ; X64-NEXT: vmovw %edi, %xmm0
382 ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0
384 %res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0
388 define <8 x i16> @test18(i16 %x) {
391 ; X64-NEXT: vmovw %edi, %xmm0
396 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0
398 %res = insertelement <8 x i16> undef, i16 %x, i32 0
402 define <16 x i16> @test19(i16 %x) {
405 ; X64-NEXT: vmovw %edi, %xmm0
410 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0
412 %res = insertelement <16 x i16> undef, i16 %x, i32 0
416 define <32 x i16> @test20(i16 %x) {
419 ; X64-NEXT: vmovw %edi, %xmm0
424 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0
426 %res = insertelement <32 x i16> undef, i16 %x, i32 0
430 @g8f16 = external global <8 x half>
431 @g8f16u = external global <8 x half>, align 8
432 @g16f16 = external global <16 x half>
433 @g16f16u = external global <16 x half>, align 8
434 @g32f16 = external global <32 x half>
435 @g32f16u = external global <32 x half>, align 8
437 define <32 x half> @load32f16(ptr %a) {
438 ; X64-LABEL: load32f16:
440 ; X64-NEXT: vmovaps (%rdi), %zmm0
443 ; X86-LABEL: load32f16:
445 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
446 ; X86-NEXT: vmovaps (%eax), %zmm0
448 %res = load <32 x half>, ptr %a
452 define <32 x half> @load32f16mask(ptr %a, <32 x half> %b, i32 %c) {
453 ; X64-LABEL: load32f16mask:
455 ; X64-NEXT: kmovd %esi, %k1
456 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
459 ; X86-LABEL: load32f16mask:
461 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
462 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
463 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
465 %msk = bitcast i32 %c to <32 x i1>
466 %res0 = load <32 x half>, ptr %a
467 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
471 define <32 x half> @load32f16maskz(ptr %a, i32 %c) {
472 ; X64-LABEL: load32f16maskz:
474 ; X64-NEXT: kmovd %esi, %k1
475 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
478 ; X86-LABEL: load32f16maskz:
480 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
481 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
482 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
484 %msk = bitcast i32 %c to <32 x i1>
485 %res0 = load <32 x half>, ptr %a
486 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
490 define <32 x half> @loadu32f16(ptr %a) {
491 ; X64-LABEL: loadu32f16:
493 ; X64-NEXT: vmovups (%rdi), %zmm0
496 ; X86-LABEL: loadu32f16:
498 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
499 ; X86-NEXT: vmovups (%eax), %zmm0
501 %res = load <32 x half>, ptr %a, align 8
505 define <32 x half> @loadu32f16mask(ptr %a, <32 x half> %b, i32 %c) {
506 ; X64-LABEL: loadu32f16mask:
508 ; X64-NEXT: kmovd %esi, %k1
509 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
512 ; X86-LABEL: loadu32f16mask:
514 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
515 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
516 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
518 %msk = bitcast i32 %c to <32 x i1>
519 %res0 = load <32 x half>, ptr %a, align 8
520 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
524 define <32 x half> @loadu32f16maskz(ptr %a, i32 %c) {
525 ; X64-LABEL: loadu32f16maskz:
527 ; X64-NEXT: kmovd %esi, %k1
528 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
531 ; X86-LABEL: loadu32f16maskz:
533 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
534 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
535 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
537 %msk = bitcast i32 %c to <32 x i1>
538 %res0 = load <32 x half>, ptr %a, align 8
539 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
543 define void @store32f16(<32 x half> %a) {
544 ; X64-LABEL: store32f16:
546 ; X64-NEXT: movq g32f16@GOTPCREL(%rip), %rax
547 ; X64-NEXT: vmovaps %zmm0, (%rax)
548 ; X64-NEXT: vzeroupper
551 ; X86-LABEL: store32f16:
553 ; X86-NEXT: vmovaps %zmm0, g32f16
554 ; X86-NEXT: vzeroupper
556 store <32 x half> %a, ptr @g32f16
560 define void @storeu32f16(<32 x half> %a) {
561 ; X64-LABEL: storeu32f16:
563 ; X64-NEXT: movq g32f16u@GOTPCREL(%rip), %rax
564 ; X64-NEXT: vmovups %zmm0, (%rax)
565 ; X64-NEXT: vzeroupper
568 ; X86-LABEL: storeu32f16:
570 ; X86-NEXT: vmovups %zmm0, g32f16u
571 ; X86-NEXT: vzeroupper
573 store <32 x half> %a, ptr @g32f16u, align 8
577 declare void @llvm.masked.store.v32f16.p0(<32 x half>, ptr, i32, <32 x i1>)
578 declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32, <32 x i1>, <32 x half>)
580 define void @storeu32f16mask(<32 x i1> %mask, ptr %addr, <32 x half> %val) {
581 ; X64-LABEL: storeu32f16mask:
583 ; X64-NEXT: vpsllw $7, %ymm0, %ymm0
584 ; X64-NEXT: vpmovb2m %ymm0, %k1
585 ; X64-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
586 ; X64-NEXT: vzeroupper
589 ; X86-LABEL: storeu32f16mask:
591 ; X86-NEXT: vpsllw $7, %ymm0, %ymm0
592 ; X86-NEXT: vpmovb2m %ymm0, %k1
593 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
594 ; X86-NEXT: vmovdqu16 %zmm1, (%eax) {%k1}
595 ; X86-NEXT: vzeroupper
597 call void @llvm.masked.store.v32f16.p0(<32 x half> %val, ptr %addr, i32 4, <32 x i1>%mask)
601 define <32 x half> @maskloadu32f16(ptr %addr, <32 x half> %val, <32 x i1> %mask) {
602 ; X64-LABEL: maskloadu32f16:
604 ; X64-NEXT: vpsllw $7, %ymm1, %ymm1
605 ; X64-NEXT: vpmovb2m %ymm1, %k1
606 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
609 ; X86-LABEL: maskloadu32f16:
611 ; X86-NEXT: vpsllw $7, %ymm1, %ymm1
612 ; X86-NEXT: vpmovb2m %ymm1, %k1
613 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
614 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
616 %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> %val)
620 define <32 x half> @maskuloadu32f16(ptr %addr, <32 x i1> %mask) {
621 ; X64-LABEL: maskuloadu32f16:
623 ; X64-NEXT: vpsllw $7, %ymm0, %ymm0
624 ; X64-NEXT: vpmovb2m %ymm0, %k1
625 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
628 ; X86-LABEL: maskuloadu32f16:
630 ; X86-NEXT: vpsllw $7, %ymm0, %ymm0
631 ; X86-NEXT: vpmovb2m %ymm0, %k1
632 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
633 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
635 %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> undef)
639 define <32 x half> @maskzloadu32f16(ptr %addr, <32 x i1> %mask) {
640 ; X64-LABEL: maskzloadu32f16:
642 ; X64-NEXT: vpsllw $7, %ymm0, %ymm0
643 ; X64-NEXT: vpmovb2m %ymm0, %k1
644 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
647 ; X86-LABEL: maskzloadu32f16:
649 ; X86-NEXT: vpsllw $7, %ymm0, %ymm0
650 ; X86-NEXT: vpmovb2m %ymm0, %k1
651 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
652 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
654 %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer)
658 define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) {
659 ; CHECK-LABEL: movrr32f16:
661 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
662 ; CHECK-NEXT: ret{{[l|q]}}
666 define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) {
667 ; X64-LABEL: movrrk32f16:
669 ; X64-NEXT: kmovd %edi, %k1
670 ; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
673 ; X86-LABEL: movrrk32f16:
675 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
676 ; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
678 %mask = bitcast i32 %msk to <32 x i1>
679 %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b
683 define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) {
684 ; X64-LABEL: movrrkz32f16:
686 ; X64-NEXT: kmovd %edi, %k1
687 ; X64-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
690 ; X86-LABEL: movrrkz32f16:
692 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
693 ; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
695 %mask = bitcast i32 %msk to <32 x i1>
696 %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer
700 define <16 x half> @load16f16(ptr %a) {
701 ; X64-LABEL: load16f16:
703 ; X64-NEXT: vmovaps (%rdi), %ymm0
706 ; X86-LABEL: load16f16:
708 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
709 ; X86-NEXT: vmovaps (%eax), %ymm0
711 %res = load <16 x half>, ptr %a
715 define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) {
716 ; X64-LABEL: load16f16mask:
718 ; X64-NEXT: kmovd %esi, %k1
719 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
722 ; X86-LABEL: load16f16mask:
724 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
725 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
726 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
728 %msk = bitcast i16 %c to <16 x i1>
729 %res0 = load <16 x half>, ptr %a
730 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
734 define <16 x half> @load16f16maskz(ptr %a, i16 %c) {
735 ; X64-LABEL: load16f16maskz:
737 ; X64-NEXT: kmovd %esi, %k1
738 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
741 ; X86-LABEL: load16f16maskz:
743 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
744 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
745 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
747 %msk = bitcast i16 %c to <16 x i1>
748 %res0 = load <16 x half>, ptr %a
749 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
753 define <16 x half> @loadu16f16(ptr %a) {
754 ; X64-LABEL: loadu16f16:
756 ; X64-NEXT: vmovups (%rdi), %ymm0
759 ; X86-LABEL: loadu16f16:
761 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
762 ; X86-NEXT: vmovups (%eax), %ymm0
764 %res = load <16 x half>, ptr %a, align 8
768 define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) {
769 ; X64-LABEL: loadu16f16mask:
771 ; X64-NEXT: kmovd %esi, %k1
772 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
775 ; X86-LABEL: loadu16f16mask:
777 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
778 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
779 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
781 %msk = bitcast i16 %c to <16 x i1>
782 %res0 = load <16 x half>, ptr %a, align 8
783 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
787 define <16 x half> @loadu16f16maskz(ptr %a, i16 %c) {
788 ; X64-LABEL: loadu16f16maskz:
790 ; X64-NEXT: kmovd %esi, %k1
791 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
794 ; X86-LABEL: loadu16f16maskz:
796 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
797 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
798 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
800 %msk = bitcast i16 %c to <16 x i1>
801 %res0 = load <16 x half>, ptr %a, align 8
802 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
806 define void @store16f16(<16 x half> %a) {
807 ; X64-LABEL: store16f16:
809 ; X64-NEXT: movq g16f16@GOTPCREL(%rip), %rax
810 ; X64-NEXT: vmovaps %ymm0, (%rax)
811 ; X64-NEXT: vzeroupper
814 ; X86-LABEL: store16f16:
816 ; X86-NEXT: vmovaps %ymm0, g16f16
817 ; X86-NEXT: vzeroupper
819 store <16 x half> %a, ptr @g16f16
823 define void @storeu16f16(<16 x half> %a) {
824 ; X64-LABEL: storeu16f16:
826 ; X64-NEXT: movq g16f16u@GOTPCREL(%rip), %rax
827 ; X64-NEXT: vmovups %ymm0, (%rax)
828 ; X64-NEXT: vzeroupper
831 ; X86-LABEL: storeu16f16:
833 ; X86-NEXT: vmovups %ymm0, g16f16u
834 ; X86-NEXT: vzeroupper
836 store <16 x half> %a, ptr @g16f16u, align 8
840 declare void @llvm.masked.store.v16f16.p0(<16 x half>, ptr, i32, <16 x i1>)
841 declare <16 x half> @llvm.masked.load.v16f16.p0(ptr, i32, <16 x i1>, <16 x half>)
843 define void @storeu16f16mask(<16 x i1> %mask, ptr %addr, <16 x half> %val) {
844 ; X64-LABEL: storeu16f16mask:
846 ; X64-NEXT: vpsllw $7, %xmm0, %xmm0
847 ; X64-NEXT: vpmovb2m %xmm0, %k1
848 ; X64-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
849 ; X64-NEXT: vzeroupper
852 ; X86-LABEL: storeu16f16mask:
854 ; X86-NEXT: vpsllw $7, %xmm0, %xmm0
855 ; X86-NEXT: vpmovb2m %xmm0, %k1
856 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
857 ; X86-NEXT: vmovdqu16 %ymm1, (%eax) {%k1}
858 ; X86-NEXT: vzeroupper
860 call void @llvm.masked.store.v16f16.p0(<16 x half> %val, ptr %addr, i32 4, <16 x i1>%mask)
864 define <16 x half> @maskloadu16f16(ptr %addr, <16 x half> %val, <16 x i1> %mask) {
865 ; X64-LABEL: maskloadu16f16:
867 ; X64-NEXT: vpsllw $7, %xmm1, %xmm1
868 ; X64-NEXT: vpmovb2m %xmm1, %k1
869 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
872 ; X86-LABEL: maskloadu16f16:
874 ; X86-NEXT: vpsllw $7, %xmm1, %xmm1
875 ; X86-NEXT: vpmovb2m %xmm1, %k1
876 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
877 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
879 %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> %val)
883 define <16 x half> @maskuloadu16f16(ptr %addr, <16 x i1> %mask) {
884 ; X64-LABEL: maskuloadu16f16:
886 ; X64-NEXT: vpsllw $7, %xmm0, %xmm0
887 ; X64-NEXT: vpmovb2m %xmm0, %k1
888 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
891 ; X86-LABEL: maskuloadu16f16:
893 ; X86-NEXT: vpsllw $7, %xmm0, %xmm0
894 ; X86-NEXT: vpmovb2m %xmm0, %k1
895 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
896 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
898 %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> undef)
902 define <16 x half> @maskzloadu16f16(ptr %addr, <16 x i1> %mask) {
903 ; X64-LABEL: maskzloadu16f16:
905 ; X64-NEXT: vpsllw $7, %xmm0, %xmm0
906 ; X64-NEXT: vpmovb2m %xmm0, %k1
907 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
910 ; X86-LABEL: maskzloadu16f16:
912 ; X86-NEXT: vpsllw $7, %xmm0, %xmm0
913 ; X86-NEXT: vpmovb2m %xmm0, %k1
914 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
915 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
917 %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer)
921 define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) {
922 ; CHECK-LABEL: movrr16f16:
924 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
925 ; CHECK-NEXT: ret{{[l|q]}}
929 define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) {
930 ; X64-LABEL: movrrk16f16:
932 ; X64-NEXT: kmovd %edi, %k1
933 ; X64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
936 ; X86-LABEL: movrrk16f16:
938 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
939 ; X86-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
941 %mask = bitcast i16 %msk to <16 x i1>
942 %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b
946 define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) {
947 ; X64-LABEL: movrrkz16f16:
949 ; X64-NEXT: kmovd %edi, %k1
950 ; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
953 ; X86-LABEL: movrrkz16f16:
955 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
956 ; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
958 %mask = bitcast i16 %msk to <16 x i1>
959 %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer
963 define <8 x half> @load8f16(ptr %a) {
964 ; X64-LABEL: load8f16:
966 ; X64-NEXT: vmovaps (%rdi), %xmm0
969 ; X86-LABEL: load8f16:
971 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
972 ; X86-NEXT: vmovaps (%eax), %xmm0
974 %res = load <8 x half>, ptr %a
978 define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
979 ; X64-LABEL: load8f16mask:
981 ; X64-NEXT: kmovd %esi, %k1
982 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
985 ; X86-LABEL: load8f16mask:
987 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
988 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
989 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
991 %msk = bitcast i8 %c to <8 x i1>
992 %res0 = load <8 x half>, ptr %a
993 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
997 define <8 x half> @load8f16maskz(ptr %a, i8 %c) {
998 ; X64-LABEL: load8f16maskz:
1000 ; X64-NEXT: kmovd %esi, %k1
1001 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1004 ; X86-LABEL: load8f16maskz:
1006 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1007 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1008 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
1010 %msk = bitcast i8 %c to <8 x i1>
1011 %res0 = load <8 x half>, ptr %a
1012 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
1016 define <8 x half> @loadu8f16(ptr %a) {
1017 ; X64-LABEL: loadu8f16:
1019 ; X64-NEXT: vmovups (%rdi), %xmm0
1022 ; X86-LABEL: loadu8f16:
1024 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1025 ; X86-NEXT: vmovups (%eax), %xmm0
1027 %res = load <8 x half>, ptr %a, align 8
1031 define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
1032 ; X64-LABEL: loadu8f16mask:
1034 ; X64-NEXT: kmovd %esi, %k1
1035 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
1038 ; X86-LABEL: loadu8f16mask:
1040 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1041 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1042 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
1044 %msk = bitcast i8 %c to <8 x i1>
1045 %res0 = load <8 x half>, ptr %a, align 8
1046 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
1050 define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) {
1051 ; X64-LABEL: loadu8f16maskz:
1053 ; X64-NEXT: kmovd %esi, %k1
1054 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1057 ; X86-LABEL: loadu8f16maskz:
1059 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1060 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1061 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
1063 %msk = bitcast i8 %c to <8 x i1>
1064 %res0 = load <8 x half>, ptr %a, align 8
1065 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
1069 define void @store8f16(<8 x half> %a) {
1070 ; X64-LABEL: store8f16:
1072 ; X64-NEXT: movq g8f16@GOTPCREL(%rip), %rax
1073 ; X64-NEXT: vmovaps %xmm0, (%rax)
1076 ; X86-LABEL: store8f16:
1078 ; X86-NEXT: vmovaps %xmm0, g8f16
1080 store <8 x half> %a, ptr @g8f16
1084 define void @storeu8f16(<8 x half> %a) {
1085 ; X64-LABEL: storeu8f16:
1087 ; X64-NEXT: movq g8f16u@GOTPCREL(%rip), %rax
1088 ; X64-NEXT: vmovups %xmm0, (%rax)
1091 ; X86-LABEL: storeu8f16:
1093 ; X86-NEXT: vmovups %xmm0, g8f16u
1095 store <8 x half> %a, ptr @g8f16u, align 8
1099 declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32, <8 x i1>)
1100 declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32, <8 x i1>, <8 x half>)
1102 define void @storeu8f16mask(<8 x i1> %mask, ptr %addr, <8 x half> %val) {
1103 ; X64-LABEL: storeu8f16mask:
1105 ; X64-NEXT: vpsllw $15, %xmm0, %xmm0
1106 ; X64-NEXT: vpmovw2m %xmm0, %k1
1107 ; X64-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
1110 ; X86-LABEL: storeu8f16mask:
1112 ; X86-NEXT: vpsllw $15, %xmm0, %xmm0
1113 ; X86-NEXT: vpmovw2m %xmm0, %k1
1114 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1115 ; X86-NEXT: vmovdqu16 %xmm1, (%eax) {%k1}
1117 call void @llvm.masked.store.v8f16.p0(<8 x half> %val, ptr %addr, i32 4, <8 x i1>%mask)
1121 define <8 x half> @maskloadu8f16(ptr %addr, <8 x half> %val, <8 x i1> %mask) {
1122 ; X64-LABEL: maskloadu8f16:
1124 ; X64-NEXT: vpsllw $15, %xmm1, %xmm1
1125 ; X64-NEXT: vpmovw2m %xmm1, %k1
1126 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
1129 ; X86-LABEL: maskloadu8f16:
1131 ; X86-NEXT: vpsllw $15, %xmm1, %xmm1
1132 ; X86-NEXT: vpmovw2m %xmm1, %k1
1133 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1134 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
1136 %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> %val)
1140 define <8 x half> @maskuloadu8f16(ptr %addr, <8 x i1> %mask) {
1141 ; X64-LABEL: maskuloadu8f16:
1143 ; X64-NEXT: vpsllw $15, %xmm0, %xmm0
1144 ; X64-NEXT: vpmovw2m %xmm0, %k1
1145 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1148 ; X86-LABEL: maskuloadu8f16:
1150 ; X86-NEXT: vpsllw $15, %xmm0, %xmm0
1151 ; X86-NEXT: vpmovw2m %xmm0, %k1
1152 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1153 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
1155 %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> undef)
1159 define <8 x half> @maskzloadu8f16(ptr %addr, <8 x i1> %mask) {
1160 ; X64-LABEL: maskzloadu8f16:
1162 ; X64-NEXT: vpsllw $15, %xmm0, %xmm0
1163 ; X64-NEXT: vpmovw2m %xmm0, %k1
1164 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1167 ; X86-LABEL: maskzloadu8f16:
1169 ; X86-NEXT: vpsllw $15, %xmm0, %xmm0
1170 ; X86-NEXT: vpmovw2m %xmm0, %k1
1171 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1172 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
1174 %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer)
1178 define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) {
1179 ; CHECK-LABEL: movrr8f16:
1181 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
1182 ; CHECK-NEXT: ret{{[l|q]}}
1186 define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
1187 ; X64-LABEL: movrrk8f16:
1189 ; X64-NEXT: kmovd %edi, %k1
1190 ; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
1193 ; X86-LABEL: movrrk8f16:
1195 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1196 ; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
1198 %mask = bitcast i8 %msk to <8 x i1>
1199 %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b
1203 define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
1204 ; X64-LABEL: movrrkz8f16:
1206 ; X64-NEXT: kmovd %edi, %k1
1207 ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
1210 ; X86-LABEL: movrrkz8f16:
1212 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1213 ; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
1215 %mask = bitcast i8 %msk to <8 x i1>
1216 %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer
1220 define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
1221 ; CHECK-LABEL: movsh:
1223 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
1224 ; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0
1225 ; CHECK-NEXT: vaddph %xmm0, %xmm2, %xmm0
1226 ; CHECK-NEXT: ret{{[l|q]}}
1227 %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
1228 %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1229 %res = fadd <8 x half> %res1, %res2
1233 define i16 @test_movw(half %x) {
1234 ; X64-LABEL: test_movw:
1236 ; X64-NEXT: vmovw %xmm0, %eax
1237 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
1240 ; X86-LABEL: test_movw:
1242 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1244 %res = bitcast half %x to i16
1248 define half @test_movw2(i16 %x) {
1249 ; X64-LABEL: test_movw2:
1251 ; X64-NEXT: vmovw %edi, %xmm0
1254 ; X86-LABEL: test_movw2:
1256 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1258 %res = bitcast i16 %x to half
1262 ; sext avoids having a truncate in front of the bitcast input due to calling
1263 ; convention or i16 op promotion.
1264 define half @test_movw3(i8 %x) {
1265 ; X64-LABEL: test_movw3:
1267 ; X64-NEXT: movsbl %dil, %eax
1268 ; X64-NEXT: vmovw %eax, %xmm0
1271 ; X86-LABEL: test_movw3:
1273 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
1274 ; X86-NEXT: vmovw %eax, %xmm0
1276 %z = sext i8 %x to i16
1277 %a = bitcast i16 %z to half
1281 define half @extract_f16_0(<8 x half> %x) {
1282 ; CHECK-LABEL: extract_f16_0:
1284 ; CHECK-NEXT: ret{{[l|q]}}
1285 %res = extractelement <8 x half> %x, i32 0
1289 define half @extract_f16_1(<8 x half> %x) {
1290 ; CHECK-LABEL: extract_f16_1:
1292 ; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0
1293 ; CHECK-NEXT: ret{{[l|q]}}
1294 %res = extractelement <8 x half> %x, i32 1
1298 define half @extract_f16_2(<8 x half> %x) {
1299 ; CHECK-LABEL: extract_f16_2:
1301 ; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1302 ; CHECK-NEXT: ret{{[l|q]}}
1303 %res = extractelement <8 x half> %x, i32 2
1307 define half @extract_f16_3(<8 x half> %x) {
1308 ; CHECK-LABEL: extract_f16_3:
1310 ; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm0
1311 ; CHECK-NEXT: ret{{[l|q]}}
1312 %res = extractelement <8 x half> %x, i32 3
1316 define half @extract_f16_4(<8 x half> %x) {
1317 ; CHECK-LABEL: extract_f16_4:
1319 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1320 ; CHECK-NEXT: ret{{[l|q]}}
1321 %res = extractelement <8 x half> %x, i32 4
1325 define half @extract_f16_5(<8 x half> %x) {
1326 ; CHECK-LABEL: extract_f16_5:
1328 ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1329 ; CHECK-NEXT: ret{{[l|q]}}
1330 %res = extractelement <8 x half> %x, i32 5
1334 define half @extract_f16_6(<8 x half> %x) {
1335 ; CHECK-LABEL: extract_f16_6:
1337 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1338 ; CHECK-NEXT: ret{{[l|q]}}
1339 %res = extractelement <8 x half> %x, i32 6
1343 define half @extract_f16_7(<8 x half> %x) {
1344 ; CHECK-LABEL: extract_f16_7:
1346 ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1347 ; CHECK-NEXT: ret{{[l|q]}}
1348 %res = extractelement <8 x half> %x, i32 7
1352 define half @extract_f16_8(<32 x half> %x, i64 %idx) nounwind {
1353 ; X64-LABEL: extract_f16_8:
1355 ; X64-NEXT: pushq %rbp
1356 ; X64-NEXT: movq %rsp, %rbp
1357 ; X64-NEXT: andq $-64, %rsp
1358 ; X64-NEXT: subq $128, %rsp
1359 ; X64-NEXT: andl $31, %edi
1360 ; X64-NEXT: vmovaps %zmm0, (%rsp)
1361 ; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1362 ; X64-NEXT: movq %rbp, %rsp
1363 ; X64-NEXT: popq %rbp
1364 ; X64-NEXT: vzeroupper
1367 ; X86-LABEL: extract_f16_8:
1369 ; X86-NEXT: pushl %ebp
1370 ; X86-NEXT: movl %esp, %ebp
1371 ; X86-NEXT: andl $-64, %esp
1372 ; X86-NEXT: subl $128, %esp
1373 ; X86-NEXT: movl 8(%ebp), %eax
1374 ; X86-NEXT: andl $31, %eax
1375 ; X86-NEXT: vmovaps %zmm0, (%esp)
1376 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1377 ; X86-NEXT: movl %ebp, %esp
1378 ; X86-NEXT: popl %ebp
1379 ; X86-NEXT: vzeroupper
1381 %res = extractelement <32 x half> %x, i64 %idx
1385 define half @extract_f16_9(<64 x half> %x, i64 %idx) nounwind {
1386 ; X64-LABEL: extract_f16_9:
1388 ; X64-NEXT: pushq %rbp
1389 ; X64-NEXT: movq %rsp, %rbp
1390 ; X64-NEXT: andq $-64, %rsp
1391 ; X64-NEXT: subq $192, %rsp
1392 ; X64-NEXT: andl $63, %edi
1393 ; X64-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
1394 ; X64-NEXT: vmovaps %zmm0, (%rsp)
1395 ; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1396 ; X64-NEXT: movq %rbp, %rsp
1397 ; X64-NEXT: popq %rbp
1398 ; X64-NEXT: vzeroupper
1401 ; X86-LABEL: extract_f16_9:
1403 ; X86-NEXT: pushl %ebp
1404 ; X86-NEXT: movl %esp, %ebp
1405 ; X86-NEXT: andl $-64, %esp
1406 ; X86-NEXT: subl $192, %esp
1407 ; X86-NEXT: movl 8(%ebp), %eax
1408 ; X86-NEXT: andl $63, %eax
1409 ; X86-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp)
1410 ; X86-NEXT: vmovaps %zmm0, (%esp)
1411 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1412 ; X86-NEXT: movl %ebp, %esp
1413 ; X86-NEXT: popl %ebp
1414 ; X86-NEXT: vzeroupper
1416 %res = extractelement <64 x half> %x, i64 %idx
1420 define i16 @extract_i16_0(<8 x i16> %x) {
1421 ; CHECK-LABEL: extract_i16_0:
1423 ; CHECK-NEXT: vmovw %xmm0, %eax
1424 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1425 ; CHECK-NEXT: ret{{[l|q]}}
1426 %res = extractelement <8 x i16> %x, i32 0
1430 define i16 @extract_i16_1(<8 x i16> %x) {
1431 ; CHECK-LABEL: extract_i16_1:
1433 ; CHECK-NEXT: vpextrw $1, %xmm0, %eax
1434 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1435 ; CHECK-NEXT: ret{{[l|q]}}
1436 %res = extractelement <8 x i16> %x, i32 1
1440 define i16 @extract_i16_2(<8 x i16> %x) {
1441 ; CHECK-LABEL: extract_i16_2:
1443 ; CHECK-NEXT: vpextrw $2, %xmm0, %eax
1444 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1445 ; CHECK-NEXT: ret{{[l|q]}}
1446 %res = extractelement <8 x i16> %x, i32 2
1450 define i16 @extract_i16_3(<8 x i16> %x) {
1451 ; CHECK-LABEL: extract_i16_3:
1453 ; CHECK-NEXT: vpextrw $3, %xmm0, %eax
1454 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1455 ; CHECK-NEXT: ret{{[l|q]}}
1456 %res = extractelement <8 x i16> %x, i32 3
1460 define i16 @extract_i16_4(<8 x i16> %x) {
1461 ; CHECK-LABEL: extract_i16_4:
1463 ; CHECK-NEXT: vpextrw $4, %xmm0, %eax
1464 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1465 ; CHECK-NEXT: ret{{[l|q]}}
1466 %res = extractelement <8 x i16> %x, i32 4
1470 define i16 @extract_i16_5(<8 x i16> %x) {
1471 ; CHECK-LABEL: extract_i16_5:
1473 ; CHECK-NEXT: vpextrw $5, %xmm0, %eax
1474 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1475 ; CHECK-NEXT: ret{{[l|q]}}
1476 %res = extractelement <8 x i16> %x, i32 5
1480 define i16 @extract_i16_6(<8 x i16> %x) {
1481 ; CHECK-LABEL: extract_i16_6:
1483 ; CHECK-NEXT: vpextrw $6, %xmm0, %eax
1484 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1485 ; CHECK-NEXT: ret{{[l|q]}}
1486 %res = extractelement <8 x i16> %x, i32 6
1490 define i16 @extract_i16_7(<8 x i16> %x) {
1491 ; CHECK-LABEL: extract_i16_7:
1493 ; CHECK-NEXT: vpextrw $7, %xmm0, %eax
1494 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1495 ; CHECK-NEXT: ret{{[l|q]}}
1496 %res = extractelement <8 x i16> %x, i32 7
1500 define void @extract_store_f16_0(<8 x half> %x, ptr %y) {
1501 ; X64-LABEL: extract_store_f16_0:
1503 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1506 ; X86-LABEL: extract_store_f16_0:
1508 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1509 ; X86-NEXT: vmovsh %xmm0, (%eax)
1511 %res = extractelement <8 x half> %x, i32 0
1512 store half %res, ptr %y
1516 define void @extract_store_f16_1(<8 x half> %x, ptr %y) {
1517 ; X64-LABEL: extract_store_f16_1:
1519 ; X64-NEXT: vpsrld $16, %xmm0, %xmm0
1520 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1523 ; X86-LABEL: extract_store_f16_1:
1525 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1526 ; X86-NEXT: vpsrld $16, %xmm0, %xmm0
1527 ; X86-NEXT: vmovsh %xmm0, (%eax)
1529 %res = extractelement <8 x half> %x, i32 1
1530 store half %res, ptr %y
1534 define void @extract_store_f16_2(<8 x half> %x, ptr %y) {
1535 ; X64-LABEL: extract_store_f16_2:
1537 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1538 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1541 ; X86-LABEL: extract_store_f16_2:
1543 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1544 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1545 ; X86-NEXT: vmovsh %xmm0, (%eax)
1547 %res = extractelement <8 x half> %x, i32 2
1548 store half %res, ptr %y
1552 define void @extract_store_f16_3(<8 x half> %x, ptr %y) {
1553 ; X64-LABEL: extract_store_f16_3:
1555 ; X64-NEXT: vpsrlq $48, %xmm0, %xmm0
1556 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1559 ; X86-LABEL: extract_store_f16_3:
1561 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1562 ; X86-NEXT: vpsrlq $48, %xmm0, %xmm0
1563 ; X86-NEXT: vmovsh %xmm0, (%eax)
1565 %res = extractelement <8 x half> %x, i32 3
1566 store half %res, ptr %y
1570 define void @extract_store_f16_4(<8 x half> %x, ptr %y) {
1571 ; X64-LABEL: extract_store_f16_4:
1573 ; X64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1574 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1577 ; X86-LABEL: extract_store_f16_4:
1579 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1580 ; X86-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1581 ; X86-NEXT: vmovsh %xmm0, (%eax)
1583 %res = extractelement <8 x half> %x, i32 4
1584 store half %res, ptr %y
1588 define void @extract_store_f16_5(<8 x half> %x, ptr %y) {
1589 ; X64-LABEL: extract_store_f16_5:
1591 ; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1592 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1595 ; X86-LABEL: extract_store_f16_5:
1597 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1598 ; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1599 ; X86-NEXT: vmovsh %xmm0, (%eax)
1601 %res = extractelement <8 x half> %x, i32 5
1602 store half %res, ptr %y
1606 define void @extract_store_f16_6(<8 x half> %x, ptr %y) {
1607 ; X64-LABEL: extract_store_f16_6:
1609 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1610 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1613 ; X86-LABEL: extract_store_f16_6:
1615 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1616 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1617 ; X86-NEXT: vmovsh %xmm0, (%eax)
1619 %res = extractelement <8 x half> %x, i32 6
1620 store half %res, ptr %y
1624 define void @extract_store_f16_7(<8 x half> %x, ptr %y) {
1625 ; X64-LABEL: extract_store_f16_7:
1627 ; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1628 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1631 ; X86-LABEL: extract_store_f16_7:
1633 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1634 ; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1635 ; X86-NEXT: vmovsh %xmm0, (%eax)
1637 %res = extractelement <8 x half> %x, i32 7
1638 store half %res, ptr %y
1642 define void @extract_store_i16_0(<8 x i16> %x, ptr %y) {
1643 ; X64-LABEL: extract_store_i16_0:
1645 ; X64-NEXT: vpextrw $0, %xmm0, (%rdi)
1648 ; X86-LABEL: extract_store_i16_0:
1650 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1651 ; X86-NEXT: vpextrw $0, %xmm0, (%eax)
1653 %res = extractelement <8 x i16> %x, i32 0
1654 store i16 %res, ptr %y
1658 define void @extract_store_i16_1(<8 x i16> %x, ptr %y) {
1659 ; X64-LABEL: extract_store_i16_1:
1661 ; X64-NEXT: vpextrw $1, %xmm0, (%rdi)
1664 ; X86-LABEL: extract_store_i16_1:
1666 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1667 ; X86-NEXT: vpextrw $1, %xmm0, (%eax)
1669 %res = extractelement <8 x i16> %x, i32 1
1670 store i16 %res, ptr %y
1674 define void @extract_store_i16_2(<8 x i16> %x, ptr %y) {
1675 ; X64-LABEL: extract_store_i16_2:
1677 ; X64-NEXT: vpextrw $2, %xmm0, (%rdi)
1680 ; X86-LABEL: extract_store_i16_2:
1682 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1683 ; X86-NEXT: vpextrw $2, %xmm0, (%eax)
1685 %res = extractelement <8 x i16> %x, i32 2
1686 store i16 %res, ptr %y
1690 define void @extract_store_i16_3(<8 x i16> %x, ptr %y) {
1691 ; X64-LABEL: extract_store_i16_3:
1693 ; X64-NEXT: vpextrw $3, %xmm0, (%rdi)
1696 ; X86-LABEL: extract_store_i16_3:
1698 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1699 ; X86-NEXT: vpextrw $3, %xmm0, (%eax)
1701 %res = extractelement <8 x i16> %x, i32 3
1702 store i16 %res, ptr %y
1706 define void @extract_store_i16_4(<8 x i16> %x, ptr %y) {
1707 ; X64-LABEL: extract_store_i16_4:
1709 ; X64-NEXT: vpextrw $4, %xmm0, (%rdi)
1712 ; X86-LABEL: extract_store_i16_4:
1714 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1715 ; X86-NEXT: vpextrw $4, %xmm0, (%eax)
1717 %res = extractelement <8 x i16> %x, i32 4
1718 store i16 %res, ptr %y
1722 define void @extract_store_i16_5(<8 x i16> %x, ptr %y) {
1723 ; X64-LABEL: extract_store_i16_5:
1725 ; X64-NEXT: vpextrw $5, %xmm0, (%rdi)
1728 ; X86-LABEL: extract_store_i16_5:
1730 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1731 ; X86-NEXT: vpextrw $5, %xmm0, (%eax)
1733 %res = extractelement <8 x i16> %x, i32 5
1734 store i16 %res, ptr %y
1738 define void @extract_store_i16_6(<8 x i16> %x, ptr %y) {
1739 ; X64-LABEL: extract_store_i16_6:
1741 ; X64-NEXT: vpextrw $6, %xmm0, (%rdi)
1744 ; X86-LABEL: extract_store_i16_6:
1746 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1747 ; X86-NEXT: vpextrw $6, %xmm0, (%eax)
1749 %res = extractelement <8 x i16> %x, i32 6
1750 store i16 %res, ptr %y
1754 define void @extract_store_i16_7(<8 x i16> %x, ptr %y) {
1755 ; X64-LABEL: extract_store_i16_7:
1757 ; X64-NEXT: vpextrw $7, %xmm0, (%rdi)
1760 ; X86-LABEL: extract_store_i16_7:
1762 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1763 ; X86-NEXT: vpextrw $7, %xmm0, (%eax)
1765 %res = extractelement <8 x i16> %x, i32 7
1766 store i16 %res, ptr %y
1770 define i32 @extract_zext_i16_0(<8 x i16> %x) {
1771 ; CHECK-LABEL: extract_zext_i16_0:
1773 ; CHECK-NEXT: vpextrw $0, %xmm0, %eax
1774 ; CHECK-NEXT: ret{{[l|q]}}
1775 %res = extractelement <8 x i16> %x, i32 0
1776 %res2 = zext i16 %res to i32
1780 define i32 @extract_zext_i16_1(<8 x i16> %x) {
1781 ; CHECK-LABEL: extract_zext_i16_1:
1783 ; CHECK-NEXT: vpextrw $1, %xmm0, %eax
1784 ; CHECK-NEXT: ret{{[l|q]}}
1785 %res = extractelement <8 x i16> %x, i32 1
1786 %res2 = zext i16 %res to i32
1790 define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) {
1791 ; X64-LABEL: build_vector_xxxxuuuu:
1793 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1794 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1795 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
1798 ; X86-LABEL: build_vector_xxxxuuuu:
1800 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1801 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1802 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1803 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1804 ; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1805 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1806 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1808 %a = insertelement <8 x half> undef, half %a0, i32 0
1809 %b = insertelement <8 x half> %a, half %a1, i32 1
1810 %c = insertelement <8 x half> %b, half %a2, i32 2
1811 %d = insertelement <8 x half> %c, half %a3, i32 3
1815 define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) {
1816 ; X64-LABEL: build_vector_uuuuxxxx:
1818 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1819 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1820 ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1821 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0
1824 ; X86-LABEL: build_vector_uuuuxxxx:
1826 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1827 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1828 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1829 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1830 ; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1831 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1832 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1833 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0
1835 %a = insertelement <8 x half> undef, half %a0, i32 4
1836 %b = insertelement <8 x half> %a, half %a1, i32 5
1837 %c = insertelement <8 x half> %b, half %a2, i32 6
1838 %d = insertelement <8 x half> %c, half %a3, i32 7
1842 define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
1843 ; X64-LABEL: build_vector_xxxxxxxx:
1845 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1846 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1847 ; X64-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
1848 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1849 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1850 ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1851 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
1854 ; X86-LABEL: build_vector_xxxxxxxx:
1856 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1857 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1858 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1859 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1860 ; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1861 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1862 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1863 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1864 ; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1865 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1866 ; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1867 ; X86-NEXT: vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero
1868 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1869 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1870 ; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1872 %a = insertelement <8 x half> undef, half %a0, i32 0
1873 %b = insertelement <8 x half> %a, half %a1, i32 1
1874 %c = insertelement <8 x half> %b, half %a2, i32 2
1875 %d = insertelement <8 x half> %c, half %a3, i32 3
1876 %e = insertelement <8 x half> %d, half %a4, i32 4
1877 %f = insertelement <8 x half> %e, half %a5, i32 5
1878 %g = insertelement <8 x half> %f, half %a6, i32 6
1879 %h = insertelement <8 x half> %g, half %a7, i32 7
1883 define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
1884 ; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx:
1886 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1887 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1888 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
1889 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1890 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1891 ; X64-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1892 ; X64-NEXT: vpbroadcastq %xmm1, %xmm1
1893 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1896 ; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx:
1898 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1899 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1900 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1901 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1902 ; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1903 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1904 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1905 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1906 ; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1907 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1908 ; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1909 ; X86-NEXT: vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero
1910 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1911 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
1912 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0
1913 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1915 %a = insertelement <16 x half> undef, half %a0, i32 0
1916 %b = insertelement <16 x half> %a, half %a1, i32 1
1917 %c = insertelement <16 x half> %b, half %a2, i32 2
1918 %d = insertelement <16 x half> %c, half %a3, i32 3
1919 %e = insertelement <16 x half> %d, half %a4, i32 12
1920 %f = insertelement <16 x half> %e, half %a5, i32 13
1921 %g = insertelement <16 x half> %f, half %a6, i32 14
1922 %h = insertelement <16 x half> %g, half %a7, i32 15
1926 define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) {
1927 ; CHECK-LABEL: regression1:
1929 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
1930 ; CHECK-NEXT: ret{{[l|q]}}
1931 %res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
1935 define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, ptr %4) {
1936 ; X64-LABEL: regression2:
1938 ; X64-NEXT: vmovw (%rsi), %xmm0
1939 ; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1940 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
1941 ; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
1942 ; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1945 ; X86-LABEL: regression2:
1947 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1948 ; X86-NEXT: vmovw (%eax), %xmm0
1949 ; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1950 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
1951 ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
1952 ; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
1954 %6 = load i8, ptr %4, align 1
1955 %7 = getelementptr i8, ptr %4, i64 1
1956 %8 = addrspacecast ptr %7 to ptr addrspace(4)
1957 %9 = load i8, ptr addrspace(4) %8, align 1
1958 %10 = insertelement <2 x i8> poison, i8 %6, i32 0
1959 %11 = insertelement <2 x i8> %10, i8 %9, i32 1
1960 %12 = uitofp <2 x i8> %11 to <2 x float>
1961 %13 = shufflevector <2 x float> %12, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1962 %14 = shufflevector <4 x float> %13, <4 x float> <float poison, float poison, float 0.000000e+00, float 2.550000e+02>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1963 %15 = fmul contract <4 x float> %14, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000>
1967 ; Make sure load/stores of v4f16 are handled well on 32-bit targets where
1968 ; default widening legalization can't use i64.
1969 define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) {
1970 ; X64-LABEL: load_store_v4f16:
1972 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1973 ; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1974 ; X64-NEXT: vaddph %xmm1, %xmm0, %xmm0
1975 ; X64-NEXT: vmovlps %xmm0, (%rdx)
1978 ; X86-LABEL: load_store_v4f16:
1980 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1981 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1982 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1983 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1984 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1985 ; X86-NEXT: vaddph %xmm1, %xmm0, %xmm0
1986 ; X86-NEXT: vmovlps %xmm0, (%eax)
1988 %a = load <4 x half>, ptr %x
1989 %b = load <4 x half>, ptr %y
1990 %c = fadd <4 x half> %a, %b
1991 store <4 x half> %c, ptr %z
1995 define <8 x half> @test21(half %a, half %b, half %c) nounwind {
1996 ; X64-LABEL: test21:
1998 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
1999 ; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2
2000 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2001 ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2002 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
2003 ; X64-NEXT: vpbroadcastw %xmm1, %xmm1
2004 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2007 ; X86-LABEL: test21:
2009 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
2010 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
2011 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2012 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
2013 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2014 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
2015 ; X86-NEXT: vpbroadcastw %xmm1, %xmm1
2016 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2018 %1 = insertelement <8 x half> <half poison, half poison, half poison, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %a, i32 0
2019 %2 = insertelement <8 x half> %1, half %b, i32 1
2020 %3 = insertelement <8 x half> %2, half %c, i32 2
2024 define <16 x i16> @test22(ptr %mem) nounwind {
2025 ; X64-LABEL: test22:
2027 ; X64-NEXT: movzwl 0, %eax
2028 ; X64-NEXT: andw (%rdi), %ax
2029 ; X64-NEXT: vmovw %eax, %xmm0
2032 ; X86-LABEL: test22:
2034 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2035 ; X86-NEXT: movzwl 0, %ecx
2036 ; X86-NEXT: andw (%eax), %cx
2037 ; X86-NEXT: vmovw %ecx, %xmm0
2039 %1 = load i16, ptr null, align 2
2040 %2 = load i16, ptr %mem, align 2
2042 %4 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %3, i32 0
2046 define void @pr52560(i8 %0, <2 x i16> %1, ptr %c) nounwind {
2047 ; X64-LABEL: pr52560:
2048 ; X64: # %bb.0: # %entry
2049 ; X64-NEXT: movsbl %dil, %eax
2050 ; X64-NEXT: vmovw %eax, %xmm1
2051 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
2052 ; X64-NEXT: vpcmpgtw %xmm2, %xmm1, %k1
2053 ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
2054 ; X64-NEXT: vmovw %xmm0, %eax
2055 ; X64-NEXT: testw %ax, %ax
2056 ; X64-NEXT: je .LBB123_2
2057 ; X64-NEXT: # %bb.1: # %for.body.preheader
2058 ; X64-NEXT: movb $0, (%rsi)
2059 ; X64-NEXT: .LBB123_2: # %for.end
2062 ; X86-LABEL: pr52560:
2063 ; X86: # %bb.0: # %entry
2064 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
2065 ; X86-NEXT: vmovw %eax, %xmm1
2066 ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
2067 ; X86-NEXT: vpcmpgtw %xmm2, %xmm1, %k1
2068 ; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
2069 ; X86-NEXT: vmovw %xmm0, %eax
2070 ; X86-NEXT: testw %ax, %ax
2071 ; X86-NEXT: je .LBB123_2
2072 ; X86-NEXT: # %bb.1: # %for.body.preheader
2073 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2074 ; X86-NEXT: movb $0, (%eax)
2075 ; X86-NEXT: .LBB123_2: # %for.end
2078 %conv = sext i8 %0 to i16
2079 %2 = insertelement <2 x i16> <i16 poison, i16 0>, i16 %conv, i32 0
2080 %3 = icmp sgt <2 x i16> %2, zeroinitializer
2081 %4 = select <2 x i1> %3, <2 x i16> %1, <2 x i16> <i16 0, i16 poison>
2082 %5 = extractelement <2 x i16> %4, i32 0
2083 %tobool.not14 = icmp eq i16 %5, 0
2084 br i1 %tobool.not14, label %for.end, label %for.body.preheader
2086 for.body.preheader: ; preds = %entry
2087 store i8 0, ptr %c, align 1
2090 for.end: ; preds = %for.body.preheader, %entry
2094 define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind {
2095 ; X64-LABEL: pr52561:
2097 ; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm1
2098 ; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
2099 ; X64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
2100 ; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
2101 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
2102 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2103 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
2104 ; X64-NEXT: vmovsh %xmm0, %xmm2, %xmm0
2107 ; X86-LABEL: pr52561:
2109 ; X86-NEXT: pushl %ebp
2110 ; X86-NEXT: movl %esp, %ebp
2111 ; X86-NEXT: andl $-32, %esp
2112 ; X86-NEXT: subl $32, %esp
2113 ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0
2114 ; X86-NEXT: vpaddd 8(%ebp), %ymm1, %ymm1
2115 ; X86-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
2116 ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0
2117 ; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1
2118 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
2119 ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
2120 ; X86-NEXT: vmovsh %xmm0, %xmm2, %xmm0
2121 ; X86-NEXT: movl %ebp, %esp
2122 ; X86-NEXT: popl %ebp
2124 %1 = add <16 x i32> %a, <i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
2125 %2 = add <16 x i32> %1, %b
2126 %3 = and <16 x i32> %2, <i32 65535, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 65535>
2130 define <8 x i16> @pr59628_xmm(i16 %arg) {
2131 ; X64-LABEL: pr59628_xmm:
2133 ; X64-NEXT: vmovw %edi, %xmm0
2134 ; X64-NEXT: vpbroadcastw %edi, %xmm1
2135 ; X64-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
2136 ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
2139 ; X86-LABEL: pr59628_xmm:
2141 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2142 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
2143 ; X86-NEXT: vpbroadcastw %eax, %xmm1
2144 ; X86-NEXT: vmovsh %xmm1, %xmm0, %xmm0
2145 ; X86-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %k1
2146 ; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
2148 %I1 = insertelement <8 x i16> zeroinitializer, i16 %arg, i16 0
2149 %I2 = insertelement <8 x i16> %I1, i16 0, i16 %arg