1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86
5 define <8 x half> @broadcastph128(half* %x) {
6 ; X64-LABEL: broadcastph128:
8 ; X64-NEXT: vpbroadcastw (%rdi), %xmm0
11 ; X86-LABEL: broadcastph128:
13 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
14 ; X86-NEXT: vpbroadcastw (%eax), %xmm0
16 %l1 = load half, half* %x, align 2
17 %vec = insertelement <8 x half> undef, half %l1, i32 0
18 %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
22 define <16 x half> @broadcastph256(half* %x) {
23 ; X64-LABEL: broadcastph256:
25 ; X64-NEXT: vpbroadcastw (%rdi), %ymm0
28 ; X86-LABEL: broadcastph256:
30 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
31 ; X86-NEXT: vpbroadcastw (%eax), %ymm0
33 %l1 = load half, half* %x, align 2
34 %vec = insertelement <16 x half> undef, half %l1, i32 0
35 %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
39 define <32 x half> @broadcastph512(half* %x) {
40 ; X64-LABEL: broadcastph512:
42 ; X64-NEXT: vpbroadcastw (%rdi), %zmm0
45 ; X86-LABEL: broadcastph512:
47 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
48 ; X86-NEXT: vpbroadcastw (%eax), %zmm0
50 %l1 = load half, half* %x, align 2
51 %vec = insertelement <32 x half> undef, half %l1, i32 0
52 %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
56 define <8 x half> @broadcastph128_scalar(half %x) {
57 ; X64-LABEL: broadcastph128_scalar:
59 ; X64-NEXT: vpbroadcastw %xmm0, %xmm0
62 ; X86-LABEL: broadcastph128_scalar:
64 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0
66 %vec = insertelement <8 x half> undef, half %x, i32 0
67 %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
71 define <16 x half> @broadcastph256_scalar(half %x) {
72 ; X64-LABEL: broadcastph256_scalar:
74 ; X64-NEXT: vpbroadcastw %xmm0, %ymm0
77 ; X86-LABEL: broadcastph256_scalar:
79 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0
81 %vec = insertelement <16 x half> undef, half %x, i32 0
82 %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
86 define <32 x half> @broadcastph512_scalar(half %x) {
87 ; X64-LABEL: broadcastph512_scalar:
89 ; X64-NEXT: vpbroadcastw %xmm0, %zmm0
92 ; X86-LABEL: broadcastph512_scalar:
94 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0
96 %vec = insertelement <32 x half> undef, half %x, i32 0
97 %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
101 define <8 x half> @broadcastph128_reg(<8 x half> %x) {
102 ; CHECK-LABEL: broadcastph128_reg:
104 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
105 ; CHECK-NEXT: ret{{[l|q]}}
106 %res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer
110 define <16 x half> @broadcastph256_reg(<16 x half> %x) {
111 ; CHECK-LABEL: broadcastph256_reg:
113 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
114 ; CHECK-NEXT: ret{{[l|q]}}
115 %res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer
119 define <32 x half> @broadcastph512_reg(<32 x half> %x) {
120 ; CHECK-LABEL: broadcastph512_reg:
122 ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0
123 ; CHECK-NEXT: ret{{[l|q]}}
124 %res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer
128 define i16 @test1(half %x) {
131 ; X64-NEXT: vmovw %xmm0, %eax
132 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
137 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
139 %res = bitcast half %x to i16
143 define <8 x i16> @test2(i16 %x) {
146 ; X64-NEXT: vmovw %edi, %xmm0
151 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0
153 %res = insertelement <8 x i16>undef, i16 %x, i32 0
157 define <8 x i16> @test4(i16* %x) {
160 ; X64-NEXT: vpbroadcastw (%rdi), %xmm0
165 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
166 ; X86-NEXT: vpbroadcastw (%eax), %xmm0
168 %y = load i16, i16* %x
169 %res = insertelement <8 x i16>undef, i16 %y, i32 0
173 define void @test5(half %x, half* %y) {
176 ; X64-NEXT: vmovsh %xmm0, (%rdi)
181 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
182 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
183 ; X86-NEXT: vmovsh %xmm0, (%eax)
185 store half %x, half* %y, align 2
189 define half @test7(i16* %x) {
192 ; X64-NEXT: vmovsh (%rdi), %xmm0
197 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
198 ; X86-NEXT: vmovsh (%eax), %xmm0
200 %y = load i16, i16* %x
201 %res = bitcast i16 %y to half
205 define <8 x i16> @test10(i16* %x) {
208 ; X64-NEXT: vmovw (%rdi), %xmm0
213 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
214 ; X86-NEXT: vmovw (%eax), %xmm0
216 %y = load i16, i16* %x, align 2
217 %res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0
221 define <16 x i16> @test10b(i16* %x) {
222 ; X64-LABEL: test10b:
224 ; X64-NEXT: vmovw (%rdi), %xmm0
227 ; X86-LABEL: test10b:
229 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
230 ; X86-NEXT: vmovw (%eax), %xmm0
232 %y = load i16, i16* %x, align 2
233 %res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0
237 define <32 x i16> @test10c(i16* %x) {
238 ; X64-LABEL: test10c:
240 ; X64-NEXT: vmovw (%rdi), %xmm0
243 ; X86-LABEL: test10c:
245 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
246 ; X86-NEXT: vmovw (%eax), %xmm0
248 %y = load i16, i16* %x, align 2
249 %res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0
253 define <8 x half> @test11(half* %x) {
256 ; X64-NEXT: vmovsh (%rdi), %xmm0
261 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
262 ; X86-NEXT: vmovsh (%eax), %xmm0
264 %y = load half, half* %x, align 2
265 %res = insertelement <8 x half>zeroinitializer, half %y, i32 0
269 define <16 x half> @test11b(half* %x) {
270 ; X64-LABEL: test11b:
272 ; X64-NEXT: vmovsh (%rdi), %xmm0
275 ; X86-LABEL: test11b:
277 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
278 ; X86-NEXT: vmovsh (%eax), %xmm0
280 %y = load half, half* %x, align 2
281 %res = insertelement <16 x half>zeroinitializer, half %y, i32 0
285 define <32 x half> @test11c(half* %x) {
286 ; X64-LABEL: test11c:
288 ; X64-NEXT: vmovsh (%rdi), %xmm0
291 ; X86-LABEL: test11c:
293 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
294 ; X86-NEXT: vmovsh (%eax), %xmm0
296 %y = load half, half* %x, align 2
297 %res = insertelement <32 x half>zeroinitializer, half %y, i32 0
301 define <8 x half> @test14(half %x) {
304 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
305 ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
310 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
312 %res = insertelement <8 x half>zeroinitializer, half %x, i32 0
316 define <16 x half> @test14b(half %x) {
317 ; X64-LABEL: test14b:
319 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
320 ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
323 ; X86-LABEL: test14b:
325 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
327 %res = insertelement <16 x half>zeroinitializer, half %x, i32 0
331 define <32 x half> @test14c(half %x) {
332 ; X64-LABEL: test14c:
334 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
335 ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
338 ; X86-LABEL: test14c:
340 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
342 %res = insertelement <32 x half>zeroinitializer, half %x, i32 0
346 define <8 x i16> @test15(i16 %x) {
349 ; X64-NEXT: vmovw %edi, %xmm0
354 ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0
356 %res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0
360 define <16 x i16> @test16(i16 %x) {
363 ; X64-NEXT: vmovw %edi, %xmm0
368 ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0
370 %res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0
374 define <32 x i16> @test17(i16 %x) {
377 ; X64-NEXT: vmovw %edi, %xmm0
382 ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0
384 %res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0
388 define <8 x i16> @test18(i16 %x) {
391 ; X64-NEXT: vmovw %edi, %xmm0
396 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0
398 %res = insertelement <8 x i16> undef, i16 %x, i32 0
402 define <16 x i16> @test19(i16 %x) {
405 ; X64-NEXT: vmovw %edi, %xmm0
410 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0
412 %res = insertelement <16 x i16> undef, i16 %x, i32 0
416 define <32 x i16> @test20(i16 %x) {
419 ; X64-NEXT: vmovw %edi, %xmm0
424 ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0
426 %res = insertelement <32 x i16> undef, i16 %x, i32 0
430 @g8f16 = external global <8 x half>
431 @g8f16u = external global <8 x half>, align 8
432 @g16f16 = external global <16 x half>
433 @g16f16u = external global <16 x half>, align 8
434 @g32f16 = external global <32 x half>
435 @g32f16u = external global <32 x half>, align 8
437 define <32 x half> @load32f16(<32 x half>* %a) {
438 ; X64-LABEL: load32f16:
440 ; X64-NEXT: vmovaps (%rdi), %zmm0
443 ; X86-LABEL: load32f16:
445 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
446 ; X86-NEXT: vmovaps (%eax), %zmm0
448 %res = load <32 x half>, <32 x half>* %a
452 define <32 x half> @load32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) {
453 ; X64-LABEL: load32f16mask:
455 ; X64-NEXT: kmovd %esi, %k1
456 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
459 ; X86-LABEL: load32f16mask:
461 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
462 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
463 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
465 %msk = bitcast i32 %c to <32 x i1>
466 %res0 = load <32 x half>, <32 x half>* %a
467 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
471 define <32 x half> @load32f16maskz(<32 x half>* %a, i32 %c) {
472 ; X64-LABEL: load32f16maskz:
474 ; X64-NEXT: kmovd %esi, %k1
475 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
478 ; X86-LABEL: load32f16maskz:
480 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
481 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
482 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
484 %msk = bitcast i32 %c to <32 x i1>
485 %res0 = load <32 x half>, <32 x half>* %a
486 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
490 define <32 x half> @loadu32f16(<32 x half>* %a) {
491 ; X64-LABEL: loadu32f16:
493 ; X64-NEXT: vmovups (%rdi), %zmm0
496 ; X86-LABEL: loadu32f16:
498 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
499 ; X86-NEXT: vmovups (%eax), %zmm0
501 %res = load <32 x half>, <32 x half>* %a, align 8
505 define <32 x half> @loadu32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) {
506 ; X64-LABEL: loadu32f16mask:
508 ; X64-NEXT: kmovd %esi, %k1
509 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
512 ; X86-LABEL: loadu32f16mask:
514 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
515 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
516 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
518 %msk = bitcast i32 %c to <32 x i1>
519 %res0 = load <32 x half>, <32 x half>* %a, align 8
520 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
524 define <32 x half> @loadu32f16maskz(<32 x half>* %a, i32 %c) {
525 ; X64-LABEL: loadu32f16maskz:
527 ; X64-NEXT: kmovd %esi, %k1
528 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
531 ; X86-LABEL: loadu32f16maskz:
533 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
534 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
535 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
537 %msk = bitcast i32 %c to <32 x i1>
538 %res0 = load <32 x half>, <32 x half>* %a, align 8
539 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
543 define void @store32f16(<32 x half> %a) {
544 ; X64-LABEL: store32f16:
546 ; X64-NEXT: movq g32f16@GOTPCREL(%rip), %rax
547 ; X64-NEXT: vmovaps %zmm0, (%rax)
548 ; X64-NEXT: vzeroupper
551 ; X86-LABEL: store32f16:
553 ; X86-NEXT: vmovaps %zmm0, g32f16
554 ; X86-NEXT: vzeroupper
556 store <32 x half> %a, <32 x half>* @g32f16
560 define void @storeu32f16(<32 x half> %a) {
561 ; X64-LABEL: storeu32f16:
563 ; X64-NEXT: movq g32f16u@GOTPCREL(%rip), %rax
564 ; X64-NEXT: vmovups %zmm0, (%rax)
565 ; X64-NEXT: vzeroupper
568 ; X86-LABEL: storeu32f16:
570 ; X86-NEXT: vmovups %zmm0, g32f16u
571 ; X86-NEXT: vzeroupper
573 store <32 x half> %a, <32 x half>* @g32f16u, align 8
577 declare void @llvm.masked.store.v32f16.p0v32f16(<32 x half>, <32 x half>*, i32, <32 x i1>)
578 declare <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>*, i32, <32 x i1>, <32 x half>)
580 define void @storeu32f16mask(<32 x i1> %mask, <32 x half>* %addr, <32 x half> %val) {
581 ; X64-LABEL: storeu32f16mask:
583 ; X64-NEXT: vpsllw $7, %ymm0, %ymm0
584 ; X64-NEXT: vpmovb2m %ymm0, %k1
585 ; X64-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
586 ; X64-NEXT: vzeroupper
589 ; X86-LABEL: storeu32f16mask:
591 ; X86-NEXT: vpsllw $7, %ymm0, %ymm0
592 ; X86-NEXT: vpmovb2m %ymm0, %k1
593 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
594 ; X86-NEXT: vmovdqu16 %zmm1, (%eax) {%k1}
595 ; X86-NEXT: vzeroupper
597 call void @llvm.masked.store.v32f16.p0v32f16(<32 x half> %val, <32 x half>* %addr, i32 4, <32 x i1>%mask)
601 define <32 x half> @maskloadu32f16(<32 x half>* %addr, <32 x half> %val, <32 x i1> %mask) {
602 ; X64-LABEL: maskloadu32f16:
604 ; X64-NEXT: vpsllw $7, %ymm1, %ymm1
605 ; X64-NEXT: vpmovb2m %ymm1, %k1
606 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
609 ; X86-LABEL: maskloadu32f16:
611 ; X86-NEXT: vpsllw $7, %ymm1, %ymm1
612 ; X86-NEXT: vpmovb2m %ymm1, %k1
613 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
614 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
616 %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> %val)
620 define <32 x half> @maskuloadu32f16(<32 x half>* %addr, <32 x i1> %mask) {
621 ; X64-LABEL: maskuloadu32f16:
623 ; X64-NEXT: vpsllw $7, %ymm0, %ymm0
624 ; X64-NEXT: vpmovb2m %ymm0, %k1
625 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
628 ; X86-LABEL: maskuloadu32f16:
630 ; X86-NEXT: vpsllw $7, %ymm0, %ymm0
631 ; X86-NEXT: vpmovb2m %ymm0, %k1
632 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
633 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
635 %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> undef)
639 define <32 x half> @maskzloadu32f16(<32 x half>* %addr, <32 x i1> %mask) {
640 ; X64-LABEL: maskzloadu32f16:
642 ; X64-NEXT: vpsllw $7, %ymm0, %ymm0
643 ; X64-NEXT: vpmovb2m %ymm0, %k1
644 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
647 ; X86-LABEL: maskzloadu32f16:
649 ; X86-NEXT: vpsllw $7, %ymm0, %ymm0
650 ; X86-NEXT: vpmovb2m %ymm0, %k1
651 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
652 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
654 %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer)
658 define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) {
659 ; CHECK-LABEL: movrr32f16:
661 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
662 ; CHECK-NEXT: ret{{[l|q]}}
666 define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) {
667 ; X64-LABEL: movrrk32f16:
669 ; X64-NEXT: kmovd %edi, %k1
670 ; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
673 ; X86-LABEL: movrrk32f16:
675 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
676 ; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
678 %mask = bitcast i32 %msk to <32 x i1>
679 %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b
683 define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) {
684 ; X64-LABEL: movrrkz32f16:
686 ; X64-NEXT: kmovd %edi, %k1
687 ; X64-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
690 ; X86-LABEL: movrrkz32f16:
692 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
693 ; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
695 %mask = bitcast i32 %msk to <32 x i1>
696 %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer
700 define <16 x half> @load16f16(<16 x half>* %a) {
701 ; X64-LABEL: load16f16:
703 ; X64-NEXT: vmovaps (%rdi), %ymm0
706 ; X86-LABEL: load16f16:
708 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
709 ; X86-NEXT: vmovaps (%eax), %ymm0
711 %res = load <16 x half>, <16 x half>* %a
715 define <16 x half> @load16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) {
716 ; X64-LABEL: load16f16mask:
718 ; X64-NEXT: kmovd %esi, %k1
719 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
722 ; X86-LABEL: load16f16mask:
724 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
725 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
726 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
728 %msk = bitcast i16 %c to <16 x i1>
729 %res0 = load <16 x half>, <16 x half>* %a
730 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
734 define <16 x half> @load16f16maskz(<16 x half>* %a, i16 %c) {
735 ; X64-LABEL: load16f16maskz:
737 ; X64-NEXT: kmovd %esi, %k1
738 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
741 ; X86-LABEL: load16f16maskz:
743 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
744 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
745 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
747 %msk = bitcast i16 %c to <16 x i1>
748 %res0 = load <16 x half>, <16 x half>* %a
749 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
753 define <16 x half> @loadu16f16(<16 x half>* %a) {
754 ; X64-LABEL: loadu16f16:
756 ; X64-NEXT: vmovups (%rdi), %ymm0
759 ; X86-LABEL: loadu16f16:
761 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
762 ; X86-NEXT: vmovups (%eax), %ymm0
764 %res = load <16 x half>, <16 x half>* %a, align 8
768 define <16 x half> @loadu16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) {
769 ; X64-LABEL: loadu16f16mask:
771 ; X64-NEXT: kmovd %esi, %k1
772 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
775 ; X86-LABEL: loadu16f16mask:
777 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
778 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
779 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
781 %msk = bitcast i16 %c to <16 x i1>
782 %res0 = load <16 x half>, <16 x half>* %a, align 8
783 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
787 define <16 x half> @loadu16f16maskz(<16 x half>* %a, i16 %c) {
788 ; X64-LABEL: loadu16f16maskz:
790 ; X64-NEXT: kmovd %esi, %k1
791 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
794 ; X86-LABEL: loadu16f16maskz:
796 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
797 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
798 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
800 %msk = bitcast i16 %c to <16 x i1>
801 %res0 = load <16 x half>, <16 x half>* %a, align 8
802 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
806 define void @store16f16(<16 x half> %a) {
807 ; X64-LABEL: store16f16:
809 ; X64-NEXT: movq g16f16@GOTPCREL(%rip), %rax
810 ; X64-NEXT: vmovaps %ymm0, (%rax)
811 ; X64-NEXT: vzeroupper
814 ; X86-LABEL: store16f16:
816 ; X86-NEXT: vmovaps %ymm0, g16f16
817 ; X86-NEXT: vzeroupper
819 store <16 x half> %a, <16 x half>* @g16f16
823 define void @storeu16f16(<16 x half> %a) {
824 ; X64-LABEL: storeu16f16:
826 ; X64-NEXT: movq g16f16u@GOTPCREL(%rip), %rax
827 ; X64-NEXT: vmovups %ymm0, (%rax)
828 ; X64-NEXT: vzeroupper
831 ; X86-LABEL: storeu16f16:
833 ; X86-NEXT: vmovups %ymm0, g16f16u
834 ; X86-NEXT: vzeroupper
836 store <16 x half> %a, <16 x half>* @g16f16u, align 8
840 declare void @llvm.masked.store.v16f16.p0v16f16(<16 x half>, <16 x half>*, i32, <16 x i1>)
841 declare <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>*, i32, <16 x i1>, <16 x half>)
843 define void @storeu16f16mask(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) {
844 ; X64-LABEL: storeu16f16mask:
846 ; X64-NEXT: vpsllw $7, %xmm0, %xmm0
847 ; X64-NEXT: vpmovb2m %xmm0, %k1
848 ; X64-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
849 ; X64-NEXT: vzeroupper
852 ; X86-LABEL: storeu16f16mask:
854 ; X86-NEXT: vpsllw $7, %xmm0, %xmm0
855 ; X86-NEXT: vpmovb2m %xmm0, %k1
856 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
857 ; X86-NEXT: vmovdqu16 %ymm1, (%eax) {%k1}
858 ; X86-NEXT: vzeroupper
860 call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask)
864 define <16 x half> @maskloadu16f16(<16 x half>* %addr, <16 x half> %val, <16 x i1> %mask) {
865 ; X64-LABEL: maskloadu16f16:
867 ; X64-NEXT: vpsllw $7, %xmm1, %xmm1
868 ; X64-NEXT: vpmovb2m %xmm1, %k1
869 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
872 ; X86-LABEL: maskloadu16f16:
874 ; X86-NEXT: vpsllw $7, %xmm1, %xmm1
875 ; X86-NEXT: vpmovb2m %xmm1, %k1
876 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
877 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
879 %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> %val)
883 define <16 x half> @maskuloadu16f16(<16 x half>* %addr, <16 x i1> %mask) {
884 ; X64-LABEL: maskuloadu16f16:
886 ; X64-NEXT: vpsllw $7, %xmm0, %xmm0
887 ; X64-NEXT: vpmovb2m %xmm0, %k1
888 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
891 ; X86-LABEL: maskuloadu16f16:
893 ; X86-NEXT: vpsllw $7, %xmm0, %xmm0
894 ; X86-NEXT: vpmovb2m %xmm0, %k1
895 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
896 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
898 %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> undef)
902 define <16 x half> @maskzloadu16f16(<16 x half>* %addr, <16 x i1> %mask) {
903 ; X64-LABEL: maskzloadu16f16:
905 ; X64-NEXT: vpsllw $7, %xmm0, %xmm0
906 ; X64-NEXT: vpmovb2m %xmm0, %k1
907 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
910 ; X86-LABEL: maskzloadu16f16:
912 ; X86-NEXT: vpsllw $7, %xmm0, %xmm0
913 ; X86-NEXT: vpmovb2m %xmm0, %k1
914 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
915 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
917 %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer)
921 define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) {
922 ; CHECK-LABEL: movrr16f16:
924 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
925 ; CHECK-NEXT: ret{{[l|q]}}
929 define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) {
930 ; X64-LABEL: movrrk16f16:
932 ; X64-NEXT: kmovd %edi, %k1
933 ; X64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
936 ; X86-LABEL: movrrk16f16:
938 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
939 ; X86-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
941 %mask = bitcast i16 %msk to <16 x i1>
942 %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b
946 define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) {
947 ; X64-LABEL: movrrkz16f16:
949 ; X64-NEXT: kmovd %edi, %k1
950 ; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
953 ; X86-LABEL: movrrkz16f16:
955 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
956 ; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
958 %mask = bitcast i16 %msk to <16 x i1>
959 %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer
963 define <8 x half> @load8f16(<8 x half>* %a) {
964 ; X64-LABEL: load8f16:
966 ; X64-NEXT: vmovaps (%rdi), %xmm0
969 ; X86-LABEL: load8f16:
971 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
972 ; X86-NEXT: vmovaps (%eax), %xmm0
974 %res = load <8 x half>, <8 x half>* %a
978 define <8 x half> @load8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) {
979 ; X64-LABEL: load8f16mask:
981 ; X64-NEXT: kmovd %esi, %k1
982 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
985 ; X86-LABEL: load8f16mask:
987 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
988 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
989 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
991 %msk = bitcast i8 %c to <8 x i1>
992 %res0 = load <8 x half>, <8 x half>* %a
993 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
997 define <8 x half> @load8f16maskz(<8 x half>* %a, i8 %c) {
998 ; X64-LABEL: load8f16maskz:
1000 ; X64-NEXT: kmovd %esi, %k1
1001 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1004 ; X86-LABEL: load8f16maskz:
1006 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1007 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1008 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
1010 %msk = bitcast i8 %c to <8 x i1>
1011 %res0 = load <8 x half>, <8 x half>* %a
1012 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
1016 define <8 x half> @loadu8f16(<8 x half>* %a) {
1017 ; X64-LABEL: loadu8f16:
1019 ; X64-NEXT: vmovups (%rdi), %xmm0
1022 ; X86-LABEL: loadu8f16:
1024 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1025 ; X86-NEXT: vmovups (%eax), %xmm0
1027 %res = load <8 x half>, <8 x half>* %a, align 8
1031 define <8 x half> @loadu8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) {
1032 ; X64-LABEL: loadu8f16mask:
1034 ; X64-NEXT: kmovd %esi, %k1
1035 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
1038 ; X86-LABEL: loadu8f16mask:
1040 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1041 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1042 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
1044 %msk = bitcast i8 %c to <8 x i1>
1045 %res0 = load <8 x half>, <8 x half>* %a, align 8
1046 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
1050 define <8 x half> @loadu8f16maskz(<8 x half>* %a, i8 %c) {
1051 ; X64-LABEL: loadu8f16maskz:
1053 ; X64-NEXT: kmovd %esi, %k1
1054 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1057 ; X86-LABEL: loadu8f16maskz:
1059 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1060 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1061 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
1063 %msk = bitcast i8 %c to <8 x i1>
1064 %res0 = load <8 x half>, <8 x half>* %a, align 8
1065 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
1069 define void @store8f16(<8 x half> %a) {
1070 ; X64-LABEL: store8f16:
1072 ; X64-NEXT: movq g8f16@GOTPCREL(%rip), %rax
1073 ; X64-NEXT: vmovaps %xmm0, (%rax)
1076 ; X86-LABEL: store8f16:
1078 ; X86-NEXT: vmovaps %xmm0, g8f16
1080 store <8 x half> %a, <8 x half>* @g8f16
1084 define void @storeu8f16(<8 x half> %a) {
1085 ; X64-LABEL: storeu8f16:
1087 ; X64-NEXT: movq g8f16u@GOTPCREL(%rip), %rax
1088 ; X64-NEXT: vmovups %xmm0, (%rax)
1091 ; X86-LABEL: storeu8f16:
1093 ; X86-NEXT: vmovups %xmm0, g8f16u
1095 store <8 x half> %a, <8 x half>* @g8f16u, align 8
1099 declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
1100 declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
1102 define void @storeu8f16mask(<8 x i1> %mask, <8 x half>* %addr, <8 x half> %val) {
1103 ; X64-LABEL: storeu8f16mask:
1105 ; X64-NEXT: vpsllw $15, %xmm0, %xmm0
1106 ; X64-NEXT: vpmovw2m %xmm0, %k1
1107 ; X64-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
1110 ; X86-LABEL: storeu8f16mask:
1112 ; X86-NEXT: vpsllw $15, %xmm0, %xmm0
1113 ; X86-NEXT: vpmovw2m %xmm0, %k1
1114 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1115 ; X86-NEXT: vmovdqu16 %xmm1, (%eax) {%k1}
1117 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %val, <8 x half>* %addr, i32 4, <8 x i1>%mask)
1121 define <8 x half> @maskloadu8f16(<8 x half>* %addr, <8 x half> %val, <8 x i1> %mask) {
1122 ; X64-LABEL: maskloadu8f16:
1124 ; X64-NEXT: vpsllw $15, %xmm1, %xmm1
1125 ; X64-NEXT: vpmovw2m %xmm1, %k1
1126 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
1129 ; X86-LABEL: maskloadu8f16:
1131 ; X86-NEXT: vpsllw $15, %xmm1, %xmm1
1132 ; X86-NEXT: vpmovw2m %xmm1, %k1
1133 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1134 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
1136 %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> %val)
1140 define <8 x half> @maskuloadu8f16(<8 x half>* %addr, <8 x i1> %mask) {
1141 ; X64-LABEL: maskuloadu8f16:
1143 ; X64-NEXT: vpsllw $15, %xmm0, %xmm0
1144 ; X64-NEXT: vpmovw2m %xmm0, %k1
1145 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1148 ; X86-LABEL: maskuloadu8f16:
1150 ; X86-NEXT: vpsllw $15, %xmm0, %xmm0
1151 ; X86-NEXT: vpmovw2m %xmm0, %k1
1152 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1153 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
1155 %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> undef)
1159 define <8 x half> @maskzloadu8f16(<8 x half>* %addr, <8 x i1> %mask) {
1160 ; X64-LABEL: maskzloadu8f16:
1162 ; X64-NEXT: vpsllw $15, %xmm0, %xmm0
1163 ; X64-NEXT: vpmovw2m %xmm0, %k1
1164 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1167 ; X86-LABEL: maskzloadu8f16:
1169 ; X86-NEXT: vpsllw $15, %xmm0, %xmm0
1170 ; X86-NEXT: vpmovw2m %xmm0, %k1
1171 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1172 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
1174 %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer)
1178 define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) {
1179 ; CHECK-LABEL: movrr8f16:
1181 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
1182 ; CHECK-NEXT: ret{{[l|q]}}
1186 define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
1187 ; X64-LABEL: movrrk8f16:
1189 ; X64-NEXT: kmovd %edi, %k1
1190 ; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
1193 ; X86-LABEL: movrrk8f16:
1195 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1196 ; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
1198 %mask = bitcast i8 %msk to <8 x i1>
1199 %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b
1203 define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
1204 ; X64-LABEL: movrrkz8f16:
1206 ; X64-NEXT: kmovd %edi, %k1
1207 ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
1210 ; X86-LABEL: movrrkz8f16:
1212 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
1213 ; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
1215 %mask = bitcast i8 %msk to <8 x i1>
1216 %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer
1220 define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
1221 ; CHECK-LABEL: movsh:
1223 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
1224 ; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0
1225 ; CHECK-NEXT: vaddph %xmm0, %xmm2, %xmm0
1226 ; CHECK-NEXT: ret{{[l|q]}}
1227 %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
1228 %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1229 %res = fadd <8 x half> %res1, %res2
1233 define i16 @test_movw(half %x) {
1234 ; X64-LABEL: test_movw:
1236 ; X64-NEXT: vmovw %xmm0, %eax
1237 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
1240 ; X86-LABEL: test_movw:
1242 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1244 %res = bitcast half %x to i16
1248 define half @test_movw2(i16 %x) {
1249 ; X64-LABEL: test_movw2:
1251 ; X64-NEXT: vmovw %edi, %xmm0
1254 ; X86-LABEL: test_movw2:
1256 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
1258 %res = bitcast i16 %x to half
1262 ; sext avoids having a truncate in front of the bitcast input due to calling
1263 ; convention or i16 op promotion.
1264 define half @test_movw3(i8 %x) {
1265 ; X64-LABEL: test_movw3:
1267 ; X64-NEXT: movsbl %dil, %eax
1268 ; X64-NEXT: vmovw %eax, %xmm0
1271 ; X86-LABEL: test_movw3:
1273 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
1274 ; X86-NEXT: vmovw %eax, %xmm0
1276 %z = sext i8 %x to i16
1277 %a = bitcast i16 %z to half
1281 define half @extract_f16_0(<8 x half> %x) {
1282 ; CHECK-LABEL: extract_f16_0:
1284 ; CHECK-NEXT: ret{{[l|q]}}
1285 %res = extractelement <8 x half> %x, i32 0
1289 define half @extract_f16_1(<8 x half> %x) {
1290 ; CHECK-LABEL: extract_f16_1:
1292 ; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0
1293 ; CHECK-NEXT: ret{{[l|q]}}
1294 %res = extractelement <8 x half> %x, i32 1
1298 define half @extract_f16_2(<8 x half> %x) {
1299 ; CHECK-LABEL: extract_f16_2:
1301 ; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1302 ; CHECK-NEXT: ret{{[l|q]}}
1303 %res = extractelement <8 x half> %x, i32 2
1307 define half @extract_f16_3(<8 x half> %x) {
1308 ; CHECK-LABEL: extract_f16_3:
1310 ; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm0
1311 ; CHECK-NEXT: ret{{[l|q]}}
1312 %res = extractelement <8 x half> %x, i32 3
1316 define half @extract_f16_4(<8 x half> %x) {
1317 ; CHECK-LABEL: extract_f16_4:
1319 ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1320 ; CHECK-NEXT: ret{{[l|q]}}
1321 %res = extractelement <8 x half> %x, i32 4
1325 define half @extract_f16_5(<8 x half> %x) {
1326 ; CHECK-LABEL: extract_f16_5:
1328 ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1329 ; CHECK-NEXT: ret{{[l|q]}}
1330 %res = extractelement <8 x half> %x, i32 5
1334 define half @extract_f16_6(<8 x half> %x) {
1335 ; CHECK-LABEL: extract_f16_6:
1337 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1338 ; CHECK-NEXT: ret{{[l|q]}}
1339 %res = extractelement <8 x half> %x, i32 6
1343 define half @extract_f16_7(<8 x half> %x) {
1344 ; CHECK-LABEL: extract_f16_7:
1346 ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1347 ; CHECK-NEXT: ret{{[l|q]}}
1348 %res = extractelement <8 x half> %x, i32 7
1352 define i16 @extract_i16_0(<8 x i16> %x) {
1353 ; CHECK-LABEL: extract_i16_0:
1355 ; CHECK-NEXT: vmovw %xmm0, %eax
1356 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1357 ; CHECK-NEXT: ret{{[l|q]}}
1358 %res = extractelement <8 x i16> %x, i32 0
1362 define i16 @extract_i16_1(<8 x i16> %x) {
1363 ; CHECK-LABEL: extract_i16_1:
1365 ; CHECK-NEXT: vpextrw $1, %xmm0, %eax
1366 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1367 ; CHECK-NEXT: ret{{[l|q]}}
1368 %res = extractelement <8 x i16> %x, i32 1
1372 define i16 @extract_i16_2(<8 x i16> %x) {
1373 ; CHECK-LABEL: extract_i16_2:
1375 ; CHECK-NEXT: vpextrw $2, %xmm0, %eax
1376 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1377 ; CHECK-NEXT: ret{{[l|q]}}
1378 %res = extractelement <8 x i16> %x, i32 2
1382 define i16 @extract_i16_3(<8 x i16> %x) {
1383 ; CHECK-LABEL: extract_i16_3:
1385 ; CHECK-NEXT: vpextrw $3, %xmm0, %eax
1386 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1387 ; CHECK-NEXT: ret{{[l|q]}}
1388 %res = extractelement <8 x i16> %x, i32 3
1392 define i16 @extract_i16_4(<8 x i16> %x) {
1393 ; CHECK-LABEL: extract_i16_4:
1395 ; CHECK-NEXT: vpextrw $4, %xmm0, %eax
1396 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1397 ; CHECK-NEXT: ret{{[l|q]}}
1398 %res = extractelement <8 x i16> %x, i32 4
1402 define i16 @extract_i16_5(<8 x i16> %x) {
1403 ; CHECK-LABEL: extract_i16_5:
1405 ; CHECK-NEXT: vpextrw $5, %xmm0, %eax
1406 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1407 ; CHECK-NEXT: ret{{[l|q]}}
1408 %res = extractelement <8 x i16> %x, i32 5
1412 define i16 @extract_i16_6(<8 x i16> %x) {
1413 ; CHECK-LABEL: extract_i16_6:
1415 ; CHECK-NEXT: vpextrw $6, %xmm0, %eax
1416 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1417 ; CHECK-NEXT: ret{{[l|q]}}
1418 %res = extractelement <8 x i16> %x, i32 6
1422 define i16 @extract_i16_7(<8 x i16> %x) {
1423 ; CHECK-LABEL: extract_i16_7:
1425 ; CHECK-NEXT: vpextrw $7, %xmm0, %eax
1426 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1427 ; CHECK-NEXT: ret{{[l|q]}}
1428 %res = extractelement <8 x i16> %x, i32 7
1432 define void @extract_store_f16_0(<8 x half> %x, half* %y) {
1433 ; X64-LABEL: extract_store_f16_0:
1435 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1438 ; X86-LABEL: extract_store_f16_0:
1440 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1441 ; X86-NEXT: vmovsh %xmm0, (%eax)
1443 %res = extractelement <8 x half> %x, i32 0
1444 store half %res, half* %y
1448 define void @extract_store_f16_1(<8 x half> %x, half* %y) {
1449 ; X64-LABEL: extract_store_f16_1:
1451 ; X64-NEXT: vpsrld $16, %xmm0, %xmm0
1452 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1455 ; X86-LABEL: extract_store_f16_1:
1457 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1458 ; X86-NEXT: vpsrld $16, %xmm0, %xmm0
1459 ; X86-NEXT: vmovsh %xmm0, (%eax)
1461 %res = extractelement <8 x half> %x, i32 1
1462 store half %res, half* %y
1466 define void @extract_store_f16_2(<8 x half> %x, half* %y) {
1467 ; X64-LABEL: extract_store_f16_2:
1469 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1470 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1473 ; X86-LABEL: extract_store_f16_2:
1475 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1476 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1477 ; X86-NEXT: vmovsh %xmm0, (%eax)
1479 %res = extractelement <8 x half> %x, i32 2
1480 store half %res, half* %y
1484 define void @extract_store_f16_3(<8 x half> %x, half* %y) {
1485 ; X64-LABEL: extract_store_f16_3:
1487 ; X64-NEXT: vpsrlq $48, %xmm0, %xmm0
1488 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1491 ; X86-LABEL: extract_store_f16_3:
1493 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1494 ; X86-NEXT: vpsrlq $48, %xmm0, %xmm0
1495 ; X86-NEXT: vmovsh %xmm0, (%eax)
1497 %res = extractelement <8 x half> %x, i32 3
1498 store half %res, half* %y
1502 define void @extract_store_f16_4(<8 x half> %x, half* %y) {
1503 ; X64-LABEL: extract_store_f16_4:
1505 ; X64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1506 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1509 ; X86-LABEL: extract_store_f16_4:
1511 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1512 ; X86-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1513 ; X86-NEXT: vmovsh %xmm0, (%eax)
1515 %res = extractelement <8 x half> %x, i32 4
1516 store half %res, half* %y
1520 define void @extract_store_f16_5(<8 x half> %x, half* %y) {
1521 ; X64-LABEL: extract_store_f16_5:
1523 ; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1524 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1527 ; X86-LABEL: extract_store_f16_5:
1529 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1530 ; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1531 ; X86-NEXT: vmovsh %xmm0, (%eax)
1533 %res = extractelement <8 x half> %x, i32 5
1534 store half %res, half* %y
1538 define void @extract_store_f16_6(<8 x half> %x, half* %y) {
1539 ; X64-LABEL: extract_store_f16_6:
1541 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1542 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1545 ; X86-LABEL: extract_store_f16_6:
1547 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1548 ; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1549 ; X86-NEXT: vmovsh %xmm0, (%eax)
1551 %res = extractelement <8 x half> %x, i32 6
1552 store half %res, half* %y
1556 define void @extract_store_f16_7(<8 x half> %x, half* %y) {
1557 ; X64-LABEL: extract_store_f16_7:
1559 ; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1560 ; X64-NEXT: vmovsh %xmm0, (%rdi)
1563 ; X86-LABEL: extract_store_f16_7:
1565 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1566 ; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1567 ; X86-NEXT: vmovsh %xmm0, (%eax)
1569 %res = extractelement <8 x half> %x, i32 7
1570 store half %res, half* %y
1574 define void @extract_store_i16_0(<8 x i16> %x, i16* %y) {
1575 ; X64-LABEL: extract_store_i16_0:
1577 ; X64-NEXT: vpextrw $0, %xmm0, (%rdi)
1580 ; X86-LABEL: extract_store_i16_0:
1582 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1583 ; X86-NEXT: vpextrw $0, %xmm0, (%eax)
1585 %res = extractelement <8 x i16> %x, i32 0
1586 store i16 %res, i16* %y
1590 define void @extract_store_i16_1(<8 x i16> %x, i16* %y) {
1591 ; X64-LABEL: extract_store_i16_1:
1593 ; X64-NEXT: vpextrw $1, %xmm0, (%rdi)
1596 ; X86-LABEL: extract_store_i16_1:
1598 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1599 ; X86-NEXT: vpextrw $1, %xmm0, (%eax)
1601 %res = extractelement <8 x i16> %x, i32 1
1602 store i16 %res, i16* %y
1606 define void @extract_store_i16_2(<8 x i16> %x, i16* %y) {
1607 ; X64-LABEL: extract_store_i16_2:
1609 ; X64-NEXT: vpextrw $2, %xmm0, (%rdi)
1612 ; X86-LABEL: extract_store_i16_2:
1614 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1615 ; X86-NEXT: vpextrw $2, %xmm0, (%eax)
1617 %res = extractelement <8 x i16> %x, i32 2
1618 store i16 %res, i16* %y
1622 define void @extract_store_i16_3(<8 x i16> %x, i16* %y) {
1623 ; X64-LABEL: extract_store_i16_3:
1625 ; X64-NEXT: vpextrw $3, %xmm0, (%rdi)
1628 ; X86-LABEL: extract_store_i16_3:
1630 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1631 ; X86-NEXT: vpextrw $3, %xmm0, (%eax)
1633 %res = extractelement <8 x i16> %x, i32 3
1634 store i16 %res, i16* %y
1638 define void @extract_store_i16_4(<8 x i16> %x, i16* %y) {
1639 ; X64-LABEL: extract_store_i16_4:
1641 ; X64-NEXT: vpextrw $4, %xmm0, (%rdi)
1644 ; X86-LABEL: extract_store_i16_4:
1646 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1647 ; X86-NEXT: vpextrw $4, %xmm0, (%eax)
1649 %res = extractelement <8 x i16> %x, i32 4
1650 store i16 %res, i16* %y
1654 define void @extract_store_i16_5(<8 x i16> %x, i16* %y) {
1655 ; X64-LABEL: extract_store_i16_5:
1657 ; X64-NEXT: vpextrw $5, %xmm0, (%rdi)
1660 ; X86-LABEL: extract_store_i16_5:
1662 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1663 ; X86-NEXT: vpextrw $5, %xmm0, (%eax)
1665 %res = extractelement <8 x i16> %x, i32 5
1666 store i16 %res, i16* %y
1670 define void @extract_store_i16_6(<8 x i16> %x, i16* %y) {
1671 ; X64-LABEL: extract_store_i16_6:
1673 ; X64-NEXT: vpextrw $6, %xmm0, (%rdi)
1676 ; X86-LABEL: extract_store_i16_6:
1678 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1679 ; X86-NEXT: vpextrw $6, %xmm0, (%eax)
1681 %res = extractelement <8 x i16> %x, i32 6
1682 store i16 %res, i16* %y
1686 define void @extract_store_i16_7(<8 x i16> %x, i16* %y) {
1687 ; X64-LABEL: extract_store_i16_7:
1689 ; X64-NEXT: vpextrw $7, %xmm0, (%rdi)
1692 ; X86-LABEL: extract_store_i16_7:
1694 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1695 ; X86-NEXT: vpextrw $7, %xmm0, (%eax)
1697 %res = extractelement <8 x i16> %x, i32 7
1698 store i16 %res, i16* %y
1702 define i32 @extract_zext_i16_0(<8 x i16> %x) {
1703 ; CHECK-LABEL: extract_zext_i16_0:
1705 ; CHECK-NEXT: vpextrw $0, %xmm0, %eax
1706 ; CHECK-NEXT: ret{{[l|q]}}
1707 %res = extractelement <8 x i16> %x, i32 0
1708 %res2 = zext i16 %res to i32
1712 define i32 @extract_zext_i16_1(<8 x i16> %x) {
1713 ; CHECK-LABEL: extract_zext_i16_1:
1715 ; CHECK-NEXT: vpextrw $1, %xmm0, %eax
1716 ; CHECK-NEXT: ret{{[l|q]}}
1717 %res = extractelement <8 x i16> %x, i32 1
1718 %res2 = zext i16 %res to i32
1722 define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) {
1723 ; X64-LABEL: build_vector_xxxxuuuu:
1725 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1726 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1727 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
1730 ; X86-LABEL: build_vector_xxxxuuuu:
1732 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
1733 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1734 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1735 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1736 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
1737 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1738 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1740 %a = insertelement <8 x half> undef, half %a0, i32 0
1741 %b = insertelement <8 x half> %a, half %a1, i32 1
1742 %c = insertelement <8 x half> %b, half %a2, i32 2
1743 %d = insertelement <8 x half> %c, half %a3, i32 3
1747 define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) {
1748 ; X64-LABEL: build_vector_uuuuxxxx:
1750 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1751 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1752 ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1753 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0
1756 ; X86-LABEL: build_vector_uuuuxxxx:
1758 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
1759 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1760 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1761 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1762 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
1763 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1764 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1765 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0
1767 %a = insertelement <8 x half> undef, half %a0, i32 4
1768 %b = insertelement <8 x half> %a, half %a1, i32 5
1769 %c = insertelement <8 x half> %b, half %a2, i32 6
1770 %d = insertelement <8 x half> %c, half %a3, i32 7
1774 define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
1775 ; X64-LABEL: build_vector_xxxxxxxx:
1777 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1778 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1779 ; X64-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
1780 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1781 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1782 ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1783 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
1786 ; X86-LABEL: build_vector_xxxxxxxx:
1788 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
1789 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1790 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1791 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1792 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
1793 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1794 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1795 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1796 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
1797 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1798 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
1799 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3
1800 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1801 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1802 ; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1804 %a = insertelement <8 x half> undef, half %a0, i32 0
1805 %b = insertelement <8 x half> %a, half %a1, i32 1
1806 %c = insertelement <8 x half> %b, half %a2, i32 2
1807 %d = insertelement <8 x half> %c, half %a3, i32 3
1808 %e = insertelement <8 x half> %d, half %a4, i32 4
1809 %f = insertelement <8 x half> %e, half %a5, i32 5
1810 %g = insertelement <8 x half> %f, half %a6, i32 6
1811 %h = insertelement <8 x half> %g, half %a7, i32 7
1815 define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
1816 ; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx:
1818 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1819 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1820 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
1821 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1822 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1823 ; X64-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1824 ; X64-NEXT: vpbroadcastq %xmm1, %xmm1
1825 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1828 ; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx:
1830 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
1831 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1832 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1833 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1834 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
1835 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1836 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1837 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
1838 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
1839 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1840 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
1841 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3
1842 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1843 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
1844 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0
1845 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1847 %a = insertelement <16 x half> undef, half %a0, i32 0
1848 %b = insertelement <16 x half> %a, half %a1, i32 1
1849 %c = insertelement <16 x half> %b, half %a2, i32 2
1850 %d = insertelement <16 x half> %c, half %a3, i32 3
1851 %e = insertelement <16 x half> %d, half %a4, i32 12
1852 %f = insertelement <16 x half> %e, half %a5, i32 13
1853 %g = insertelement <16 x half> %f, half %a6, i32 14
1854 %h = insertelement <16 x half> %g, half %a7, i32 15
1858 define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) {
1859 ; CHECK-LABEL: regression1:
1861 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
1862 ; CHECK-NEXT: ret{{[l|q]}}
1863 %res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
1867 define <4 x float> @regression2(i8 addrspace(1)* %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, i8* %4) {
1868 ; X64-LABEL: regression2:
1870 ; X64-NEXT: vmovw (%rsi), %xmm0
1871 ; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1872 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
1873 ; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
1874 ; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1877 ; X86-LABEL: regression2:
1879 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1880 ; X86-NEXT: vmovw (%eax), %xmm0
1881 ; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1882 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
1883 ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
1884 ; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
1886 %6 = getelementptr i8, i8* %4, i64 0
1887 %7 = getelementptr i8, i8* %6, i64 0
1888 %8 = getelementptr i8, i8* %7, i64 0
1889 %9 = load i8, i8* %8, align 1
1890 %10 = getelementptr i8, i8* %8, i64 1
1891 %11 = addrspacecast i8* %10 to i8 addrspace(4)*
1892 %12 = load i8, i8 addrspace(4)* %11, align 1
1893 %13 = insertelement <2 x i8> poison, i8 %9, i32 0
1894 %14 = insertelement <2 x i8> %13, i8 %12, i32 1
1895 %15 = uitofp <2 x i8> %14 to <2 x float>
1896 %16 = shufflevector <2 x float> %15, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1897 %17 = shufflevector <4 x float> %16, <4 x float> <float poison, float poison, float 0.000000e+00, float 2.550000e+02>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1898 %18 = fmul contract <4 x float> %17, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000>
1902 ; Make sure load/stores of v4f16 are handled well on 32-bit targets where
1903 ; default widening legalization can't use i64.
1904 define void @load_store_v4f16(<4 x half>* %x, <4 x half>* %y, <4 x half>* %z) {
1905 ; X64-LABEL: load_store_v4f16:
1907 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1908 ; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1909 ; X64-NEXT: vaddph %xmm1, %xmm0, %xmm0
1910 ; X64-NEXT: vmovlps %xmm0, (%rdx)
1913 ; X86-LABEL: load_store_v4f16:
1915 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1916 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1917 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1918 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1919 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1920 ; X86-NEXT: vaddph %xmm1, %xmm0, %xmm0
1921 ; X86-NEXT: vmovlps %xmm0, (%eax)
1923 %a = load <4 x half>, <4 x half>* %x
1924 %b = load <4 x half>, <4 x half>* %y
1925 %c = fadd <4 x half> %a, %b
1926 store <4 x half> %c, <4 x half>* %z