1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s
4 declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)
6 ; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16.
8 define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
11 ; CHECK-NEXT: subq $72, %rsp
12 ; CHECK-NEXT: .cfi_def_cfa_offset 80
13 ; CHECK-NEXT: vmovaps %xmm1, %xmm9
14 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17]
15 ; CHECK-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14
17 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,30,1,22,4,30,1,22,4,30,1,22,4,30,1,22]
18 ; CHECK-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10
20 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm7 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
21 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
22 ; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29]
23 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8
24 ; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7]
25 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
26 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5
27 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6
28 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
29 ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3]
30 ; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1]
31 ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
32 ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
33 ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
34 ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
35 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,1,2,3]
36 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
37 ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
38 ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
39 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
40 ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3]
41 ; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3]
42 ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1]
43 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
44 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8
45 ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3],xmm11[3]
46 ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
47 ; CHECK-NEXT: vaddps %xmm14, %xmm2, %xmm2
48 ; CHECK-NEXT: vmovaps %xmm13, %xmm1
49 ; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
50 ; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10
51 ; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3
52 ; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0
53 ; CHECK-NEXT: vaddps %xmm8, %xmm0, %xmm0
54 ; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0
55 ; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
56 ; CHECK-NEXT: vmovaps %xmm10, (%rsp)
57 ; CHECK-NEXT: vmovaps %xmm9, %xmm3
58 ; CHECK-NEXT: vzeroupper
59 ; CHECK-NEXT: callq foo
60 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
61 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
62 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
63 ; CHECK-NEXT: addq $72, %rsp
64 ; CHECK-NEXT: .cfi_def_cfa_offset 8
66 %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
67 %a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
68 %a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>
69 %a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>
70 %a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
71 %a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>
72 %a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
73 %a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
74 %ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
75 %ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
76 %ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>
77 %ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>
78 %ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
79 %ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
80 %ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
81 %ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
82 %ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>
83 %ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>
84 %ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>
85 %ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
86 %ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>
87 %ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>
89 %r1 = fadd <4 x float> %ay10, %ay9
90 %r2 = fadd <4 x float> %ay8, %ay7
91 %r3 = fadd <4 x float> %ay6, %ay5
92 %r4 = fadd <4 x float> %ay2, %ax10
93 %r5 = fadd <4 x float> %ay9, %ax8
94 %r6 = fadd <4 x float> %r5, %r3
95 %r7 = fadd <4 x float> %a9, %r6
96 %a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)
97 %a12 = fadd <4 x float> %a2, %a1
98 %a13 = fadd <4 x float> %a12, %a11