1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s
4 declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)
6 ; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16.
8 define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
11 ; CHECK-NEXT: subq $72, %rsp
12 ; CHECK-NEXT: .cfi_def_cfa_offset 80
13 ; CHECK-NEXT: vmovaps %xmm1, %xmm8
14 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
15 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm5
16 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
17 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[0,1],xmm2[1],xmm10[3]
18 ; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm1[1],xmm5[2,3]
19 ; CHECK-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
20 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
21 ; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm6[0,1],xmm2[1],xmm6[3]
22 ; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7
23 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
24 ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3]
25 ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1]
26 ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1,2],xmm3[1]
27 ; CHECK-NEXT: vaddps %xmm4, %xmm6, %xmm12
28 ; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
29 ; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm7[2],zero,zero
30 ; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3]
31 ; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm4[0]
32 ; CHECK-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[3,1,2,3]
33 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
34 ; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm1[2],zero,zero
35 ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[1]
36 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
37 ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm2[1],xmm9[3]
38 ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0,1,2],xmm3[1]
39 ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
40 ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm4[0,1],xmm2[1],xmm4[3]
41 ; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm2[1],xmm7[3]
42 ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3]
43 ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
44 ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]
45 ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
46 ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
47 ; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm3[3]
48 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1,2],xmm3[1]
49 ; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
50 ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
51 ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
52 ; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0
53 ; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9
54 ; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0
55 ; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
56 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
57 ; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
58 ; CHECK-NEXT: vmovaps %xmm9, (%rsp)
59 ; CHECK-NEXT: vmovaps %xmm8, %xmm3
60 ; CHECK-NEXT: vzeroupper
61 ; CHECK-NEXT: callq foo
62 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
63 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
64 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
65 ; CHECK-NEXT: addq $72, %rsp
66 ; CHECK-NEXT: .cfi_def_cfa_offset 8
68 %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
70 %a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
71 %a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>
72 %a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>
73 %a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
74 %a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>
75 %a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
76 %a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
77 %ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
78 %ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
79 %ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>
80 %ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>
81 %ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
82 %ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
83 %ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
84 %ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
85 %ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>
86 %ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>
87 %ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>
88 %ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
89 %ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>
90 %ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>
92 %r1 = fadd <4 x float> %ay10, %ay9
93 %r2 = fadd <4 x float> %ay8, %ay7
94 %r3 = fadd <4 x float> %ay6, %ay5
95 %r4 = fadd <4 x float> %ay2, %ax10
96 %r5 = fadd <4 x float> %ay9, %ax8
97 %r6 = fadd <4 x float> %r5, %r3
98 %r7 = fadd <4 x float> %a9, %r6
99 %a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)
100 %a12 = fadd <4 x float> %a2, %a1
101 %a13 = fadd <4 x float> %a12, %a11