1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s
4 declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)
6 ; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16.
8 define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
11 ; CHECK-NEXT: subq $72, %rsp
12 ; CHECK-NEXT: .cfi_def_cfa_offset 80
13 ; CHECK-NEXT: vmovaps %xmm1, %xmm9
14 ; CHECK-NEXT: vmovaps {{.*#+}} xmm14 = [4,22,1,17]
15 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14
16 ; CHECK-NEXT: vmovaps {{.*#+}} xmm10 = [4,30,1,22]
17 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10
18 ; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29]
19 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8
20 ; CHECK-NEXT: vmovaps {{.*#+}} xmm7 = <5,20,u,u>
21 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
22 ; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7]
23 ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
24 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5
25 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6
26 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
27 ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3]
28 ; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1]
29 ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
30 ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
31 ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
32 ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
33 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3]
34 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
35 ; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3]
36 ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
37 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
38 ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3]
39 ; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3]
40 ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1]
41 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
42 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8
43 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3]
44 ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
45 ; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2
46 ; CHECK-NEXT: vmovaps %xmm13, %xmm1
47 ; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
48 ; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10
49 ; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3
50 ; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0
51 ; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0
52 ; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0
53 ; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
54 ; CHECK-NEXT: vmovaps %xmm10, (%rsp)
55 ; CHECK-NEXT: vmovaps %xmm9, %xmm3
56 ; CHECK-NEXT: vzeroupper
57 ; CHECK-NEXT: callq foo@PLT
58 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
59 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
60 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
61 ; CHECK-NEXT: addq $72, %rsp
62 ; CHECK-NEXT: .cfi_def_cfa_offset 8
64 %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
65 %a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
66 %a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>
67 %a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>
68 %a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
69 %a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>
70 %a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
71 %a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
72 %ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
73 %ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
74 %ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>
75 %ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>
76 %ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
77 %ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
78 %ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
79 %ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
80 %ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>
81 %ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>
82 %ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>
83 %ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
84 %ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>
85 %ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>
87 %r1 = fadd <4 x float> %ay10, %ay9
88 %r2 = fadd <4 x float> %ay8, %ay7
89 %r3 = fadd <4 x float> %ay6, %ay5
90 %r4 = fadd <4 x float> %ay2, %ax10
91 %r5 = fadd <4 x float> %ay9, %ax8
92 %r6 = fadd <4 x float> %r5, %r3
93 %r7 = fadd <4 x float> %a9, %r6
94 %a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)
95 %a12 = fadd <4 x float> %a2, %a1
96 %a13 = fadd <4 x float> %a12, %a11