1 ;RUN: llc < %s -mtriple=amdgcn-pal -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
2 ;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
4 ; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
6 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
8 %i1 = extractelement <2 x float> %arg3, i32 1
9 %ret1 = insertelement <4 x float> undef, float %i1, i32 0
10 %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1, 0
11 ret { <4 x float> } %ret2
14 ; CHECK-LABEL: {{^}}_amdgpu_ps_3_arg:
16 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_3_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
18 %i1 = extractelement <2 x float> %arg3, i32 1
19 %i2 = extractelement <2 x float> %arg4, i32 0
20 %i3 = extractelement <2 x float> %arg5, i32 1
21 %ret1 = insertelement <4 x float> undef, float %i1, i32 0
22 %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
23 %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
24 %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0
25 ret { <4 x float> } %ret2
28 ; CHECK-LABEL: {{^}}_amdgpu_ps_2_arg_gap:
30 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_2_arg_gap(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
32 %i1 = extractelement <2 x float> %arg3, i32 1
33 %i3 = extractelement <2 x float> %arg5, i32 1
34 %ret1 = insertelement <4 x float> undef, float %i1, i32 0
35 %ret1.2 = insertelement <4 x float> %ret1, float %i3, i32 1
36 %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0
37 ret { <4 x float> } %ret2
40 ; Using InitialPSInputAddr of 0x2 causes the 2nd VGPR arg to be included in the packing - this increases the total number of VGPRs and in turn makes arg3 not be packed to be
41 ; adjacent to arg1 (the only 2 used arguments)
42 ; CHECK-LABEL: {{^}}_amdgpu_ps_2_arg_no_pack:
44 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_2_arg_no_pack(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #1 {
46 %i1 = extractelement <2 x float> %arg3, i32 1
47 %i3 = extractelement <2 x float> %arg5, i32 1
48 %ret1 = insertelement <4 x float> undef, float %i1, i32 0
49 %ret1.2 = insertelement <4 x float> %ret1, float %i3, i32 1
50 %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0
51 ret { <4 x float> } %ret2
54 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg:
56 define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
58 %i1 = extractelement <2 x float> %arg3, i32 1
59 %i2 = extractelement <2 x float> %arg4, i32 0
60 %i3 = extractelement <2 x float> %arg5, i32 1
61 %i4 = extractelement <3 x float> %arg6, i32 1
62 %i5 = extractelement <2 x float> %arg7, i32 0
63 %i6 = extractelement <2 x float> %arg8, i32 0
64 %i7 = extractelement <2 x float> %arg9, i32 1
66 %ret1 = insertelement <4 x float> undef, float %i1, i32 0
67 %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
68 %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
69 %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
71 %ret2 = insertelement <4 x float> undef, float %i5, i32 0
72 %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
73 %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
74 %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
76 %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
77 %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
78 %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
79 %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
81 %arg15.f = bitcast i32 %arg15 to float
82 %arg16.f = bitcast i32 %arg16 to float
83 %arg17.f = bitcast i32 %arg17 to float
84 %arg18.f = bitcast i32 %arg18 to float
86 %ret4 = insertelement <4 x float> undef, float %arg15.f, i32 0
87 %ret4.1 = insertelement <4 x float> %ret4, float %arg16.f, i32 1
88 %ret4.2 = insertelement <4 x float> %ret4.1, float %arg17.f, i32 2
89 %ret4.3 = insertelement <4 x float> %ret4.2, float %arg18.f, i32 3
91 %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
92 %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
93 %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
94 %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
96 ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
99 ; Extra arguments have to be allocated even if they're unused
100 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg_extra_unused:
101 ; CHECK: NumVgprs: 26
102 define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
104 %i1 = extractelement <2 x float> %arg3, i32 1
105 %i2 = extractelement <2 x float> %arg4, i32 0
106 %i3 = extractelement <2 x float> %arg5, i32 1
107 %i4 = extractelement <3 x float> %arg6, i32 1
108 %i5 = extractelement <2 x float> %arg7, i32 0
109 %i6 = extractelement <2 x float> %arg8, i32 0
110 %i7 = extractelement <2 x float> %arg9, i32 1
112 %ret1 = insertelement <4 x float> undef, float %i1, i32 0
113 %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
114 %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
115 %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
117 %ret2 = insertelement <4 x float> undef, float %i5, i32 0
118 %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
119 %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
120 %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
122 %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
123 %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
124 %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
125 %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
127 %arg15.f = bitcast i32 %arg15 to float
128 %arg16.f = bitcast i32 %arg16 to float
129 %arg17.f = bitcast i32 %arg17 to float
130 %arg18.f = bitcast i32 %arg18 to float
132 %ret4 = insertelement <4 x float> undef, float %arg15.f, i32 0
133 %ret4.1 = insertelement <4 x float> %ret4, float %arg16.f, i32 1
134 %ret4.2 = insertelement <4 x float> %ret4.1, float %arg17.f, i32 2
135 %ret4.3 = insertelement <4 x float> %ret4.2, float %arg18.f, i32 3
137 %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
138 %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
139 %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
140 %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
142 ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
145 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg_extra:
146 ; CHECK: NumVgprs: 26
147 ; CHECK: NumVGPRsForWavesPerEU: 26
148 define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
150 %i1 = extractelement <2 x float> %arg3, i32 1
151 %i2 = extractelement <2 x float> %arg4, i32 0
152 %i3 = extractelement <2 x float> %arg5, i32 1
153 %i4 = extractelement <3 x float> %arg6, i32 1
154 %i5 = extractelement <2 x float> %arg7, i32 0
155 %i6 = extractelement <2 x float> %arg8, i32 0
156 %i7 = extractelement <2 x float> %arg9, i32 1
158 %ret1 = insertelement <4 x float> undef, float %i1, i32 0
159 %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
160 %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
161 %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
163 %ret2 = insertelement <4 x float> undef, float %i5, i32 0
164 %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
165 %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
166 %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
168 %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
169 %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
170 %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
171 %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
173 %arg15.f = bitcast i32 %arg15 to float
174 %arg16.f = bitcast i32 %arg16 to float
175 %arg17.f = bitcast i32 %arg17 to float
176 %arg18.f = bitcast i32 %arg18 to float
178 %arg15_16.f = fadd float %arg15.f, %arg16.f
179 %arg17_18.f = fadd float %arg17.f, %arg18.f
181 %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0
182 %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1
183 %ret4.2 = insertelement <4 x float> %ret4.1, float %arg15_16.f, i32 2
184 %ret4.3 = insertelement <4 x float> %ret4.2, float %arg17_18.f, i32 3
186 %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
187 %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
188 %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
189 %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
191 ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
194 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
195 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
197 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
199 ret { <4 x float> } undef
202 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
203 ; Additionally set the PSInputAddr to 0 via the metadata
204 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
206 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
208 ret { <4 x float> } undef
211 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_used:
213 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_used(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
215 %ret4.1 = insertelement <4 x float> undef, float %extra_arg1, i32 0
216 %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg2, i32 1
218 %ret.res = insertvalue { <4 x float> } undef, <4 x float> %ret4.2, 0
220 ret { <4 x float> } %ret.res
223 ; CHECK-LABEL: {{^}}_amdgpu_ps_part_unused_extra_used:
225 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_part_unused_extra_used(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
227 %ret4.1 = insertelement <4 x float> undef, float %arg14, i32 0
228 %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg1, i32 1
229 %ret4.3 = insertelement <4 x float> %ret4.2, float %extra_arg2, i32 2
231 %ret.res = insertvalue { <4 x float> } undef, <4 x float> %ret4.3, 0
233 ret { <4 x float> } %ret.res
236 ; CHECK-LABEL: {{^}}_amdgpu_ps_part_unused_extra_unused:
238 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_part_unused_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
240 %ret4.1 = insertelement <4 x float> undef, float %arg12, i32 0
241 %ret4.2 = insertelement <4 x float> %ret4.1, float %arg13, i32 1
242 %ret4.3 = insertelement <4 x float> %ret4.2, float %arg14, i32 2
244 %ret.res = insertvalue { <4 x float> } undef, <4 x float> %ret4.3, 0
246 ret { <4 x float> } %ret.res
249 ; Extra unused inputs are always added to the allocation
250 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_unused:
252 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
255 ret { <4 x float> } undef
258 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_used_no_packing:
259 ; CHECK: NumVgprs: 26
260 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_used_no_packing(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 {
262 %ret4.1 = insertelement <4 x float> undef, float %extra_arg1, i32 0
263 %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg2, i32 1
265 %ret.res = insertvalue { <4 x float> } undef, <4 x float> %ret4.2, 0
267 ret { <4 x float> } %ret.res
270 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_unused_no_packing:
271 ; CHECK: NumVgprs: 26
272 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_unused_no_packing(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 {
274 ret { <4 x float> } undef
277 ; CHECK-LABEL: {{^}}_amdgpu_ps_some_unused_arg_extra:
278 ; CHECK: NumVgprs: 24
279 ; CHECK: NumVGPRsForWavesPerEU: 24
280 define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_some_unused_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
282 %i1 = extractelement <2 x float> %arg3, i32 1
283 %i2 = extractelement <2 x float> %arg4, i32 0
284 %i3 = extractelement <2 x float> %arg5, i32 1
285 %i4 = extractelement <3 x float> %arg6, i32 1
286 %i5 = extractelement <2 x float> %arg7, i32 0
287 %i6 = extractelement <2 x float> %arg8, i32 0
288 %i7 = extractelement <2 x float> %arg9, i32 1
290 %ret1 = insertelement <4 x float> undef, float %i1, i32 0
291 %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
292 %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
293 %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
295 %ret2 = insertelement <4 x float> undef, float %i5, i32 0
296 %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
297 %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
298 %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
300 %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
301 %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
302 %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
303 %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
305 %arg15.f = bitcast i32 %arg15 to float
306 %arg16.f = bitcast i32 %arg16 to float
308 %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0
309 %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1
310 %ret4.2 = insertelement <4 x float> %ret4.1, float %arg15.f, i32 2
311 %ret4.3 = insertelement <4 x float> %ret4.2, float %arg16.f, i32 3
313 %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
314 %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
315 %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
316 %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
318 ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
321 ;CHECK-LABEL: {{^}}_amdgpu_ps_some_unused_no_packing_arg_extra:
323 ;CHECK: NumVGPRsForWavesPerEU: 26
324 define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_some_unused_no_packing_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 {
326 %i1 = extractelement <2 x float> %arg3, i32 1
327 %i2 = extractelement <2 x float> %arg4, i32 0
328 %i3 = extractelement <2 x float> %arg5, i32 1
329 %i4 = extractelement <3 x float> %arg6, i32 1
330 %i5 = extractelement <2 x float> %arg7, i32 0
331 %i6 = extractelement <2 x float> %arg8, i32 0
332 %i7 = extractelement <2 x float> %arg9, i32 1
334 %ret1 = insertelement <4 x float> undef, float %i1, i32 0
335 %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
336 %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
337 %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
339 %ret2 = insertelement <4 x float> undef, float %i5, i32 0
340 %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
341 %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
342 %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
344 %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
345 %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
346 %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
347 %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
349 %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0
350 %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1
352 %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
353 %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
354 %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
355 %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.1, 3
357 ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
360 attributes #0 = { nounwind "target-features"=",+wavefrontsize64,+cumode" }
361 attributes #1 = { nounwind "InitialPSInputAddr"="2" "target-features"=",+wavefrontsize64,+cumode" }
362 attributes #2 = { nounwind "InitialPSInputAddr"="0xffff" "target-features"=",+wavefrontsize64,+cumode" }
363 attributes #3 = { nounwind "InitialPSInputAddr"="0" "target-features"=",+wavefrontsize64,+cumode" }