1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
4 ; SelectionDAG builder was using the IR value kind to decide how to
5 ; split the types for copyToRegs/copyFromRegs in all contexts. This
6 ; was incorrect if the ABI-like value such as a call was used outside
7 ; of the block. The value in that case is not used directly, but
8 ; through another set of copies to potentially different register
9 ; types in the parent block.
11 ; This would then end up producing inconsistent pairs of copies with
12 ; the wrong sizes when the vector type result from the call was split
13 ; into multiple pieces, but expected to be a single register in the
16 ; This isn't exactly ideal for AMDGPU, since in reality the
17 ; intermediate vector register type is undesirable anyway, but it
18 ; requires more work to be able to split all vector copies in all
21 ; This was only an issue if the value was used directly in another
22 ; block. If there was an intermediate operation or a phi it was fine,
23 ; since that didn't look like an ABI copy.
26 define float @call_split_type_used_outside_block_v2f32() #0 {
27 ; GCN-LABEL: call_split_type_used_outside_block_v2f32:
28 ; GCN: ; %bb.0: ; %bb0
29 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; GCN-NEXT: s_mov_b32 s16, s33
31 ; GCN-NEXT: s_mov_b32 s33, s32
32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
33 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
34 ; GCN-NEXT: s_mov_b64 exec, s[18:19]
35 ; GCN-NEXT: v_writelane_b32 v40, s16, 2
36 ; GCN-NEXT: s_addk_i32 s32, 0x400
37 ; GCN-NEXT: v_writelane_b32 v40, s30, 0
38 ; GCN-NEXT: v_writelane_b32 v40, s31, 1
39 ; GCN-NEXT: s_getpc_b64 s[16:17]
40 ; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4
41 ; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12
42 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
43 ; GCN-NEXT: v_readlane_b32 s31, v40, 1
44 ; GCN-NEXT: v_readlane_b32 s30, v40, 0
45 ; GCN-NEXT: v_readlane_b32 s4, v40, 2
46 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
47 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
48 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
49 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
50 ; GCN-NEXT: s_mov_b32 s33, s4
51 ; GCN-NEXT: s_waitcnt vmcnt(0)
52 ; GCN-NEXT: s_setpc_b64 s[30:31]
54 %split.ret.type = call <2 x float> @func_v2f32()
58 %extract = extractelement <2 x float> %split.ret.type, i32 0
62 define float @call_split_type_used_outside_block_v3f32() #0 {
63 ; GCN-LABEL: call_split_type_used_outside_block_v3f32:
64 ; GCN: ; %bb.0: ; %bb0
65 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66 ; GCN-NEXT: s_mov_b32 s16, s33
67 ; GCN-NEXT: s_mov_b32 s33, s32
68 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
69 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
70 ; GCN-NEXT: s_mov_b64 exec, s[18:19]
71 ; GCN-NEXT: v_writelane_b32 v40, s16, 2
72 ; GCN-NEXT: s_addk_i32 s32, 0x400
73 ; GCN-NEXT: v_writelane_b32 v40, s30, 0
74 ; GCN-NEXT: v_writelane_b32 v40, s31, 1
75 ; GCN-NEXT: s_getpc_b64 s[16:17]
76 ; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4
77 ; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12
78 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
79 ; GCN-NEXT: v_readlane_b32 s31, v40, 1
80 ; GCN-NEXT: v_readlane_b32 s30, v40, 0
81 ; GCN-NEXT: v_readlane_b32 s4, v40, 2
82 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
83 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
84 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
85 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
86 ; GCN-NEXT: s_mov_b32 s33, s4
87 ; GCN-NEXT: s_waitcnt vmcnt(0)
88 ; GCN-NEXT: s_setpc_b64 s[30:31]
90 %split.ret.type = call <3 x float> @func_v3f32()
94 %extract = extractelement <3 x float> %split.ret.type, i32 0
98 define half @call_split_type_used_outside_block_v4f16() #0 {
99 ; GCN-LABEL: call_split_type_used_outside_block_v4f16:
100 ; GCN: ; %bb.0: ; %bb0
101 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102 ; GCN-NEXT: s_mov_b32 s16, s33
103 ; GCN-NEXT: s_mov_b32 s33, s32
104 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
105 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
106 ; GCN-NEXT: s_mov_b64 exec, s[18:19]
107 ; GCN-NEXT: v_writelane_b32 v40, s16, 2
108 ; GCN-NEXT: s_addk_i32 s32, 0x400
109 ; GCN-NEXT: v_writelane_b32 v40, s30, 0
110 ; GCN-NEXT: v_writelane_b32 v40, s31, 1
111 ; GCN-NEXT: s_getpc_b64 s[16:17]
112 ; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4
113 ; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12
114 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
115 ; GCN-NEXT: v_readlane_b32 s31, v40, 1
116 ; GCN-NEXT: v_readlane_b32 s30, v40, 0
117 ; GCN-NEXT: v_readlane_b32 s4, v40, 2
118 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
119 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
120 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
121 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
122 ; GCN-NEXT: s_mov_b32 s33, s4
123 ; GCN-NEXT: s_waitcnt vmcnt(0)
124 ; GCN-NEXT: s_setpc_b64 s[30:31]
126 %split.ret.type = call <4 x half> @func_v4f16()
130 %extract = extractelement <4 x half> %split.ret.type, i32 0
134 define { i32, half } @call_split_type_used_outside_block_struct() #0 {
135 ; GCN-LABEL: call_split_type_used_outside_block_struct:
136 ; GCN: ; %bb.0: ; %bb0
137 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138 ; GCN-NEXT: s_mov_b32 s16, s33
139 ; GCN-NEXT: s_mov_b32 s33, s32
140 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
141 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
142 ; GCN-NEXT: s_mov_b64 exec, s[18:19]
143 ; GCN-NEXT: v_writelane_b32 v40, s16, 2
144 ; GCN-NEXT: s_addk_i32 s32, 0x400
145 ; GCN-NEXT: v_writelane_b32 v40, s30, 0
146 ; GCN-NEXT: v_writelane_b32 v40, s31, 1
147 ; GCN-NEXT: s_getpc_b64 s[16:17]
148 ; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4
149 ; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12
150 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
151 ; GCN-NEXT: v_readlane_b32 s31, v40, 1
152 ; GCN-NEXT: v_readlane_b32 s30, v40, 0
153 ; GCN-NEXT: v_mov_b32_e32 v1, v4
154 ; GCN-NEXT: v_readlane_b32 s4, v40, 2
155 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
156 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
157 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
158 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
159 ; GCN-NEXT: s_mov_b32 s33, s4
160 ; GCN-NEXT: s_waitcnt vmcnt(0)
161 ; GCN-NEXT: s_setpc_b64 s[30:31]
163 %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
167 %val0 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 0
168 %val1 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 1
169 %extract0 = extractelement <4 x i32> %val0, i32 0
170 %extract1 = extractelement <4 x half> %val1, i32 0
171 %ins0 = insertvalue { i32, half } undef, i32 %extract0, 0
172 %ins1 = insertvalue { i32, half } %ins0, half %extract1, 1
173 ret { i32, half } %ins1
176 define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
177 ; GCN-LABEL: v3i16_registers:
178 ; GCN: ; %bb.0: ; %entry
179 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
180 ; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
181 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
182 ; GCN-NEXT: s_add_u32 s0, s0, s17
183 ; GCN-NEXT: s_addc_u32 s1, s1, 0
184 ; GCN-NEXT: s_mov_b32 s32, 0
185 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
186 ; GCN-NEXT: s_bitcmp1_b32 s12, 0
187 ; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
188 ; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
189 ; GCN-NEXT: s_cbranch_vccnz .LBB4_2
190 ; GCN-NEXT: ; %bb.1: ; %if.else
191 ; GCN-NEXT: s_add_u32 s8, s8, 8
192 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
193 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
194 ; GCN-NEXT: s_addc_u32 s9, s9, 0
195 ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
196 ; GCN-NEXT: s_mov_b32 s12, s14
197 ; GCN-NEXT: s_mov_b32 s13, s15
198 ; GCN-NEXT: s_mov_b32 s14, s16
199 ; GCN-NEXT: s_getpc_b64 s[18:19]
200 ; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4
201 ; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12
202 ; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
203 ; GCN-NEXT: s_branch .LBB4_3
205 ; GCN-NEXT: v_mov_b32_e32 v0, 0
206 ; GCN-NEXT: v_mov_b32_e32 v1, 0
207 ; GCN-NEXT: .LBB4_3: ; %if.end
208 ; GCN-NEXT: global_store_short v[0:1], v1, off
209 ; GCN-NEXT: global_store_dword v[0:1], v0, off
212 br i1 %cond, label %if.then, label %if.else
214 if.then: ; preds = %entry
217 if.else: ; preds = %entry
218 %call6 = tail call <3 x i16> @func_v3i16() #0
221 if.end: ; preds = %if.else, %if.then
222 %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
223 store <3 x i16> %call6.sink, ptr addrspace(1) undef
227 define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
228 ; GCN-LABEL: v3f16_registers:
229 ; GCN: ; %bb.0: ; %entry
230 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
231 ; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
232 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
233 ; GCN-NEXT: s_add_u32 s0, s0, s17
234 ; GCN-NEXT: s_addc_u32 s1, s1, 0
235 ; GCN-NEXT: s_mov_b32 s32, 0
236 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
237 ; GCN-NEXT: s_bitcmp1_b32 s12, 0
238 ; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
239 ; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
240 ; GCN-NEXT: s_cbranch_vccnz .LBB5_2
241 ; GCN-NEXT: ; %bb.1: ; %if.else
242 ; GCN-NEXT: s_add_u32 s8, s8, 8
243 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
244 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
245 ; GCN-NEXT: s_addc_u32 s9, s9, 0
246 ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
247 ; GCN-NEXT: s_mov_b32 s12, s14
248 ; GCN-NEXT: s_mov_b32 s13, s15
249 ; GCN-NEXT: s_mov_b32 s14, s16
250 ; GCN-NEXT: s_getpc_b64 s[18:19]
251 ; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4
252 ; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12
253 ; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
254 ; GCN-NEXT: s_branch .LBB5_3
256 ; GCN-NEXT: v_mov_b32_e32 v0, 0
257 ; GCN-NEXT: v_mov_b32_e32 v1, 0
258 ; GCN-NEXT: .LBB5_3: ; %if.end
259 ; GCN-NEXT: global_store_short v[0:1], v1, off
260 ; GCN-NEXT: global_store_dword v[0:1], v0, off
263 br i1 %cond, label %if.then, label %if.else
265 if.then: ; preds = %entry
268 if.else: ; preds = %entry
269 %call6 = tail call <3 x half> @func_v3f16() #0
272 if.end: ; preds = %if.else, %if.then
273 %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
274 store <3 x half> %call6.sink, ptr addrspace(1) undef
278 declare hidden <2 x float> @func_v2f32() #0
279 declare hidden <3 x float> @func_v3f32() #0
280 declare hidden <4 x float> @func_v4f32() #0
281 declare hidden <4 x half> @func_v4f16() #0
282 declare hidden <3 x i16> @func_v3i16()
283 declare hidden <3 x half> @func_v3f16()
285 declare hidden { <4 x i32>, <4 x half> } @func_struct() #0
287 attributes #0 = { nounwind}