1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
6 declare void @extern_func() #2
8 define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
9 ; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be
10 ; preserved across the call and should get 8 scratch registers.
11 ; GFX9-LABEL: non_preserved_vgpr_tuple8:
12 ; GFX9: ; %bb.0: ; %main_body
13 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14 ; GFX9-NEXT: s_mov_b32 s4, s33
15 ; GFX9-NEXT: s_mov_b32 s33, s32
16 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
17 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
18 ; GFX9-NEXT: s_mov_b64 exec, s[6:7]
19 ; GFX9-NEXT: v_mov_b32_e32 v36, v16
20 ; GFX9-NEXT: v_mov_b32_e32 v35, v15
21 ; GFX9-NEXT: v_mov_b32_e32 v34, v14
22 ; GFX9-NEXT: v_mov_b32_e32 v33, v13
23 ; GFX9-NEXT: v_mov_b32_e32 v32, v12
24 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
25 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
26 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
27 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
28 ; GFX9-NEXT: ;;#ASMSTART
29 ; GFX9-NEXT: ;;#ASMEND
30 ; GFX9-NEXT: ;;#ASMSTART
31 ; GFX9-NEXT: ;;#ASMEND
32 ; GFX9-NEXT: ;;#ASMSTART
33 ; GFX9-NEXT: ;;#ASMEND
34 ; GFX9-NEXT: ;;#ASMSTART
35 ; GFX9-NEXT: ;;#ASMEND
36 ; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
37 ; GFX9-NEXT: s_addk_i32 s32, 0x800
38 ; GFX9-NEXT: v_writelane_b32 v40, s4, 2
39 ; GFX9-NEXT: s_getpc_b64 s[4:5]
40 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
41 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
42 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
43 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0
44 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1
45 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
47 ; GFX9-NEXT: v_mov_b32_e32 v0, v41
48 ; GFX9-NEXT: v_mov_b32_e32 v1, v42
49 ; GFX9-NEXT: v_mov_b32_e32 v2, v43
50 ; GFX9-NEXT: v_mov_b32_e32 v3, v44
51 ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
52 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
53 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
54 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
55 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1
56 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0
57 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2
58 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
59 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
60 ; GFX9-NEXT: s_mov_b64 exec, s[6:7]
61 ; GFX9-NEXT: s_addk_i32 s32, 0xf800
62 ; GFX9-NEXT: s_mov_b32 s33, s4
63 ; GFX9-NEXT: s_waitcnt vmcnt(0)
64 ; GFX9-NEXT: s_setpc_b64 s[30:31]
66 ; GFX10-LABEL: non_preserved_vgpr_tuple8:
67 ; GFX10: ; %bb.0: ; %main_body
68 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69 ; GFX10-NEXT: s_mov_b32 s4, s33
70 ; GFX10-NEXT: s_mov_b32 s33, s32
71 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1
72 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
73 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
74 ; GFX10-NEXT: s_mov_b32 exec_lo, s5
75 ; GFX10-NEXT: v_mov_b32_e32 v36, v16
76 ; GFX10-NEXT: v_mov_b32_e32 v35, v15
77 ; GFX10-NEXT: v_mov_b32_e32 v34, v14
78 ; GFX10-NEXT: v_mov_b32_e32 v33, v13
79 ; GFX10-NEXT: v_mov_b32_e32 v32, v12
80 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
81 ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
82 ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
83 ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
84 ; GFX10-NEXT: ;;#ASMSTART
85 ; GFX10-NEXT: ;;#ASMEND
86 ; GFX10-NEXT: ;;#ASMSTART
87 ; GFX10-NEXT: ;;#ASMEND
88 ; GFX10-NEXT: ;;#ASMSTART
89 ; GFX10-NEXT: ;;#ASMEND
90 ; GFX10-NEXT: ;;#ASMSTART
91 ; GFX10-NEXT: ;;#ASMEND
92 ; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
93 ; GFX10-NEXT: s_addk_i32 s32, 0x400
94 ; GFX10-NEXT: v_writelane_b32 v40, s4, 2
95 ; GFX10-NEXT: s_getpc_b64 s[4:5]
96 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
97 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
98 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
99 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0
100 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1
101 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
103 ; GFX10-NEXT: v_mov_b32_e32 v0, v41
104 ; GFX10-NEXT: v_mov_b32_e32 v1, v42
105 ; GFX10-NEXT: v_mov_b32_e32 v2, v43
106 ; GFX10-NEXT: v_mov_b32_e32 v3, v44
107 ; GFX10-NEXT: s_clause 0x3
108 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33
109 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4
110 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8
111 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12
112 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1
113 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0
114 ; GFX10-NEXT: v_readlane_b32 s4, v40, 2
115 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1
116 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
117 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
118 ; GFX10-NEXT: s_mov_b32 exec_lo, s5
119 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00
120 ; GFX10-NEXT: s_mov_b32 s33, s4
121 ; GFX10-NEXT: s_waitcnt vmcnt(0)
122 ; GFX10-NEXT: s_setpc_b64 s[30:31]
124 ; GFX11-LABEL: non_preserved_vgpr_tuple8:
125 ; GFX11: ; %bb.0: ; %main_body
126 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127 ; GFX11-NEXT: s_mov_b32 s0, s33
128 ; GFX11-NEXT: s_mov_b32 s33, s32
129 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1
130 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
131 ; GFX11-NEXT: s_mov_b32 exec_lo, s1
132 ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
133 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
134 ; GFX11-NEXT: v_mov_b32_e32 v32, v12
135 ; GFX11-NEXT: s_clause 0x3
136 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12
137 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8
138 ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4
139 ; GFX11-NEXT: scratch_store_b32 off, v44, s33
140 ; GFX11-NEXT: ;;#ASMSTART
141 ; GFX11-NEXT: ;;#ASMEND
142 ; GFX11-NEXT: ;;#ASMSTART
143 ; GFX11-NEXT: ;;#ASMEND
144 ; GFX11-NEXT: ;;#ASMSTART
145 ; GFX11-NEXT: ;;#ASMEND
146 ; GFX11-NEXT: ;;#ASMSTART
147 ; GFX11-NEXT: ;;#ASMEND
148 ; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
149 ; GFX11-NEXT: s_add_i32 s32, s32, 32
150 ; GFX11-NEXT: v_writelane_b32 v40, s0, 2
151 ; GFX11-NEXT: s_getpc_b64 s[0:1]
152 ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
153 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
154 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
155 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0
156 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1
157 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
158 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
159 ; GFX11-NEXT: v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42
160 ; GFX11-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44
161 ; GFX11-NEXT: s_clause 0x3
162 ; GFX11-NEXT: scratch_load_b32 v44, off, s33
163 ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4
164 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8
165 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12
166 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1
167 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0
168 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2
169 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1
170 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
171 ; GFX11-NEXT: s_mov_b32 exec_lo, s1
172 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0
173 ; GFX11-NEXT: s_mov_b32 s33, s0
174 ; GFX11-NEXT: s_waitcnt vmcnt(0)
175 ; GFX11-NEXT: s_setpc_b64 s[30:31]
187 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
188 call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
189 call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
190 call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
191 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
192 call void @extern_func()
196 define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
197 ; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved
198 ; across the call and should get allcoated to 8 CSRs.
199 ; Only the lower 5 sub-registers of the tuple are preserved.
200 ; The upper 3 sub-registers are unused.
201 ; GFX9-LABEL: call_preserved_vgpr_tuple8:
202 ; GFX9: ; %bb.0: ; %main_body
203 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204 ; GFX9-NEXT: s_mov_b32 s4, s33
205 ; GFX9-NEXT: s_mov_b32 s33, s32
206 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
207 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
208 ; GFX9-NEXT: s_mov_b64 exec, s[6:7]
209 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
210 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
211 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
212 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
213 ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
214 ; GFX9-NEXT: v_mov_b32_e32 v45, v16
215 ; GFX9-NEXT: v_mov_b32_e32 v44, v15
216 ; GFX9-NEXT: v_mov_b32_e32 v43, v14
217 ; GFX9-NEXT: v_mov_b32_e32 v42, v13
218 ; GFX9-NEXT: v_mov_b32_e32 v41, v12
219 ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
220 ; GFX9-NEXT: s_addk_i32 s32, 0x800
221 ; GFX9-NEXT: v_writelane_b32 v40, s4, 2
222 ; GFX9-NEXT: s_getpc_b64 s[4:5]
223 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
224 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
225 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
226 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0
227 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1
228 ; GFX9-NEXT: s_waitcnt vmcnt(0)
229 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
230 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
231 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
232 ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
234 ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
235 ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
236 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
237 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
238 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
239 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1
240 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0
241 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2
242 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
243 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
244 ; GFX9-NEXT: s_mov_b64 exec, s[6:7]
245 ; GFX9-NEXT: s_addk_i32 s32, 0xf800
246 ; GFX9-NEXT: s_mov_b32 s33, s4
247 ; GFX9-NEXT: s_waitcnt vmcnt(0)
248 ; GFX9-NEXT: s_setpc_b64 s[30:31]
250 ; GFX10-LABEL: call_preserved_vgpr_tuple8:
251 ; GFX10: ; %bb.0: ; %main_body
252 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253 ; GFX10-NEXT: s_mov_b32 s4, s33
254 ; GFX10-NEXT: s_mov_b32 s33, s32
255 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1
256 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
257 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
258 ; GFX10-NEXT: s_mov_b32 exec_lo, s5
259 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
260 ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
261 ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
262 ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
263 ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
264 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
265 ; GFX10-NEXT: s_addk_i32 s32, 0x400
266 ; GFX10-NEXT: v_writelane_b32 v40, s4, 2
267 ; GFX10-NEXT: s_getpc_b64 s[4:5]
268 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
269 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
270 ; GFX10-NEXT: v_mov_b32_e32 v41, v16
271 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
272 ; GFX10-NEXT: v_mov_b32_e32 v42, v15
273 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0
274 ; GFX10-NEXT: v_mov_b32_e32 v43, v14
275 ; GFX10-NEXT: v_mov_b32_e32 v44, v13
276 ; GFX10-NEXT: v_mov_b32_e32 v45, v12
277 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1
278 ; GFX10-NEXT: s_waitcnt vmcnt(0)
279 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
280 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
282 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
283 ; GFX10-NEXT: s_clause 0x4
284 ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33
285 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4
286 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8
287 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12
288 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16
289 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1
290 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0
291 ; GFX10-NEXT: v_readlane_b32 s4, v40, 2
292 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1
293 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
294 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
295 ; GFX10-NEXT: s_mov_b32 exec_lo, s5
296 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00
297 ; GFX10-NEXT: s_mov_b32 s33, s4
298 ; GFX10-NEXT: s_waitcnt vmcnt(0)
299 ; GFX10-NEXT: s_setpc_b64 s[30:31]
301 ; GFX11-LABEL: call_preserved_vgpr_tuple8:
302 ; GFX11: ; %bb.0: ; %main_body
303 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304 ; GFX11-NEXT: s_mov_b32 s0, s33
305 ; GFX11-NEXT: s_mov_b32 s33, s32
306 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1
307 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 ; 4-byte Folded Spill
308 ; GFX11-NEXT: s_mov_b32 exec_lo, s1
309 ; GFX11-NEXT: s_clause 0x4
310 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16
311 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:12
312 ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8
313 ; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4
314 ; GFX11-NEXT: scratch_store_b32 off, v45, s33
315 ; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
316 ; GFX11-NEXT: s_add_i32 s32, s32, 32
317 ; GFX11-NEXT: v_writelane_b32 v40, s0, 2
318 ; GFX11-NEXT: s_getpc_b64 s[0:1]
319 ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
320 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
321 ; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
322 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
323 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0
324 ; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13
325 ; GFX11-NEXT: v_mov_b32_e32 v45, v12
326 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1
327 ; GFX11-NEXT: s_waitcnt vmcnt(0)
328 ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
329 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
331 ; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
332 ; GFX11-NEXT: s_clause 0x4
333 ; GFX11-NEXT: scratch_load_b32 v45, off, s33
334 ; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4
335 ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8
336 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12
337 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16
338 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1
339 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0
340 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2
341 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1
342 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 ; 4-byte Folded Reload
343 ; GFX11-NEXT: s_mov_b32 exec_lo, s1
344 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0
345 ; GFX11-NEXT: s_mov_b32 s33, s0
346 ; GFX11-NEXT: s_waitcnt vmcnt(0)
347 ; GFX11-NEXT: s_setpc_b64 s[30:31]
356 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
357 store <4 x float> %v, ptr addrspace(1) undef
358 call void @extern_func()
359 %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
363 declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
365 attributes #0 = { nounwind writeonly }
366 attributes #1 = { nounwind readonly }
367 attributes #2 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }