1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefix=MUBUF %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=FLATSCR %s
5 ; Test that the VGPR spiller correctly switches to SGPR offsets when the
6 ; instruction offset field would overflow, and that it accounts for memory
9 define amdgpu_kernel void @test_inst_offset_kernel() {
10 ; MUBUF-LABEL: test_inst_offset_kernel:
11 ; MUBUF: ; %bb.0: ; %entry
12 ; MUBUF-NEXT: s_add_u32 s0, s0, s7
13 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0
14 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
15 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
16 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
17 ; MUBUF-NEXT: ;;#ASMSTART
18 ; MUBUF-NEXT: ;;#ASMEND
19 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
20 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
21 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
22 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
23 ; MUBUF-NEXT: s_endpgm
25 ; FLATSCR-LABEL: test_inst_offset_kernel:
26 ; FLATSCR: ; %bb.0: ; %entry
27 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
28 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
29 ; FLATSCR-NEXT: s_mov_b32 s0, 0
30 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
31 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
32 ; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
33 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
34 ; FLATSCR-NEXT: ;;#ASMSTART
35 ; FLATSCR-NEXT: ;;#ASMEND
36 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
37 ; FLATSCR-NEXT: s_mov_b32 s0, 0
38 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
39 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:8
40 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
41 ; FLATSCR-NEXT: s_endpgm
43 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
44 ; the instruction offset field.
45 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
47 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
50 %a = load volatile i32, ptr addrspace(5) %aptr
53 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
55 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
56 store volatile i32 %a, ptr addrspace(5) %outptr
61 define amdgpu_kernel void @test_sgpr_offset_kernel() {
62 ; MUBUF-LABEL: test_sgpr_offset_kernel:
63 ; MUBUF: ; %bb.0: ; %entry
64 ; MUBUF-NEXT: s_add_u32 s0, s0, s7
65 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0
66 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
67 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
68 ; MUBUF-NEXT: s_mov_b32 s4, 0x40000
69 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
70 ; MUBUF-NEXT: ;;#ASMSTART
71 ; MUBUF-NEXT: ;;#ASMEND
72 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
73 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
74 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
75 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
76 ; MUBUF-NEXT: s_endpgm
78 ; FLATSCR-LABEL: test_sgpr_offset_kernel:
79 ; FLATSCR: ; %bb.0: ; %entry
80 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
81 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
82 ; FLATSCR-NEXT: s_mov_b32 s0, 0
83 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
84 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
85 ; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
86 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
87 ; FLATSCR-NEXT: ;;#ASMSTART
88 ; FLATSCR-NEXT: ;;#ASMEND
89 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
90 ; FLATSCR-NEXT: s_mov_b32 s0, 0
91 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
92 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:8
93 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
94 ; FLATSCR-NEXT: s_endpgm
96 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
97 ; fit in the instruction, and has to live in the SGPR offset.
98 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
100 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
101 ; 0x40000 / 64 = 4096 (for wave64)
102 %a = load volatile i32, ptr addrspace(5) %aptr
104 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
106 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
107 store volatile i32 %a, ptr addrspace(5) %outptr
112 define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
113 ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_func:
114 ; MUBUF: ; %bb.0: ; %entry
115 ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; MUBUF-NEXT: ;;#ASMSTART
117 ; MUBUF-NEXT: ;;#ASMEND
118 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
119 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
120 ; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100
121 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill
122 ; MUBUF-NEXT: ;;#ASMSTART
123 ; MUBUF-NEXT: ;;#ASMEND
124 ; MUBUF-NEXT: ;;#ASMSTART
125 ; MUBUF-NEXT: ;;#ASMEND
126 ; MUBUF-NEXT: ;;#ASMSTART
127 ; MUBUF-NEXT: ;;#ASMEND
128 ; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100
129 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload
130 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
131 ; MUBUF-NEXT: ;;#ASMSTART
132 ; MUBUF-NEXT: ;;#ASMEND
133 ; MUBUF-NEXT: s_setpc_b64 s[30:31]
135 ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_func:
136 ; FLATSCR: ; %bb.0: ; %entry
137 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138 ; FLATSCR-NEXT: ;;#ASMSTART
139 ; FLATSCR-NEXT: ;;#ASMEND
140 ; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc
141 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
142 ; FLATSCR-NEXT: s_add_i32 s8, s32, 0x1004
143 ; FLATSCR-NEXT: scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
144 ; FLATSCR-NEXT: ;;#ASMSTART
145 ; FLATSCR-NEXT: ;;#ASMEND
146 ; FLATSCR-NEXT: ;;#ASMSTART
147 ; FLATSCR-NEXT: ;;#ASMEND
148 ; FLATSCR-NEXT: ;;#ASMSTART
149 ; FLATSCR-NEXT: ;;#ASMEND
150 ; FLATSCR-NEXT: s_add_i32 s8, s32, 0x1004
151 ; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
152 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
153 ; FLATSCR-NEXT: ;;#ASMSTART
154 ; FLATSCR-NEXT: ;;#ASMEND
155 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
157 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
158 ; fit in the instruction, and has to live in the SGPR offset.
159 %alloca = alloca i8, i32 4096, align 4, addrspace(5)
161 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
163 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
164 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
165 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
166 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
167 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
168 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
169 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
170 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
171 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
173 ; 0x40000 / 64 = 4096 (for wave64)
174 %a = load volatile i32, ptr addrspace(5) %aptr
175 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
177 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
178 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
179 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
180 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
181 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
182 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
183 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
184 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
185 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
187 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
188 ; Force %a to spill with no free SGPRs
189 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
193 define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 {
194 ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
195 ; MUBUF: ; %bb.0: ; %entry
196 ; MUBUF-NEXT: s_add_u32 s0, s0, s7
197 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0
198 ; MUBUF-NEXT: ;;#ASMSTART
199 ; MUBUF-NEXT: ;;#ASMEND
200 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
201 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
202 ; MUBUF-NEXT: s_mov_b32 s10, 0x40100
203 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill
204 ; MUBUF-NEXT: ;;#ASMSTART
205 ; MUBUF-NEXT: ;;#ASMEND
206 ; MUBUF-NEXT: ;;#ASMSTART
207 ; MUBUF-NEXT: ;;#ASMEND
208 ; MUBUF-NEXT: ;;#ASMSTART
209 ; MUBUF-NEXT: ;;#ASMEND
210 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload
211 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
212 ; MUBUF-NEXT: ;;#ASMSTART
213 ; MUBUF-NEXT: ;;#ASMEND
214 ; MUBUF-NEXT: s_endpgm
216 ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
217 ; FLATSCR: ; %bb.0: ; %entry
218 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
219 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
220 ; FLATSCR-NEXT: s_mov_b32 s8, 0
221 ; FLATSCR-NEXT: ;;#ASMSTART
222 ; FLATSCR-NEXT: ;;#ASMEND
223 ; FLATSCR-NEXT: scratch_load_dword v0, off, s8 offset:8 glc
224 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
225 ; FLATSCR-NEXT: s_movk_i32 s8, 0x1004
226 ; FLATSCR-NEXT: scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
227 ; FLATSCR-NEXT: ;;#ASMSTART
228 ; FLATSCR-NEXT: ;;#ASMEND
229 ; FLATSCR-NEXT: ;;#ASMSTART
230 ; FLATSCR-NEXT: ;;#ASMEND
231 ; FLATSCR-NEXT: ;;#ASMSTART
232 ; FLATSCR-NEXT: ;;#ASMEND
233 ; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
234 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
235 ; FLATSCR-NEXT: ;;#ASMSTART
236 ; FLATSCR-NEXT: ;;#ASMEND
237 ; FLATSCR-NEXT: s_endpgm
239 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
240 ; fit in the instruction, and has to live in the SGPR offset.
241 %alloca = alloca i8, i32 4096, align 4, addrspace(5)
243 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
245 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
246 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
247 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
248 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
249 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
250 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
251 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
252 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
253 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
255 ; 0x40000 / 64 = 4096 (for wave64)
256 %a = load volatile i32, ptr addrspace(5) %aptr
257 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
259 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
260 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
261 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
262 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
263 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
264 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
265 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
266 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
267 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
269 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
270 ; Force %a to spill with no free SGPRs
271 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
275 define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
276 ; MUBUF-LABEL: test_sgpr_offset_subregs_kernel:
277 ; MUBUF: ; %bb.0: ; %entry
278 ; MUBUF-NEXT: s_add_u32 s0, s0, s7
279 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0
280 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
281 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
282 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
283 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
284 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
285 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
286 ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
287 ; MUBUF-NEXT: ;;#ASMSTART
288 ; MUBUF-NEXT: ;;#ASMEND
289 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
290 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
291 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
292 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
293 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
294 ; MUBUF-NEXT: ;;#ASMSTART
295 ; MUBUF-NEXT: ; v[0:1]
296 ; MUBUF-NEXT: ;;#ASMEND
297 ; MUBUF-NEXT: s_endpgm
299 ; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel:
300 ; FLATSCR: ; %bb.0: ; %entry
301 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
302 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
303 ; FLATSCR-NEXT: s_mov_b32 s0, 0
304 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
305 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
306 ; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
307 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
308 ; FLATSCR-NEXT: s_mov_b32 s0, 0
309 ; FLATSCR-NEXT: ;;#ASMSTART
310 ; FLATSCR-NEXT: ;;#ASMEND
311 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
312 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
313 ; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
314 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
315 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
316 ; FLATSCR-NEXT: ;;#ASMSTART
317 ; FLATSCR-NEXT: ; v[0:1]
318 ; FLATSCR-NEXT: ;;#ASMEND
319 ; FLATSCR-NEXT: s_endpgm
321 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
322 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
323 ; the instruction offset field.
324 %alloca = alloca i8, i32 4084, align 4, addrspace(5)
325 %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
326 %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
329 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
331 ; Ensure the alloca sticks around.
332 %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
333 %b = load volatile i32, ptr addrspace(5) %bptr
335 ; Ensure the spill is of the full super-reg.
336 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
341 define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
342 ; MUBUF-LABEL: test_inst_offset_subregs_kernel:
343 ; MUBUF: ; %bb.0: ; %entry
344 ; MUBUF-NEXT: s_add_u32 s0, s0, s7
345 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0
346 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
347 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
348 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
349 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
350 ; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00
351 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
352 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
353 ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
354 ; MUBUF-NEXT: ;;#ASMSTART
355 ; MUBUF-NEXT: ;;#ASMEND
356 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
357 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
358 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
359 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
360 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
361 ; MUBUF-NEXT: ;;#ASMSTART
362 ; MUBUF-NEXT: ; v[0:1]
363 ; MUBUF-NEXT: ;;#ASMEND
364 ; MUBUF-NEXT: s_endpgm
366 ; FLATSCR-LABEL: test_inst_offset_subregs_kernel:
367 ; FLATSCR: ; %bb.0: ; %entry
368 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
369 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
370 ; FLATSCR-NEXT: s_mov_b32 s0, 0
371 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
372 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
373 ; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
374 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
375 ; FLATSCR-NEXT: s_mov_b32 s0, 0
376 ; FLATSCR-NEXT: ;;#ASMSTART
377 ; FLATSCR-NEXT: ;;#ASMEND
378 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
379 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
380 ; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
381 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
382 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
383 ; FLATSCR-NEXT: ;;#ASMSTART
384 ; FLATSCR-NEXT: ; v[0:1]
385 ; FLATSCR-NEXT: ;;#ASMEND
386 ; FLATSCR-NEXT: s_endpgm
388 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
389 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
390 ; in the SGPR offset.
391 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
393 ; 0x3ff00 / 64 = 4092 (for wave64)
394 %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
395 %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
398 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
400 ; Ensure the alloca sticks around.
401 %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
402 %b = load volatile i32, ptr addrspace(5) %bptr
404 ; Ensure the spill is of the full super-reg.
405 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
410 define void @test_inst_offset_function() {
411 ; MUBUF-LABEL: test_inst_offset_function:
412 ; MUBUF: ; %bb.0: ; %entry
413 ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
415 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
416 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
417 ; MUBUF-NEXT: ;;#ASMSTART
418 ; MUBUF-NEXT: ;;#ASMEND
419 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
420 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
421 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
422 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
423 ; MUBUF-NEXT: s_setpc_b64 s[30:31]
425 ; FLATSCR-LABEL: test_inst_offset_function:
426 ; FLATSCR: ; %bb.0: ; %entry
427 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428 ; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc
429 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
430 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4088 ; 4-byte Folded Spill
431 ; FLATSCR-NEXT: ;;#ASMSTART
432 ; FLATSCR-NEXT: ;;#ASMEND
433 ; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4088 ; 4-byte Folded Reload
434 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
435 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4
436 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
437 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
439 ; Occupy enough bytes of scratch, so the offset of the spill of %a
440 ; just fits in the instruction offset field when the emergency stack
441 ; slot is added. It's hard to hit the actual limit since we're also
442 ; going to insert the emergency stack slot for large frames.
443 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
445 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
448 %a = load volatile i32, ptr addrspace(5) %aptr
451 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
453 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
454 store volatile i32 %a, ptr addrspace(5) %outptr
459 define void @test_sgpr_offset_function() {
460 ; MUBUF-LABEL: test_sgpr_offset_function:
461 ; MUBUF: ; %bb.0: ; %entry
462 ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
464 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
465 ; MUBUF-NEXT: s_add_i32 s4, s32, 0x40100
466 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
467 ; MUBUF-NEXT: ;;#ASMSTART
468 ; MUBUF-NEXT: ;;#ASMEND
469 ; MUBUF-NEXT: s_add_i32 s4, s32, 0x40100
470 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
471 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
472 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
473 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
474 ; MUBUF-NEXT: s_setpc_b64 s[30:31]
476 ; FLATSCR-LABEL: test_sgpr_offset_function:
477 ; FLATSCR: ; %bb.0: ; %entry
478 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
479 ; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc
480 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
481 ; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1004
482 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
483 ; FLATSCR-NEXT: ;;#ASMSTART
484 ; FLATSCR-NEXT: ;;#ASMEND
485 ; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1004
486 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
487 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
488 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8
489 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
490 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
492 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
493 ; fit in the instruction, and has to live in the SGPR offset.
494 %alloca = alloca i8, i32 4096, align 4, addrspace(5)
496 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
497 ; 0x40000 / 64 = 4096 (for wave64)
498 %a = load volatile i32, ptr addrspace(5) %aptr
501 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
503 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
504 store volatile i32 %a, ptr addrspace(5) %outptr
509 define void @test_sgpr_offset_subregs_function() {
510 ; MUBUF-LABEL: test_sgpr_offset_subregs_function:
511 ; MUBUF: ; %bb.0: ; %entry
512 ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
513 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
514 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
515 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 glc
516 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
517 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Spill
518 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
519 ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
520 ; MUBUF-NEXT: ;;#ASMSTART
521 ; MUBUF-NEXT: ;;#ASMEND
522 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
523 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
524 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload
525 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
526 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
527 ; MUBUF-NEXT: ;;#ASMSTART
528 ; MUBUF-NEXT: ; v[0:1]
529 ; MUBUF-NEXT: ;;#ASMEND
530 ; MUBUF-NEXT: s_setpc_b64 s[30:31]
532 ; FLATSCR-LABEL: test_sgpr_offset_subregs_function:
533 ; FLATSCR: ; %bb.0: ; %entry
534 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
535 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:8 glc
536 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
537 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:4084 ; 8-byte Folded Spill
538 ; FLATSCR-NEXT: ;;#ASMSTART
539 ; FLATSCR-NEXT: ;;#ASMEND
540 ; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc
541 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
542 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4084 ; 8-byte Folded Reload
543 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
544 ; FLATSCR-NEXT: ;;#ASMSTART
545 ; FLATSCR-NEXT: ; v[0:1]
546 ; FLATSCR-NEXT: ;;#ASMEND
547 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
549 ; We want to test the spill of the last subreg of %a is the highest
550 ; valid value for the immediate offset. We enable the emergency
551 ; stack slot for large frames, so it's hard to get the frame layout
552 ; exactly as we want to test it.
553 ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a
554 ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in
555 ; the instruction offset field.
556 %alloca = alloca i8, i32 4084, align 4, addrspace(5)
557 %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
558 %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
561 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
563 ; Ensure the alloca sticks around.
564 %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
565 %b = load volatile i32, ptr addrspace(5) %bptr
567 ; Ensure the spill is of the full super-reg.
568 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
573 define void @test_inst_offset_subregs_function() {
574 ; MUBUF-LABEL: test_inst_offset_subregs_function:
575 ; MUBUF: ; %bb.0: ; %entry
576 ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 glc
578 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
579 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 glc
580 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
581 ; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00
582 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
583 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
584 ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
585 ; MUBUF-NEXT: ;;#ASMSTART
586 ; MUBUF-NEXT: ;;#ASMEND
587 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
588 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
589 ; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00
590 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
591 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
592 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
593 ; MUBUF-NEXT: ;;#ASMSTART
594 ; MUBUF-NEXT: ; v[0:1]
595 ; MUBUF-NEXT: ;;#ASMEND
596 ; MUBUF-NEXT: s_setpc_b64 s[30:31]
598 ; FLATSCR-LABEL: test_inst_offset_subregs_function:
599 ; FLATSCR: ; %bb.0: ; %entry
600 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
601 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:12 glc
602 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
603 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:4092 ; 8-byte Folded Spill
604 ; FLATSCR-NEXT: ;;#ASMSTART
605 ; FLATSCR-NEXT: ;;#ASMEND
606 ; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc
607 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
608 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4092 ; 8-byte Folded Reload
609 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
610 ; FLATSCR-NEXT: ;;#ASMSTART
611 ; FLATSCR-NEXT: ; v[0:1]
612 ; FLATSCR-NEXT: ;;#ASMEND
613 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
615 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
616 ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live
617 ; in the SGPR offset.
618 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
620 ; 0x3ff0000 / 64 = 4092 (for wave64)
621 %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
622 %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
625 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
627 ; Ensure the alloca sticks around.
628 %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
629 %b = load volatile i32, ptr addrspace(5) %bptr
631 ; Ensure the spill is of the full super-reg.
632 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
637 attributes #0 = { nounwind }
638 attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
639 attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }
640 attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" }