1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
4 ; Test that the VGPR spiller correctly switches to SGPR offsets when the
5 ; instruction offset field would overflow, and that it accounts for memory
8 ; GCN-LABEL: test_inst_offset_kernel
9 define amdgpu_kernel void @test_inst_offset_kernel() {
11 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
12 ; the instruction offset field.
13 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
14 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
16 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
17 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
18 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill
19 %a = load volatile i32, i32 addrspace(5)* %aptr
22 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
24 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
25 store volatile i32 %a, i32 addrspace(5)* %outptr
30 ; GCN-LABEL: test_sgpr_offset_kernel
31 define amdgpu_kernel void @test_sgpr_offset_kernel() {
33 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
34 ; fit in the instruction, and has to live in the SGPR offset.
35 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
36 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
38 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
39 ; 0x40000 / 64 = 4096 (for wave64)
40 ; MUBUF: s_mov_b32 s4, 0x40000
41 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
42 ; FLATSCR: s_movk_i32 s2, 0x1000
43 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
44 %a = load volatile i32, i32 addrspace(5)* %aptr
47 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
49 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
50 store volatile i32 %a, i32 addrspace(5)* %outptr
55 ; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
56 ; pointer to temporarily update, so we just crash.
58 ; GCN-LABEL: test_sgpr_offset_function_scavenge_fail
59 define void @test_sgpr_offset_function_scavenge_fail() #2 {
61 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
62 ; fit in the instruction, and has to live in the SGPR offset.
63 %alloca = alloca i8, i32 4096, align 4, addrspace(5)
64 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
66 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
68 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
69 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
70 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
71 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
72 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
73 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
74 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
75 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
76 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
78 ; 0x40000 / 64 = 4096 (for wave64)
79 %a = load volatile i32, i32 addrspace(5)* %aptr
81 ; MUBUF: s_add_i32 s32, s32, 0x40000
82 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
83 ; MUBUF: s_add_i32 s32, s32, 0xfffc0000
84 ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1000
85 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
86 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
88 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
89 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
90 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
91 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
92 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
93 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
94 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
95 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
96 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
98 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
100 ; MUBUF: s_add_i32 s32, s32, 0x40000
101 ; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
102 ; MUBUF: s_add_i32 s32, s32, 0xfffc0000
103 ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1000
104 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
106 ; Force %a to spill with no free SGPRs
107 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
111 ; GCN-LABEL: test_sgpr_offset_subregs_kernel
112 define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
114 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
115 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
116 ; the instruction offset field.
117 %alloca = alloca i8, i32 4084, align 4, addrspace(5)
118 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
119 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
121 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
122 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
123 ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8
124 ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]] ; 8-byte Folded Spill
125 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
126 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
129 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
131 ; Ensure the alloca sticks around.
132 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
133 %b = load volatile i32, i32 addrspace(5)* %bptr
135 ; Ensure the spill is of the full super-reg.
136 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
141 ; GCN-LABEL: test_inst_offset_subregs_kernel
142 define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
144 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
145 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
146 ; in the SGPR offset.
147 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
148 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
149 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
151 ; 0x3ff00 / 64 = 4092 (for wave64)
152 ; MUBUF: s_mov_b32 s4, 0x3ff00
153 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
154 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
155 ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
156 ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]] ; 8-byte Folded Spill
157 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
158 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
161 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
163 ; Ensure the alloca sticks around.
164 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
165 %b = load volatile i32, i32 addrspace(5)* %bptr
167 ; Ensure the spill is of the full super-reg.
168 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
173 ; GCN-LABEL: test_inst_offset_function
174 define void @test_inst_offset_function() {
176 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
177 ; the instruction offset field.
178 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
179 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
181 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
182 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
183 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
184 %a = load volatile i32, i32 addrspace(5)* %aptr
187 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
189 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
190 store volatile i32 %a, i32 addrspace(5)* %outptr
195 ; GCN-LABEL: test_sgpr_offset_function
196 define void @test_sgpr_offset_function() {
198 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
199 ; fit in the instruction, and has to live in the SGPR offset.
200 %alloca = alloca i8, i32 4096, align 4, addrspace(5)
201 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
203 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
204 ; 0x40000 / 64 = 4096 (for wave64)
205 ; MUBUF: s_add_i32 s4, s32, 0x40000
206 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
207 ; FLATSCR: s_add_i32 s0, s32, 0x1000
208 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
209 %a = load volatile i32, i32 addrspace(5)* %aptr
212 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
214 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
215 store volatile i32 %a, i32 addrspace(5)* %outptr
220 ; GCN-LABEL: test_sgpr_offset_subregs_function
221 define void @test_sgpr_offset_subregs_function() {
223 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
224 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
225 ; the instruction offset field.
226 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
227 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
228 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
230 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
231 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
232 ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4088 ; 8-byte Folded Spill
233 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
234 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
237 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
239 ; Ensure the alloca sticks around.
240 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
241 %b = load volatile i32, i32 addrspace(5)* %bptr
243 ; Ensure the spill is of the full super-reg.
244 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
249 ; GCN-LABEL: test_inst_offset_subregs_function
250 define void @test_inst_offset_subregs_function() {
252 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
253 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
254 ; in the SGPR offset.
255 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
256 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
257 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
259 ; 0x3ff00 / 64 = 4092 (for wave64)
260 ; MUBUF: s_add_i32 s4, s32, 0x3ff00
261 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
262 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
263 ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4092 ; 8-byte Folded Spill
264 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
265 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
268 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
270 ; Ensure the alloca sticks around.
271 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
272 %b = load volatile i32, i32 addrspace(5)* %bptr
274 ; Ensure the spill is of the full super-reg.
275 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
280 attributes #0 = { nounwind }
281 attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
282 attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }