1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
3 ; Test that the VGPR spiller correctly switches to SGPR offsets when the
4 ; instruction offset field would overflow, and that it accounts for memory
7 ; CHECK-LABEL: test_inst_offset_kernel
8 define amdgpu_kernel void @test_inst_offset_kernel() {
10 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
11 ; the instruction offset field.
12 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
13 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
15 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
16 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
17 %a = load volatile i32, i32 addrspace(5)* %aptr
20 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
22 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
23 store volatile i32 %a, i32 addrspace(5)* %outptr
28 ; CHECK-LABEL: test_sgpr_offset_kernel
29 define amdgpu_kernel void @test_sgpr_offset_kernel() {
31 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
32 ; fit in the instruction, and has to live in the SGPR offset.
33 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
34 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
36 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
37 ; 0x40000 / 64 = 4096 (for wave64)
38 ; CHECK: s_add_u32 s6, s7, 0x40000
39 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
40 %a = load volatile i32, i32 addrspace(5)* %aptr
43 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
45 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
46 store volatile i32 %a, i32 addrspace(5)* %outptr
51 ; CHECK-LABEL: test_sgpr_offset_kernel_scavenge_fail
52 define amdgpu_kernel void @test_sgpr_offset_kernel_scavenge_fail() #1 {
54 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
55 ; fit in the instruction, and has to live in the SGPR offset.
56 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
57 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
59 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
61 ; 0x40000 / 64 = 4096 (for wave64)
62 %a = load volatile i32, i32 addrspace(5)* %aptr
64 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
65 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
66 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
67 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
68 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
69 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
70 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
71 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
72 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
74 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
76 ; CHECK: s_add_u32 s7, s7, 0x40000
77 ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 ; 4-byte Folded Reload
78 ; CHECK: s_sub_u32 s7, s7, 0x40000
80 ; Force %a to spill with no free SGPRs
81 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
85 ; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
86 define void @test_sgpr_offset_function_scavenge_fail() #2 {
88 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
89 ; fit in the instruction, and has to live in the SGPR offset.
90 %alloca = alloca i8, i32 4096, align 4, addrspace(5)
91 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
93 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
95 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
96 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
97 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
98 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
99 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
100 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
101 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
102 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
103 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
105 ; 0x40000 / 64 = 4096 (for wave64)
106 %a = load volatile i32, i32 addrspace(5)* %aptr
108 ; CHECK: s_add_u32 s32, s32, 0x40000
109 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
110 ; CHECK: s_sub_u32 s32, s32, 0x40000
111 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
113 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
114 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
115 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
116 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
117 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
118 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
119 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
120 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
121 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
123 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
125 ; CHECK: s_add_u32 s32, s32, 0x40000
126 ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
127 ; CHECK: s_sub_u32 s32, s32, 0x40000
129 ; Force %a to spill with no free SGPRs
130 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
134 ; CHECK-LABEL: test_sgpr_offset_subregs_kernel
135 define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
137 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
138 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
139 ; the instruction offset field.
140 %alloca = alloca i8, i32 4084, align 4, addrspace(5)
141 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
142 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
144 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
145 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
146 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
147 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
150 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
152 ; Ensure the alloca sticks around.
153 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
154 %b = load volatile i32, i32 addrspace(5)* %bptr
156 ; Ensure the spill is of the full super-reg.
157 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
162 ; CHECK-LABEL: test_inst_offset_subregs_kernel
163 define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
165 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
166 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
167 ; in the SGPR offset.
168 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
169 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
170 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
172 ; 0x3ff00 / 64 = 4092 (for wave64)
173 ; CHECK: s_add_u32 s6, s7, 0x3ff00
174 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
175 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
176 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
177 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
180 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
182 ; Ensure the alloca sticks around.
183 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
184 %b = load volatile i32, i32 addrspace(5)* %bptr
186 ; Ensure the spill is of the full super-reg.
187 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
192 ; CHECK-LABEL: test_inst_offset_function
193 define void @test_inst_offset_function() {
195 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
196 ; the instruction offset field.
197 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
198 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
200 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
201 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
202 %a = load volatile i32, i32 addrspace(5)* %aptr
205 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
207 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
208 store volatile i32 %a, i32 addrspace(5)* %outptr
213 ; CHECK-LABEL: test_sgpr_offset_function
214 define void @test_sgpr_offset_function() {
216 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
217 ; fit in the instruction, and has to live in the SGPR offset.
218 %alloca = alloca i8, i32 4096, align 4, addrspace(5)
219 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
221 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
222 ; 0x40000 / 64 = 4096 (for wave64)
223 ; CHECK: s_add_u32 s4, s32, 0x40000
224 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
225 %a = load volatile i32, i32 addrspace(5)* %aptr
228 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
230 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
231 store volatile i32 %a, i32 addrspace(5)* %outptr
236 ; CHECK-LABEL: test_sgpr_offset_subregs_function
237 define void @test_sgpr_offset_subregs_function() {
239 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
240 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
241 ; the instruction offset field.
242 %alloca = alloca i8, i32 4088, align 4, addrspace(5)
243 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
244 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
246 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
247 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
248 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
249 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
252 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
254 ; Ensure the alloca sticks around.
255 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
256 %b = load volatile i32, i32 addrspace(5)* %bptr
258 ; Ensure the spill is of the full super-reg.
259 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
264 ; CHECK-LABEL: test_inst_offset_subregs_function
265 define void @test_inst_offset_subregs_function() {
267 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
268 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
269 ; in the SGPR offset.
270 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
271 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
272 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
274 ; 0x3ff00 / 64 = 4092 (for wave64)
275 ; CHECK: s_add_u32 s4, s32, 0x3ff00
276 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
277 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
278 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
279 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
282 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
284 ; Ensure the alloca sticks around.
285 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
286 %b = load volatile i32, i32 addrspace(5)* %bptr
288 ; Ensure the spill is of the full super-reg.
289 call void asm sideeffect "; $0", "r"(<2 x i32> %a)
294 attributes #0 = { nounwind }
295 attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
296 attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }