1 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
2 ; RUN: not llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GCN,GFX900 %s
4 ; GCN-LABEL: {{^}}max_10_vgprs:
5 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
6 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
7 ; GFX908-NOT: SCRATCH_RSRC
8 ; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}}
9 ; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
10 ; GFX900: buffer_store_dword v{{[0-9]}},
11 ; GFX900: buffer_store_dword v{{[0-9]}},
12 ; GFX900: buffer_load_dword v{{[0-9]}},
13 ; GFX900: buffer_load_dword v{{[0-9]}},
15 ; GFX908-DAG v_accvgpr_read_b32 v{{[0-9]}}, a0
16 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1
19 ; GFX900: ScratchSize: 12
20 ; GFX908: ScratchSize: 0
22 ; GCN: NumVGPRsForWavesPerEU: 10
23 define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 {
24 %tid = call i32 @llvm.amdgcn.workitem.id.x()
25 %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
26 %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
27 %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
28 %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
29 %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
30 %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
31 %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
32 %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
33 %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
34 %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
35 %v1 = load volatile i32, i32 addrspace(1)* %p1
36 %v2 = load volatile i32, i32 addrspace(1)* %p2
37 %v3 = load volatile i32, i32 addrspace(1)* %p3
38 %v4 = load volatile i32, i32 addrspace(1)* %p4
39 %v5 = load volatile i32, i32 addrspace(1)* %p5
40 %v6 = load volatile i32, i32 addrspace(1)* %p6
41 %v7 = load volatile i32, i32 addrspace(1)* %p7
42 %v8 = load volatile i32, i32 addrspace(1)* %p8
43 %v9 = load volatile i32, i32 addrspace(1)* %p9
44 %v10 = load volatile i32, i32 addrspace(1)* %p10
45 call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
46 store volatile i32 %v1, i32 addrspace(1)* undef
47 store volatile i32 %v2, i32 addrspace(1)* undef
48 store volatile i32 %v3, i32 addrspace(1)* undef
49 store volatile i32 %v4, i32 addrspace(1)* undef
50 store volatile i32 %v5, i32 addrspace(1)* undef
51 store volatile i32 %v6, i32 addrspace(1)* undef
52 store volatile i32 %v7, i32 addrspace(1)* undef
53 store volatile i32 %v8, i32 addrspace(1)* undef
54 store volatile i32 %v9, i32 addrspace(1)* undef
55 store volatile i32 %v10, i32 addrspace(1)* undef
59 ; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
60 ; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
61 ; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
62 ; GFX908: v_accvgpr_write_b32 a9, v{{[0-9]}}
63 ; GFX908: buffer_store_dword v{{[0-9]}},
65 ; GFX908: v_accvgpr_read_b32 v{{[0-9]}}, a9
66 ; GFX908: buffer_load_dword v{{[0-9]}},
69 ; GFX900: couldn't allocate input reg for constraint 'a'
71 ; GFX908: NumVgprs: 10
72 ; GFX908: ScratchSize: 8
73 ; GFX908: VGPRBlocks: 2
74 ; GFX908: NumVGPRsForWavesPerEU: 10
75 define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
76 %tid = call i32 @llvm.amdgcn.workitem.id.x()
77 call void asm sideeffect "", "a,a,a,a,a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
78 %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
79 %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
80 %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
81 %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
82 %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
83 %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
84 %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
85 %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
86 %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
87 %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
88 %v1 = load volatile i32, i32 addrspace(1)* %p1
89 %v2 = load volatile i32, i32 addrspace(1)* %p2
90 %v3 = load volatile i32, i32 addrspace(1)* %p3
91 %v4 = load volatile i32, i32 addrspace(1)* %p4
92 %v5 = load volatile i32, i32 addrspace(1)* %p5
93 %v6 = load volatile i32, i32 addrspace(1)* %p6
94 %v7 = load volatile i32, i32 addrspace(1)* %p7
95 %v8 = load volatile i32, i32 addrspace(1)* %p8
96 %v9 = load volatile i32, i32 addrspace(1)* %p9
97 %v10 = load volatile i32, i32 addrspace(1)* %p10
98 call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
99 store volatile i32 %v1, i32 addrspace(1)* undef
100 store volatile i32 %v2, i32 addrspace(1)* undef
101 store volatile i32 %v3, i32 addrspace(1)* undef
102 store volatile i32 %v4, i32 addrspace(1)* undef
103 store volatile i32 %v5, i32 addrspace(1)* undef
104 store volatile i32 %v6, i32 addrspace(1)* undef
105 store volatile i32 %v7, i32 addrspace(1)* undef
106 store volatile i32 %v8, i32 addrspace(1)* undef
107 store volatile i32 %v9, i32 addrspace(1)* undef
108 store volatile i32 %v10, i32 addrspace(1)* undef
112 ; GCN-LABEL: {{^}}max_10_vgprs_used_1a_partial_spill:
113 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
114 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
115 ; GFX908-DAG: v_accvgpr_write_b32 a0, 1
116 ; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
117 ; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}}
118 ; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}}
119 ; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}}
120 ; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}}
121 ; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}}
122 ; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}}
123 ; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}}
124 ; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
125 ; GFX900: buffer_store_dword v{{[0-9]}},
126 ; GCN-DAG: buffer_store_dword v{{[0-9]}},
127 ; GFX900: buffer_load_dword v{{[0-9]}},
128 ; GCN-DAG: buffer_load_dword v{{[0-9]}},
129 ; GFX908-DAG v_accvgpr_read_b32 v{{[0-9]}}, a1
130 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2
131 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3
132 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4
133 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5
134 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6
135 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7
136 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8
137 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9
140 ; GFX900: ScratchSize: 44
141 ; GFX908: ScratchSize: 20
143 ; GCN: NumVGPRsForWavesPerEU: 10
144 define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 {
145 %tid = call i32 @llvm.amdgcn.workitem.id.x()
146 call void asm sideeffect "", "a"(i32 1)
147 %p1 = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %tid
148 %p2 = getelementptr inbounds i64, i64 addrspace(1)* %p1, i32 8
149 %p3 = getelementptr inbounds i64, i64 addrspace(1)* %p2, i32 16
150 %p4 = getelementptr inbounds i64, i64 addrspace(1)* %p3, i32 24
151 %p5 = getelementptr inbounds i64, i64 addrspace(1)* %p4, i32 32
152 %v1 = load volatile i64, i64 addrspace(1)* %p1
153 %v2 = load volatile i64, i64 addrspace(1)* %p2
154 %v3 = load volatile i64, i64 addrspace(1)* %p3
155 %v4 = load volatile i64, i64 addrspace(1)* %p4
156 %v5 = load volatile i64, i64 addrspace(1)* %p5
157 call void asm sideeffect "", "v,v,v,v,v"(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5)
158 store volatile i64 %v1, i64 addrspace(1)* %p2
159 store volatile i64 %v2, i64 addrspace(1)* %p3
160 store volatile i64 %v3, i64 addrspace(1)* %p4
161 store volatile i64 %v4, i64 addrspace(1)* %p5
162 store volatile i64 %v5, i64 addrspace(1)* %p1
166 ; GCN-LABEL: {{^}}max_10_vgprs_spill_v32:
167 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
168 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
169 ; GFX908-DAG: v_accvgpr_write_b32 a0,
170 ; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
172 ; GCN: buffer_store_dword v{{[0-9]}},
174 ; GFX908: NumVgprs: 10
175 ; GFX900: ScratchSize: 100
176 ; GFX908: ScratchSize: 68
177 ; GFX908: VGPRBlocks: 2
178 ; GFX908: NumVGPRsForWavesPerEU: 10
179 define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p) #0 {
180 %tid = call i32 @llvm.amdgcn.workitem.id.x()
181 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
182 %v = load volatile <32 x float>, <32 x float> addrspace(1)* %gep
183 store volatile <32 x float> %v, <32 x float> addrspace(1)* undef
187 ; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32:
188 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
189 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
190 ; GFX908-NOT: SCRATCH_RSRC
191 ; GFX908-DAG: v_accvgpr_write_b32 a0, v
192 ; GFX900: buffer_store_dword v
193 ; GFX900: buffer_load_dword v
194 ; GFX908-NOT: buffer_
195 ; GFX908-DAG v_accvgpr_read_b32
198 ; GFX900: ScratchSize: 148
199 ; GFX908: ScratchSize: 0
200 ; GCN: VGPRBlocks: 63
201 ; GCN: NumVGPRsForWavesPerEU: 256
202 define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) {
203 %tid = call i32 @llvm.amdgcn.workitem.id.x()
204 %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
205 %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
206 %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
207 %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
208 %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
209 %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
210 %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
211 %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
212 %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
213 %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
214 %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
215 %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
216 %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
217 %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
218 %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
219 %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
220 %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
221 %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
222 store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
223 store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
224 store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
225 store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
226 store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
227 store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
228 store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
229 store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
230 store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
234 ; FIXME: adding an AReg_1024 register class for v32f32 and v32i32
235 ; produces unnecessary copies and we still have some amount
236 ; of conventional spilling.
238 ; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
239 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
240 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
241 ; GFX908-FIXME-NOT: SCRATCH_RSRC
242 ; GFX908-DAG: v_accvgpr_write_b32 a0, v
243 ; GFX900: buffer_store_dword v
244 ; GFX900: buffer_load_dword v
245 ; GFX908-FIXME-NOT: buffer_
246 ; GFX908-DAG v_accvgpr_read_b32
249 ; GFX900: ScratchSize: 580
250 ; GFX908-FIXME: ScratchSize: 0
251 ; GCN: VGPRBlocks: 63
252 ; GCN: NumVGPRsForWavesPerEU: 256
253 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {
254 %tid = call i32 @llvm.amdgcn.workitem.id.x()
255 %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
256 %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
257 %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
258 %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
259 %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
260 %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
261 %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
262 %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
263 %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
264 %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
265 %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
266 %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
267 %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
268 %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
269 %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
270 %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
271 %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
272 %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
276 store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
277 store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
278 store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
279 store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
280 store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
281 store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
282 store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
283 store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
284 store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
288 declare i32 @llvm.amdgcn.workitem.id.x()
290 attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }