1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=OPT %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s
5 ; Opt checks from utils/update_test_checks.py, llc checks from utils/update_llc_test_checks.py
7 ; Define four variables and four non-kernel functions which access exactly one variable each
8 @v0 = addrspace(3) global float poison
9 @v1 = addrspace(3) global i16 poison, align 16
10 @v2 = addrspace(3) global i64 poison
11 @v3 = addrspace(3) global i8 poison
12 @unused = addrspace(3) global i16 poison
14 ; OPT: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 16, !absolute_symbol !0
15 ; OPT: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
16 ; OPT: @llvm.amdgcn.kernel.kernel_no_table.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_no_table.lds.t poison, align 8, !absolute_symbol !0
17 ; OPT: @llvm.amdgcn.kernel.k01.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k01.lds.t poison, align 4, !absolute_symbol !1
18 ; OPT: @llvm.amdgcn.kernel.k23.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k23.lds.t poison, align 8, !absolute_symbol !0
19 ; OPT: @llvm.amdgcn.kernel.k123.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k123.lds.t poison, align 8, !absolute_symbol !2
20 ; OPT{LITERAL}: @llvm.amdgcn.lds.offset.table = internal addrspace(4) constant [2 x [1 x i32]] [[1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds to i32)], [1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds to i32)]]
24 ; OPT-NEXT: [[LD:%.*]] = load float, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4
25 ; OPT-NEXT: [[MUL:%.*]] = fmul float [[LD]], 2.000000e+00
26 ; OPT-NEXT: store float [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4
31 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32 ; GCN-NEXT: v_mov_b32_e32 v0, 0
33 ; GCN-NEXT: s_mov_b32 m0, -1
34 ; GCN-NEXT: ds_read_b32 v1, v0 offset:4
35 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
36 ; GCN-NEXT: v_add_f32_e32 v1, v1, v1
37 ; GCN-NEXT: ds_write_b32 v0, v1 offset:4
38 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
39 ; GCN-NEXT: s_setpc_b64 s[30:31]
40 %ld = load float, ptr addrspace(3) @v0
41 %mul = fmul float %ld, 2.
42 store float %mul, ptr addrspace(3) @v0
48 ; OPT-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16
49 ; OPT-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 3
50 ; OPT-NEXT: store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 16
55 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GCN-NEXT: v_mov_b32_e32 v0, 0
57 ; GCN-NEXT: s_mov_b32 m0, -1
58 ; GCN-NEXT: ds_read_u16 v1, v0
59 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
60 ; GCN-NEXT: v_mul_lo_u32 v1, v1, 3
61 ; GCN-NEXT: ds_write_b16 v0, v1
62 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
63 ; GCN-NEXT: s_setpc_b64 s[30:31]
64 %ld = load i16, ptr addrspace(3) @v1
66 store i16 %mul, ptr addrspace(3) @v1
72 ; OPT-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
73 ; OPT-NEXT: [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
74 ; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
75 ; OPT-NEXT: [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
76 ; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8
77 ; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 4
78 ; OPT-NEXT: [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
79 ; OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4
80 ; OPT-NEXT: [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
81 ; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8
86 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87 ; GCN-NEXT: s_mov_b32 s4, s15
88 ; GCN-NEXT: s_ashr_i32 s5, s15, 31
89 ; GCN-NEXT: s_getpc_b64 s[6:7]
90 ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4
91 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12
92 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
93 ; GCN-NEXT: s_add_u32 s4, s4, s6
94 ; GCN-NEXT: s_addc_u32 s5, s5, s7
95 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
96 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
97 ; GCN-NEXT: v_mov_b32_e32 v2, s4
98 ; GCN-NEXT: s_mov_b32 m0, -1
99 ; GCN-NEXT: ds_read_b64 v[0:1], v2
100 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
101 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
102 ; GCN-NEXT: ds_write_b64 v2, v[0:1]
103 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
104 ; GCN-NEXT: s_setpc_b64 s[30:31]
105 %ld = load i64, ptr addrspace(3) @v2
106 %mul = mul i64 %ld, 4
107 store i64 %mul, ptr addrspace(3) @v2
113 ; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8
114 ; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 5
115 ; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8
120 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121 ; GCN-NEXT: v_mov_b32_e32 v0, 0
122 ; GCN-NEXT: s_mov_b32 m0, -1
123 ; GCN-NEXT: ds_read_u8 v1, v0 offset:8
124 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
125 ; GCN-NEXT: v_mul_lo_u32 v1, v1, 5
126 ; GCN-NEXT: ds_write_b8 v0, v1 offset:8
127 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
128 ; GCN-NEXT: s_setpc_b64 s[30:31]
129 %ld = load i8, ptr addrspace(3) @v3
131 store i8 %mul, ptr addrspace(3) @v3
135 ; Doesn't access any via a function, won't be in the lookup table
136 define amdgpu_kernel void @kernel_no_table() {
137 ; OPT-LABEL: @kernel_no_table(
138 ; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
139 ; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 8
140 ; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
143 ; GCN-LABEL: kernel_no_table:
145 ; GCN-NEXT: v_mov_b32_e32 v2, 0
146 ; GCN-NEXT: s_mov_b32 m0, -1
147 ; GCN-NEXT: ds_read_b64 v[0:1], v2
148 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
149 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
150 ; GCN-NEXT: ds_write_b64 v2, v[0:1]
152 %ld = load i64, ptr addrspace(3) @v2
153 %mul = mul i64 %ld, 8
154 store i64 %mul, ptr addrspace(3) @v2
158 ; Access two variables, will allocate those two
159 define amdgpu_kernel void @k01() {
161 ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ]
162 ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
163 ; OPT-NEXT: call void @f0()
164 ; OPT-NEXT: call void @f1()
169 ; GCN-NEXT: s_mov_b32 s32, 0
170 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
171 ; GCN-NEXT: s_add_i32 s6, s6, s9
172 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
173 ; GCN-NEXT: s_add_u32 s0, s0, s9
174 ; GCN-NEXT: s_addc_u32 s1, s1, 0
175 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
176 ; GCN-NEXT: s_getpc_b64 s[4:5]
177 ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4
178 ; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12
179 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
180 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
181 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
182 ; GCN-NEXT: s_getpc_b64 s[4:5]
183 ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4
184 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12
185 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
186 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
187 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
194 define amdgpu_kernel void @k23() {
196 ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope !4, !noalias !7
197 ; OPT-NEXT: call void @f2()
198 ; OPT-NEXT: call void @f3()
203 ; GCN-NEXT: s_mov_b32 s32, 0
204 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
205 ; GCN-NEXT: s_add_i32 s6, s6, s9
206 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
207 ; GCN-NEXT: s_add_u32 s0, s0, s9
208 ; GCN-NEXT: s_addc_u32 s1, s1, 0
209 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
210 ; GCN-NEXT: s_getpc_b64 s[4:5]
211 ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4
212 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12
213 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
214 ; GCN-NEXT: s_mov_b32 s15, 1
215 ; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
216 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
217 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
218 ; GCN-NEXT: s_getpc_b64 s[4:5]
219 ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4
220 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12
221 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
222 ; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
223 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
224 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
231 ; Access and allocate three variables
232 define amdgpu_kernel void @k123() {
234 ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !10, !noalias !13
235 ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
236 ; OPT-NEXT: call void @f1()
237 ; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10
238 ; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8
239 ; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10
240 ; OPT-NEXT: call void @f2()
245 ; GCN-NEXT: s_mov_b32 s32, 0
246 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7
247 ; GCN-NEXT: s_add_i32 s6, s6, s9
248 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
249 ; GCN-NEXT: s_add_u32 s0, s0, s9
250 ; GCN-NEXT: s_addc_u32 s1, s1, 0
251 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
252 ; GCN-NEXT: s_getpc_b64 s[4:5]
253 ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4
254 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12
255 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
256 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
257 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
258 ; GCN-NEXT: v_mov_b32_e32 v0, 0
259 ; GCN-NEXT: s_mov_b32 m0, -1
260 ; GCN-NEXT: ds_read_u8 v1, v0 offset:16
261 ; GCN-NEXT: s_getpc_b64 s[4:5]
262 ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4
263 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12
264 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
265 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
266 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1
267 ; GCN-NEXT: ds_write_b8 v0, v1 offset:16
268 ; GCN-NEXT: s_mov_b32 s15, 0
269 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
272 %ld = load i8, ptr addrspace(3) @v3
274 store i8 %mul, ptr addrspace(3) @v3
280 ; OPT: declare i32 @llvm.amdgcn.lds.kernel.id()
287 ; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
288 ; OPT: attributes #1 = { "amdgpu-lds-size"="16" }
289 ; OPT: attributes #2 = { "amdgpu-lds-size"="24" }
290 ; OPT: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) }
291 ; OPT: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
293 ; OPT: !0 = !{i32 0, i32 1}
294 ; OPT: !1 = !{i32 4, i32 5}
295 ; OPT: !2 = !{i32 8, i32 9}
298 ; OPT: !5 = distinct !{!5, !6}
299 ; OPT: !6 = distinct !{!6}
301 ; OPT: !8 = distinct !{!8, !6}
304 ; OPT: !11 = distinct !{!11, !12}
305 ; OPT: !12 = distinct !{!12}
307 ; OPT: !14 = distinct !{!14, !12}
309 ; Table size length number-kernels * number-variables * sizeof(uint16_t)
310 ; GCN: .type llvm.amdgcn.lds.offset.table,@object
311 ; GCN-NEXT: .section .data.rel.ro,#alloc,#write
312 ; GCN-NEXT: .p2align 2, 0x0
313 ; GCN-NEXT: llvm.amdgcn.lds.offset.table:
316 ; GCN-NEXT: .size llvm.amdgcn.lds.offset.table, 8