1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | FileCheck %s
3 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -passes=amdgpu-promote-kernel-arguments,infer-address-spaces | FileCheck %s
4 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=GCN %s
6 ; GCN-LABEL: ptr_nest_3:
7 ; GCN-COUNT-2: global_load_dwordx2
8 ; GCN: global_store_dword
9 define amdgpu_kernel void @ptr_nest_3(ptr addrspace(1) nocapture readonly %Arg) {
10 ; CHECK-LABEL: @ptr_nest_3(
12 ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
13 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i32 [[I]]
14 ; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
15 ; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
16 ; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0
17 ; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
18 ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
19 ; CHECK-NEXT: ret void
22 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
23 %p1 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i32 %i
24 %p2 = load ptr, ptr addrspace(1) %p1, align 8
25 %p3 = load ptr, ptr %p2, align 8
26 store float 0.000000e+00, ptr %p3, align 4
30 ; GCN-LABEL: ptr_bitcast:
31 ; GCN: global_load_dwordx2
32 ; GCN: global_store_dword
33 define amdgpu_kernel void @ptr_bitcast(ptr nocapture readonly %Arg) {
34 ; CHECK-LABEL: @ptr_bitcast(
36 ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
37 ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
38 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I]]
39 ; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
40 ; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
41 ; CHECK-NEXT: store i32 0, ptr addrspace(1) [[P2_GLOBAL]], align 4
42 ; CHECK-NEXT: ret void
45 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
46 %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
47 %p2 = load ptr, ptr %p1, align 8
48 store i32 0, ptr %p2, align 4
52 %struct.S = type { ptr }
54 ; GCN-LABEL: ptr_in_struct:
56 ; GCN: global_store_dword
57 define amdgpu_kernel void @ptr_in_struct(ptr addrspace(1) nocapture readonly %Arg) {
58 ; CHECK-LABEL: @ptr_in_struct(
60 ; CHECK-NEXT: [[P1:%.*]] = load ptr, ptr addrspace(1) [[ARG:%.*]], align 8, !amdgpu.noclobber !0
61 ; CHECK-NEXT: [[P1_GLOBAL:%.*]] = addrspacecast ptr [[P1]] to ptr addrspace(1)
62 ; CHECK-NEXT: [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
63 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[P1_GLOBAL]], i32 [[ID]]
64 ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[ARRAYIDX]], align 4
65 ; CHECK-NEXT: ret void
68 %p1 = load ptr, ptr addrspace(1) %Arg, align 8
69 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
70 %arrayidx = getelementptr inbounds float, ptr %p1, i32 %id
71 store float 0.000000e+00, ptr %arrayidx, align 4
75 @LDS = internal unnamed_addr addrspace(3) global [4 x float] undef, align 16
77 ; GCN-LABEL: flat_ptr_arg:
78 ; GCN-COUNT-2: global_load_dwordx2
79 ; GCN: global_load_dwordx4
80 ; GCN: global_store_dword
81 define amdgpu_kernel void @flat_ptr_arg(ptr nocapture readonly noalias %Arg, ptr nocapture noalias %Out, i32 %X) {
82 ; CHECK-LABEL: @flat_ptr_arg(
84 ; CHECK-NEXT: [[OUT_GLOBAL:%.*]] = addrspacecast ptr [[OUT:%.*]] to ptr addrspace(1)
85 ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
86 ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
87 ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64
88 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i64 [[IDXPROM]]
89 ; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
90 ; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1)
91 ; CHECK-NEXT: [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0
92 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X:%.*]]
93 ; CHECK-NEXT: store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
94 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 1
95 ; CHECK-NEXT: [[I3:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_1]], align 4
96 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1
97 ; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_1]]
98 ; CHECK-NEXT: store float [[I3]], ptr addrspace(3) [[ARRAYIDX512_1]], align 4
99 ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 2
100 ; CHECK-NEXT: [[I4:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_2]], align 4
101 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2
102 ; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_2]]
103 ; CHECK-NEXT: store float [[I4]], ptr addrspace(3) [[ARRAYIDX512_2]], align 4
104 ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 3
105 ; CHECK-NEXT: [[I5:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_3]], align 4
106 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3
107 ; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_3]]
108 ; CHECK-NEXT: store float [[I5]], ptr addrspace(3) [[ARRAYIDX512_3]], align 4
109 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1
110 ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
111 ; CHECK-NEXT: [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
112 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[OUT_GLOBAL]], i64 [[IDXPROM]]
113 ; CHECK-NEXT: [[I7:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX11]], align 8, !amdgpu.noclobber !0
114 ; CHECK-NEXT: [[I7_GLOBAL:%.*]] = addrspacecast ptr [[I7]] to ptr addrspace(1)
115 ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
116 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I7_GLOBAL]], i64 [[IDXPROM8]]
117 ; CHECK-NEXT: store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4
118 ; CHECK-NEXT: ret void
121 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
122 %idxprom = zext i32 %i to i64
123 %arrayidx10 = getelementptr inbounds ptr, ptr %Arg, i64 %idxprom
124 %i1 = load ptr, ptr %arrayidx10, align 8
125 %i2 = load float, ptr %i1, align 4
126 %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
127 store float %i2, ptr addrspace(3) %arrayidx512, align 4
128 %arrayidx3.1 = getelementptr inbounds float, ptr %i1, i64 1
129 %i3 = load float, ptr %arrayidx3.1, align 4
130 %add.1 = add nsw i32 %X, 1
131 %arrayidx512.1 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.1
132 store float %i3, ptr addrspace(3) %arrayidx512.1, align 4
133 %arrayidx3.2 = getelementptr inbounds float, ptr %i1, i64 2
134 %i4 = load float, ptr %arrayidx3.2, align 4
135 %add.2 = add nsw i32 %X, 2
136 %arrayidx512.2 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.2
137 store float %i4, ptr addrspace(3) %arrayidx512.2, align 4
138 %arrayidx3.3 = getelementptr inbounds float, ptr %i1, i64 3
139 %i5 = load float, ptr %arrayidx3.3, align 4
140 %add.3 = add nsw i32 %X, 3
141 %arrayidx512.3 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.3
142 store float %i5, ptr addrspace(3) %arrayidx512.3, align 4
143 %sub = add nsw i32 %X, -1
144 %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
145 %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
146 %arrayidx11 = getelementptr inbounds ptr, ptr %Out, i64 %idxprom
147 %i7 = load ptr, ptr %arrayidx11, align 8
148 %idxprom8 = sext i32 %X to i64
149 %arrayidx9 = getelementptr inbounds float, ptr %i7, i64 %idxprom8
150 store float %i6, ptr %arrayidx9, align 4
154 ; GCN-LABEL: global_ptr_arg:
155 ; GCN: global_load_dwordx2
156 ; GCN: global_load_dwordx4
157 ; GCN: global_store_dword
158 define amdgpu_kernel void @global_ptr_arg(ptr addrspace(1) nocapture readonly %Arg, i32 %X) {
159 ; CHECK-LABEL: @global_ptr_arg(
161 ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
162 ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64
163 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]]
164 ; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
165 ; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1)
166 ; CHECK-NEXT: [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0
167 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X:%.*]]
168 ; CHECK-NEXT: store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
169 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 1
170 ; CHECK-NEXT: [[I3:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_1]], align 4
171 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1
172 ; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_1]]
173 ; CHECK-NEXT: store float [[I3]], ptr addrspace(3) [[ARRAYIDX512_1]], align 4
174 ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 2
175 ; CHECK-NEXT: [[I4:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_2]], align 4
176 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2
177 ; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_2]]
178 ; CHECK-NEXT: store float [[I4]], ptr addrspace(3) [[ARRAYIDX512_2]], align 4
179 ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 3
180 ; CHECK-NEXT: [[I5:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_3]], align 4
181 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3
182 ; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_3]]
183 ; CHECK-NEXT: store float [[I5]], ptr addrspace(3) [[ARRAYIDX512_3]], align 4
184 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1
185 ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
186 ; CHECK-NEXT: [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
187 ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
188 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 [[IDXPROM8]]
189 ; CHECK-NEXT: store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4
190 ; CHECK-NEXT: ret void
193 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
194 %idxprom = zext i32 %i to i64
195 %arrayidx10 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i64 %idxprom
196 %i1 = load ptr, ptr addrspace(1) %arrayidx10, align 8
197 %i2 = load float, ptr %i1, align 4
198 %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
199 store float %i2, ptr addrspace(3) %arrayidx512, align 4
200 %arrayidx3.1 = getelementptr inbounds float, ptr %i1, i64 1
201 %i3 = load float, ptr %arrayidx3.1, align 4
202 %add.1 = add nsw i32 %X, 1
203 %arrayidx512.1 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.1
204 store float %i3, ptr addrspace(3) %arrayidx512.1, align 4
205 %arrayidx3.2 = getelementptr inbounds float, ptr %i1, i64 2
206 %i4 = load float, ptr %arrayidx3.2, align 4
207 %add.2 = add nsw i32 %X, 2
208 %arrayidx512.2 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.2
209 store float %i4, ptr addrspace(3) %arrayidx512.2, align 4
210 %arrayidx3.3 = getelementptr inbounds float, ptr %i1, i64 3
211 %i5 = load float, ptr %arrayidx3.3, align 4
212 %add.3 = add nsw i32 %X, 3
213 %arrayidx512.3 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.3
214 store float %i5, ptr addrspace(3) %arrayidx512.3, align 4
215 %sub = add nsw i32 %X, -1
216 %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
217 %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
218 %idxprom8 = sext i32 %X to i64
219 %arrayidx9 = getelementptr inbounds float, ptr %i1, i64 %idxprom8
220 store float %i6, ptr %arrayidx9, align 4
224 ; GCN-LABEL: global_ptr_arg_clobbered:
225 ; GCN: global_store_dwordx2
226 ; GCN: global_load_dwordx2
227 ; GCN: flat_load_dword
228 ; GCN: flat_store_dword
229 define amdgpu_kernel void @global_ptr_arg_clobbered(ptr addrspace(1) nocapture readonly %Arg, i32 %X) {
230 ; CHECK-LABEL: @global_ptr_arg_clobbered(
232 ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
233 ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64
234 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]]
235 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARRAYIDX10]], i32 [[X:%.*]]
236 ; CHECK-NEXT: store ptr null, ptr addrspace(1) [[ARRAYIDX11]], align 4
237 ; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8
238 ; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[I1]], align 4
239 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X]]
240 ; CHECK-NEXT: store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
241 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1
242 ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
243 ; CHECK-NEXT: [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
244 ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
245 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 [[IDXPROM8]]
246 ; CHECK-NEXT: store float [[I6]], ptr [[ARRAYIDX9]], align 4
247 ; CHECK-NEXT: ret void
250 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
251 %idxprom = zext i32 %i to i64
252 %arrayidx10 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i64 %idxprom
253 %arrayidx11 = getelementptr inbounds ptr, ptr addrspace(1) %arrayidx10, i32 %X
254 store ptr null, ptr addrspace(1) %arrayidx11, align 4
255 %i1 = load ptr, ptr addrspace(1) %arrayidx10, align 8
256 %i2 = load float, ptr %i1, align 4
257 %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
258 store float %i2, ptr addrspace(3) %arrayidx512, align 4
259 %sub = add nsw i32 %X, -1
260 %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
261 %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
262 %idxprom8 = sext i32 %X to i64
263 %arrayidx9 = getelementptr inbounds float, ptr %i1, i64 %idxprom8
264 store float %i6, ptr %arrayidx9, align 4
268 ; GCN-LABEL: global_ptr_arg_clobbered_after_load:
269 ; GCN: global_load_dwordx2
270 ; GCN: global_store_dwordx2
271 ; GCN: global_load_dword
272 ; GCN: global_store_dword
273 define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(ptr addrspace(1) nocapture readonly %Arg, i32 %X) {
274 ; CHECK-LABEL: @global_ptr_arg_clobbered_after_load(
276 ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
277 ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64
278 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]]
279 ; CHECK-NEXT: [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
280 ; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1)
281 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARRAYIDX10]], i32 [[X:%.*]]
282 ; CHECK-NEXT: store ptr null, ptr addrspace(1) [[ARRAYIDX11]], align 4
283 ; CHECK-NEXT: [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4
284 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X]]
285 ; CHECK-NEXT: store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
286 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1
287 ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
288 ; CHECK-NEXT: [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
289 ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
290 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 [[IDXPROM8]]
291 ; CHECK-NEXT: store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4
292 ; CHECK-NEXT: ret void
295 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
296 %idxprom = zext i32 %i to i64
297 %arrayidx10 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i64 %idxprom
298 %i1 = load ptr, ptr addrspace(1) %arrayidx10, align 8
299 %arrayidx11 = getelementptr inbounds ptr, ptr addrspace(1) %arrayidx10, i32 %X
300 store ptr null, ptr addrspace(1) %arrayidx11, align 4
301 %i2 = load float, ptr %i1, align 4
302 %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
303 store float %i2, ptr addrspace(3) %arrayidx512, align 4
304 %sub = add nsw i32 %X, -1
305 %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
306 %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
307 %idxprom8 = sext i32 %X to i64
308 %arrayidx9 = getelementptr inbounds float, ptr %i1, i64 %idxprom8
309 store float %i6, ptr %arrayidx9, align 4
313 ; GCN-LABEL: ptr_nest_3_barrier:
314 ; GCN-COUNT-2: global_load_dwordx2
315 ; GCN: global_store_dword
316 define amdgpu_kernel void @ptr_nest_3_barrier(ptr addrspace(1) nocapture readonly %Arg) {
317 ; CHECK-LABEL: @ptr_nest_3_barrier(
319 ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
320 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i32 [[I]]
321 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
322 ; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
323 ; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
324 ; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0
325 ; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
326 ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
327 ; CHECK-NEXT: ret void
330 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
331 %p1 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i32 %i
332 tail call void @llvm.amdgcn.s.barrier()
333 %p2 = load ptr, ptr addrspace(1) %p1, align 8
334 %p3 = load ptr, ptr %p2, align 8
335 store float 0.000000e+00, ptr %p3, align 4
339 ; GCN-LABEL: flat_ptr_nest_2:
341 ; GCN: s_load_dwordx2
342 ; GCN: global_store_dword
343 define amdgpu_kernel void @flat_ptr_nest_2(ptr nocapture readonly %Arg, i32 %i) {
344 ; CHECK-LABEL: @flat_ptr_nest_2(
346 ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
347 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]]
348 ; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
349 ; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
350 ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4
351 ; CHECK-NEXT: ret void
354 %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
355 %p2 = load ptr, ptr %p1, align 8
356 store float 0.000000e+00, ptr %p2, align 4
360 ; GCN-LABEL: const_ptr_nest_3:
362 ; GCN: s_load_dwordx2
363 ; GCN: s_load_dwordx2
364 ; GCN: global_store_dword
365 define amdgpu_kernel void @const_ptr_nest_3(ptr addrspace(4) nocapture readonly %Arg, i32 %i) {
366 ; CHECK-LABEL: @const_ptr_nest_3(
368 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[ARG:%.*]], i32 [[I:%.*]]
369 ; CHECK-NEXT: [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8, !amdgpu.noclobber !0
370 ; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8, !amdgpu.noclobber !0
371 ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
372 ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[TMP0]], align 4
373 ; CHECK-NEXT: ret void
376 %p1 = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) %Arg, i32 %i
377 %p2 = load ptr addrspace(4), ptr addrspace(4) %p1, align 8
378 %p3 = load ptr, ptr addrspace(4) %p2, align 8
379 store float 0.000000e+00, ptr %p3, align 4
383 ; GCN-LABEL: cast_from_const_const_ptr_nest_3:
385 ; GCN: s_load_dwordx2
386 ; GCN: s_load_dwordx2
387 ; GCN: global_store_dword
388 define amdgpu_kernel void @cast_from_const_const_ptr_nest_3(ptr addrspace(4) nocapture readonly %Arg, i32 %i) {
389 ; CHECK-LABEL: @cast_from_const_const_ptr_nest_3(
391 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[ARG:%.*]], i32 [[I:%.*]]
392 ; CHECK-NEXT: [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8, !amdgpu.noclobber !0
393 ; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8, !amdgpu.noclobber !0
394 ; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
395 ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
396 ; CHECK-NEXT: ret void
399 %p1 = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) %Arg, i32 %i
400 %a1 = addrspacecast ptr addrspace(4) %p1 to ptr
401 %p2 = load ptr addrspace(4), ptr %a1, align 8
402 %a2 = addrspacecast ptr addrspace(4) %p2 to ptr
403 %p3 = load ptr, ptr %a2, align 8
404 store float 0.000000e+00, ptr %p3, align 4
408 ; GCN-LABEL: flat_ptr_volatile_load:
410 ; GCN: flat_load_dwordx2
411 ; GCN: global_store_dword
412 define amdgpu_kernel void @flat_ptr_volatile_load(ptr nocapture readonly %Arg, i32 %i) {
413 ; CHECK-LABEL: @flat_ptr_volatile_load(
415 ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
416 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]]
417 ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
418 ; CHECK-NEXT: [[P2:%.*]] = load volatile ptr, ptr [[TMP0]], align 8
419 ; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
420 ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4
421 ; CHECK-NEXT: ret void
424 %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
425 %p2 = load volatile ptr, ptr %p1, align 8
426 store float 0.000000e+00, ptr %p2, align 4
430 ; GCN-LABEL: flat_ptr_atomic_load:
432 ; GCN: global_load_dwordx2
433 ; GCN: global_store_dword
434 define amdgpu_kernel void @flat_ptr_atomic_load(ptr nocapture readonly %Arg, i32 %i) {
435 ; CHECK-LABEL: @flat_ptr_atomic_load(
437 ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
438 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]]
439 ; CHECK-NEXT: [[P2:%.*]] = load atomic ptr, ptr addrspace(1) [[P1]] monotonic, align 8
440 ; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
441 ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4
442 ; CHECK-NEXT: ret void
445 %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
446 %p2 = load atomic ptr, ptr %p1 monotonic, align 8
447 store float 0.000000e+00, ptr %p2, align 4
451 ; GCN-LABEL: cast_changing_pointee_type:
453 ; GCN: s_load_dwordx2
454 ; GCN: s_load_dwordx2
455 ; GCN: global_store_dword
456 define amdgpu_kernel void @cast_changing_pointee_type(ptr addrspace(1) nocapture readonly %Arg, i32 %i) {
457 ; CHECK-LABEL: @cast_changing_pointee_type(
459 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]
460 ; CHECK-NEXT: [[P2:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
461 ; CHECK-NEXT: [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2]], align 8, !amdgpu.noclobber !0
462 ; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
463 ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
464 ; CHECK-NEXT: ret void
467 %p1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %Arg, i32 %i
468 %a1 = addrspacecast ptr addrspace(1) %p1 to ptr
469 %p2 = load ptr addrspace(1), ptr %a1, align 8
470 %a2 = addrspacecast ptr addrspace(1) %p2 to ptr
471 %p3 = load ptr, ptr %a2, align 8
472 store float 0.000000e+00, ptr %p3, align 4
476 declare i32 @llvm.amdgcn.workitem.id.x()
477 declare void @llvm.amdgcn.s.barrier()