1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
4 ; Test that doing a shift of a pointer with a constant add will be
5 ; folded into the constant offset addressing mode even if the add has
6 ; multiple uses. This is relevant to accessing 2 separate, adjacent
10 declare i32 @llvm.amdgcn.workitem.id.x() #1
12 @lds0 = addrspace(3) global [512 x float] undef, align 4
13 @lds1 = addrspace(3) global [512 x float] undef, align 4
16 ; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8
18 ; GCN-LABEL: {{^}}load_shl_base_lds_0:
19 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
20 ; GCN: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
22 define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
23 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
24 %idx.0 = add nsw i32 %tid.x, 2
25 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
26 %val0 = load float, float addrspace(3)* %arrayidx0, align 4
27 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
28 store float %val0, float addrspace(1)* %out
32 ; Make sure once the first use is folded into the addressing mode, the
33 ; remaining add use goes through the normal shl + add constant fold.
35 ; GCN-LABEL: {{^}}load_shl_base_lds_1:
36 ; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
38 ; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation
39 ; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
41 ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
42 ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
43 ; GCN-DAG: buffer_store_dword [[RESULT]]
44 ; GCN-DAG: buffer_store_dword [[ADDUSE]]
46 define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
47 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
48 %idx.0 = add nsw i32 %tid.x, 2
49 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
50 %val0 = load float, float addrspace(3)* %arrayidx0, align 4
51 %shl_add_use = shl i32 %idx.0, 2
52 store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4
53 store float %val0, float addrspace(1)* %out
57 @maxlds = addrspace(3) global [65536 x i8] undef, align 4
59 ; GCN-LABEL: {{^}}load_shl_base_lds_max_offset
60 ; GCN: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
62 define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
63 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
64 %idx.0 = add nsw i32 %tid.x, 65535
65 %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
66 %val0 = load i8, i8 addrspace(3)* %arrayidx0
67 store i32 %idx.0, i32 addrspace(1)* %add_use
68 store i8 %val0, i8 addrspace(1)* %out
72 ; The two globals are placed adjacent in memory, so the same base
73 ; pointer can be used with an offset into the second one.
75 ; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints
77 ; GCN-LABEL: {{^}}load_shl_base_lds_2:
78 ; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
79 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
80 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
81 ; GCN: s_mov_b32 m0, -1
83 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
84 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256
85 ; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
88 define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
89 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
90 %idx.0 = add nsw i32 %tid.x, 64
91 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
92 %val0 = load float, float addrspace(3)* %arrayidx0, align 4
93 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
94 %val1 = load float, float addrspace(3)* %arrayidx1, align 4
95 %sum = fadd float %val0, %val1
96 store float %sum, float addrspace(1)* %out, align 4
100 ; GCN-LABEL: {{^}}store_shl_base_lds_0:
101 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
102 ; GCN: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
104 define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
105 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
106 %idx.0 = add nsw i32 %tid.x, 2
107 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
108 store float 1.0, float addrspace(3)* %arrayidx0, align 4
109 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
114 ; --------------------------------------------------------------------------------
117 @lds2 = addrspace(3) global [512 x i32] undef, align 4
119 ; define amdgpu_kernel void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
120 ; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
121 ; %idx.0 = add nsw i32 %tid.x, 2
122 ; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
123 ; %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4
124 ; store i32 %val, i32 addrspace(1)* %out, align 4
125 ; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
130 ; GCN-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0:
131 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
132 ; GCN: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
134 define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
135 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
136 %idx.0 = add nsw i32 %tid.x, 2
137 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
138 %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
139 %result = extractvalue { i32, i1 } %pair, 0
140 store i32 %result, i32 addrspace(1)* %out, align 4
141 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
145 ; GCN-LABEL: {{^}}atomic_swap_shl_base_lds_0:
146 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
147 ; GCN: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
149 define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
150 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
151 %idx.0 = add nsw i32 %tid.x, 2
152 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
153 %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
154 store i32 %val, i32 addrspace(1)* %out, align 4
155 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
159 ; GCN-LABEL: {{^}}atomic_add_shl_base_lds_0:
160 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
161 ; GCN: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
163 define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
164 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
165 %idx.0 = add nsw i32 %tid.x, 2
166 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
167 %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
168 store i32 %val, i32 addrspace(1)* %out, align 4
169 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
173 ; GCN-LABEL: {{^}}atomic_sub_shl_base_lds_0:
174 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
175 ; GCN: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
177 define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
178 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
179 %idx.0 = add nsw i32 %tid.x, 2
180 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
181 %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
182 store i32 %val, i32 addrspace(1)* %out, align 4
183 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
187 ; GCN-LABEL: {{^}}atomic_and_shl_base_lds_0:
188 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
189 ; GCN: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
191 define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
192 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
193 %idx.0 = add nsw i32 %tid.x, 2
194 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
195 %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
196 store i32 %val, i32 addrspace(1)* %out, align 4
197 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
201 ; GCN-LABEL: {{^}}atomic_or_shl_base_lds_0:
202 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
203 ; GCN: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
205 define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
206 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
207 %idx.0 = add nsw i32 %tid.x, 2
208 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
209 %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
210 store i32 %val, i32 addrspace(1)* %out, align 4
211 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
215 ; GCN-LABEL: {{^}}atomic_xor_shl_base_lds_0:
216 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
217 ; GCN: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
219 define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
220 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
221 %idx.0 = add nsw i32 %tid.x, 2
222 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
223 %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
224 store i32 %val, i32 addrspace(1)* %out, align 4
225 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
229 ; define amdgpu_kernel void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
230 ; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
231 ; %idx.0 = add nsw i32 %tid.x, 2
232 ; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
233 ; %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
234 ; store i32 %val, i32 addrspace(1)* %out, align 4
235 ; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
239 ; GCN-LABEL: {{^}}atomic_min_shl_base_lds_0:
240 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
241 ; GCN: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
243 define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
244 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
245 %idx.0 = add nsw i32 %tid.x, 2
246 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
247 %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
248 store i32 %val, i32 addrspace(1)* %out, align 4
249 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
253 ; GCN-LABEL: {{^}}atomic_max_shl_base_lds_0:
254 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
255 ; GCN: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
257 define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
258 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
259 %idx.0 = add nsw i32 %tid.x, 2
260 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
261 %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
262 store i32 %val, i32 addrspace(1)* %out, align 4
263 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
267 ; GCN-LABEL: {{^}}atomic_umin_shl_base_lds_0:
268 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
269 ; GCN: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
271 define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
272 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
273 %idx.0 = add nsw i32 %tid.x, 2
274 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
275 %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
276 store i32 %val, i32 addrspace(1)* %out, align 4
277 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
281 ; GCN-LABEL: {{^}}atomic_umax_shl_base_lds_0:
282 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
283 ; GCN: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
285 define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
286 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
287 %idx.0 = add nsw i32 %tid.x, 2
288 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
289 %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
290 store i32 %val, i32 addrspace(1)* %out, align 4
291 store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
295 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_lds:
296 ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
297 ; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32
299 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
300 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
301 define void @shl_add_ptr_combine_2use_lds(i32 %idx) #0 {
302 %idx.add = add nuw i32 %idx, 4
303 %shl0 = shl i32 %idx.add, 3
304 %shl1 = shl i32 %idx.add, 4
305 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
306 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
307 store volatile i32 9, i32 addrspace(3)* %ptr0
308 store volatile i32 10, i32 addrspace(3)* %ptr1
312 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_lds_offset:
313 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
314 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
315 ; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:65528
316 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 0x1fff0, [[SCALE1]]
317 ; GCN: ds_write_b32 [[ADD1]], v{{[0-9]+$}}
318 define void @shl_add_ptr_combine_2use_max_lds_offset(i32 %idx) #0 {
319 %idx.add = add nuw i32 %idx, 8191
320 %shl0 = shl i32 %idx.add, 3
321 %shl1 = shl i32 %idx.add, 4
322 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
323 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
324 store volatile i32 9, i32 addrspace(3)* %ptr0
325 store volatile i32 10, i32 addrspace(3)* %ptr1
329 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_both_max_lds_offset:
330 ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x1000, v0
331 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]]
332 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]]
333 ; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+$}}
334 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+$}}
335 define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 {
336 %idx.add = add nuw i32 %idx, 4096
337 %shl0 = shl i32 %idx.add, 4
338 %shl1 = shl i32 %idx.add, 5
339 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
340 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
341 store volatile i32 9, i32 addrspace(3)* %ptr0
342 store volatile i32 10, i32 addrspace(3)* %ptr1
346 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private:
347 ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0
348 ; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:16
350 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0
351 ; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen offset:32
352 define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 {
353 %idx = zext i16 %idx.arg to i32
354 %idx.add = add nuw i32 %idx, 4
355 %shl0 = shl i32 %idx.add, 2
356 %shl1 = shl i32 %idx.add, 3
357 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(5)*
358 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(5)*
359 store volatile i32 9, i32 addrspace(5)* %ptr0
360 store volatile i32 10, i32 addrspace(5)* %ptr1
364 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_private_offset:
365 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
366 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
367 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:4088
368 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x1ff0, [[SCALE1]]
369 ; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], s33 offen{{$}}
370 define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) #0 {
371 %idx = zext i16 %idx.arg to i32
372 %idx.add = add nuw i32 %idx, 511
373 %shl0 = shl i32 %idx.add, 3
374 %shl1 = shl i32 %idx.add, 4
375 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(5)*
376 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(5)*
377 store volatile i32 9, i32 addrspace(5)* %ptr0
378 store volatile i32 10, i32 addrspace(5)* %ptr1
381 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_both_max_private_offset:
382 ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x100, v0
383 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]]
384 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]]
385 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen{{$}}
386 ; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen{{$}}
387 define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.arg) #0 {
388 %idx = zext i16 %idx.arg to i32
389 %idx.add = add nuw i32 %idx, 256
390 %shl0 = shl i32 %idx.add, 4
391 %shl1 = shl i32 %idx.add, 5
392 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(5)*
393 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(5)*
394 store volatile i32 9, i32 addrspace(5)* %ptr0
395 store volatile i32 10, i32 addrspace(5)* %ptr1
399 ; FIXME: This or should fold into an offset on the write
400 ; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds:
401 ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
402 ; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]]
403 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}}
405 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
406 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
407 define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
408 %idx.add = or i32 %idx, 4
409 %shl0 = shl i32 %idx.add, 3
410 %shl1 = shl i32 %idx.add, 4
411 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
412 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
413 store volatile i32 9, i32 addrspace(3)* %ptr0
414 store volatile i32 10, i32 addrspace(3)* %ptr1
418 ; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_max_lds_offset:
419 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
420 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
421 ; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:65528
422 ; GCN-DAG: v_or_b32_e32 [[ADD1:v[0-9]+]], 0x1fff0, [[SCALE1]]
423 ; GCN: ds_write_b32 [[ADD1]], v{{[0-9]+$}}
424 define void @shl_or_ptr_combine_2use_max_lds_offset(i32 %idx) #0 {
425 %idx.add = or i32 %idx, 8191
426 %shl0 = shl i32 %idx.add, 3
427 %shl1 = shl i32 %idx.add, 4
428 %ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
429 %ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
430 store volatile i32 9, i32 addrspace(3)* %ptr0
431 store volatile i32 10, i32 addrspace(3)* %ptr1
435 attributes #0 = { nounwind }
436 attributes #1 = { nounwind readnone }