1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
4 @lds = addrspace(3) global [512 x float] undef, align 4
5 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
7 ; GCN-LABEL: {{^}}simple_write2_one_val_f32:
11 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[VAL:v[0-9]+]]
12 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
13 ; GCN: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
15 define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
16 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
17 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
18 %val = load float, float addrspace(1)* %in.gep, align 4
19 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
20 store float %val, float addrspace(3)* %arrayidx0, align 4
21 %add.x = add nsw i32 %x.i, 8
22 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
23 store float %val, float addrspace(3)* %arrayidx1, align 4
27 ; GCN-LABEL: {{^}}simple_write2_two_val_f32:
28 ; CI-DAG: s_mov_b32 m0
31 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
32 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
34 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
35 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
37 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
38 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
40 define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
41 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
42 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
43 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
44 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
45 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
46 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
47 store float %val0, float addrspace(3)* %arrayidx0, align 4
48 %add.x = add nsw i32 %x.i, 8
49 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
50 store float %val1, float addrspace(3)* %arrayidx1, align 4
54 ; GCN-LABEL: @simple_write2_two_val_f32_volatile_0
55 ; CI-DAG: s_mov_b32 m0
58 ; GCN-NOT: ds_write2_b32
59 ; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
60 ; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
62 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
63 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
64 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
65 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
66 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
67 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
68 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
69 store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
70 %add.x = add nsw i32 %x.i, 8
71 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
72 store float %val1, float addrspace(3)* %arrayidx1, align 4
76 ; GCN-LABEL: @simple_write2_two_val_f32_volatile_1
77 ; CI-DAG: s_mov_b32 m0
80 ; GCN-NOT: ds_write2_b32
81 ; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
82 ; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
84 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
85 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
86 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
87 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
88 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
89 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
90 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
91 store float %val0, float addrspace(3)* %arrayidx0, align 4
92 %add.x = add nsw i32 %x.i, 8
93 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
94 store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
98 ; 2 data subregisters from different super registers.
99 ; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32:
102 ; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
103 ; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
104 ; CI-DAG: s_mov_b32 m0
106 ; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
107 ; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
109 ; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an
110 ; early legalization of the constant bus constraint on the v_lshl_add_u32,
111 ; and then SIFoldOperands folds in an unlucky order.
112 ; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
113 ; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]]
115 ; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
116 ; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
118 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
120 define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
121 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
122 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
123 %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
124 %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
125 %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
126 %val0.0 = extractelement <2 x float> %val0, i32 0
127 %val1.1 = extractelement <2 x float> %val1, i32 1
128 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
129 store float %val0.0, float addrspace(3)* %arrayidx0, align 4
130 %add.x = add nsw i32 %x.i, 8
131 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
132 store float %val1.1, float addrspace(3)* %arrayidx1, align 4
136 ; GCN-LABEL: @simple_write2_two_val_subreg2_f32
137 ; CI-DAG: s_mov_b32 m0
140 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
142 ; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
143 ; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
144 ; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
145 ; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
147 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
149 define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
150 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
151 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
152 %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
153 %val0 = extractelement <2 x float> %val, i32 0
154 %val1 = extractelement <2 x float> %val, i32 1
155 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
156 store float %val0, float addrspace(3)* %arrayidx0, align 4
157 %add.x = add nsw i32 %x.i, 8
158 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
159 store float %val1, float addrspace(3)* %arrayidx1, align 4
163 ; GCN-LABEL: @simple_write2_two_val_subreg4_f32
164 ; CI-DAG: s_mov_b32 m0
167 ; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
169 ; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
170 ; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
171 ; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
172 ; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
174 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
176 define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
177 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
178 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
179 %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
180 %val0 = extractelement <4 x float> %val, i32 0
181 %val1 = extractelement <4 x float> %val, i32 3
182 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
183 store float %val0, float addrspace(3)* %arrayidx0, align 4
184 %add.x = add nsw i32 %x.i, 8
185 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
186 store float %val1, float addrspace(3)* %arrayidx1, align 4
190 ; GCN-LABEL: @simple_write2_two_val_max_offset_f32
191 ; CI-DAG: s_mov_b32 m0
194 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
195 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
197 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
198 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
200 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
201 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
203 define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
204 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
205 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
206 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
207 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
208 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
209 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
210 store float %val0, float addrspace(3)* %arrayidx0, align 4
211 %add.x = add nsw i32 %x.i, 255
212 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
213 store float %val1, float addrspace(3)* %arrayidx1, align 4
217 ; GCN-LABEL: @simple_write2_two_val_too_far_f32
218 ; CI-DAG: s_mov_b32 m0
221 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
222 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
224 define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
225 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
226 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
227 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
228 %val0 = load float, float addrspace(1)* %in0.gep, align 4
229 %val1 = load float, float addrspace(1)* %in1.gep, align 4
230 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
231 store float %val0, float addrspace(3)* %arrayidx0, align 4
232 %add.x = add nsw i32 %x.i, 257
233 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
234 store float %val1, float addrspace(3)* %arrayidx1, align 4
238 ; GCN-LABEL: @simple_write2_two_val_f32_x2
239 ; CI-DAG: s_mov_b32 m0
242 ; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
243 ; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
245 define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
246 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
247 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
248 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
249 %val0 = load float, float addrspace(1)* %in0.gep, align 4
250 %val1 = load float, float addrspace(1)* %in1.gep, align 4
252 %idx.0 = add nsw i32 %tid.x, 0
253 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
254 store float %val0, float addrspace(3)* %arrayidx0, align 4
256 %idx.1 = add nsw i32 %tid.x, 8
257 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
258 store float %val1, float addrspace(3)* %arrayidx1, align 4
260 %idx.2 = add nsw i32 %tid.x, 11
261 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
262 store float %val0, float addrspace(3)* %arrayidx2, align 4
264 %idx.3 = add nsw i32 %tid.x, 27
265 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
266 store float %val1, float addrspace(3)* %arrayidx3, align 4
271 ; GCN-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
272 ; CI-DAG: s_mov_b32 m0
275 ; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
276 ; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
278 define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
279 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
280 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
281 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
282 %val0 = load float, float addrspace(1)* %in0.gep, align 4
283 %val1 = load float, float addrspace(1)* %in1.gep, align 4
285 %idx.0 = add nsw i32 %tid.x, 3
286 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
287 store float %val0, float addrspace(3)* %arrayidx0, align 4
289 %idx.1 = add nsw i32 %tid.x, 8
290 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
291 store float %val1, float addrspace(3)* %arrayidx1, align 4
293 %idx.2 = add nsw i32 %tid.x, 11
294 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
295 store float %val0, float addrspace(3)* %arrayidx2, align 4
297 %idx.3 = add nsw i32 %tid.x, 27
298 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
299 store float %val1, float addrspace(3)* %arrayidx3, align 4
304 ; GCN-LABEL: @write2_ptr_subreg_arg_two_val_f32
305 ; CI-DAG: s_mov_b32 m0
308 ; GCN-NOT: ds_write2_b32
312 define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
313 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
314 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
315 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
316 %val0 = load float, float addrspace(1)* %in0.gep, align 4
317 %val1 = load float, float addrspace(1)* %in1.gep, align 4
319 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
320 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
321 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
322 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
323 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
325 ; Apply an additional offset after the vector that will be more obviously folded.
326 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
327 store float %val0, float addrspace(3)* %gep.0, align 4
329 %add.x = add nsw i32 %x.i, 8
330 store float %val1, float addrspace(3)* %gep.1.offset, align 4
334 ; GCN-LABEL: @simple_write2_one_val_f64
335 ; CI-DAG: s_mov_b32 m0
338 ; GCN-DAG: {{buffer|global}}_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
339 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
340 ; GCN: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
342 define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
343 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
344 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
345 %val = load double, double addrspace(1)* %in.gep, align 8
346 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
347 store double %val, double addrspace(3)* %arrayidx0, align 8
348 %add.x = add nsw i32 %x.i, 8
349 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
350 store double %val, double addrspace(3)* %arrayidx1, align 8
354 ; GCN-LABEL: @misaligned_simple_write2_one_val_f64
355 ; CI-DAG: s_mov_b32 m0
358 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
359 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
360 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
361 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
363 define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
364 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
365 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
366 %val = load double, double addrspace(1)* %in.gep, align 8
367 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
368 store double %val, double addrspace(3)* %arrayidx0, align 4
369 %add.x = add nsw i32 %x.i, 7
370 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
371 store double %val, double addrspace(3)* %arrayidx1, align 4
375 ; GCN-LABEL: @simple_write2_two_val_f64
376 ; CI-DAG: s_mov_b32 m0
379 ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
380 ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
382 ; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
383 ; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
386 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
387 ; GCN: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
389 define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
390 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
391 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
392 %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
393 %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
394 %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
395 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
396 store double %val0, double addrspace(3)* %arrayidx0, align 8
397 %add.x = add nsw i32 %x.i, 8
398 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
399 store double %val1, double addrspace(3)* %arrayidx1, align 8
403 @foo = addrspace(3) global [4 x i32] undef, align 4
405 ; GCN-LABEL: @store_constant_adjacent_offsets
406 ; CI-DAG: s_mov_b32 m0
409 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
410 ; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
411 define amdgpu_kernel void @store_constant_adjacent_offsets() {
412 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
413 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
417 ; GCN-LABEL: @store_constant_disjoint_offsets
418 ; CI-DAG: s_mov_b32 m0
421 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
422 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
423 ; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2
424 define amdgpu_kernel void @store_constant_disjoint_offsets() {
425 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
426 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
430 @bar = addrspace(3) global [4 x i64] undef, align 4
432 ; GCN-LABEL: @store_misaligned64_constant_offsets
433 ; CI-DAG: s_mov_b32 m0
436 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
437 ; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
438 ; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
440 define amdgpu_kernel void @store_misaligned64_constant_offsets() {
441 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
442 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
446 @bar.large = addrspace(3) global [4096 x i64] undef, align 4
448 ; GCN-LABEL: @store_misaligned64_constant_large_offsets
449 ; CI-DAG: s_mov_b32 m0
452 ; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
453 ; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
454 ; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
455 ; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}}
456 ; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}}
457 ; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
458 ; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
460 define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
461 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
462 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
466 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
467 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
469 define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
470 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
471 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
472 %val = load float, float addrspace(1)* %in
473 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
474 store float %val, float addrspace(3)* %arrayidx44, align 4
475 %add47 = add nsw i32 %x.i, 1
476 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
477 store float %val, float addrspace(3)* %arrayidx48, align 4
478 %add51 = add nsw i32 %x.i, 16
479 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
480 store float %val, float addrspace(3)* %arrayidx52, align 4
481 %add55 = add nsw i32 %x.i, 17
482 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
483 store float %val, float addrspace(3)* %arrayidx56, align 4
484 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
485 store float %val, float addrspace(3)* %arrayidx60, align 4
486 %add63 = add nsw i32 %y.i, 1
487 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
488 store float %val, float addrspace(3)* %arrayidx64, align 4
489 %add67 = add nsw i32 %y.i, 32
490 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
491 store float %val, float addrspace(3)* %arrayidx68, align 4
492 %add71 = add nsw i32 %y.i, 33
493 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
494 store float %val, float addrspace(3)* %arrayidx72, align 4
495 %add75 = add nsw i32 %y.i, 64
496 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
497 store float %val, float addrspace(3)* %arrayidx76, align 4
498 %add79 = add nsw i32 %y.i, 65
499 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
500 store float %val, float addrspace(3)* %arrayidx80, align 4
504 ; GCN-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
508 ; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}}
509 ; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
510 define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
511 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
512 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
513 %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
514 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
515 store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
519 declare i32 @llvm.amdgcn.workgroup.id.x() #1
520 declare i32 @llvm.amdgcn.workgroup.id.y() #1
521 declare i32 @llvm.amdgcn.workitem.id.x() #1
522 declare i32 @llvm.amdgcn.workitem.id.y() #1
524 attributes #0 = { nounwind }
525 attributes #1 = { nounwind readnone speculatable }
526 attributes #2 = { convergent nounwind }