1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
4 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
6 ; FIXME: We don't get cases where the address was an SGPR because we
7 ; get a copy to the address register for each one.
9 @lds = addrspace(3) global [512 x float] undef, align 4
10 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
12 define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 {
13 ; CI-LABEL: simple_read2_f32:
15 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
16 ; CI-NEXT: s_mov_b32 m0, -1
17 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8
18 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
19 ; CI-NEXT: s_mov_b32 s3, 0xf000
20 ; CI-NEXT: s_mov_b32 s2, 0
21 ; CI-NEXT: s_waitcnt lgkmcnt(0)
22 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
23 ; CI-NEXT: v_mov_b32_e32 v1, 0
24 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
27 ; GFX9-LABEL: simple_read2_f32:
29 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
30 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8
31 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
33 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
34 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
36 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
37 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
38 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
39 %add.x = add nsw i32 %x.i, 8
40 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
41 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
42 %sum = fadd float %val0, %val1
43 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
44 store float %sum, ptr addrspace(1) %out.gep, align 4
48 define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 {
49 ; CI-LABEL: simple_read2_f32_max_offset:
51 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
52 ; CI-NEXT: s_mov_b32 m0, -1
53 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255
54 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
55 ; CI-NEXT: s_mov_b32 s3, 0xf000
56 ; CI-NEXT: s_mov_b32 s2, 0
57 ; CI-NEXT: s_waitcnt lgkmcnt(0)
58 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
59 ; CI-NEXT: v_mov_b32_e32 v1, 0
60 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
63 ; GFX9-LABEL: simple_read2_f32_max_offset:
65 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
66 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255
67 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
68 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
70 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
73 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
74 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
75 %add.x = add nsw i32 %x.i, 255
76 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
77 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
78 %sum = fadd float %val0, %val1
79 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
80 store float %sum, ptr addrspace(1) %out.gep, align 4
84 define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 {
85 ; CI-LABEL: simple_read2_f32_too_far:
87 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
88 ; CI-NEXT: s_mov_b32 m0, -1
89 ; CI-NEXT: ds_read_b32 v1, v0
90 ; CI-NEXT: ds_read_b32 v2, v0 offset:1028
91 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
92 ; CI-NEXT: s_mov_b32 s3, 0xf000
93 ; CI-NEXT: s_mov_b32 s2, 0
94 ; CI-NEXT: s_waitcnt lgkmcnt(0)
95 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
96 ; CI-NEXT: v_mov_b32_e32 v1, 0
97 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
100 ; GFX9-LABEL: simple_read2_f32_too_far:
102 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
103 ; GFX9-NEXT: ds_read_b32 v1, v0
104 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028
105 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
106 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
108 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
109 ; GFX9-NEXT: s_endpgm
110 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
111 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
112 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
113 %add.x = add nsw i32 %x.i, 257
114 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
115 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
116 %sum = fadd float %val0, %val1
117 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
118 store float %sum, ptr addrspace(1) %out.gep, align 4
122 define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 {
123 ; CI-LABEL: simple_read2_f32_x2:
125 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
126 ; CI-NEXT: s_mov_b32 m0, -1
127 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8
128 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
129 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
130 ; CI-NEXT: s_mov_b32 s3, 0xf000
131 ; CI-NEXT: s_mov_b32 s2, 0
132 ; CI-NEXT: s_waitcnt lgkmcnt(0)
133 ; CI-NEXT: v_add_f32_e32 v1, v1, v2
134 ; CI-NEXT: v_add_f32_e32 v2, v3, v4
135 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
136 ; CI-NEXT: v_mov_b32_e32 v1, 0
137 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
140 ; GFX9-LABEL: simple_read2_f32_x2:
142 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
143 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8
144 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
145 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
146 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
147 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
148 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3
149 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
150 ; GFX9-NEXT: global_store_dword v4, v0, s[0:1]
151 ; GFX9-NEXT: s_endpgm
152 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
153 %idx.0 = add nsw i32 %tid.x, 0
154 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
155 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
157 %idx.1 = add nsw i32 %tid.x, 8
158 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
159 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
160 %sum.0 = fadd float %val0, %val1
162 %idx.2 = add nsw i32 %tid.x, 11
163 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
164 %val2 = load float, ptr addrspace(3) %arrayidx2, align 4
166 %idx.3 = add nsw i32 %tid.x, 27
167 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
168 %val3 = load float, ptr addrspace(3) %arrayidx3, align 4
169 %sum.1 = fadd float %val2, %val3
171 %sum = fadd float %sum.0, %sum.1
172 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0
173 store float %sum, ptr addrspace(1) %out.gep, align 4
177 ; Make sure there is an instruction between the two sets of reads.
178 define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 {
179 ; CI-LABEL: simple_read2_f32_x2_barrier:
181 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
182 ; CI-NEXT: s_mov_b32 m0, -1
183 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8
184 ; CI-NEXT: s_waitcnt lgkmcnt(0)
186 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
187 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
188 ; CI-NEXT: s_mov_b32 s3, 0xf000
189 ; CI-NEXT: v_add_f32_e32 v1, v1, v2
190 ; CI-NEXT: s_mov_b32 s2, 0
191 ; CI-NEXT: s_waitcnt lgkmcnt(0)
192 ; CI-NEXT: v_add_f32_e32 v2, v3, v4
193 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
194 ; CI-NEXT: v_mov_b32_e32 v1, 0
195 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
198 ; GFX9-LABEL: simple_read2_f32_x2_barrier:
200 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
201 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8
202 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
203 ; GFX9-NEXT: s_barrier
204 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
205 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
206 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3
209 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
210 ; GFX9-NEXT: global_store_dword v4, v0, s[0:1]
211 ; GFX9-NEXT: s_endpgm
212 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
213 %idx.0 = add nsw i32 %tid.x, 0
214 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
215 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
217 %idx.1 = add nsw i32 %tid.x, 8
218 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
219 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
220 %sum.0 = fadd float %val0, %val1
222 call void @llvm.amdgcn.s.barrier() #2
224 %idx.2 = add nsw i32 %tid.x, 11
225 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
226 %val2 = load float, ptr addrspace(3) %arrayidx2, align 4
228 %idx.3 = add nsw i32 %tid.x, 27
229 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
230 %val3 = load float, ptr addrspace(3) %arrayidx3, align 4
231 %sum.1 = fadd float %val2, %val3
233 %sum = fadd float %sum.0, %sum.1
234 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0
235 store float %sum, ptr addrspace(1) %out.gep, align 4
239 ; For some reason adding something to the base address for the first
240 ; element results in only folding the inner pair.
241 define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %out) #0 {
242 ; CI-LABEL: simple_read2_f32_x2_nonzero_base:
244 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
245 ; CI-NEXT: s_mov_b32 m0, -1
246 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8
247 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
248 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
249 ; CI-NEXT: s_mov_b32 s3, 0xf000
250 ; CI-NEXT: s_mov_b32 s2, 0
251 ; CI-NEXT: s_waitcnt lgkmcnt(0)
252 ; CI-NEXT: v_add_f32_e32 v1, v1, v2
253 ; CI-NEXT: v_add_f32_e32 v2, v3, v4
254 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
255 ; CI-NEXT: v_mov_b32_e32 v1, 0
256 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:8
259 ; GFX9-LABEL: simple_read2_f32_x2_nonzero_base:
261 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
262 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8
263 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
264 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
265 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
267 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3
268 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
269 ; GFX9-NEXT: global_store_dword v4, v0, s[0:1] offset:8
270 ; GFX9-NEXT: s_endpgm
271 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
272 %idx.0 = add nsw i32 %tid.x, 2
273 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
274 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
276 %idx.1 = add nsw i32 %tid.x, 8
277 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
278 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
279 %sum.0 = fadd float %val0, %val1
281 %idx.2 = add nsw i32 %tid.x, 11
282 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
283 %val2 = load float, ptr addrspace(3) %arrayidx2, align 4
285 %idx.3 = add nsw i32 %tid.x, 27
286 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
287 %val3 = load float, ptr addrspace(3) %arrayidx3, align 4
288 %sum.1 = fadd float %val2, %val3
290 %sum = fadd float %sum.0, %sum.1
291 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0
292 store float %sum, ptr addrspace(1) %out.gep, align 4
296 ; Be careful of vectors of pointers. We don't know if the 2 pointers
297 ; in the vectors are really the same base, so this is not safe to
299 ; Base pointers come from different subregister of same super
300 ; register. We can't safely merge this.
301 define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 {
302 ; CI-LABEL: read2_ptr_is_subreg_arg_f32:
304 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
305 ; CI-NEXT: s_mov_b32 m0, -1
306 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
307 ; CI-NEXT: s_waitcnt lgkmcnt(0)
308 ; CI-NEXT: v_mov_b32_e32 v1, s2
309 ; CI-NEXT: v_mov_b32_e32 v2, s3
310 ; CI-NEXT: ds_read_b32 v1, v1 offset:32
311 ; CI-NEXT: ds_read_b32 v2, v2
312 ; CI-NEXT: s_mov_b32 s3, 0xf000
313 ; CI-NEXT: s_mov_b32 s2, 0
314 ; CI-NEXT: s_waitcnt lgkmcnt(0)
315 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
316 ; CI-NEXT: v_mov_b32_e32 v1, 0
317 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
320 ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32:
322 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
323 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
324 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
325 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
326 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
327 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32
328 ; GFX9-NEXT: ds_read_b32 v2, v2
329 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
331 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
332 ; GFX9-NEXT: s_endpgm
333 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
334 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
335 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
336 %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
337 %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
338 %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
339 %val0 = load float, ptr addrspace(3) %gep.0, align 4
340 %val1 = load float, ptr addrspace(3) %gep.1, align 4
341 %add.x = add nsw i32 %x.i, 8
342 %sum = fadd float %val0, %val1
343 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
344 store float %sum, ptr addrspace(1) %out.gep, align 4
348 ; Apply a constant scalar offset after the pointer vector extract. We
349 ; are rejecting merges that have the same, constant 0 offset, so make
350 ; sure we are really rejecting it because of the different
352 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 {
353 ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32:
355 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
356 ; CI-NEXT: s_mov_b32 m0, -1
357 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
358 ; CI-NEXT: s_waitcnt lgkmcnt(0)
359 ; CI-NEXT: v_mov_b32_e32 v1, s2
360 ; CI-NEXT: v_mov_b32_e32 v2, s3
361 ; CI-NEXT: ds_read_b32 v1, v1 offset:32
362 ; CI-NEXT: ds_read_b32 v2, v2 offset:32
363 ; CI-NEXT: s_mov_b32 s3, 0xf000
364 ; CI-NEXT: s_mov_b32 s2, 0
365 ; CI-NEXT: s_waitcnt lgkmcnt(0)
366 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
367 ; CI-NEXT: v_mov_b32_e32 v1, 0
368 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
371 ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32:
373 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
374 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
375 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
376 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
377 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
378 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32
379 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:32
380 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
382 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
383 ; GFX9-NEXT: s_endpgm
384 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
385 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
386 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
387 %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
388 %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
389 %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
391 ; Apply an additional offset after the vector that will be more obviously folded.
392 %gep.1.offset = getelementptr float, ptr addrspace(3) %gep.1, i32 8
394 %val0 = load float, ptr addrspace(3) %gep.0, align 4
395 %val1 = load float, ptr addrspace(3) %gep.1.offset, align 4
396 %add.x = add nsw i32 %x.i, 8
397 %sum = fadd float %val0, %val1
398 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
399 store float %sum, ptr addrspace(1) %out.gep, align 4
403 define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 {
404 ; CI-LABEL: read2_ptr_is_subreg_f32:
406 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
407 ; CI-NEXT: s_mov_b32 m0, -1
408 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8
409 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
410 ; CI-NEXT: s_mov_b32 s3, 0xf000
411 ; CI-NEXT: s_mov_b32 s2, 0
412 ; CI-NEXT: s_waitcnt lgkmcnt(0)
413 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
414 ; CI-NEXT: v_mov_b32_e32 v1, 0
415 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
418 ; GFX9-LABEL: read2_ptr_is_subreg_f32:
420 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
421 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8
422 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
423 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
424 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
425 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
426 ; GFX9-NEXT: s_endpgm
427 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
428 %ptr.0 = insertelement <2 x ptr addrspace(3)> undef, ptr addrspace(3) @lds, i32 0
429 %ptr.1 = insertelement <2 x ptr addrspace(3)> %ptr.0, ptr addrspace(3) @lds, i32 1
430 %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
431 %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
432 %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
433 %gep = getelementptr inbounds [512 x float], <2 x ptr addrspace(3)> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
434 %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
435 %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
436 %val0 = load float, ptr addrspace(3) %gep.0, align 4
437 %val1 = load float, ptr addrspace(3) %gep.1, align 4
438 %add.x = add nsw i32 %x.i, 8
439 %sum = fadd float %val0, %val1
440 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
441 store float %sum, ptr addrspace(1) %out.gep, align 4
445 define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 {
446 ; CI-LABEL: simple_read2_f32_volatile_0:
448 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
449 ; CI-NEXT: s_mov_b32 m0, -1
450 ; CI-NEXT: ds_read_b32 v1, v0
451 ; CI-NEXT: ds_read_b32 v2, v0 offset:32
452 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
453 ; CI-NEXT: s_mov_b32 s3, 0xf000
454 ; CI-NEXT: s_mov_b32 s2, 0
455 ; CI-NEXT: s_waitcnt lgkmcnt(0)
456 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
457 ; CI-NEXT: v_mov_b32_e32 v1, 0
458 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
461 ; GFX9-LABEL: simple_read2_f32_volatile_0:
463 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
464 ; GFX9-NEXT: ds_read_b32 v1, v0
465 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32
466 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
467 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
468 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
469 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
470 ; GFX9-NEXT: s_endpgm
471 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
472 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
473 %val0 = load volatile float, ptr addrspace(3) %arrayidx0, align 4
474 %add.x = add nsw i32 %x.i, 8
475 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
476 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
477 %sum = fadd float %val0, %val1
478 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
479 store float %sum, ptr addrspace(1) %out.gep, align 4
483 define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 {
484 ; CI-LABEL: simple_read2_f32_volatile_1:
486 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
487 ; CI-NEXT: s_mov_b32 m0, -1
488 ; CI-NEXT: ds_read_b32 v1, v0
489 ; CI-NEXT: ds_read_b32 v2, v0 offset:32
490 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
491 ; CI-NEXT: s_mov_b32 s3, 0xf000
492 ; CI-NEXT: s_mov_b32 s2, 0
493 ; CI-NEXT: s_waitcnt lgkmcnt(0)
494 ; CI-NEXT: v_add_f32_e32 v2, v1, v2
495 ; CI-NEXT: v_mov_b32_e32 v1, 0
496 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
499 ; GFX9-LABEL: simple_read2_f32_volatile_1:
501 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
502 ; GFX9-NEXT: ds_read_b32 v1, v0
503 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32
504 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
505 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
506 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
507 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
508 ; GFX9-NEXT: s_endpgm
509 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
510 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
511 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
512 %add.x = add nsw i32 %x.i, 8
513 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
514 %val1 = load volatile float, ptr addrspace(3) %arrayidx1, align 4
515 %sum = fadd float %val0, %val1
516 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
517 store float %sum, ptr addrspace(1) %out.gep, align 4
521 ; Can't fold since not correctly aligned.
522 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
523 ; CI-LABEL: unaligned_read2_f32:
525 ; CI-NEXT: s_load_dword s2, s[0:1], 0x2
526 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
527 ; CI-NEXT: s_mov_b32 m0, -1
528 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
529 ; CI-NEXT: s_mov_b32 s3, 0xf000
530 ; CI-NEXT: s_waitcnt lgkmcnt(0)
531 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
532 ; CI-NEXT: ds_read_u8 v2, v1 offset:34
533 ; CI-NEXT: ds_read_u8 v3, v1 offset:32
534 ; CI-NEXT: ds_read_u8 v4, v1 offset:3
535 ; CI-NEXT: ds_read_u8 v5, v1 offset:2
536 ; CI-NEXT: ds_read_u8 v6, v1 offset:1
537 ; CI-NEXT: ds_read_u8 v7, v1
538 ; CI-NEXT: ds_read_u8 v8, v1 offset:33
539 ; CI-NEXT: ds_read_u8 v1, v1 offset:35
540 ; CI-NEXT: s_waitcnt lgkmcnt(5)
541 ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
542 ; CI-NEXT: s_waitcnt lgkmcnt(3)
543 ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
544 ; CI-NEXT: v_or_b32_e32 v4, v4, v5
545 ; CI-NEXT: s_waitcnt lgkmcnt(1)
546 ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8
547 ; CI-NEXT: s_waitcnt lgkmcnt(0)
548 ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
549 ; CI-NEXT: v_or_b32_e32 v1, v1, v2
550 ; CI-NEXT: v_or_b32_e32 v6, v6, v7
551 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
552 ; CI-NEXT: v_or_b32_e32 v3, v5, v3
553 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
554 ; CI-NEXT: v_or_b32_e32 v4, v4, v6
555 ; CI-NEXT: v_or_b32_e32 v1, v1, v3
556 ; CI-NEXT: v_add_f32_e32 v2, v4, v1
557 ; CI-NEXT: s_mov_b32 s2, 0
558 ; CI-NEXT: v_mov_b32_e32 v1, 0
559 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
562 ; GFX9-ALIGNED-LABEL: unaligned_read2_f32:
563 ; GFX9-ALIGNED: ; %bb.0:
564 ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8
565 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
566 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0
567 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0
569 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1
570 ; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:1
571 ; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:2
572 ; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:3
573 ; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:32
574 ; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:33
575 ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:34
576 ; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:35
577 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6)
578 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2
579 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4)
580 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4
581 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2
582 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2)
583 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6
584 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
585 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8
586 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3
587 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1
588 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3]
589 ; GFX9-ALIGNED-NEXT: s_endpgm
591 ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32:
592 ; GFX9-UNALIGNED: ; %bb.0:
593 ; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8
594 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0
595 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
596 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
597 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2
598 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8
599 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
600 ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1
601 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1]
602 ; GFX9-UNALIGNED-NEXT: s_endpgm
603 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
604 %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
605 %val0 = load float, ptr addrspace(3) %arrayidx0, align 1
606 %add.x = add nsw i32 %x.i, 8
607 %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x
608 %val1 = load float, ptr addrspace(3) %arrayidx1, align 1
609 %sum = fadd float %val0, %val1
610 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
611 store float %sum, ptr addrspace(1) %out.gep, align 4
615 define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
616 ; CI-LABEL: unaligned_offset_read2_f32:
618 ; CI-NEXT: s_load_dword s2, s[0:1], 0x2
619 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
620 ; CI-NEXT: s_mov_b32 m0, -1
621 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
622 ; CI-NEXT: s_mov_b32 s3, 0xf000
623 ; CI-NEXT: s_waitcnt lgkmcnt(0)
624 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
625 ; CI-NEXT: ds_read_u8 v2, v1 offset:11
626 ; CI-NEXT: ds_read_u8 v3, v1 offset:9
627 ; CI-NEXT: ds_read_u8 v4, v1 offset:8
628 ; CI-NEXT: ds_read_u8 v5, v1 offset:7
629 ; CI-NEXT: ds_read_u8 v6, v1 offset:6
630 ; CI-NEXT: ds_read_u8 v7, v1 offset:5
631 ; CI-NEXT: ds_read_u8 v8, v1 offset:10
632 ; CI-NEXT: ds_read_u8 v1, v1 offset:12
633 ; CI-NEXT: s_waitcnt lgkmcnt(5)
634 ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
635 ; CI-NEXT: s_waitcnt lgkmcnt(3)
636 ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
637 ; CI-NEXT: v_or_b32_e32 v4, v4, v5
638 ; CI-NEXT: s_waitcnt lgkmcnt(1)
639 ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8
640 ; CI-NEXT: s_waitcnt lgkmcnt(0)
641 ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
642 ; CI-NEXT: v_or_b32_e32 v1, v1, v2
643 ; CI-NEXT: v_or_b32_e32 v6, v6, v7
644 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
645 ; CI-NEXT: v_or_b32_e32 v3, v5, v3
646 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
647 ; CI-NEXT: v_or_b32_e32 v4, v4, v6
648 ; CI-NEXT: v_or_b32_e32 v1, v1, v3
649 ; CI-NEXT: v_add_f32_e32 v2, v4, v1
650 ; CI-NEXT: s_mov_b32 s2, 0
651 ; CI-NEXT: v_mov_b32_e32 v1, 0
652 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
655 ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32:
656 ; GFX9-ALIGNED: ; %bb.0:
657 ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8
658 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
659 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0
660 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
661 ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0
662 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5
663 ; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:6
664 ; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:7
665 ; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:8
666 ; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:9
667 ; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:10
668 ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:11
669 ; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:12
670 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6)
671 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2
672 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4)
673 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4
674 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2
675 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2)
676 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6
677 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
678 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8
679 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3
680 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1
681 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3]
682 ; GFX9-ALIGNED-NEXT: s_endpgm
684 ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32:
685 ; GFX9-UNALIGNED: ; %bb.0:
686 ; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8
687 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0
688 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
689 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
690 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2
691 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5
692 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
693 ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1
694 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1]
695 ; GFX9-UNALIGNED-NEXT: s_endpgm
696 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
697 %base = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
698 %addr0.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 5
699 %val0 = load float, ptr addrspace(3) %addr0.i8, align 1
700 %addr1.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 9
701 %val1 = load float, ptr addrspace(3) %addr1.i8, align 1
702 %sum = fadd float %val0, %val1
703 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
704 store float %sum, ptr addrspace(1) %out.gep, align 4
708 define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
709 ; CI-LABEL: misaligned_2_simple_read2_f32:
711 ; CI-NEXT: s_load_dword s2, s[0:1], 0x2
712 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
713 ; CI-NEXT: s_mov_b32 m0, -1
714 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
715 ; CI-NEXT: s_mov_b32 s3, 0xf000
716 ; CI-NEXT: s_waitcnt lgkmcnt(0)
717 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
718 ; CI-NEXT: ds_read_u16 v2, v1 offset:32
719 ; CI-NEXT: ds_read_u16 v3, v1 offset:2
720 ; CI-NEXT: ds_read_u16 v4, v1
721 ; CI-NEXT: ds_read_u16 v1, v1 offset:34
722 ; CI-NEXT: s_mov_b32 s2, 0
723 ; CI-NEXT: s_waitcnt lgkmcnt(2)
724 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
725 ; CI-NEXT: s_waitcnt lgkmcnt(1)
726 ; CI-NEXT: v_or_b32_e32 v3, v3, v4
727 ; CI-NEXT: s_waitcnt lgkmcnt(0)
728 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
729 ; CI-NEXT: v_or_b32_e32 v1, v1, v2
730 ; CI-NEXT: v_add_f32_e32 v2, v3, v1
731 ; CI-NEXT: v_mov_b32_e32 v1, 0
732 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
735 ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32:
736 ; GFX9-ALIGNED: ; %bb.0:
737 ; GFX9-ALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8
738 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0
739 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
740 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
741 ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0
742 ; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1
743 ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2
744 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32
745 ; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34
746 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2)
747 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2
748 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
749 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4
750 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1
751 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1]
752 ; GFX9-ALIGNED-NEXT: s_endpgm
754 ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32:
755 ; GFX9-UNALIGNED: ; %bb.0:
756 ; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8
757 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0
758 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
759 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
760 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2
761 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8
762 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
763 ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1
764 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1]
765 ; GFX9-UNALIGNED-NEXT: s_endpgm
766 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
767 %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
768 %val0 = load float, ptr addrspace(3) %arrayidx0, align 2
769 %add.x = add nsw i32 %x.i, 8
770 %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x
771 %val1 = load float, ptr addrspace(3) %arrayidx1, align 2
772 %sum = fadd float %val0, %val1
773 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
774 store float %sum, ptr addrspace(1) %out.gep, align 4
778 define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 {
779 ; CI-LABEL: simple_read2_f64:
781 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
782 ; CI-NEXT: s_mov_b32 m0, -1
783 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8
784 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
785 ; CI-NEXT: s_mov_b32 s3, 0xf000
786 ; CI-NEXT: s_mov_b32 s2, 0
787 ; CI-NEXT: v_mov_b32_e32 v5, 0
788 ; CI-NEXT: s_waitcnt lgkmcnt(0)
789 ; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
790 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
793 ; GFX9-LABEL: simple_read2_f64:
795 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
796 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8
797 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
798 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
799 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
800 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
801 ; GFX9-NEXT: s_endpgm
802 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
803 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
804 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
805 %add.x = add nsw i32 %x.i, 8
806 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
807 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
808 %sum = fadd double %val0, %val1
809 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
810 store double %sum, ptr addrspace(1) %out.gep, align 8
814 define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 {
815 ; CI-LABEL: simple_read2_f64_max_offset:
817 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
818 ; CI-NEXT: s_mov_b32 m0, -1
819 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255
820 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
821 ; CI-NEXT: s_mov_b32 s3, 0xf000
822 ; CI-NEXT: s_mov_b32 s2, 0
823 ; CI-NEXT: v_mov_b32_e32 v5, 0
824 ; CI-NEXT: s_waitcnt lgkmcnt(0)
825 ; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
826 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
829 ; GFX9-LABEL: simple_read2_f64_max_offset:
831 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
832 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255
833 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
834 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
835 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
836 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
837 ; GFX9-NEXT: s_endpgm
838 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
839 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
840 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
841 %add.x = add nsw i32 %x.i, 255
842 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
843 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
844 %sum = fadd double %val0, %val1
845 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
846 store double %sum, ptr addrspace(1) %out.gep, align 8
850 define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 {
851 ; CI-LABEL: simple_read2_f64_too_far:
853 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
854 ; CI-NEXT: s_mov_b32 m0, -1
855 ; CI-NEXT: ds_read_b64 v[1:2], v0
856 ; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056
857 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
858 ; CI-NEXT: s_mov_b32 s3, 0xf000
859 ; CI-NEXT: s_mov_b32 s2, 0
860 ; CI-NEXT: s_waitcnt lgkmcnt(0)
861 ; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4]
862 ; CI-NEXT: v_mov_b32_e32 v1, 0
863 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
866 ; GFX9-LABEL: simple_read2_f64_too_far:
868 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
869 ; GFX9-NEXT: ds_read_b64 v[0:1], v4
870 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056
871 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
872 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
873 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
874 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
875 ; GFX9-NEXT: s_endpgm
876 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
877 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
878 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
879 %add.x = add nsw i32 %x.i, 257
880 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
881 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
882 %sum = fadd double %val0, %val1
883 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
884 store double %sum, ptr addrspace(1) %out.gep, align 8
889 define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
890 ; CI-LABEL: misaligned_read2_f64:
892 ; CI-NEXT: s_load_dword s2, s[0:1], 0x2
893 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
894 ; CI-NEXT: s_mov_b32 m0, -1
895 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
896 ; CI-NEXT: s_mov_b32 s3, 0xf000
897 ; CI-NEXT: s_waitcnt lgkmcnt(0)
898 ; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0
899 ; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1
900 ; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15
901 ; CI-NEXT: s_mov_b32 s2, 0
902 ; CI-NEXT: s_waitcnt lgkmcnt(0)
903 ; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4]
904 ; CI-NEXT: v_mov_b32_e32 v1, 0
905 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
908 ; GFX9-LABEL: misaligned_read2_f64:
910 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8
911 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
912 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
913 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
914 ; GFX9-NEXT: v_add_u32_e32 v2, s2, v4
915 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
916 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15
917 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
918 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
919 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
920 ; GFX9-NEXT: s_endpgm
921 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
922 %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
923 %val0 = load double, ptr addrspace(3) %arrayidx0, align 4
924 %add.x = add nsw i32 %x.i, 7
925 %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
926 %val1 = load double, ptr addrspace(3) %arrayidx1, align 4
927 %sum = fadd double %val0, %val1
928 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
929 store double %sum, ptr addrspace(1) %out.gep, align 4
933 @foo = addrspace(3) global [4 x i32] undef, align 4
935 define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) {
936 ; CI-LABEL: load_constant_adjacent_offsets:
938 ; CI-NEXT: v_mov_b32_e32 v0, 0
939 ; CI-NEXT: s_mov_b32 m0, -1
940 ; CI-NEXT: ds_read_b64 v[0:1], v0
941 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
942 ; CI-NEXT: s_mov_b32 s3, 0xf000
943 ; CI-NEXT: s_mov_b32 s2, -1
944 ; CI-NEXT: s_waitcnt lgkmcnt(0)
945 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
946 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
949 ; GFX9-LABEL: load_constant_adjacent_offsets:
951 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
952 ; GFX9-NEXT: ds_read_b64 v[0:1], v2
953 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
954 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
955 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
956 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
957 ; GFX9-NEXT: s_endpgm
958 %val0 = load i32, ptr addrspace(3) @foo, align 4
959 %val1 = load i32, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 1), align 4
960 %sum = add i32 %val0, %val1
961 store i32 %sum, ptr addrspace(1) %out, align 4
965 define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) {
966 ; CI-LABEL: load_constant_disjoint_offsets:
968 ; CI-NEXT: v_mov_b32_e32 v0, 0
969 ; CI-NEXT: s_mov_b32 m0, -1
970 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2
971 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
972 ; CI-NEXT: s_mov_b32 s3, 0xf000
973 ; CI-NEXT: s_mov_b32 s2, -1
974 ; CI-NEXT: s_waitcnt lgkmcnt(0)
975 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
976 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
979 ; GFX9-LABEL: load_constant_disjoint_offsets:
981 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
982 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2
983 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
984 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
985 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
986 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
987 ; GFX9-NEXT: s_endpgm
988 %val0 = load i32, ptr addrspace(3) @foo, align 4
989 %val1 = load i32, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 2), align 4
990 %sum = add i32 %val0, %val1
991 store i32 %sum, ptr addrspace(1) %out, align 4
995 @bar = addrspace(3) global [4 x i64] undef, align 4
997 define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) %out) {
998 ; CI-LABEL: load_misaligned64_constant_offsets:
1000 ; CI-NEXT: v_mov_b32_e32 v0, 0
1001 ; CI-NEXT: s_mov_b32 m0, -1
1002 ; CI-NEXT: ds_read_b128 v[0:3], v0
1003 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1004 ; CI-NEXT: s_mov_b32 s3, 0xf000
1005 ; CI-NEXT: s_mov_b32 s2, -1
1006 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1007 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1008 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
1009 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1012 ; GFX9-LABEL: load_misaligned64_constant_offsets:
1014 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1015 ; GFX9-NEXT: ds_read_b128 v[0:3], v4
1016 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1017 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1018 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
1019 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
1020 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1021 ; GFX9-NEXT: s_endpgm
1022 %val0 = load i64, ptr addrspace(3) @bar, align 4
1023 %val1 = load i64, ptr addrspace(3) getelementptr inbounds ([4 x i64], ptr addrspace(3) @bar, i32 0, i32 1), align 4
1024 %sum = add i64 %val0, %val1
1025 store i64 %sum, ptr addrspace(1) %out, align 8
1029 @bar.large = addrspace(3) global [4096 x i64] undef, align 4
1031 define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspace(1) %out) {
1032 ; CI-LABEL: load_misaligned64_constant_large_offsets:
1034 ; CI-NEXT: v_mov_b32_e32 v2, 0
1035 ; CI-NEXT: s_mov_b32 m0, -1
1036 ; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384
1037 ; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760
1038 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1039 ; CI-NEXT: s_mov_b32 s3, 0xf000
1040 ; CI-NEXT: s_mov_b32 s2, -1
1041 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1042 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1043 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
1044 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1047 ; GFX9-LABEL: load_misaligned64_constant_large_offsets:
1049 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1050 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384
1051 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760
1052 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1053 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1054 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
1055 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
1056 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1057 ; GFX9-NEXT: s_endpgm
1058 %val0 = load i64, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 2048), align 4
1059 %val1 = load i64, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 4095), align 4
1060 %sum = add i64 %val0, %val1
1061 store i64 %sum, ptr addrspace(1) %out, align 8
1065 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
1066 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
1068 define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 {
1069 ; CI-LABEL: sgemm_inner_loop_read2_sequence:
1071 ; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
1072 ; CI-NEXT: s_lshl_b32 s0, s2, 2
1073 ; CI-NEXT: s_add_i32 s1, s0, 0xc20
1074 ; CI-NEXT: s_addk_i32 s0, 0xc60
1075 ; CI-NEXT: v_mov_b32_e32 v0, s1
1076 ; CI-NEXT: v_mov_b32_e32 v2, s0
1077 ; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1
1078 ; CI-NEXT: s_mov_b32 m0, -1
1079 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
1080 ; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
1081 ; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1
1082 ; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
1083 ; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
1084 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1085 ; CI-NEXT: v_add_f32_e32 v0, v0, v1
1086 ; CI-NEXT: v_add_f32_e32 v0, v0, v2
1087 ; CI-NEXT: v_add_f32_e32 v0, v0, v3
1088 ; CI-NEXT: v_add_f32_e32 v0, v0, v4
1089 ; CI-NEXT: v_add_f32_e32 v0, v0, v5
1090 ; CI-NEXT: v_add_f32_e32 v0, v0, v6
1091 ; CI-NEXT: v_add_f32_e32 v0, v0, v7
1092 ; CI-NEXT: v_add_f32_e32 v0, v0, v8
1093 ; CI-NEXT: s_mov_b32 s7, 0xf000
1094 ; CI-NEXT: s_mov_b32 s6, -1
1095 ; CI-NEXT: v_add_f32_e32 v0, v0, v9
1096 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1099 ; GFX9-LABEL: sgemm_inner_loop_read2_sequence:
1101 ; GFX9-NEXT: s_lshl_b32 s2, s2, 2
1102 ; GFX9-NEXT: s_add_i32 s3, s2, 0xc20
1103 ; GFX9-NEXT: s_addk_i32 s2, 0xc60
1104 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
1105 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1106 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1
1107 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
1108 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
1109 ; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1
1110 ; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
1111 ; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
1112 ; GFX9-NEXT: s_waitcnt lgkmcnt(4)
1113 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
1114 ; GFX9-NEXT: s_waitcnt lgkmcnt(3)
1115 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
1116 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v3
1117 ; GFX9-NEXT: s_waitcnt lgkmcnt(2)
1118 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
1119 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1120 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v5
1121 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1122 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6
1123 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7
1124 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
1125 ; GFX9-NEXT: v_mov_b32_e32 v10, 0
1126 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v9
1127 ; GFX9-NEXT: global_store_dword v10, v0, s[0:1]
1128 ; GFX9-NEXT: s_endpgm
1129 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
1130 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
1131 %arrayidx44 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %x.i
1132 %tmp16 = load float, ptr addrspace(3) %arrayidx44, align 4
1133 %add47 = add nsw i32 %x.i, 1
1134 %arrayidx48 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add47
1135 %tmp17 = load float, ptr addrspace(3) %arrayidx48, align 4
1136 %add51 = add nsw i32 %x.i, 16
1137 %arrayidx52 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add51
1138 %tmp18 = load float, ptr addrspace(3) %arrayidx52, align 4
1139 %add55 = add nsw i32 %x.i, 17
1140 %arrayidx56 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add55
1141 %tmp19 = load float, ptr addrspace(3) %arrayidx56, align 4
1142 %arrayidx60 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %y.i
1143 %tmp20 = load float, ptr addrspace(3) %arrayidx60, align 4
1144 %add63 = add nsw i32 %y.i, 1
1145 %arrayidx64 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add63
1146 %tmp21 = load float, ptr addrspace(3) %arrayidx64, align 4
1147 %add67 = add nsw i32 %y.i, 32
1148 %arrayidx68 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add67
1149 %tmp22 = load float, ptr addrspace(3) %arrayidx68, align 4
1150 %add71 = add nsw i32 %y.i, 33
1151 %arrayidx72 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add71
1152 %tmp23 = load float, ptr addrspace(3) %arrayidx72, align 4
1153 %add75 = add nsw i32 %y.i, 64
1154 %arrayidx76 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add75
1155 %tmp24 = load float, ptr addrspace(3) %arrayidx76, align 4
1156 %add79 = add nsw i32 %y.i, 65
1157 %arrayidx80 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add79
1158 %tmp25 = load float, ptr addrspace(3) %arrayidx80, align 4
1159 %sum.0 = fadd float %tmp16, %tmp17
1160 %sum.1 = fadd float %sum.0, %tmp18
1161 %sum.2 = fadd float %sum.1, %tmp19
1162 %sum.3 = fadd float %sum.2, %tmp20
1163 %sum.4 = fadd float %sum.3, %tmp21
1164 %sum.5 = fadd float %sum.4, %tmp22
1165 %sum.6 = fadd float %sum.5, %tmp23
1166 %sum.7 = fadd float %sum.6, %tmp24
1167 %sum.8 = fadd float %sum.7, %tmp25
1168 store float %sum.8, ptr addrspace(1) %C, align 4
1172 define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
1173 ; CI-LABEL: misaligned_read2_v2i32:
1175 ; CI-NEXT: s_load_dword s2, s[0:1], 0x2
1176 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1177 ; CI-NEXT: s_mov_b32 m0, -1
1178 ; CI-NEXT: s_mov_b32 s3, 0xf000
1179 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1180 ; CI-NEXT: v_mov_b32_e32 v0, s2
1181 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
1182 ; CI-NEXT: s_mov_b32 s2, -1
1183 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1184 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1187 ; GFX9-LABEL: misaligned_read2_v2i32:
1189 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8
1190 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
1191 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1192 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1193 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1194 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
1195 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1196 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
1197 ; GFX9-NEXT: s_endpgm
1198 %load = load <2 x i32>, ptr addrspace(3) %in, align 4
1199 store <2 x i32> %load, ptr addrspace(1) %out, align 8
1203 define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
1204 ; CI-LABEL: misaligned_read2_i64:
1206 ; CI-NEXT: s_load_dword s2, s[0:1], 0x2
1207 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1208 ; CI-NEXT: s_mov_b32 m0, -1
1209 ; CI-NEXT: s_mov_b32 s3, 0xf000
1210 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1211 ; CI-NEXT: v_mov_b32_e32 v0, s2
1212 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
1213 ; CI-NEXT: s_mov_b32 s2, -1
1214 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1215 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1218 ; GFX9-LABEL: misaligned_read2_i64:
1220 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8
1221 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
1222 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1223 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1224 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1225 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
1226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1227 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
1228 ; GFX9-NEXT: s_endpgm
1229 %load = load i64, ptr addrspace(3) %in, align 4
1230 store i64 %load, ptr addrspace(1) %out, align 8
1234 define amdgpu_kernel void @ds_read_diff_base_interleaving(
1235 ; CI-LABEL: ds_read_diff_base_interleaving:
1236 ; CI: ; %bb.0: ; %bb
1237 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2
1238 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1239 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1
1240 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1241 ; CI-NEXT: s_mov_b32 m0, -1
1242 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1243 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v1
1244 ; CI-NEXT: v_add_i32_e32 v3, vcc, s5, v0
1245 ; CI-NEXT: v_add_i32_e32 v4, vcc, s6, v1
1246 ; CI-NEXT: v_add_i32_e32 v6, vcc, s7, v0
1247 ; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
1248 ; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4
1249 ; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
1250 ; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:4
1251 ; CI-NEXT: s_mov_b32 s3, 0xf000
1252 ; CI-NEXT: s_mov_b32 s2, -1
1253 ; CI-NEXT: s_waitcnt lgkmcnt(2)
1254 ; CI-NEXT: v_mul_f32_e32 v0, v0, v2
1255 ; CI-NEXT: v_add_f32_e32 v0, 2.0, v0
1256 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1257 ; CI-NEXT: v_mul_f32_e32 v2, v4, v6
1258 ; CI-NEXT: v_sub_f32_e32 v0, v0, v2
1259 ; CI-NEXT: v_mul_f32_e32 v1, v1, v3
1260 ; CI-NEXT: v_sub_f32_e32 v0, v0, v1
1261 ; CI-NEXT: v_mul_f32_e32 v1, v5, v7
1262 ; CI-NEXT: v_sub_f32_e32 v0, v0, v1
1263 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40
1266 ; GFX9-LABEL: ds_read_diff_base_interleaving:
1267 ; GFX9: ; %bb.0: ; %bb
1268 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
1269 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
1270 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1271 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1272 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
1273 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1274 ; GFX9-NEXT: v_add_u32_e32 v2, s4, v1
1275 ; GFX9-NEXT: v_add_u32_e32 v3, s5, v0
1276 ; GFX9-NEXT: v_add_u32_e32 v4, s6, v1
1277 ; GFX9-NEXT: v_add_u32_e32 v6, s7, v0
1278 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
1279 ; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4
1280 ; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
1281 ; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:4
1282 ; GFX9-NEXT: s_waitcnt lgkmcnt(2)
1283 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
1284 ; GFX9-NEXT: v_add_f32_e32 v0, 2.0, v0
1285 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1286 ; GFX9-NEXT: v_mul_f32_e32 v2, v4, v6
1287 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
1288 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
1289 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
1290 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7
1291 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
1292 ; GFX9-NEXT: global_store_dword v8, v0, s[0:1] offset:40
1293 ; GFX9-NEXT: s_endpgm
1294 ptr addrspace(1) nocapture %arg,
1295 ptr addrspace(3) %arg1,
1296 ptr addrspace(3) %arg2,
1297 ptr addrspace(3) %arg3,
1298 ptr addrspace(3) %arg4) #1 {
1300 %tmp = getelementptr float, ptr addrspace(1) %arg, i64 10
1301 %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2
1302 %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
1303 %tmp7 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg1, i32 0, i32 %tmp6, i32 0
1304 %tmp8 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg2, i32 0, i32 0, i32 %tmp5
1305 %tmp9 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg3, i32 0, i32 %tmp6, i32 0
1306 %tmp10 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg4, i32 0, i32 0, i32 %tmp5
1307 %tmp11 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg1, i32 0, i32 %tmp6, i32 1
1308 %tmp12 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg2, i32 0, i32 1, i32 %tmp5
1309 %tmp13 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg3, i32 0, i32 %tmp6, i32 1
1310 %tmp14 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg4, i32 0, i32 1, i32 %tmp5
1311 %tmp15 = load float, ptr addrspace(3) %tmp7
1312 %tmp16 = load float, ptr addrspace(3) %tmp8
1313 %tmp17 = fmul float %tmp15, %tmp16
1314 %tmp18 = fadd float 2.000000e+00, %tmp17
1315 %tmp19 = load float, ptr addrspace(3) %tmp9
1316 %tmp20 = load float, ptr addrspace(3) %tmp10
1317 %tmp21 = fmul float %tmp19, %tmp20
1318 %tmp22 = fsub float %tmp18, %tmp21
1319 %tmp23 = load float, ptr addrspace(3) %tmp11
1320 %tmp24 = load float, ptr addrspace(3) %tmp12
1321 %tmp25 = fmul float %tmp23, %tmp24
1322 %tmp26 = fsub float %tmp22, %tmp25
1323 %tmp27 = load float, ptr addrspace(3) %tmp13
1324 %tmp28 = load float, ptr addrspace(3) %tmp14
1325 %tmp29 = fmul float %tmp27, %tmp28
1326 %tmp30 = fsub float %tmp26, %tmp29
1327 store float %tmp30, ptr addrspace(1) %tmp
1331 define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspace(3) %arg) {
1332 ; CI-LABEL: ds_read_call_read:
1334 ; CI-NEXT: s_getpc_b64 s[40:41]
1335 ; CI-NEXT: s_mov_b32 s40, s0
1336 ; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0
1337 ; CI-NEXT: s_mov_b32 s14, s10
1338 ; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0
1339 ; CI-NEXT: s_mov_b32 m0, -1
1340 ; CI-NEXT: s_mov_b32 s12, s8
1341 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1342 ; CI-NEXT: s_add_u32 s40, s40, s11
1343 ; CI-NEXT: s_mov_b64 s[10:11], s[6:7]
1344 ; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
1345 ; CI-NEXT: s_load_dword s6, s[4:5], 0x2
1346 ; CI-NEXT: s_addc_u32 s41, s41, 0
1347 ; CI-NEXT: s_add_u32 s8, s4, 12
1348 ; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1349 ; CI-NEXT: s_mov_b32 s13, s9
1350 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1351 ; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3
1352 ; CI-NEXT: ds_read_b32 v41, v40
1353 ; CI-NEXT: s_addc_u32 s9, s5, 0
1354 ; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1355 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
1356 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
1357 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
1358 ; CI-NEXT: s_mov_b64 s[0:1], s[40:41]
1359 ; CI-NEXT: s_mov_b32 s17, void_func_void@abs32@hi
1360 ; CI-NEXT: s_mov_b32 s16, void_func_void@abs32@lo
1361 ; CI-NEXT: v_or_b32_e32 v31, v0, v2
1362 ; CI-NEXT: s_mov_b64 s[2:3], s[42:43]
1363 ; CI-NEXT: s_mov_b32 s32, 0
1364 ; CI-NEXT: s_mov_b32 s39, 0xf000
1365 ; CI-NEXT: s_mov_b32 s38, -1
1366 ; CI-NEXT: s_swappc_b64 s[30:31], s[16:17]
1367 ; CI-NEXT: ds_read_b32 v0, v40 offset:4
1368 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1369 ; CI-NEXT: v_add_i32_e32 v0, vcc, v41, v0
1370 ; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0
1373 ; GFX9-LABEL: ds_read_call_read:
1375 ; GFX9-NEXT: s_getpc_b64 s[36:37]
1376 ; GFX9-NEXT: s_mov_b32 s36, s0
1377 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0
1378 ; GFX9-NEXT: s_mov_b32 s14, s10
1379 ; GFX9-NEXT: s_mov_b32 s12, s8
1380 ; GFX9-NEXT: s_mov_b32 s13, s9
1381 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1382 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1383 ; GFX9-NEXT: s_add_u32 s36, s36, s11
1384 ; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
1385 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8
1386 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
1387 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
1388 ; GFX9-NEXT: s_add_u32 s8, s4, 12
1389 ; GFX9-NEXT: s_addc_u32 s9, s5, 0
1390 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1391 ; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6
1392 ; GFX9-NEXT: ds_read_b32 v42, v41
1393 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1394 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
1395 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
1396 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
1397 ; GFX9-NEXT: s_mov_b32 s17, void_func_void@abs32@hi
1398 ; GFX9-NEXT: s_mov_b32 s16, void_func_void@abs32@lo
1399 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
1400 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
1401 ; GFX9-NEXT: s_mov_b32 s32, 0
1402 ; GFX9-NEXT: v_mov_b32_e32 v40, 0
1403 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
1404 ; GFX9-NEXT: ds_read_b32 v0, v41 offset:4
1405 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1406 ; GFX9-NEXT: v_add_u32_e32 v0, v42, v0
1407 ; GFX9-NEXT: global_store_dword v40, v0, s[34:35]
1408 ; GFX9-NEXT: s_endpgm
1409 %x = call i32 @llvm.amdgcn.workitem.id.x()
1410 %arrayidx0 = getelementptr i32, ptr addrspace(3) %arg, i32 %x
1411 %arrayidx1 = getelementptr i32, ptr addrspace(3) %arrayidx0, i32 1
1412 %v0 = load i32, ptr addrspace(3) %arrayidx0, align 4
1413 call void @void_func_void()
1414 %v1 = load i32, ptr addrspace(3) %arrayidx1, align 4
1415 %r = add i32 %v0, %v1
1416 store i32 %r, ptr addrspace(1) %out, align 4
1420 define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, ptr addrspace(3) %inptr) {
1421 ; CI-LABEL: ds_read_interp_read:
1423 ; CI-NEXT: s_mov_b32 m0, -1
1424 ; CI-NEXT: ds_read_b32 v2, v0
1425 ; CI-NEXT: s_mov_b32 m0, s0
1426 ; CI-NEXT: v_interp_mov_f32 v1, p10, attr0.x
1427 ; CI-NEXT: s_mov_b32 m0, -1
1428 ; CI-NEXT: ds_read_b32 v0, v0 offset:16
1429 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1430 ; CI-NEXT: v_add_f32_e32 v1, v0, v1
1431 ; CI-NEXT: v_mov_b32_e32 v0, v2
1432 ; CI-NEXT: ; return to shader part epilog
1434 ; GFX9-LABEL: ds_read_interp_read:
1436 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:4
1437 ; GFX9-NEXT: s_mov_b32 m0, s0
1438 ; GFX9-NEXT: s_nop 0
1439 ; GFX9-NEXT: v_interp_mov_f32_e32 v2, p10, attr0.x
1440 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1441 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
1442 ; GFX9-NEXT: ; return to shader part epilog
1443 %v0 = load float, ptr addrspace(3) %inptr, align 4
1444 %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims)
1445 %ptr1 = getelementptr float, ptr addrspace(3) %inptr, i32 4
1446 %v1 = load float, ptr addrspace(3) %ptr1, align 4
1447 %v1b = fadd float %v1, %intrp
1448 %r0 = insertelement <2 x float> undef, float %v0, i32 0
1449 %r1 = insertelement <2 x float> %r0, float %v1b, i32 1
1453 @v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
1455 define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) {
1456 ; CI-LABEL: read2_v2i32_align1_odd_offset:
1457 ; CI: ; %bb.0: ; %entry
1458 ; CI-NEXT: v_mov_b32_e32 v0, 0
1459 ; CI-NEXT: s_mov_b32 m0, -1
1460 ; CI-NEXT: ds_read_u8 v1, v0 offset:70
1461 ; CI-NEXT: ds_read_u8 v2, v0 offset:72
1462 ; CI-NEXT: ds_read_u8 v3, v0 offset:71
1463 ; CI-NEXT: ds_read_u8 v4, v0 offset:69
1464 ; CI-NEXT: ds_read_u8 v5, v0 offset:68
1465 ; CI-NEXT: s_waitcnt lgkmcnt(4)
1466 ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1467 ; CI-NEXT: s_waitcnt lgkmcnt(3)
1468 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1469 ; CI-NEXT: s_waitcnt lgkmcnt(2)
1470 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
1471 ; CI-NEXT: s_waitcnt lgkmcnt(1)
1472 ; CI-NEXT: v_or_b32_e32 v1, v1, v4
1473 ; CI-NEXT: ds_read_u8 v4, v0 offset:67
1474 ; CI-NEXT: ds_read_u8 v6, v0 offset:66
1475 ; CI-NEXT: ds_read_u8 v0, v0 offset:65
1476 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1477 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1478 ; CI-NEXT: v_or_b32_e32 v1, v2, v1
1479 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1480 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
1481 ; CI-NEXT: v_or_b32_e32 v0, v2, v0
1482 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
1483 ; CI-NEXT: v_or_b32_e32 v2, v2, v4
1484 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1485 ; CI-NEXT: s_mov_b32 s3, 0xf000
1486 ; CI-NEXT: s_mov_b32 s2, -1
1487 ; CI-NEXT: v_or_b32_e32 v0, v2, v0
1488 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1491 ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
1492 ; GFX9-ALIGNED: ; %bb.0: ; %entry
1493 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
1494 ; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:70
1495 ; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:65
1496 ; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:66
1497 ; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:67
1498 ; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:68
1499 ; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69
1500 ; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:72
1501 ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71
1502 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7)
1503 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1504 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1505 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
1506 ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
1507 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7
1508 ; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1509 ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v0
1510 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v4
1511 ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v3
1512 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v6
1513 ; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1514 ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0
1515 ; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1516 ; GFX9-ALIGNED-NEXT: s_endpgm
1518 ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
1519 ; GFX9-UNALIGNED: ; %bb.0: ; %entry
1520 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0
1521 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1522 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65
1523 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
1524 ; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1525 ; GFX9-UNALIGNED-NEXT: s_endpgm
1527 %load = load <2 x i32>, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @v2i32_align1, i32 65), align 1
1528 store <2 x i32> %load, ptr addrspace(1) %out
1532 declare void @void_func_void() #3
1534 declare i32 @llvm.amdgcn.workgroup.id.x() #1
1535 declare i32 @llvm.amdgcn.workgroup.id.y() #1
1536 declare i32 @llvm.amdgcn.workitem.id.x() #1
1537 declare i32 @llvm.amdgcn.workitem.id.y() #1
1539 declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone
1541 declare void @llvm.amdgcn.s.barrier() #2
1543 attributes #0 = { nounwind }
1544 attributes #1 = { nounwind readnone speculatable }
1545 attributes #2 = { convergent nounwind }
1546 attributes #3 = { nounwind noinline }