1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
4 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
6 @lds = addrspace(3) global [512 x float] undef, align 4
7 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
9 define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
10 ; CI-LABEL: simple_write2_one_val_f32:
12 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
13 ; CI-NEXT: s_mov_b32 s3, 0xf000
14 ; CI-NEXT: s_mov_b32 s2, 0
15 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
16 ; CI-NEXT: v_mov_b32_e32 v1, 0
17 ; CI-NEXT: s_waitcnt lgkmcnt(0)
18 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
19 ; CI-NEXT: s_mov_b32 m0, -1
20 ; CI-NEXT: s_waitcnt vmcnt(0)
21 ; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
24 ; GFX9-LABEL: simple_write2_one_val_f32:
26 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
27 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
28 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
29 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
30 ; GFX9-NEXT: s_waitcnt vmcnt(0)
31 ; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
33 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
34 %in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i
35 %val = load float, ptr addrspace(1) %in.gep, align 4
36 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
37 store float %val, ptr addrspace(3) %arrayidx0, align 4
38 %add.x = add nsw i32 %x.i, 8
39 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
40 store float %val, ptr addrspace(3) %arrayidx1, align 4
44 define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
45 ; CI-LABEL: simple_write2_two_val_f32:
47 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
48 ; CI-NEXT: s_mov_b32 s3, 0xf000
49 ; CI-NEXT: s_mov_b32 s2, 0
50 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
51 ; CI-NEXT: v_mov_b32_e32 v1, 0
52 ; CI-NEXT: s_waitcnt lgkmcnt(0)
53 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
54 ; CI-NEXT: s_waitcnt vmcnt(0)
55 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
56 ; CI-NEXT: s_waitcnt vmcnt(0)
57 ; CI-NEXT: s_mov_b32 m0, -1
58 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
61 ; GFX9-LABEL: simple_write2_two_val_f32:
63 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
64 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
65 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
67 ; GFX9-NEXT: s_waitcnt vmcnt(0)
68 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
69 ; GFX9-NEXT: s_waitcnt vmcnt(0)
70 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
73 %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
74 %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
75 %val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
76 %val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
77 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
78 store float %val0, ptr addrspace(3) %arrayidx0, align 4
79 %add.x = add nsw i32 %x.i, 8
80 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
81 store float %val1, ptr addrspace(3) %arrayidx1, align 4
85 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
86 ; CI-LABEL: simple_write2_two_val_f32_volatile_0:
88 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
89 ; CI-NEXT: s_mov_b32 s7, 0xf000
90 ; CI-NEXT: s_mov_b32 s6, 0
91 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
92 ; CI-NEXT: v_mov_b32_e32 v1, 0
93 ; CI-NEXT: s_waitcnt lgkmcnt(0)
94 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
95 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
96 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
97 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
98 ; CI-NEXT: s_waitcnt vmcnt(0)
99 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
100 ; CI-NEXT: s_waitcnt vmcnt(0)
101 ; CI-NEXT: s_mov_b32 m0, -1
102 ; CI-NEXT: ds_write_b32 v0, v2
103 ; CI-NEXT: ds_write_b32 v0, v1 offset:32
106 ; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
108 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
109 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
110 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
113 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
114 ; GFX9-NEXT: s_waitcnt vmcnt(0)
115 ; GFX9-NEXT: ds_write_b32 v0, v1
116 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
117 ; GFX9-NEXT: s_endpgm
118 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
119 %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
120 %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
121 %val0 = load volatile float, ptr addrspace(1) %in0.gep, align 4
122 %val1 = load volatile float, ptr addrspace(1) %in1.gep, align 4
123 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
124 store volatile float %val0, ptr addrspace(3) %arrayidx0, align 4
125 %add.x = add nsw i32 %x.i, 8
126 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
127 store float %val1, ptr addrspace(3) %arrayidx1, align 4
131 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
132 ; CI-LABEL: simple_write2_two_val_f32_volatile_1:
134 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
135 ; CI-NEXT: s_mov_b32 s7, 0xf000
136 ; CI-NEXT: s_mov_b32 s6, 0
137 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
138 ; CI-NEXT: v_mov_b32_e32 v1, 0
139 ; CI-NEXT: s_waitcnt lgkmcnt(0)
140 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
141 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
142 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
143 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
144 ; CI-NEXT: s_waitcnt vmcnt(0)
145 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
146 ; CI-NEXT: s_waitcnt vmcnt(0)
147 ; CI-NEXT: s_mov_b32 m0, -1
148 ; CI-NEXT: ds_write_b32 v0, v2
149 ; CI-NEXT: ds_write_b32 v0, v1 offset:32
152 ; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
154 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
155 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
156 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
157 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
158 ; GFX9-NEXT: s_waitcnt vmcnt(0)
159 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
160 ; GFX9-NEXT: s_waitcnt vmcnt(0)
161 ; GFX9-NEXT: ds_write_b32 v0, v1
162 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
163 ; GFX9-NEXT: s_endpgm
164 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
165 %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
166 %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
167 %val0 = load volatile float, ptr addrspace(1) %in0.gep, align 4
168 %val1 = load volatile float, ptr addrspace(1) %in1.gep, align 4
169 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
170 store float %val0, ptr addrspace(3) %arrayidx0, align 4
171 %add.x = add nsw i32 %x.i, 8
172 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
173 store volatile float %val1, ptr addrspace(3) %arrayidx1, align 4
177 ; 2 data subregisters from different super registers.
178 ; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo
179 ; This should be an s_mov_b32. The v_mov_b32 gets introduced by an
180 ; early legalization of the constant bus constraint on the v_lshl_add_u32,
181 ; and then SIFoldOperands folds in an unlucky order.
182 define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
183 ; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
185 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
186 ; CI-NEXT: s_mov_b32 s3, 0xf000
187 ; CI-NEXT: s_mov_b32 s2, 0
188 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
189 ; CI-NEXT: v_mov_b32_e32 v2, 0
190 ; CI-NEXT: s_waitcnt lgkmcnt(0)
191 ; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
192 ; CI-NEXT: s_waitcnt vmcnt(0)
193 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
194 ; CI-NEXT: s_waitcnt vmcnt(0)
195 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
196 ; CI-NEXT: s_mov_b32 m0, -1
197 ; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8
200 ; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32:
202 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
203 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
204 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
205 ; GFX9-NEXT: ; kill: killed $vgpr4
206 ; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] glc
209 ; GFX9-NEXT: s_waitcnt vmcnt(0)
210 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
211 ; GFX9-NEXT: s_waitcnt vmcnt(0)
212 ; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8
213 ; GFX9-NEXT: s_endpgm
214 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
215 %in.gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %x.i
216 %in.gep.1 = getelementptr <2 x float>, ptr addrspace(1) %in.gep.0, i32 1
217 %val0 = load volatile <2 x float>, ptr addrspace(1) %in.gep.0, align 8
218 %val1 = load volatile <2 x float>, ptr addrspace(1) %in.gep.1, align 8
219 %val0.0 = extractelement <2 x float> %val0, i32 0
220 %val1.1 = extractelement <2 x float> %val1, i32 1
221 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
222 store float %val0.0, ptr addrspace(3) %arrayidx0, align 4
223 %add.x = add nsw i32 %x.i, 8
224 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
225 store float %val1.1, ptr addrspace(3) %arrayidx1, align 4
229 define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
230 ; CI-LABEL: simple_write2_two_val_subreg2_f32:
232 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
233 ; CI-NEXT: s_mov_b32 s3, 0xf000
234 ; CI-NEXT: s_mov_b32 s2, 0
235 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
236 ; CI-NEXT: v_mov_b32_e32 v2, 0
237 ; CI-NEXT: s_waitcnt lgkmcnt(0)
238 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
239 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
240 ; CI-NEXT: s_mov_b32 m0, -1
241 ; CI-NEXT: s_waitcnt vmcnt(0)
242 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
245 ; GFX9-LABEL: simple_write2_two_val_subreg2_f32:
247 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
248 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0
249 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
250 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
252 ; GFX9-NEXT: s_waitcnt vmcnt(0)
253 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
254 ; GFX9-NEXT: s_endpgm
255 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
256 %in.gep = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %x.i
257 %val = load <2 x float>, ptr addrspace(1) %in.gep, align 8
258 %val0 = extractelement <2 x float> %val, i32 0
259 %val1 = extractelement <2 x float> %val, i32 1
260 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
261 store float %val0, ptr addrspace(3) %arrayidx0, align 4
262 %add.x = add nsw i32 %x.i, 8
263 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
264 store float %val1, ptr addrspace(3) %arrayidx1, align 4
268 define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
269 ; CI-LABEL: simple_write2_two_val_subreg4_f32:
271 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
272 ; CI-NEXT: s_mov_b32 s3, 0xf000
273 ; CI-NEXT: s_mov_b32 s2, 0
274 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0
275 ; CI-NEXT: v_mov_b32_e32 v2, 0
276 ; CI-NEXT: s_waitcnt lgkmcnt(0)
277 ; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
278 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
279 ; CI-NEXT: s_mov_b32 m0, -1
280 ; CI-NEXT: s_waitcnt vmcnt(0)
281 ; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
284 ; GFX9-LABEL: simple_write2_two_val_subreg4_f32:
286 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
287 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0
288 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
289 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
290 ; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1]
291 ; GFX9-NEXT: s_waitcnt vmcnt(0)
292 ; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
293 ; GFX9-NEXT: s_endpgm
294 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
295 %in.gep = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %x.i
296 %val = load <4 x float>, ptr addrspace(1) %in.gep, align 16
297 %val0 = extractelement <4 x float> %val, i32 0
298 %val1 = extractelement <4 x float> %val, i32 3
299 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
300 store float %val0, ptr addrspace(3) %arrayidx0, align 4
301 %add.x = add nsw i32 %x.i, 8
302 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
303 store float %val1, ptr addrspace(3) %arrayidx1, align 4
307 define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
308 ; CI-LABEL: simple_write2_two_val_max_offset_f32:
310 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
311 ; CI-NEXT: s_mov_b32 s3, 0xf000
312 ; CI-NEXT: s_mov_b32 s2, 0
313 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
314 ; CI-NEXT: v_mov_b32_e32 v1, 0
315 ; CI-NEXT: s_waitcnt lgkmcnt(0)
316 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
317 ; CI-NEXT: s_waitcnt vmcnt(0)
318 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
319 ; CI-NEXT: s_waitcnt vmcnt(0)
320 ; CI-NEXT: s_mov_b32 m0, -1
321 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255
324 ; GFX9-LABEL: simple_write2_two_val_max_offset_f32:
326 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
327 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
328 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
330 ; GFX9-NEXT: s_waitcnt vmcnt(0)
331 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
332 ; GFX9-NEXT: s_waitcnt vmcnt(0)
333 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255
334 ; GFX9-NEXT: s_endpgm
335 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
336 %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
337 %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
338 %val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
339 %val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
340 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
341 store float %val0, ptr addrspace(3) %arrayidx0, align 4
342 %add.x = add nsw i32 %x.i, 255
343 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
344 store float %val1, ptr addrspace(3) %arrayidx1, align 4
348 define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
349 ; CI-LABEL: simple_write2_two_val_too_far_f32:
351 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
352 ; CI-NEXT: s_mov_b32 s7, 0xf000
353 ; CI-NEXT: s_mov_b32 s6, 0
354 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
355 ; CI-NEXT: v_mov_b32_e32 v1, 0
356 ; CI-NEXT: s_waitcnt lgkmcnt(0)
357 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
358 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
359 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
360 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
361 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
362 ; CI-NEXT: s_mov_b32 m0, -1
363 ; CI-NEXT: s_waitcnt vmcnt(1)
364 ; CI-NEXT: ds_write_b32 v0, v2
365 ; CI-NEXT: s_waitcnt vmcnt(0)
366 ; CI-NEXT: ds_write_b32 v0, v1 offset:1028
369 ; GFX9-LABEL: simple_write2_two_val_too_far_f32:
371 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
372 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
375 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
376 ; GFX9-NEXT: s_waitcnt vmcnt(1)
377 ; GFX9-NEXT: ds_write_b32 v0, v1
378 ; GFX9-NEXT: s_waitcnt vmcnt(0)
379 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:1028
380 ; GFX9-NEXT: s_endpgm
381 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
382 %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
383 %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
384 %val0 = load float, ptr addrspace(1) %in0.gep, align 4
385 %val1 = load float, ptr addrspace(1) %in1.gep, align 4
386 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
387 store float %val0, ptr addrspace(3) %arrayidx0, align 4
388 %add.x = add nsw i32 %x.i, 257
389 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
390 store float %val1, ptr addrspace(3) %arrayidx1, align 4
394 define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
395 ; CI-LABEL: simple_write2_two_val_f32_x2:
397 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
398 ; CI-NEXT: s_mov_b32 s7, 0xf000
399 ; CI-NEXT: s_mov_b32 s6, 0
400 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
401 ; CI-NEXT: v_mov_b32_e32 v1, 0
402 ; CI-NEXT: s_waitcnt lgkmcnt(0)
403 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
404 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
405 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
406 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
407 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
408 ; CI-NEXT: s_mov_b32 m0, -1
409 ; CI-NEXT: s_waitcnt vmcnt(0)
410 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
411 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
414 ; GFX9-LABEL: simple_write2_two_val_f32_x2:
416 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
417 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
418 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
419 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
420 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
422 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
423 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
424 ; GFX9-NEXT: s_endpgm
425 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
426 %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %tid.x
427 %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %tid.x
428 %val0 = load float, ptr addrspace(1) %in0.gep, align 4
429 %val1 = load float, ptr addrspace(1) %in1.gep, align 4
431 %idx.0 = add nsw i32 %tid.x, 0
432 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
433 store float %val0, ptr addrspace(3) %arrayidx0, align 4
435 %idx.1 = add nsw i32 %tid.x, 8
436 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
437 store float %val1, ptr addrspace(3) %arrayidx1, align 4
439 %idx.2 = add nsw i32 %tid.x, 11
440 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
441 store float %val0, ptr addrspace(3) %arrayidx2, align 4
443 %idx.3 = add nsw i32 %tid.x, 27
444 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
445 store float %val1, ptr addrspace(3) %arrayidx3, align 4
450 define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
451 ; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
453 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
454 ; CI-NEXT: s_mov_b32 s7, 0xf000
455 ; CI-NEXT: s_mov_b32 s6, 0
456 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
457 ; CI-NEXT: v_mov_b32_e32 v1, 0
458 ; CI-NEXT: s_waitcnt lgkmcnt(0)
459 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
460 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
461 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
462 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
463 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
464 ; CI-NEXT: s_mov_b32 m0, -1
465 ; CI-NEXT: s_waitcnt vmcnt(0)
466 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8
467 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
470 ; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
472 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
473 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
474 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
475 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
476 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
477 ; GFX9-NEXT: s_waitcnt vmcnt(0)
478 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
479 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
480 ; GFX9-NEXT: s_endpgm
481 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
482 %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %tid.x
483 %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %tid.x
484 %val0 = load float, ptr addrspace(1) %in0.gep, align 4
485 %val1 = load float, ptr addrspace(1) %in1.gep, align 4
487 %idx.0 = add nsw i32 %tid.x, 3
488 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
489 store float %val0, ptr addrspace(3) %arrayidx0, align 4
491 %idx.1 = add nsw i32 %tid.x, 8
492 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
493 store float %val1, ptr addrspace(3) %arrayidx1, align 4
495 %idx.2 = add nsw i32 %tid.x, 11
496 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
497 store float %val0, ptr addrspace(3) %arrayidx2, align 4
499 %idx.3 = add nsw i32 %tid.x, 27
500 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
501 store float %val1, ptr addrspace(3) %arrayidx3, align 4
506 define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x ptr addrspace(3)> %lds.ptr) #0 {
507 ; CI-LABEL: write2_ptr_subreg_arg_two_val_f32:
509 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
510 ; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x6
511 ; CI-NEXT: s_mov_b32 s7, 0xf000
512 ; CI-NEXT: s_mov_b32 s6, 0
513 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
514 ; CI-NEXT: s_waitcnt lgkmcnt(0)
515 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
516 ; CI-NEXT: v_mov_b32_e32 v1, 0
517 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
518 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
519 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
520 ; CI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
521 ; CI-NEXT: v_mov_b32_e32 v1, s8
522 ; CI-NEXT: s_mov_b32 m0, -1
523 ; CI-NEXT: v_mov_b32_e32 v3, s9
524 ; CI-NEXT: s_waitcnt vmcnt(1)
525 ; CI-NEXT: ds_write_b32 v1, v2 offset:32
526 ; CI-NEXT: s_waitcnt vmcnt(0)
527 ; CI-NEXT: ds_write_b32 v3, v0 offset:32
530 ; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32:
532 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
533 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18
534 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
535 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
536 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
537 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
538 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
539 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
540 ; GFX9-NEXT: s_waitcnt vmcnt(1)
541 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:32
542 ; GFX9-NEXT: s_waitcnt vmcnt(0)
543 ; GFX9-NEXT: ds_write_b32 v3, v2 offset:32
544 ; GFX9-NEXT: s_endpgm
545 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
546 %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
547 %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
548 %val0 = load float, ptr addrspace(1) %in0.gep, align 4
549 %val1 = load float, ptr addrspace(1) %in1.gep, align 4
551 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
552 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
553 %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
554 %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
555 %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
557 ; Apply an additional offset after the vector that will be more obviously folded.
558 %gep.1.offset = getelementptr float, ptr addrspace(3) %gep.1, i32 8
559 store float %val0, ptr addrspace(3) %gep.0, align 4
561 %add.x = add nsw i32 %x.i, 8
562 store float %val1, ptr addrspace(3) %gep.1.offset, align 4
566 define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
567 ; CI-LABEL: simple_write2_one_val_f64:
569 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
570 ; CI-NEXT: s_mov_b32 s3, 0xf000
571 ; CI-NEXT: s_mov_b32 s2, 0
572 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
573 ; CI-NEXT: v_mov_b32_e32 v1, 0
574 ; CI-NEXT: s_waitcnt lgkmcnt(0)
575 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
576 ; CI-NEXT: s_mov_b32 m0, -1
577 ; CI-NEXT: s_waitcnt vmcnt(0)
578 ; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8
581 ; GFX9-LABEL: simple_write2_one_val_f64:
583 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
584 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
585 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
586 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
587 ; GFX9-NEXT: s_waitcnt vmcnt(0)
588 ; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
589 ; GFX9-NEXT: s_endpgm
590 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
591 %in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
592 %val = load double, ptr addrspace(1) %in.gep, align 8
593 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
594 store double %val, ptr addrspace(3) %arrayidx0, align 8
595 %add.x = add nsw i32 %x.i, 8
596 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
597 store double %val, ptr addrspace(3) %arrayidx1, align 8
601 define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
602 ; CI-LABEL: misaligned_simple_write2_one_val_f64:
604 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
605 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
606 ; CI-NEXT: s_mov_b32 s3, 0xf000
607 ; CI-NEXT: s_mov_b32 s2, 0
608 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
609 ; CI-NEXT: v_mov_b32_e32 v1, 0
610 ; CI-NEXT: s_waitcnt lgkmcnt(0)
611 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
612 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
613 ; CI-NEXT: s_mov_b32 m0, -1
614 ; CI-NEXT: s_waitcnt vmcnt(0)
615 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
616 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:14 offset1:15
619 ; GFX9-LABEL: misaligned_simple_write2_one_val_f64:
621 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
622 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10
623 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
624 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
625 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
626 ; GFX9-NEXT: v_add_u32_e32 v2, s2, v2
627 ; GFX9-NEXT: s_waitcnt vmcnt(0)
628 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
629 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
630 ; GFX9-NEXT: s_endpgm
631 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
632 %in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
633 %val = load double, ptr addrspace(1) %in.gep, align 8
634 %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
635 store double %val, ptr addrspace(3) %arrayidx0, align 4
636 %add.x = add nsw i32 %x.i, 7
637 %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
638 store double %val, ptr addrspace(3) %arrayidx1, align 4
642 define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
643 ; CI-LABEL: unaligned_offset_simple_write2_one_val_f64:
645 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
646 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
647 ; CI-NEXT: s_mov_b32 s3, 0xf000
648 ; CI-NEXT: s_mov_b32 s2, 0
649 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
650 ; CI-NEXT: v_mov_b32_e32 v1, 0
651 ; CI-NEXT: s_waitcnt lgkmcnt(0)
652 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
653 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
654 ; CI-NEXT: s_mov_b32 m0, -1
655 ; CI-NEXT: s_waitcnt vmcnt(0)
656 ; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1
657 ; CI-NEXT: ds_write_b8 v0, v1 offset:5
658 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
659 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
660 ; CI-NEXT: ds_write_b8 v0, v1 offset:9
661 ; CI-NEXT: ds_write_b8 v0, v2 offset:13
662 ; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2
663 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
664 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
665 ; CI-NEXT: ds_write_b8 v0, v3 offset:8
666 ; CI-NEXT: ds_write_b8 v0, v4 offset:7
667 ; CI-NEXT: ds_write_b8 v0, v5 offset:6
668 ; CI-NEXT: ds_write_b8 v0, v3 offset:12
669 ; CI-NEXT: ds_write_b8 v0, v4 offset:11
670 ; CI-NEXT: ds_write_b8 v0, v5 offset:10
671 ; CI-NEXT: ds_write_b8 v0, v1 offset:16
672 ; CI-NEXT: ds_write_b8 v0, v6 offset:15
673 ; CI-NEXT: ds_write_b8 v0, v2 offset:14
676 ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
677 ; GFX9-ALIGNED: ; %bb.0:
678 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
679 ; GFX9-ALIGNED-NEXT: s_load_dword s2, s[4:5], 0x10
680 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
681 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
682 ; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
683 ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s2, v2
684 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0)
685 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7
686 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5
687 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
688 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0
689 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11
690 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9
691 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15
692 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13
693 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1
694 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1
695 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8
696 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6
697 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12
698 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10
699 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16
700 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14
701 ; GFX9-ALIGNED-NEXT: s_endpgm
703 ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
704 ; GFX9-UNALIGNED: ; %bb.0:
705 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
706 ; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[4:5], 0x10
707 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
708 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
709 ; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
710 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s2, v2
711 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
712 ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5
713 ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:9
714 ; GFX9-UNALIGNED-NEXT: s_endpgm
715 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
716 %in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
717 %val = load double, ptr addrspace(1) %in.gep, align 8
718 %base = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
719 %addr0.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 5
720 store double %val, ptr addrspace(3) %addr0.i8, align 1
721 %addr1.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 9
722 store double %val, ptr addrspace(3) %addr1.i8, align 1
726 define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
727 ; CI-LABEL: simple_write2_two_val_f64:
729 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
730 ; CI-NEXT: s_mov_b32 s3, 0xf000
731 ; CI-NEXT: s_mov_b32 s2, 0
732 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
733 ; CI-NEXT: v_mov_b32_e32 v1, 0
734 ; CI-NEXT: s_waitcnt lgkmcnt(0)
735 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
736 ; CI-NEXT: s_waitcnt vmcnt(0)
737 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
738 ; CI-NEXT: s_waitcnt vmcnt(0)
739 ; CI-NEXT: s_mov_b32 m0, -1
740 ; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
743 ; GFX9-LABEL: simple_write2_two_val_f64:
745 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
746 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
747 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
748 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc
749 ; GFX9-NEXT: s_waitcnt vmcnt(0)
750 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
751 ; GFX9-NEXT: s_waitcnt vmcnt(0)
752 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
753 ; GFX9-NEXT: s_endpgm
754 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
755 %in.gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %x.i
756 %in.gep.1 = getelementptr double, ptr addrspace(1) %in.gep.0, i32 1
757 %val0 = load volatile double, ptr addrspace(1) %in.gep.0, align 8
758 %val1 = load volatile double, ptr addrspace(1) %in.gep.1, align 8
759 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
760 store double %val0, ptr addrspace(3) %arrayidx0, align 8
761 %add.x = add nsw i32 %x.i, 8
762 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
763 store double %val1, ptr addrspace(3) %arrayidx1, align 8
767 @foo = addrspace(3) global [4 x i32] undef, align 4
769 define amdgpu_kernel void @store_constant_adjacent_offsets() {
770 ; CI-LABEL: store_constant_adjacent_offsets:
772 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
773 ; CI-NEXT: v_mov_b32_e32 v1, v0
774 ; CI-NEXT: v_mov_b32_e32 v2, 0
775 ; CI-NEXT: s_mov_b32 m0, -1
776 ; CI-NEXT: ds_write_b64 v2, v[0:1]
779 ; GFX9-LABEL: store_constant_adjacent_offsets:
781 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
782 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
783 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
784 ; GFX9-NEXT: ds_write_b64 v2, v[0:1]
785 ; GFX9-NEXT: s_endpgm
786 store i32 123, ptr addrspace(3) @foo, align 4
787 store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 1), align 4
791 define amdgpu_kernel void @store_constant_disjoint_offsets() {
792 ; CI-LABEL: store_constant_disjoint_offsets:
794 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
795 ; CI-NEXT: v_mov_b32_e32 v1, 0
796 ; CI-NEXT: s_mov_b32 m0, -1
797 ; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
800 ; GFX9-LABEL: store_constant_disjoint_offsets:
802 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
803 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
804 ; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
805 ; GFX9-NEXT: s_endpgm
806 store i32 123, ptr addrspace(3) @foo, align 4
807 store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 2), align 4
811 @bar = addrspace(3) global [4 x i64] undef, align 4
813 define amdgpu_kernel void @store_misaligned64_constant_offsets() {
814 ; CI-LABEL: store_misaligned64_constant_offsets:
816 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
817 ; CI-NEXT: v_mov_b32_e32 v1, 0
818 ; CI-NEXT: v_mov_b32_e32 v2, v0
819 ; CI-NEXT: v_mov_b32_e32 v3, v1
820 ; CI-NEXT: s_mov_b32 m0, -1
821 ; CI-NEXT: ds_write_b128 v1, v[0:3]
824 ; GFX9-LABEL: store_misaligned64_constant_offsets:
826 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
827 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
828 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
829 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
830 ; GFX9-NEXT: ds_write_b128 v1, v[0:3]
831 ; GFX9-NEXT: s_endpgm
832 store i64 123, ptr addrspace(3) @bar, align 4
833 store i64 123, ptr addrspace(3) getelementptr inbounds ([4 x i64], ptr addrspace(3) @bar, i32 0, i32 1), align 4
837 @bar.large = addrspace(3) global [4096 x i64] undef, align 4
839 define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
840 ; CI-LABEL: store_misaligned64_constant_large_offsets:
842 ; CI-NEXT: s_mov_b64 s[0:1], 0x7b
843 ; CI-NEXT: v_mov_b32_e32 v0, s0
844 ; CI-NEXT: v_mov_b32_e32 v2, 0
845 ; CI-NEXT: v_mov_b32_e32 v1, s1
846 ; CI-NEXT: s_mov_b32 m0, -1
847 ; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384
848 ; CI-NEXT: ds_write_b64 v2, v[0:1] offset:32760
851 ; GFX9-LABEL: store_misaligned64_constant_large_offsets:
853 ; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b
854 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
855 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
856 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
857 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384
858 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760
859 ; GFX9-NEXT: s_endpgm
860 store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 2048), align 4
861 store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 4095), align 4
865 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
866 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
868 define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 {
869 ; CI-LABEL: write2_sgemm_sequence:
871 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4
872 ; CI-NEXT: s_mov_b32 m0, -1
873 ; CI-NEXT: s_waitcnt lgkmcnt(0)
874 ; CI-NEXT: s_load_dword s0, s[0:1], 0x0
875 ; CI-NEXT: s_lshl_b32 s1, s8, 2
876 ; CI-NEXT: s_add_i32 s2, s1, 0xc20
877 ; CI-NEXT: s_addk_i32 s1, 0xc60
878 ; CI-NEXT: v_mov_b32_e32 v0, s2
879 ; CI-NEXT: s_waitcnt lgkmcnt(0)
880 ; CI-NEXT: v_mov_b32_e32 v2, s0
881 ; CI-NEXT: v_mov_b32_e32 v3, s0
882 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
883 ; CI-NEXT: v_mov_b32_e32 v0, s1
884 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
885 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1
886 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
887 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
888 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
891 ; GFX9-LABEL: write2_sgemm_sequence:
893 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
894 ; GFX9-NEXT: s_lshl_b32 s2, s8, 2
895 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
896 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
897 ; GFX9-NEXT: s_add_i32 s1, s2, 0xc20
898 ; GFX9-NEXT: s_addk_i32 s2, 0xc60
899 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
900 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
901 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
902 ; GFX9-NEXT: v_mov_b32_e32 v3, s0
903 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
904 ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
905 ; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1
906 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1
907 ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
908 ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
909 ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
910 ; GFX9-NEXT: s_endpgm
911 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
912 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
913 %val = load float, ptr addrspace(1) %in
914 %arrayidx44 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %x.i
915 store float %val, ptr addrspace(3) %arrayidx44, align 4
916 %add47 = add nsw i32 %x.i, 1
917 %arrayidx48 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add47
918 store float %val, ptr addrspace(3) %arrayidx48, align 4
919 %add51 = add nsw i32 %x.i, 16
920 %arrayidx52 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add51
921 store float %val, ptr addrspace(3) %arrayidx52, align 4
922 %add55 = add nsw i32 %x.i, 17
923 %arrayidx56 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add55
924 store float %val, ptr addrspace(3) %arrayidx56, align 4
925 %arrayidx60 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %y.i
926 store float %val, ptr addrspace(3) %arrayidx60, align 4
927 %add63 = add nsw i32 %y.i, 1
928 %arrayidx64 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add63
929 store float %val, ptr addrspace(3) %arrayidx64, align 4
930 %add67 = add nsw i32 %y.i, 32
931 %arrayidx68 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add67
932 store float %val, ptr addrspace(3) %arrayidx68, align 4
933 %add71 = add nsw i32 %y.i, 33
934 %arrayidx72 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add71
935 store float %val, ptr addrspace(3) %arrayidx72, align 4
936 %add75 = add nsw i32 %y.i, 64
937 %arrayidx76 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add75
938 store float %val, ptr addrspace(3) %arrayidx76, align 4
939 %add79 = add nsw i32 %y.i, 65
940 %arrayidx80 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add79
941 store float %val, ptr addrspace(3) %arrayidx80, align 4
945 define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) %out, ptr addrspace(1) %in) #0 {
946 ; CI-LABEL: simple_write2_v4f32_superreg_align4:
948 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
949 ; CI-NEXT: s_load_dword s4, s[4:5], 0x0
950 ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
951 ; CI-NEXT: s_mov_b32 m0, -1
952 ; CI-NEXT: s_waitcnt lgkmcnt(0)
953 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
954 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
955 ; CI-NEXT: s_waitcnt lgkmcnt(0)
956 ; CI-NEXT: v_mov_b32_e32 v1, s0
957 ; CI-NEXT: v_mov_b32_e32 v2, s1
958 ; CI-NEXT: v_mov_b32_e32 v3, s2
959 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
960 ; CI-NEXT: v_mov_b32_e32 v1, s3
961 ; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
964 ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
965 ; GFX9-ALIGNED: ; %bb.0:
966 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
967 ; GFX9-ALIGNED-NEXT: s_load_dword s8, s[4:5], 0x0
968 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
969 ; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s8
970 ; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
971 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
972 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0
973 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1
974 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s2
975 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, s3
976 ; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
977 ; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
978 ; GFX9-ALIGNED-NEXT: s_endpgm
980 ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
981 ; GFX9-UNALIGNED: ; %bb.0:
982 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
983 ; GFX9-UNALIGNED-NEXT: s_load_dword s8, s[4:5], 0x0
984 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
985 ; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s8
986 ; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
987 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
988 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2
989 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
990 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s0
991 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
992 ; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
993 ; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
994 ; GFX9-UNALIGNED-NEXT: s_endpgm
995 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
996 %in.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %in
997 %val0 = load <4 x float>, ptr addrspace(1) %in.gep, align 4
998 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(3) %out, i32 %x.i
999 store <4 x float> %val0, ptr addrspace(3) %out.gep, align 4
1003 @v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
1005 define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
1006 ; CI-LABEL: write2_v2i32_align1_odd_offset:
1007 ; CI: ; %bb.0: ; %entry
1008 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
1009 ; CI-NEXT: v_mov_b32_e32 v1, 0
1010 ; CI-NEXT: s_mov_b32 m0, -1
1011 ; CI-NEXT: ds_write_b8 v1, v0 offset:65
1012 ; CI-NEXT: v_mov_b32_e32 v0, 1
1013 ; CI-NEXT: ds_write_b8 v1, v0 offset:70
1014 ; CI-NEXT: v_mov_b32_e32 v0, 0xc8
1015 ; CI-NEXT: ds_write_b8 v1, v0 offset:69
1016 ; CI-NEXT: ds_write_b8 v1, v1 offset:68
1017 ; CI-NEXT: ds_write_b8 v1, v1 offset:67
1018 ; CI-NEXT: ds_write_b8 v1, v1 offset:66
1019 ; CI-NEXT: ds_write_b8 v1, v1 offset:72
1020 ; CI-NEXT: ds_write_b8 v1, v1 offset:71
1023 ; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1024 ; GFX9-ALIGNED: ; %bb.0: ; %entry
1025 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
1026 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0
1027 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65
1028 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1
1029 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70
1030 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8
1031 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69
1032 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68
1033 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67
1034 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66
1035 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72
1036 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71
1037 ; GFX9-ALIGNED-NEXT: s_endpgm
1039 ; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1040 ; GFX9-UNALIGNED: ; %bb.0: ; %entry
1041 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
1042 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x1c8
1043 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0
1044 ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:65
1045 ; GFX9-UNALIGNED-NEXT: s_endpgm
1047 store <2 x i32> <i32 123, i32 456>, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @v2i32_align1, i32 65), align 1
1051 declare i32 @llvm.amdgcn.workgroup.id.x() #1
1052 declare i32 @llvm.amdgcn.workgroup.id.y() #1
1053 declare i32 @llvm.amdgcn.workitem.id.x() #1
1054 declare i32 @llvm.amdgcn.workitem.id.y() #1
1056 attributes #0 = { nounwind }
1057 attributes #1 = { nounwind readnone speculatable }
1058 attributes #2 = { convergent nounwind }