1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
4 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
6 @lds = addrspace(3) global [512 x float] undef, align 4
7 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
9 define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
10 ; CI-LABEL: simple_write2_one_val_f32:
12 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
13 ; CI-NEXT: s_mov_b32 s3, 0xf000
14 ; CI-NEXT: s_mov_b32 s2, 0
15 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
16 ; CI-NEXT: v_mov_b32_e32 v1, 0
17 ; CI-NEXT: s_waitcnt lgkmcnt(0)
18 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
19 ; CI-NEXT: s_mov_b32 m0, -1
20 ; CI-NEXT: s_waitcnt vmcnt(0)
21 ; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
24 ; GFX9-LABEL: simple_write2_one_val_f32:
26 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
27 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
28 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
29 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
30 ; GFX9-NEXT: s_waitcnt vmcnt(0)
31 ; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
33 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
34 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
35 %val = load float, float addrspace(1)* %in.gep, align 4
36 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
37 store float %val, float addrspace(3)* %arrayidx0, align 4
38 %add.x = add nsw i32 %x.i, 8
39 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
40 store float %val, float addrspace(3)* %arrayidx1, align 4
44 define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
45 ; CI-LABEL: simple_write2_two_val_f32:
47 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
48 ; CI-NEXT: s_mov_b32 s3, 0xf000
49 ; CI-NEXT: s_mov_b32 s2, 0
50 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
51 ; CI-NEXT: v_mov_b32_e32 v1, 0
52 ; CI-NEXT: s_waitcnt lgkmcnt(0)
53 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
54 ; CI-NEXT: s_waitcnt vmcnt(0)
55 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
56 ; CI-NEXT: s_waitcnt vmcnt(0)
57 ; CI-NEXT: s_mov_b32 m0, -1
58 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
61 ; GFX9-LABEL: simple_write2_two_val_f32:
63 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
64 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
65 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
67 ; GFX9-NEXT: s_waitcnt vmcnt(0)
68 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
69 ; GFX9-NEXT: s_waitcnt vmcnt(0)
70 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
73 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
74 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
75 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
76 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
77 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
78 store float %val0, float addrspace(3)* %arrayidx0, align 4
79 %add.x = add nsw i32 %x.i, 8
80 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
81 store float %val1, float addrspace(3)* %arrayidx1, align 4
85 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
86 ; CI-LABEL: simple_write2_two_val_f32_volatile_0:
88 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
89 ; CI-NEXT: s_mov_b32 s3, 0xf000
90 ; CI-NEXT: s_mov_b32 s2, 0
91 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
92 ; CI-NEXT: v_mov_b32_e32 v1, 0
93 ; CI-NEXT: s_waitcnt lgkmcnt(0)
94 ; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
95 ; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
96 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
97 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
98 ; CI-NEXT: s_waitcnt vmcnt(0)
99 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
100 ; CI-NEXT: s_waitcnt vmcnt(0)
101 ; CI-NEXT: s_mov_b32 m0, -1
102 ; CI-NEXT: ds_write_b32 v0, v2
103 ; CI-NEXT: ds_write_b32 v0, v1 offset:32
106 ; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
108 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
109 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
110 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
113 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
114 ; GFX9-NEXT: s_waitcnt vmcnt(0)
115 ; GFX9-NEXT: ds_write_b32 v0, v1
116 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
117 ; GFX9-NEXT: s_endpgm
118 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
119 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
120 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
121 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
122 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
123 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
124 store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
125 %add.x = add nsw i32 %x.i, 8
126 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
127 store float %val1, float addrspace(3)* %arrayidx1, align 4
131 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
132 ; CI-LABEL: simple_write2_two_val_f32_volatile_1:
134 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
135 ; CI-NEXT: s_mov_b32 s3, 0xf000
136 ; CI-NEXT: s_mov_b32 s2, 0
137 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
138 ; CI-NEXT: v_mov_b32_e32 v1, 0
139 ; CI-NEXT: s_waitcnt lgkmcnt(0)
140 ; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
141 ; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
142 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
143 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
144 ; CI-NEXT: s_waitcnt vmcnt(0)
145 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
146 ; CI-NEXT: s_waitcnt vmcnt(0)
147 ; CI-NEXT: s_mov_b32 m0, -1
148 ; CI-NEXT: ds_write_b32 v0, v2
149 ; CI-NEXT: ds_write_b32 v0, v1 offset:32
152 ; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
154 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
155 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
156 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
157 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
158 ; GFX9-NEXT: s_waitcnt vmcnt(0)
159 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
160 ; GFX9-NEXT: s_waitcnt vmcnt(0)
161 ; GFX9-NEXT: ds_write_b32 v0, v1
162 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
163 ; GFX9-NEXT: s_endpgm
164 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
165 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
166 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
167 %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
168 %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
169 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
170 store float %val0, float addrspace(3)* %arrayidx0, align 4
171 %add.x = add nsw i32 %x.i, 8
172 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
173 store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
177 ; 2 data subregisters from different super registers.
178 ; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo
179 ; This should be an s_mov_b32. The v_mov_b32 gets introduced by an
180 ; early legalization of the constant bus constraint on the v_lshl_add_u32,
181 ; and then SIFoldOperands folds in an unlucky order.
182 define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
183 ; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
185 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
186 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
187 ; CI-NEXT: s_mov_b32 s3, 0xf000
188 ; CI-NEXT: s_mov_b32 s2, 0
189 ; CI-NEXT: v_mov_b32_e32 v2, 0
190 ; CI-NEXT: s_waitcnt lgkmcnt(0)
191 ; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
192 ; CI-NEXT: s_waitcnt vmcnt(0)
193 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
194 ; CI-NEXT: s_waitcnt vmcnt(0)
195 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
196 ; CI-NEXT: s_mov_b32 m0, -1
197 ; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8
200 ; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32:
202 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
203 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
204 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
205 ; GFX9-NEXT: ; kill: killed $vgpr4
206 ; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] glc
209 ; GFX9-NEXT: s_waitcnt vmcnt(0)
210 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
211 ; GFX9-NEXT: s_waitcnt vmcnt(0)
212 ; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8
213 ; GFX9-NEXT: s_endpgm
214 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
215 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
216 %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
217 %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
218 %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
219 %val0.0 = extractelement <2 x float> %val0, i32 0
220 %val1.1 = extractelement <2 x float> %val1, i32 1
221 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
222 store float %val0.0, float addrspace(3)* %arrayidx0, align 4
223 %add.x = add nsw i32 %x.i, 8
224 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
225 store float %val1.1, float addrspace(3)* %arrayidx1, align 4
229 define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
230 ; CI-LABEL: simple_write2_two_val_subreg2_f32:
232 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
233 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
234 ; CI-NEXT: s_mov_b32 s3, 0xf000
235 ; CI-NEXT: s_mov_b32 s2, 0
236 ; CI-NEXT: v_mov_b32_e32 v2, 0
237 ; CI-NEXT: s_waitcnt lgkmcnt(0)
238 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
239 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
240 ; CI-NEXT: s_mov_b32 m0, -1
241 ; CI-NEXT: s_waitcnt vmcnt(0)
242 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
245 ; GFX9-LABEL: simple_write2_two_val_subreg2_f32:
247 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
248 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0
249 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
250 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
252 ; GFX9-NEXT: s_waitcnt vmcnt(0)
253 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
254 ; GFX9-NEXT: s_endpgm
255 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
256 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
257 %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
258 %val0 = extractelement <2 x float> %val, i32 0
259 %val1 = extractelement <2 x float> %val, i32 1
260 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
261 store float %val0, float addrspace(3)* %arrayidx0, align 4
262 %add.x = add nsw i32 %x.i, 8
263 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
264 store float %val1, float addrspace(3)* %arrayidx1, align 4
268 define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
269 ; CI-LABEL: simple_write2_two_val_subreg4_f32:
271 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
272 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0
273 ; CI-NEXT: s_mov_b32 s3, 0xf000
274 ; CI-NEXT: s_mov_b32 s2, 0
275 ; CI-NEXT: v_mov_b32_e32 v2, 0
276 ; CI-NEXT: s_waitcnt lgkmcnt(0)
277 ; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
278 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
279 ; CI-NEXT: s_mov_b32 m0, -1
280 ; CI-NEXT: s_waitcnt vmcnt(0)
281 ; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
284 ; GFX9-LABEL: simple_write2_two_val_subreg4_f32:
286 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
287 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0
288 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
289 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
290 ; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1]
291 ; GFX9-NEXT: s_waitcnt vmcnt(0)
292 ; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
293 ; GFX9-NEXT: s_endpgm
294 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
295 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
296 %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
297 %val0 = extractelement <4 x float> %val, i32 0
298 %val1 = extractelement <4 x float> %val, i32 3
299 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
300 store float %val0, float addrspace(3)* %arrayidx0, align 4
301 %add.x = add nsw i32 %x.i, 8
302 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
303 store float %val1, float addrspace(3)* %arrayidx1, align 4
307 define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
308 ; CI-LABEL: simple_write2_two_val_max_offset_f32:
310 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
311 ; CI-NEXT: s_mov_b32 s3, 0xf000
312 ; CI-NEXT: s_mov_b32 s2, 0
313 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
314 ; CI-NEXT: v_mov_b32_e32 v1, 0
315 ; CI-NEXT: s_waitcnt lgkmcnt(0)
316 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
317 ; CI-NEXT: s_waitcnt vmcnt(0)
318 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
319 ; CI-NEXT: s_waitcnt vmcnt(0)
320 ; CI-NEXT: s_mov_b32 m0, -1
321 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255
324 ; GFX9-LABEL: simple_write2_two_val_max_offset_f32:
326 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
327 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
328 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
330 ; GFX9-NEXT: s_waitcnt vmcnt(0)
331 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
332 ; GFX9-NEXT: s_waitcnt vmcnt(0)
333 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255
334 ; GFX9-NEXT: s_endpgm
335 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
336 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
337 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
338 %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
339 %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
340 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
341 store float %val0, float addrspace(3)* %arrayidx0, align 4
342 %add.x = add nsw i32 %x.i, 255
343 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
344 store float %val1, float addrspace(3)* %arrayidx1, align 4
348 define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
349 ; CI-LABEL: simple_write2_two_val_too_far_f32:
351 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
352 ; CI-NEXT: s_mov_b32 s3, 0xf000
353 ; CI-NEXT: s_mov_b32 s2, 0
354 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
355 ; CI-NEXT: v_mov_b32_e32 v1, 0
356 ; CI-NEXT: s_waitcnt lgkmcnt(0)
357 ; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
358 ; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
359 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
360 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
361 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
362 ; CI-NEXT: s_mov_b32 m0, -1
363 ; CI-NEXT: s_waitcnt vmcnt(1)
364 ; CI-NEXT: ds_write_b32 v0, v2
365 ; CI-NEXT: s_waitcnt vmcnt(0)
366 ; CI-NEXT: ds_write_b32 v0, v1 offset:1028
369 ; GFX9-LABEL: simple_write2_two_val_too_far_f32:
371 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
372 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
375 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
376 ; GFX9-NEXT: s_waitcnt vmcnt(1)
377 ; GFX9-NEXT: ds_write_b32 v0, v1
378 ; GFX9-NEXT: s_waitcnt vmcnt(0)
379 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:1028
380 ; GFX9-NEXT: s_endpgm
381 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
382 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
383 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
384 %val0 = load float, float addrspace(1)* %in0.gep, align 4
385 %val1 = load float, float addrspace(1)* %in1.gep, align 4
386 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
387 store float %val0, float addrspace(3)* %arrayidx0, align 4
388 %add.x = add nsw i32 %x.i, 257
389 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
390 store float %val1, float addrspace(3)* %arrayidx1, align 4
394 define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
395 ; CI-LABEL: simple_write2_two_val_f32_x2:
397 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
398 ; CI-NEXT: s_mov_b32 s3, 0xf000
399 ; CI-NEXT: s_mov_b32 s2, 0
400 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
401 ; CI-NEXT: v_mov_b32_e32 v1, 0
402 ; CI-NEXT: s_waitcnt lgkmcnt(0)
403 ; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
404 ; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
405 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
406 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
407 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
408 ; CI-NEXT: s_mov_b32 m0, -1
409 ; CI-NEXT: s_waitcnt vmcnt(0)
410 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
411 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
414 ; GFX9-LABEL: simple_write2_two_val_f32_x2:
416 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
417 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
418 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
419 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
420 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
422 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
423 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
424 ; GFX9-NEXT: s_endpgm
425 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
426 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
427 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
428 %val0 = load float, float addrspace(1)* %in0.gep, align 4
429 %val1 = load float, float addrspace(1)* %in1.gep, align 4
431 %idx.0 = add nsw i32 %tid.x, 0
432 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
433 store float %val0, float addrspace(3)* %arrayidx0, align 4
435 %idx.1 = add nsw i32 %tid.x, 8
436 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
437 store float %val1, float addrspace(3)* %arrayidx1, align 4
439 %idx.2 = add nsw i32 %tid.x, 11
440 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
441 store float %val0, float addrspace(3)* %arrayidx2, align 4
443 %idx.3 = add nsw i32 %tid.x, 27
444 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
445 store float %val1, float addrspace(3)* %arrayidx3, align 4
450 define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
451 ; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
453 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
454 ; CI-NEXT: s_mov_b32 s3, 0xf000
455 ; CI-NEXT: s_mov_b32 s2, 0
456 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
457 ; CI-NEXT: v_mov_b32_e32 v1, 0
458 ; CI-NEXT: s_waitcnt lgkmcnt(0)
459 ; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
460 ; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
461 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
462 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
463 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
464 ; CI-NEXT: s_mov_b32 m0, -1
465 ; CI-NEXT: s_waitcnt vmcnt(0)
466 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8
467 ; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
470 ; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
472 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
473 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
474 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
475 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
476 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
477 ; GFX9-NEXT: s_waitcnt vmcnt(0)
478 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
479 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
480 ; GFX9-NEXT: s_endpgm
481 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
482 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
483 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
484 %val0 = load float, float addrspace(1)* %in0.gep, align 4
485 %val1 = load float, float addrspace(1)* %in1.gep, align 4
487 %idx.0 = add nsw i32 %tid.x, 3
488 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
489 store float %val0, float addrspace(3)* %arrayidx0, align 4
491 %idx.1 = add nsw i32 %tid.x, 8
492 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
493 store float %val1, float addrspace(3)* %arrayidx1, align 4
495 %idx.2 = add nsw i32 %tid.x, 11
496 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
497 store float %val0, float addrspace(3)* %arrayidx2, align 4
499 %idx.3 = add nsw i32 %tid.x, 27
500 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
501 store float %val1, float addrspace(3)* %arrayidx3, align 4
506 define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
507 ; CI-LABEL: write2_ptr_subreg_arg_two_val_f32:
509 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
510 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
511 ; CI-NEXT: s_mov_b32 s3, 0xf000
512 ; CI-NEXT: s_mov_b32 s2, 0
513 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
514 ; CI-NEXT: s_waitcnt lgkmcnt(0)
515 ; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
516 ; CI-NEXT: v_mov_b32_e32 v1, 0
517 ; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
518 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
519 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
520 ; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
521 ; CI-NEXT: v_mov_b32_e32 v1, s8
522 ; CI-NEXT: s_mov_b32 m0, -1
523 ; CI-NEXT: v_mov_b32_e32 v3, s9
524 ; CI-NEXT: s_waitcnt vmcnt(1)
525 ; CI-NEXT: ds_write_b32 v1, v2 offset:32
526 ; CI-NEXT: s_waitcnt vmcnt(0)
527 ; CI-NEXT: ds_write_b32 v3, v0 offset:32
530 ; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32:
532 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
533 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
534 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
535 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
536 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
537 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
538 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
539 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
540 ; GFX9-NEXT: s_waitcnt vmcnt(1)
541 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:32
542 ; GFX9-NEXT: s_waitcnt vmcnt(0)
543 ; GFX9-NEXT: ds_write_b32 v3, v2 offset:32
544 ; GFX9-NEXT: s_endpgm
545 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
546 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
547 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
548 %val0 = load float, float addrspace(1)* %in0.gep, align 4
549 %val1 = load float, float addrspace(1)* %in1.gep, align 4
551 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
552 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
553 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
554 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
555 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
557 ; Apply an additional offset after the vector that will be more obviously folded.
558 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
559 store float %val0, float addrspace(3)* %gep.0, align 4
561 %add.x = add nsw i32 %x.i, 8
562 store float %val1, float addrspace(3)* %gep.1.offset, align 4
566 define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
567 ; CI-LABEL: simple_write2_one_val_f64:
569 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
570 ; CI-NEXT: s_mov_b32 s3, 0xf000
571 ; CI-NEXT: s_mov_b32 s2, 0
572 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
573 ; CI-NEXT: v_mov_b32_e32 v1, 0
574 ; CI-NEXT: s_waitcnt lgkmcnt(0)
575 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
576 ; CI-NEXT: s_mov_b32 m0, -1
577 ; CI-NEXT: s_waitcnt vmcnt(0)
578 ; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8
581 ; GFX9-LABEL: simple_write2_one_val_f64:
583 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
584 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
585 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
586 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
587 ; GFX9-NEXT: s_waitcnt vmcnt(0)
588 ; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
589 ; GFX9-NEXT: s_endpgm
590 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
591 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
592 %val = load double, double addrspace(1)* %in.gep, align 8
593 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
594 store double %val, double addrspace(3)* %arrayidx0, align 8
595 %add.x = add nsw i32 %x.i, 8
596 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
597 store double %val, double addrspace(3)* %arrayidx1, align 8
601 define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
602 ; CI-LABEL: misaligned_simple_write2_one_val_f64:
604 ; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
605 ; CI-NEXT: s_load_dword s0, s[0:1], 0xd
606 ; CI-NEXT: s_mov_b32 s7, 0xf000
607 ; CI-NEXT: s_mov_b32 s6, 0
608 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
609 ; CI-NEXT: v_mov_b32_e32 v1, 0
610 ; CI-NEXT: s_waitcnt lgkmcnt(0)
611 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
612 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
613 ; CI-NEXT: s_mov_b32 m0, -1
614 ; CI-NEXT: s_waitcnt vmcnt(0)
615 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
616 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:14 offset1:15
619 ; GFX9-LABEL: misaligned_simple_write2_one_val_f64:
621 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
622 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34
623 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
624 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
625 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
626 ; GFX9-NEXT: v_add_u32_e32 v2, s4, v2
627 ; GFX9-NEXT: s_waitcnt vmcnt(0)
628 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
629 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
630 ; GFX9-NEXT: s_endpgm
631 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
632 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
633 %val = load double, double addrspace(1)* %in.gep, align 8
634 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
635 store double %val, double addrspace(3)* %arrayidx0, align 4
636 %add.x = add nsw i32 %x.i, 7
637 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
638 store double %val, double addrspace(3)* %arrayidx1, align 4
642 define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
643 ; CI-LABEL: unaligned_offset_simple_write2_one_val_f64:
645 ; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
646 ; CI-NEXT: s_load_dword s0, s[0:1], 0xd
647 ; CI-NEXT: s_mov_b32 s7, 0xf000
648 ; CI-NEXT: s_mov_b32 s6, 0
649 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
650 ; CI-NEXT: v_mov_b32_e32 v1, 0
651 ; CI-NEXT: s_waitcnt lgkmcnt(0)
652 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
653 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
654 ; CI-NEXT: s_mov_b32 m0, -1
655 ; CI-NEXT: s_waitcnt vmcnt(0)
656 ; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1
657 ; CI-NEXT: ds_write_b8 v0, v1 offset:5
658 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
659 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
660 ; CI-NEXT: ds_write_b8 v0, v2 offset:13
661 ; CI-NEXT: ds_write_b8 v0, v1 offset:9
662 ; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2
663 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
664 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
665 ; CI-NEXT: ds_write_b8 v0, v3 offset:8
666 ; CI-NEXT: ds_write_b8 v0, v4 offset:7
667 ; CI-NEXT: ds_write_b8 v0, v5 offset:6
668 ; CI-NEXT: ds_write_b8 v0, v1 offset:16
669 ; CI-NEXT: ds_write_b8 v0, v6 offset:15
670 ; CI-NEXT: ds_write_b8 v0, v2 offset:14
671 ; CI-NEXT: ds_write_b8 v0, v3 offset:12
672 ; CI-NEXT: ds_write_b8 v0, v4 offset:11
673 ; CI-NEXT: ds_write_b8 v0, v5 offset:10
676 ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
677 ; GFX9-ALIGNED: ; %bb.0:
678 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
679 ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34
680 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
681 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
682 ; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
683 ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2
684 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0)
685 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
686 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7
687 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5
688 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0
689 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15
690 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13
691 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11
692 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9
693 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1
694 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1
695 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8
696 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6
697 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16
698 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14
699 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12
700 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10
701 ; GFX9-ALIGNED-NEXT: s_endpgm
703 ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
704 ; GFX9-UNALIGNED: ; %bb.0:
705 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
706 ; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34
707 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
708 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
709 ; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
710 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2
711 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2
712 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2
713 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
714 ; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
715 ; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
716 ; GFX9-UNALIGNED-NEXT: s_endpgm
717 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
718 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
719 %val = load double, double addrspace(1)* %in.gep, align 8
720 %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
721 %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)*
722 %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5
723 %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)*
724 store double %val, double addrspace(3)* %addr0, align 1
725 %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9
726 %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)*
727 store double %val, double addrspace(3)* %addr1, align 1
731 define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
732 ; CI-LABEL: simple_write2_two_val_f64:
734 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
735 ; CI-NEXT: s_mov_b32 s3, 0xf000
736 ; CI-NEXT: s_mov_b32 s2, 0
737 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
738 ; CI-NEXT: v_mov_b32_e32 v1, 0
739 ; CI-NEXT: s_waitcnt lgkmcnt(0)
740 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
741 ; CI-NEXT: s_waitcnt vmcnt(0)
742 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
743 ; CI-NEXT: s_waitcnt vmcnt(0)
744 ; CI-NEXT: s_mov_b32 m0, -1
745 ; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
748 ; GFX9-LABEL: simple_write2_two_val_f64:
750 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
751 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
752 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
753 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc
754 ; GFX9-NEXT: s_waitcnt vmcnt(0)
755 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
756 ; GFX9-NEXT: s_waitcnt vmcnt(0)
757 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
758 ; GFX9-NEXT: s_endpgm
759 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
760 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
761 %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
762 %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
763 %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
764 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
765 store double %val0, double addrspace(3)* %arrayidx0, align 8
766 %add.x = add nsw i32 %x.i, 8
767 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
768 store double %val1, double addrspace(3)* %arrayidx1, align 8
772 @foo = addrspace(3) global [4 x i32] undef, align 4
774 define amdgpu_kernel void @store_constant_adjacent_offsets() {
775 ; CI-LABEL: store_constant_adjacent_offsets:
777 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
778 ; CI-NEXT: v_mov_b32_e32 v1, v0
779 ; CI-NEXT: v_mov_b32_e32 v2, 0
780 ; CI-NEXT: s_mov_b32 m0, -1
781 ; CI-NEXT: ds_write_b64 v2, v[0:1]
784 ; GFX9-LABEL: store_constant_adjacent_offsets:
786 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
787 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
788 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
789 ; GFX9-NEXT: ds_write_b64 v2, v[0:1]
790 ; GFX9-NEXT: s_endpgm
791 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
792 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
796 define amdgpu_kernel void @store_constant_disjoint_offsets() {
797 ; CI-LABEL: store_constant_disjoint_offsets:
799 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
800 ; CI-NEXT: v_mov_b32_e32 v1, 0
801 ; CI-NEXT: s_mov_b32 m0, -1
802 ; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
805 ; GFX9-LABEL: store_constant_disjoint_offsets:
807 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
808 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
809 ; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
810 ; GFX9-NEXT: s_endpgm
811 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
812 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
816 @bar = addrspace(3) global [4 x i64] undef, align 4
818 define amdgpu_kernel void @store_misaligned64_constant_offsets() {
819 ; CI-LABEL: store_misaligned64_constant_offsets:
821 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
822 ; CI-NEXT: v_mov_b32_e32 v1, 0
823 ; CI-NEXT: v_mov_b32_e32 v2, v0
824 ; CI-NEXT: v_mov_b32_e32 v3, v1
825 ; CI-NEXT: s_mov_b32 m0, -1
826 ; CI-NEXT: ds_write_b128 v1, v[0:3]
829 ; GFX9-LABEL: store_misaligned64_constant_offsets:
831 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
832 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
833 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
834 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
835 ; GFX9-NEXT: ds_write_b128 v1, v[0:3]
836 ; GFX9-NEXT: s_endpgm
837 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
838 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
842 @bar.large = addrspace(3) global [4096 x i64] undef, align 4
844 define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
845 ; CI-LABEL: store_misaligned64_constant_large_offsets:
847 ; CI-NEXT: s_mov_b64 s[0:1], 0x7b
848 ; CI-NEXT: v_mov_b32_e32 v0, s0
849 ; CI-NEXT: v_mov_b32_e32 v2, 0
850 ; CI-NEXT: v_mov_b32_e32 v1, s1
851 ; CI-NEXT: s_mov_b32 m0, -1
852 ; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384
853 ; CI-NEXT: ds_write_b64 v2, v[0:1] offset:32760
856 ; GFX9-LABEL: store_misaligned64_constant_large_offsets:
858 ; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b
859 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
860 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
861 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
862 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384
863 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760
864 ; GFX9-NEXT: s_endpgm
865 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
866 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
870 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
871 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
873 define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
874 ; CI-LABEL: write2_sgemm_sequence:
876 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
877 ; CI-NEXT: s_lshl_b32 s2, s2, 2
878 ; CI-NEXT: s_add_i32 s3, s2, 0xc20
879 ; CI-NEXT: v_mov_b32_e32 v0, s3
880 ; CI-NEXT: s_addk_i32 s2, 0xc60
881 ; CI-NEXT: s_waitcnt lgkmcnt(0)
882 ; CI-NEXT: s_load_dword s0, s[0:1], 0x0
883 ; CI-NEXT: s_mov_b32 m0, -1
884 ; CI-NEXT: s_waitcnt lgkmcnt(0)
885 ; CI-NEXT: v_mov_b32_e32 v2, s0
886 ; CI-NEXT: v_mov_b32_e32 v3, s0
887 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
888 ; CI-NEXT: v_mov_b32_e32 v0, s2
889 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
890 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1
891 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
892 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
893 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
896 ; GFX9-LABEL: write2_sgemm_sequence:
898 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
899 ; GFX9-NEXT: s_lshl_b32 s2, s2, 2
900 ; GFX9-NEXT: s_add_i32 s3, s2, 0xc20
901 ; GFX9-NEXT: s_addk_i32 s2, 0xc60
902 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
903 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
904 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
905 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
906 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
907 ; GFX9-NEXT: v_mov_b32_e32 v3, s0
908 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
909 ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
910 ; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1
911 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1
912 ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
913 ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
914 ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
915 ; GFX9-NEXT: s_endpgm
916 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
917 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
918 %val = load float, float addrspace(1)* %in
919 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
920 store float %val, float addrspace(3)* %arrayidx44, align 4
921 %add47 = add nsw i32 %x.i, 1
922 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
923 store float %val, float addrspace(3)* %arrayidx48, align 4
924 %add51 = add nsw i32 %x.i, 16
925 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
926 store float %val, float addrspace(3)* %arrayidx52, align 4
927 %add55 = add nsw i32 %x.i, 17
928 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
929 store float %val, float addrspace(3)* %arrayidx56, align 4
930 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
931 store float %val, float addrspace(3)* %arrayidx60, align 4
932 %add63 = add nsw i32 %y.i, 1
933 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
934 store float %val, float addrspace(3)* %arrayidx64, align 4
935 %add67 = add nsw i32 %y.i, 32
936 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
937 store float %val, float addrspace(3)* %arrayidx68, align 4
938 %add71 = add nsw i32 %y.i, 33
939 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
940 store float %val, float addrspace(3)* %arrayidx72, align 4
941 %add75 = add nsw i32 %y.i, 64
942 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
943 store float %val, float addrspace(3)* %arrayidx76, align 4
944 %add79 = add nsw i32 %y.i, 65
945 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
946 store float %val, float addrspace(3)* %arrayidx80, align 4
950 define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
951 ; CI-LABEL: simple_write2_v4f32_superreg_align4:
953 ; CI-NEXT: s_load_dword s4, s[0:1], 0x9
954 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
955 ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
956 ; CI-NEXT: s_mov_b32 m0, -1
957 ; CI-NEXT: s_waitcnt lgkmcnt(0)
958 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
959 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
960 ; CI-NEXT: s_waitcnt lgkmcnt(0)
961 ; CI-NEXT: v_mov_b32_e32 v1, s0
962 ; CI-NEXT: v_mov_b32_e32 v2, s1
963 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
964 ; CI-NEXT: v_mov_b32_e32 v3, s2
965 ; CI-NEXT: v_mov_b32_e32 v1, s3
966 ; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
969 ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
970 ; GFX9-ALIGNED: ; %bb.0:
971 ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24
972 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
973 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
974 ; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
975 ; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
976 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
977 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0
978 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1
979 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s2
980 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, s3
981 ; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
982 ; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
983 ; GFX9-ALIGNED-NEXT: s_endpgm
985 ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
986 ; GFX9-UNALIGNED: ; %bb.0:
987 ; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24
988 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
989 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
990 ; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4
991 ; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
992 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
993 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
994 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2
995 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
996 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
997 ; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
998 ; GFX9-UNALIGNED-NEXT: s_endpgm
999 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1000 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
1001 %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
1002 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
1003 store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
1007 @v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
1009 define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
1010 ; CI-LABEL: write2_v2i32_align1_odd_offset:
1011 ; CI: ; %bb.0: ; %entry
1012 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
1013 ; CI-NEXT: v_mov_b32_e32 v1, 0
1014 ; CI-NEXT: s_mov_b32 m0, -1
1015 ; CI-NEXT: ds_write_b8 v1, v0 offset:65
1016 ; CI-NEXT: v_mov_b32_e32 v0, 1
1017 ; CI-NEXT: ds_write_b8 v1, v0 offset:70
1018 ; CI-NEXT: v_mov_b32_e32 v0, 0xc8
1019 ; CI-NEXT: ds_write_b8 v1, v0 offset:69
1020 ; CI-NEXT: ds_write_b8 v1, v1 offset:68
1021 ; CI-NEXT: ds_write_b8 v1, v1 offset:67
1022 ; CI-NEXT: ds_write_b8 v1, v1 offset:66
1023 ; CI-NEXT: ds_write_b8 v1, v1 offset:72
1024 ; CI-NEXT: ds_write_b8 v1, v1 offset:71
1027 ; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1028 ; GFX9-ALIGNED: ; %bb.0: ; %entry
1029 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
1030 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0
1031 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65
1032 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1
1033 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70
1034 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8
1035 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69
1036 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68
1037 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67
1038 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66
1039 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72
1040 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71
1041 ; GFX9-ALIGNED-NEXT: s_endpgm
1043 ; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1044 ; GFX9-UNALIGNED: ; %bb.0: ; %entry
1045 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41
1046 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b
1047 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x1c8
1048 ; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
1049 ; GFX9-UNALIGNED-NEXT: s_endpgm
1051 store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
1055 declare i32 @llvm.amdgcn.workgroup.id.x() #1
1056 declare i32 @llvm.amdgcn.workgroup.id.y() #1
1057 declare i32 @llvm.amdgcn.workitem.id.x() #1
1058 declare i32 @llvm.amdgcn.workitem.id.y() #1
1060 attributes #0 = { nounwind }
1061 attributes #1 = { nounwind readnone speculatable }
1062 attributes #2 = { convergent nounwind }