1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG
3 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL
4 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-SDAG
5 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-GISEL
7 define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
8 ; GCN-LABEL: ds1align1:
10 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
11 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
12 ; GCN-NEXT: v_mov_b32_e32 v0, s0
13 ; GCN-NEXT: ds_read_u8 v0, v0
14 ; GCN-NEXT: v_mov_b32_e32 v1, s1
15 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
16 ; GCN-NEXT: ds_write_b8 v1, v0
18 %val = load i8, ptr addrspace(3) %in, align 1
19 store i8 %val, ptr addrspace(3) %out, align 1
23 define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
24 ; ALIGNED-SDAG-LABEL: ds2align1:
25 ; ALIGNED-SDAG: ; %bb.0:
26 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
27 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
28 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
29 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
30 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:1
31 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1
32 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
33 ; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v1
34 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
35 ; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v0 offset:1
36 ; ALIGNED-SDAG-NEXT: s_endpgm
38 ; ALIGNED-GISEL-LABEL: ds2align1:
39 ; ALIGNED-GISEL: ; %bb.0:
40 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
41 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
42 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
43 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
44 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1
45 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
46 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
47 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1
48 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
49 ; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0
50 ; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v1 offset:1
51 ; ALIGNED-GISEL-NEXT: s_endpgm
53 ; UNALIGNED-LABEL: ds2align1:
55 ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
56 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
57 ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
58 ; UNALIGNED-NEXT: ds_read_u16 v0, v0
59 ; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
60 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
61 ; UNALIGNED-NEXT: ds_write_b16 v1, v0
62 ; UNALIGNED-NEXT: s_endpgm
63 %val = load i16, ptr addrspace(3) %in, align 1
64 store i16 %val, ptr addrspace(3) %out, align 1
68 define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
69 ; GCN-LABEL: ds2align2:
71 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
72 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
73 ; GCN-NEXT: v_mov_b32_e32 v0, s0
74 ; GCN-NEXT: ds_read_u16 v0, v0
75 ; GCN-NEXT: v_mov_b32_e32 v1, s1
76 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
77 ; GCN-NEXT: ds_write_b16 v1, v0
79 %val = load i16, ptr addrspace(3) %in, align 2
80 store i16 %val, ptr addrspace(3) %out, align 2
84 define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
85 ; ALIGNED-SDAG-LABEL: ds4align1:
86 ; ALIGNED-SDAG: ; %bb.0:
87 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
88 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
89 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
90 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
91 ; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
92 ; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
93 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:3
94 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
95 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
96 ; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v1
97 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
98 ; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v2 offset:1
99 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
100 ; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v3 offset:2
101 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
102 ; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v0 offset:3
103 ; ALIGNED-SDAG-NEXT: s_endpgm
105 ; ALIGNED-GISEL-LABEL: ds4align1:
106 ; ALIGNED-GISEL: ; %bb.0:
107 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
108 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8
109 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
110 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
111 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
112 ; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
113 ; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3
114 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2
115 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1
116 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
117 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
118 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
119 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3
120 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
121 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
122 ; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1
123 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
124 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0
125 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:1
126 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
127 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:2
128 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:3
129 ; ALIGNED-GISEL-NEXT: s_endpgm
131 ; UNALIGNED-LABEL: ds4align1:
132 ; UNALIGNED: ; %bb.0:
133 ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
134 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
135 ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
136 ; UNALIGNED-NEXT: ds_read_b32 v0, v0
137 ; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
138 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
139 ; UNALIGNED-NEXT: ds_write_b32 v1, v0
140 ; UNALIGNED-NEXT: s_endpgm
141 %val = load i32, ptr addrspace(3) %in, align 1
142 store i32 %val, ptr addrspace(3) %out, align 1
146 define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
147 ; ALIGNED-SDAG-LABEL: ds4align2:
148 ; ALIGNED-SDAG: ; %bb.0:
149 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
150 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
151 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
152 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0
153 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2
154 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1
155 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
156 ; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v1
157 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
158 ; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v0 offset:2
159 ; ALIGNED-SDAG-NEXT: s_endpgm
161 ; ALIGNED-GISEL-LABEL: ds4align2:
162 ; ALIGNED-GISEL: ; %bb.0:
163 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
164 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
165 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
166 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
167 ; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2
168 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
169 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
170 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
171 ; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0
172 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v2, v0 offset:2
173 ; ALIGNED-GISEL-NEXT: s_endpgm
175 ; UNALIGNED-LABEL: ds4align2:
176 ; UNALIGNED: ; %bb.0:
177 ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
178 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
179 ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
180 ; UNALIGNED-NEXT: ds_read_b32 v0, v0
181 ; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
182 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
183 ; UNALIGNED-NEXT: ds_write_b32 v1, v0
184 ; UNALIGNED-NEXT: s_endpgm
185 %val = load i32, ptr addrspace(3) %in, align 2
186 store i32 %val, ptr addrspace(3) %out, align 2
190 define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
191 ; GCN-LABEL: ds4align4:
193 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
194 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
195 ; GCN-NEXT: v_mov_b32_e32 v0, s0
196 ; GCN-NEXT: ds_read_b32 v0, v0
197 ; GCN-NEXT: v_mov_b32_e32 v1, s1
198 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
199 ; GCN-NEXT: ds_write_b32 v1, v0
201 %val = load i32, ptr addrspace(3) %in, align 4
202 store i32 %val, ptr addrspace(3) %out, align 4
206 define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
207 ; ALIGNED-SDAG-LABEL: ds8align1:
208 ; ALIGNED-SDAG: ; %bb.0:
209 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
210 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
211 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
212 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
213 ; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
214 ; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
215 ; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3
216 ; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4
217 ; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5
218 ; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6
219 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7
220 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s1
221 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
222 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v5 offset:4
223 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
224 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v6 offset:5
225 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v1
226 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v2 offset:1
227 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v3 offset:2
228 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v4 offset:3
229 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
230 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6
231 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
232 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7
233 ; ALIGNED-SDAG-NEXT: s_endpgm
235 ; ALIGNED-GISEL-LABEL: ds8align1:
236 ; ALIGNED-GISEL: ; %bb.0:
237 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
238 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
239 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
240 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
241 ; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
242 ; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
243 ; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3
244 ; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4
245 ; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5
246 ; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6
247 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7
248 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
249 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
250 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
251 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4
252 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
253 ; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
254 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
255 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5
256 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
257 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
258 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7
259 ; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2
260 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1
261 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
262 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1
263 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1
264 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, 8
265 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
266 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v1 offset:2
267 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v4 offset:3
268 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
269 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v0 offset:4
270 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:5
271 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
272 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v0 offset:6
273 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:7
274 ; ALIGNED-GISEL-NEXT: s_endpgm
276 ; UNALIGNED-LABEL: ds8align1:
277 ; UNALIGNED: ; %bb.0:
278 ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
279 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
280 ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
281 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
282 ; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
283 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
284 ; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
285 ; UNALIGNED-NEXT: s_endpgm
286 %val = load <2 x i32>, ptr addrspace(3) %in, align 1
287 store <2 x i32> %val, ptr addrspace(3) %out, align 1
291 define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
292 ; ALIGNED-SDAG-LABEL: ds8align2:
293 ; ALIGNED-SDAG: ; %bb.0:
294 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
295 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
296 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
297 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4
298 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
299 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
300 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6
301 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
302 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
303 ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4
304 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
305 ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2
306 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
307 ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:2
308 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
309 ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:6
310 ; ALIGNED-SDAG-NEXT: s_endpgm
312 ; ALIGNED-GISEL-LABEL: ds8align2:
313 ; ALIGNED-GISEL: ; %bb.0:
314 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
315 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
316 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
317 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
318 ; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
319 ; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
320 ; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6
321 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
322 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
323 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
324 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
325 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3
326 ; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1
327 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2
328 ; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:4
329 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:6
330 ; ALIGNED-GISEL-NEXT: s_endpgm
332 ; UNALIGNED-LABEL: ds8align2:
333 ; UNALIGNED: ; %bb.0:
334 ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
335 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
336 ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
337 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
338 ; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
339 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
340 ; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
341 ; UNALIGNED-NEXT: s_endpgm
342 %val = load <2 x i32>, ptr addrspace(3) %in, align 2
343 store <2 x i32> %val, ptr addrspace(3) %out, align 2
347 define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
348 ; GCN-LABEL: ds8align4:
350 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
351 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
352 ; GCN-NEXT: v_mov_b32_e32 v0, s0
353 ; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
354 ; GCN-NEXT: v_mov_b32_e32 v2, s1
355 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
356 ; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
358 %val = load <2 x i32>, ptr addrspace(3) %in, align 4
359 store <2 x i32> %val, ptr addrspace(3) %out, align 4
363 define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
364 ; GCN-LABEL: ds8align8:
366 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
367 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
368 ; GCN-NEXT: v_mov_b32_e32 v0, s0
369 ; GCN-NEXT: ds_read_b64 v[0:1], v0
370 ; GCN-NEXT: v_mov_b32_e32 v2, s1
371 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
372 ; GCN-NEXT: ds_write_b64 v2, v[0:1]
374 %val = load <2 x i32>, ptr addrspace(3) %in, align 8
375 store <2 x i32> %val, ptr addrspace(3) %out, align 8
379 define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
380 ; ALIGNED-SDAG-LABEL: ds12align1:
381 ; ALIGNED-SDAG: ; %bb.0:
382 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
383 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
384 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
385 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
386 ; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
387 ; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
388 ; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3
389 ; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4
390 ; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5
391 ; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6
392 ; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7
393 ; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8
394 ; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9
395 ; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10
396 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11
397 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1
398 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
399 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8
400 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
401 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9
402 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4
403 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5
404 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1
405 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1
406 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2
407 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3
408 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6
409 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7
410 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11)
411 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10
412 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11)
413 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11
414 ; ALIGNED-SDAG-NEXT: s_endpgm
416 ; ALIGNED-GISEL-LABEL: ds12align1:
417 ; ALIGNED-GISEL: ; %bb.0:
418 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
419 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
420 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
421 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
422 ; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
423 ; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
424 ; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3
425 ; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4
426 ; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5
427 ; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6
428 ; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7
429 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
430 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
431 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
432 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4
433 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
434 ; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
435 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
436 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5
437 ; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8
438 ; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9
439 ; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10
440 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:11
441 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
442 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v8
443 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7
444 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
445 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3
446 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
447 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5
448 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
449 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
450 ; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3
451 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1
452 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
453 ; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2
454 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1
455 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1
456 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, 8
457 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
458 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v1 offset:2
459 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v5 offset:3
460 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2
461 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:4
462 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:5
463 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
464 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v2 offset:6
465 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:7
466 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
467 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:8
468 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:9
469 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
470 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:10
471 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:11
472 ; ALIGNED-GISEL-NEXT: s_endpgm
474 ; UNALIGNED-LABEL: ds12align1:
475 ; UNALIGNED: ; %bb.0:
476 ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
477 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
478 ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
479 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
480 ; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
481 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
482 ; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
483 ; UNALIGNED-NEXT: s_endpgm
484 %val = load <3 x i32>, ptr addrspace(3) %in, align 1
485 store <3 x i32> %val, ptr addrspace(3) %out, align 1
489 define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
490 ; ALIGNED-SDAG-LABEL: ds12align2:
491 ; ALIGNED-SDAG: ; %bb.0:
492 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
493 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
494 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
495 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8
496 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
497 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
498 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4
499 ; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6
500 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1
501 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10
502 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
503 ; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8
504 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
505 ; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4
506 ; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2
507 ; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2
508 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
509 ; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6
510 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
511 ; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10
512 ; ALIGNED-SDAG-NEXT: s_endpgm
514 ; ALIGNED-GISEL-LABEL: ds12align2:
515 ; ALIGNED-GISEL: ; %bb.0:
516 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
517 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
518 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
519 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
520 ; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
521 ; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
522 ; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6
523 ; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8
524 ; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10
525 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1
526 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
527 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
528 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
529 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3
530 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
531 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v5
532 ; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1
533 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v1 offset:2
534 ; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v2 offset:4
535 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v2 offset:6
536 ; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v0 offset:8
537 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v0 offset:10
538 ; ALIGNED-GISEL-NEXT: s_endpgm
540 ; UNALIGNED-LABEL: ds12align2:
541 ; UNALIGNED: ; %bb.0:
542 ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
543 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
544 ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
545 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
546 ; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
547 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
548 ; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
549 ; UNALIGNED-NEXT: s_endpgm
550 %val = load <3 x i32>, ptr addrspace(3) %in, align 2
551 store <3 x i32> %val, ptr addrspace(3) %out, align 2
555 define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
556 ; ALIGNED-LABEL: ds12align4:
558 ; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
559 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
560 ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0
561 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
562 ; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8
563 ; ALIGNED-NEXT: v_mov_b32_e32 v3, s1
564 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
565 ; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
566 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
567 ; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8
568 ; ALIGNED-NEXT: s_endpgm
570 ; UNALIGNED-SDAG-LABEL: ds12align4:
571 ; UNALIGNED-SDAG: ; %bb.0:
572 ; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
573 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
574 ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
575 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
576 ; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8
577 ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
578 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
579 ; UNALIGNED-SDAG-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
580 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
581 ; UNALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8
582 ; UNALIGNED-SDAG-NEXT: s_endpgm
584 ; UNALIGNED-GISEL-LABEL: ds12align4:
585 ; UNALIGNED-GISEL: ; %bb.0:
586 ; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
587 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
588 ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
589 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0
590 ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
591 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
592 ; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2]
593 ; UNALIGNED-GISEL-NEXT: s_endpgm
594 %val = load <3 x i32>, ptr addrspace(3) %in, align 4
595 store <3 x i32> %val, ptr addrspace(3) %out, align 4
599 define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
600 ; ALIGNED-SDAG-LABEL: ds12align8:
601 ; ALIGNED-SDAG: ; %bb.0:
602 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
603 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
604 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
605 ; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2
606 ; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8
607 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
608 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
609 ; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1]
610 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
611 ; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8
612 ; ALIGNED-SDAG-NEXT: s_endpgm
614 ; ALIGNED-GISEL-LABEL: ds12align8:
615 ; ALIGNED-GISEL: ; %bb.0:
616 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
617 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
618 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0
619 ; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
620 ; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8
621 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
622 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
623 ; ALIGNED-GISEL-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
624 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
625 ; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8
626 ; ALIGNED-GISEL-NEXT: s_endpgm
628 ; UNALIGNED-SDAG-LABEL: ds12align8:
629 ; UNALIGNED-SDAG: ; %bb.0:
630 ; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
631 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
632 ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
633 ; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8
634 ; UNALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v0
635 ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
636 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
637 ; UNALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8
638 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
639 ; UNALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1]
640 ; UNALIGNED-SDAG-NEXT: s_endpgm
642 ; UNALIGNED-GISEL-LABEL: ds12align8:
643 ; UNALIGNED-GISEL: ; %bb.0:
644 ; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
645 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
646 ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
647 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0
648 ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
649 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
650 ; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2]
651 ; UNALIGNED-GISEL-NEXT: s_endpgm
652 %val = load <3 x i32>, ptr addrspace(3) %in, align 8
653 store <3 x i32> %val, ptr addrspace(3) %out, align 8
657 define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
658 ; GCN-LABEL: ds12align16:
660 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
661 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
662 ; GCN-NEXT: v_mov_b32_e32 v0, s0
663 ; GCN-NEXT: ds_read_b96 v[0:2], v0
664 ; GCN-NEXT: v_mov_b32_e32 v3, s1
665 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
666 ; GCN-NEXT: ds_write_b96 v3, v[0:2]
668 %val = load <3 x i32>, ptr addrspace(3) %in, align 16
669 store <3 x i32> %val, ptr addrspace(3) %out, align 16
673 define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
674 ; ALIGNED-SDAG-LABEL: ds16align1:
675 ; ALIGNED-SDAG: ; %bb.0:
676 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
677 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
678 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
679 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
680 ; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
681 ; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
682 ; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3
683 ; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4
684 ; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5
685 ; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6
686 ; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7
687 ; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8
688 ; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9
689 ; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10
690 ; ALIGNED-SDAG-NEXT: ds_read_u8 v12, v0 offset:11
691 ; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12
692 ; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13
693 ; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14
694 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15
695 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1
696 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
697 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12
698 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
699 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13
700 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1
701 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1
702 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4
703 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5
704 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8
705 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9
706 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2
707 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3
708 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6
709 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7
710 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10
711 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11
712 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14)
713 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14
714 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15
715 ; ALIGNED-SDAG-NEXT: s_endpgm
717 ; ALIGNED-GISEL-LABEL: ds16align1:
718 ; ALIGNED-GISEL: ; %bb.0:
719 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
720 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
721 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
722 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
723 ; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
724 ; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
725 ; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3
726 ; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4
727 ; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5
728 ; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6
729 ; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7
730 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
731 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
732 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
733 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4
734 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
735 ; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
736 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
737 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5
738 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
739 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v8
740 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7
741 ; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2
742 ; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8
743 ; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9
744 ; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10
745 ; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:11
746 ; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:12
747 ; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:13
748 ; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:14
749 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15
750 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
751 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3
752 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
753 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v6
754 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
755 ; ALIGNED-GISEL-NEXT: v_or3_b32 v3, v4, v5, v3
756 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
757 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v4, v8, 8, v7
758 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
759 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
760 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9
761 ; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4
762 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1
763 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1
764 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1
765 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1
766 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8
767 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v6, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
768 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v1 offset:2
769 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v6 offset:3
770 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2
771 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4
772 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:5
773 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
774 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v2 offset:6
775 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:7
776 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3
777 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:8
778 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:9
779 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
780 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v3 offset:10
781 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:11
782 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
783 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:12
784 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:13
785 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
786 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:14
787 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:15
788 ; ALIGNED-GISEL-NEXT: s_endpgm
790 ; UNALIGNED-LABEL: ds16align1:
791 ; UNALIGNED: ; %bb.0:
792 ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
793 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
794 ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
795 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
796 ; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
797 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
798 ; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
799 ; UNALIGNED-NEXT: s_endpgm
800 %val = load <4 x i32>, ptr addrspace(3) %in, align 1
801 store <4 x i32> %val, ptr addrspace(3) %out, align 1
805 define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
806 ; ALIGNED-SDAG-LABEL: ds16align2:
807 ; ALIGNED-SDAG: ; %bb.0:
808 ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
809 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
810 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
811 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12
812 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
813 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
814 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4
815 ; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6
816 ; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8
817 ; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10
818 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1
819 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14
820 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
821 ; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12
822 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
823 ; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2
824 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6)
825 ; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4
826 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
827 ; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8
828 ; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2
829 ; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6
830 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
831 ; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10
832 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
833 ; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14
834 ; ALIGNED-SDAG-NEXT: s_endpgm
836 ; ALIGNED-GISEL-LABEL: ds16align2:
837 ; ALIGNED-GISEL: ; %bb.0:
838 ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
839 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
840 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
841 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
842 ; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
843 ; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
844 ; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6
845 ; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8
846 ; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10
847 ; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:12
848 ; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14
849 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
850 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
851 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
852 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3
853 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
854 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
855 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5
856 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
857 ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v7
858 ; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1
859 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2
860 ; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:4
861 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v2 offset:6
862 ; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:8
863 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v3 offset:10
864 ; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:12
865 ; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:14
866 ; ALIGNED-GISEL-NEXT: s_endpgm
868 ; UNALIGNED-LABEL: ds16align2:
869 ; UNALIGNED: ; %bb.0:
870 ; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
871 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
872 ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
873 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
874 ; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
875 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
876 ; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
877 ; UNALIGNED-NEXT: s_endpgm
878 %val = load <4 x i32>, ptr addrspace(3) %in, align 2
879 store <4 x i32> %val, ptr addrspace(3) %out, align 2
883 define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
884 ; ALIGNED-LABEL: ds16align4:
886 ; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
887 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
888 ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0
889 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
890 ; ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
891 ; ALIGNED-NEXT: v_mov_b32_e32 v4, s1
892 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
893 ; ALIGNED-NEXT: ds_write2_b32 v4, v0, v1 offset1:1
894 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
895 ; ALIGNED-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
896 ; ALIGNED-NEXT: s_endpgm
898 ; UNALIGNED-SDAG-LABEL: ds16align4:
899 ; UNALIGNED-SDAG: ; %bb.0:
900 ; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
901 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
902 ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
903 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
904 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
905 ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
906 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
907 ; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
908 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
909 ; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset1:1
910 ; UNALIGNED-SDAG-NEXT: s_endpgm
912 ; UNALIGNED-GISEL-LABEL: ds16align4:
913 ; UNALIGNED-GISEL: ; %bb.0:
914 ; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
915 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
916 ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
917 ; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
918 ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
919 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
920 ; UNALIGNED-GISEL-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
921 ; UNALIGNED-GISEL-NEXT: s_endpgm
922 %val = load <4 x i32>, ptr addrspace(3) %in, align 4
923 store <4 x i32> %val, ptr addrspace(3) %out, align 4
927 define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
928 ; GCN-LABEL: ds16align8:
930 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
931 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
932 ; GCN-NEXT: v_mov_b32_e32 v0, s0
933 ; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
934 ; GCN-NEXT: v_mov_b32_e32 v4, s1
935 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
936 ; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
938 %val = load <4 x i32>, ptr addrspace(3) %in, align 8
939 store <4 x i32> %val, ptr addrspace(3) %out, align 8
943 define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
944 ; GCN-LABEL: ds16align16:
946 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
947 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
948 ; GCN-NEXT: v_mov_b32_e32 v0, s0
949 ; GCN-NEXT: ds_read_b128 v[0:3], v0
950 ; GCN-NEXT: v_mov_b32_e32 v4, s1
951 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
952 ; GCN-NEXT: ds_write_b128 v4, v[0:3]
954 %val = load <4 x i32>, ptr addrspace(3) %in, align 16
955 store <4 x i32> %val, ptr addrspace(3) %out, align 16