1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
7 ; Unaligned DS access in available from GFX9 onwards.
8 ; LDS alignment enforcement is controlled by a configuration register:
9 ; SH_MEM_CONFIG.alignment_mode
11 define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
12 ; GFX9-LABEL: load_lds_v4i32_align1:
14 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15 ; GFX9-NEXT: ds_read_b128 v[0:3], v0
16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
17 ; GFX9-NEXT: s_setpc_b64 s[30:31]
19 ; GFX7-LABEL: load_lds_v4i32_align1:
21 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22 ; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
23 ; GFX7-NEXT: ds_read_u8 v2, v0
24 ; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
25 ; GFX7-NEXT: s_mov_b32 m0, -1
26 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
27 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
28 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
29 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
30 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:3
31 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
32 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
33 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
34 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
35 ; GFX7-NEXT: ds_read_u8 v9, v0 offset:8
36 ; GFX7-NEXT: ds_read_u8 v10, v0 offset:9
37 ; GFX7-NEXT: ds_read_u8 v11, v0 offset:10
38 ; GFX7-NEXT: s_waitcnt lgkmcnt(7)
39 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
40 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
41 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
42 ; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
43 ; GFX7-NEXT: s_waitcnt lgkmcnt(5)
44 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
45 ; GFX7-NEXT: s_waitcnt lgkmcnt(3)
46 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
47 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
48 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
49 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
50 ; GFX7-NEXT: ds_read_u8 v3, v0 offset:11
51 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:12
52 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:13
53 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:14
54 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
55 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
56 ; GFX7-NEXT: s_waitcnt lgkmcnt(6)
57 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v10
58 ; GFX7-NEXT: s_waitcnt lgkmcnt(4)
59 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3
60 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v11
61 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v9
62 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v8
63 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
64 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
65 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6
66 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
67 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
69 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
70 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
71 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
72 ; GFX7-NEXT: v_mov_b32_e32 v0, v4
73 ; GFX7-NEXT: s_setpc_b64 s[30:31]
75 ; GFX10-LABEL: load_lds_v4i32_align1:
77 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
79 ; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
80 ; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
81 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX10-NEXT: s_setpc_b64 s[30:31]
84 ; GFX11-LABEL: load_lds_v4i32_align1:
86 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87 ; GFX11-NEXT: ds_load_b128 v[0:3], v0
88 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
89 ; GFX11-NEXT: s_setpc_b64 s[30:31]
90 %load = load <4 x i32>, ptr addrspace(3) %ptr, align 1
94 define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
95 ; GFX9-LABEL: load_lds_v3i32_align1:
97 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98 ; GFX9-NEXT: ds_read_b96 v[0:2], v0
99 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX9-NEXT: s_setpc_b64 s[30:31]
102 ; GFX7-LABEL: load_lds_v3i32_align1:
104 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
106 ; GFX7-NEXT: ds_read_u8 v2, v0
107 ; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
108 ; GFX7-NEXT: s_mov_b32 m0, -1
109 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
110 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
111 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
112 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
113 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:3
114 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:4
115 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:5
116 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:6
117 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:7
118 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:8
119 ; GFX7-NEXT: ds_read_u8 v9, v0 offset:9
120 ; GFX7-NEXT: ds_read_u8 v10, v0 offset:10
121 ; GFX7-NEXT: s_waitcnt lgkmcnt(7)
122 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
123 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
124 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
125 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
126 ; GFX7-NEXT: v_or_b32_e32 v3, v2, v1
127 ; GFX7-NEXT: s_waitcnt lgkmcnt(6)
128 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v5
129 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
130 ; GFX7-NEXT: s_waitcnt lgkmcnt(4)
131 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7
132 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
133 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
134 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
135 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
136 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9
137 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
138 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
139 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v10
140 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v8
141 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4
142 ; GFX7-NEXT: v_or_b32_e32 v2, v0, v2
143 ; GFX7-NEXT: v_mov_b32_e32 v0, v3
144 ; GFX7-NEXT: s_setpc_b64 s[30:31]
146 ; GFX10-LABEL: load_lds_v3i32_align1:
148 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
150 ; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
151 ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8
152 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX10-NEXT: s_setpc_b64 s[30:31]
155 ; GFX11-LABEL: load_lds_v3i32_align1:
157 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158 ; GFX11-NEXT: ds_load_b96 v[0:2], v0
159 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
160 ; GFX11-NEXT: s_setpc_b64 s[30:31]
161 %load = load <3 x i32>, ptr addrspace(3) %ptr, align 1
165 define void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) {
166 ; GFX9-LABEL: store_lds_v4i32_align1:
168 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169 ; GFX9-NEXT: ds_write_b128 v0, v[1:4]
170 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
171 ; GFX9-NEXT: s_setpc_b64 s[30:31]
173 ; GFX7-LABEL: store_lds_v4i32_align1:
175 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
176 ; GFX7-NEXT: s_mov_b32 m0, -1
177 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1
178 ; GFX7-NEXT: v_bfe_u32 v6, v1, 8, 8
179 ; GFX7-NEXT: ds_write_b8 v0, v1
180 ; GFX7-NEXT: ds_write_b8 v0, v6 offset:1
181 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1
182 ; GFX7-NEXT: ds_write_b8 v0, v5 offset:2
183 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
184 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
185 ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8
186 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
187 ; GFX7-NEXT: ds_write_b8 v0, v5 offset:5
188 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
189 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
190 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:7
191 ; GFX7-NEXT: v_bfe_u32 v2, v3, 8, 8
192 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
193 ; GFX7-NEXT: ds_write_b8 v0, v3 offset:8
194 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:9
195 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3
196 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
197 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:11
198 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4
199 ; GFX7-NEXT: v_bfe_u32 v2, v4, 8, 8
200 ; GFX7-NEXT: ds_write_b8 v0, v4 offset:12
201 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:13
202 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v4
203 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:14
204 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:15
205 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX7-NEXT: s_setpc_b64 s[30:31]
208 ; GFX10-LABEL: store_lds_v4i32_align1:
210 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
212 ; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
213 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX10-NEXT: s_setpc_b64 s[30:31]
216 ; GFX11-LABEL: store_lds_v4i32_align1:
218 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219 ; GFX11-NEXT: ds_store_b128 v0, v[1:4]
220 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
221 ; GFX11-NEXT: s_setpc_b64 s[30:31]
222 store <4 x i32> %x, ptr addrspace(3) %out, align 1
226 define void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) {
227 ; GFX9-LABEL: store_lds_v3i32_align1:
229 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230 ; GFX9-NEXT: ds_write_b96 v0, v[1:3]
231 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
232 ; GFX9-NEXT: s_setpc_b64 s[30:31]
234 ; GFX7-LABEL: store_lds_v3i32_align1:
236 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237 ; GFX7-NEXT: s_mov_b32 m0, -1
238 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
239 ; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8
240 ; GFX7-NEXT: ds_write_b8 v0, v1
241 ; GFX7-NEXT: ds_write_b8 v0, v5 offset:1
242 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1
243 ; GFX7-NEXT: ds_write_b8 v0, v4 offset:2
244 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
245 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
246 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
247 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
248 ; GFX7-NEXT: ds_write_b8 v0, v4 offset:5
249 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
250 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
251 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:7
252 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
253 ; GFX7-NEXT: v_bfe_u32 v2, v3, 8, 8
254 ; GFX7-NEXT: ds_write_b8 v0, v3 offset:8
255 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:9
256 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3
257 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
258 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:11
259 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
260 ; GFX7-NEXT: s_setpc_b64 s[30:31]
262 ; GFX10-LABEL: store_lds_v3i32_align1:
264 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
266 ; GFX10-NEXT: ds_write_b32 v0, v3 offset:8
267 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX10-NEXT: s_setpc_b64 s[30:31]
270 ; GFX11-LABEL: store_lds_v3i32_align1:
272 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273 ; GFX11-NEXT: ds_store_b96 v0, v[1:3]
274 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
275 ; GFX11-NEXT: s_setpc_b64 s[30:31]
276 store <3 x i32> %x, ptr addrspace(3) %out, align 1
280 define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg %ptr, ptr addrspace(1) inreg %out) {
281 ; GFX9-LABEL: test_s_load_constant_v8i32_align1:
283 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
284 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
285 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
286 ; GFX9-NEXT: s_waitcnt vmcnt(1)
287 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3]
288 ; GFX9-NEXT: s_waitcnt vmcnt(1)
289 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
290 ; GFX9-NEXT: s_endpgm
292 ; GFX7-LABEL: test_s_load_constant_v8i32_align1:
294 ; GFX7-NEXT: s_mov_b32 s4, s2
295 ; GFX7-NEXT: s_mov_b32 s5, s3
296 ; GFX7-NEXT: s_mov_b32 s2, -1
297 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
298 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
299 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
300 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
301 ; GFX7-NEXT: s_waitcnt vmcnt(1)
302 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
303 ; GFX7-NEXT: s_waitcnt vmcnt(1)
304 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
305 ; GFX7-NEXT: s_endpgm
307 ; GFX10-LABEL: test_s_load_constant_v8i32_align1:
309 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
310 ; GFX10-NEXT: s_clause 0x1
311 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
312 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
313 ; GFX10-NEXT: s_waitcnt vmcnt(1)
314 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3]
315 ; GFX10-NEXT: s_waitcnt vmcnt(0)
316 ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
317 ; GFX10-NEXT: s_endpgm
319 ; GFX11-LABEL: test_s_load_constant_v8i32_align1:
321 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
322 ; GFX11-NEXT: s_clause 0x1
323 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[0:1]
324 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:16
325 ; GFX11-NEXT: s_waitcnt vmcnt(1)
326 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3]
327 ; GFX11-NEXT: s_waitcnt vmcnt(0)
328 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16
329 ; GFX11-NEXT: s_nop 0
330 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
331 ; GFX11-NEXT: s_endpgm
332 %load = load <8 x i32>, ptr addrspace(4) %ptr, align 1
333 store <8 x i32> %load, ptr addrspace(1) %out