1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8 ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
10 define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) {
11 ; GFX9-LABEL: store_lds_v3i32:
13 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
14 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
17 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
18 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
19 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
20 ; GFX9-NEXT: ds_write_b96 v3, v[0:2]
23 ; GFX7-LABEL: store_lds_v3i32:
25 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
26 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
27 ; GFX7-NEXT: s_mov_b32 m0, -1
28 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
29 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
30 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
31 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
32 ; GFX7-NEXT: v_mov_b32_e32 v3, s0
33 ; GFX7-NEXT: ds_write_b96 v3, v[0:2]
36 ; GFX10-LABEL: store_lds_v3i32:
38 ; GFX10-NEXT: s_clause 0x1
39 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
40 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
41 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
43 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
44 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
45 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
46 ; GFX10-NEXT: ds_write_b96 v3, v[0:2]
47 ; GFX10-NEXT: s_endpgm
49 ; GFX11-LABEL: store_lds_v3i32:
51 ; GFX11-NEXT: s_clause 0x1
52 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
53 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
54 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
55 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
56 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
57 ; GFX11-NEXT: ds_store_b96 v3, v[0:2]
58 ; GFX11-NEXT: s_endpgm
59 store <3 x i32> %x, ptr addrspace(3) %out
63 define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) {
64 ; GFX9-LABEL: store_lds_v3i32_align1:
66 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
67 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
68 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s4
70 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
71 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
72 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8
73 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16
74 ; GFX9-NEXT: ds_write_b8 v1, v0
75 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
76 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
77 ; GFX9-NEXT: s_lshr_b32 s1, s0, 8
78 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
79 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
80 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
81 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s5
82 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
83 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8
84 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
85 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16
86 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
87 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
88 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
89 ; GFX9-NEXT: s_lshr_b32 s1, s0, 8
90 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
91 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
92 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
93 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s6
94 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
95 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8
96 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
97 ; GFX9-NEXT: s_lshr_b32 s0, s6, 16
98 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
99 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
100 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
101 ; GFX9-NEXT: s_lshr_b32 s1, s0, 8
102 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
103 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
104 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
105 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11
106 ; GFX9-NEXT: s_endpgm
108 ; GFX7-LABEL: store_lds_v3i32_align1:
110 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
111 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
112 ; GFX7-NEXT: s_mov_b32 m0, -1
113 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
114 ; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008
115 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
116 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
117 ; GFX7-NEXT: s_lshr_b32 s1, s4, 16
118 ; GFX7-NEXT: ds_write_b8 v1, v0
119 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
120 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
121 ; GFX7-NEXT: s_lshr_b32 s0, s4, 24
122 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
123 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
124 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
125 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
126 ; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008
127 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
128 ; GFX7-NEXT: s_lshr_b32 s0, s5, 16
129 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
130 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
131 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
132 ; GFX7-NEXT: s_lshr_b32 s1, s5, 24
133 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
134 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
135 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
136 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
137 ; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008
138 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
139 ; GFX7-NEXT: s_lshr_b32 s0, s6, 16
140 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
141 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
142 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9
143 ; GFX7-NEXT: s_lshr_b32 s1, s6, 24
144 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
145 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10
146 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
147 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11
148 ; GFX7-NEXT: s_endpgm
150 ; GFX10-LABEL: store_lds_v3i32_align1:
152 ; GFX10-NEXT: s_clause 0x1
153 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
154 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
155 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
156 ; GFX10-NEXT: s_lshr_b32 s0, s4, 16
157 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
158 ; GFX10-NEXT: s_lshr_b32 s2, s5, 16
159 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s5
160 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s4
161 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
162 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
163 ; GFX10-NEXT: s_lshr_b32 s4, s6, 16
164 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s6
165 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
166 ; GFX10-NEXT: s_lshr_b32 s6, s0, 8
167 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
168 ; GFX10-NEXT: s_lshr_b32 s0, s3, 8
169 ; GFX10-NEXT: s_lshr_b32 s3, s2, 8
170 ; GFX10-NEXT: s_lshr_b32 s1, s1, 8
171 ; GFX10-NEXT: v_mov_b32_e32 v5, s2
172 ; GFX10-NEXT: s_lshr_b32 s2, s5, 8
173 ; GFX10-NEXT: v_mov_b32_e32 v9, s3
174 ; GFX10-NEXT: v_mov_b32_e32 v6, s1
175 ; GFX10-NEXT: v_mov_b32_e32 v8, s0
176 ; GFX10-NEXT: v_mov_b32_e32 v10, s2
177 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8
178 ; GFX10-NEXT: v_mov_b32_e32 v7, s6
179 ; GFX10-NEXT: ds_write_b8 v1, v0
180 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4
181 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2
182 ; GFX10-NEXT: ds_write_b8 v1, v5 offset:6
183 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:1
184 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3
185 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5
186 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
187 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
188 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7
189 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
190 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
191 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
192 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:11
193 ; GFX10-NEXT: s_endpgm
195 ; GFX11-LABEL: store_lds_v3i32_align1:
197 ; GFX11-NEXT: s_clause 0x1
198 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
199 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
200 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
201 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s4
202 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16
203 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
204 ; GFX11-NEXT: s_lshr_b32 s4, s6, 16
205 ; GFX11-NEXT: s_lshr_b32 s2, s2, 8
206 ; GFX11-NEXT: s_lshr_b32 s0, s5, 16
207 ; GFX11-NEXT: s_and_b32 s3, 0xffff, s5
208 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
209 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s6
210 ; GFX11-NEXT: s_lshr_b32 s6, s1, 8
211 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s2
212 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
213 ; GFX11-NEXT: s_lshr_b32 s1, s3, 8
214 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8
215 ; GFX11-NEXT: s_lshr_b32 s0, s5, 8
216 ; GFX11-NEXT: s_lshr_b32 s5, s4, 8
217 ; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s1
218 ; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v11, s0
219 ; GFX11-NEXT: v_mov_b32_e32 v12, s5
220 ; GFX11-NEXT: ds_store_b8 v1, v0
221 ; GFX11-NEXT: ds_store_b8 v1, v7 offset:1
222 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
223 ; GFX11-NEXT: ds_store_b8 v1, v8 offset:3
224 ; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
225 ; GFX11-NEXT: ds_store_b8 v1, v9 offset:5
226 ; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
227 ; GFX11-NEXT: ds_store_b8 v1, v10 offset:7
228 ; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
229 ; GFX11-NEXT: ds_store_b8 v1, v11 offset:9
230 ; GFX11-NEXT: ds_store_b8 v1, v6 offset:10
231 ; GFX11-NEXT: ds_store_b8 v1, v12 offset:11
232 ; GFX11-NEXT: s_endpgm
233 store <3 x i32> %x, ptr addrspace(3) %out, align 1
237 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) {
238 ; GFX9-LABEL: store_lds_v3i32_align2:
240 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
241 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
242 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
243 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16
244 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
245 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
246 ; GFX9-NEXT: ds_write_b16 v1, v0
247 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
248 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2
249 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16
250 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
251 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4
252 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
253 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6
254 ; GFX9-NEXT: s_lshr_b32 s0, s6, 16
255 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
256 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8
257 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
258 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10
259 ; GFX9-NEXT: s_endpgm
261 ; GFX7-LABEL: store_lds_v3i32_align2:
263 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
264 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
265 ; GFX7-NEXT: s_mov_b32 m0, -1
266 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
267 ; GFX7-NEXT: s_lshr_b32 s1, s4, 16
268 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
269 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
270 ; GFX7-NEXT: ds_write_b16 v1, v0
271 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
272 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:2
273 ; GFX7-NEXT: s_lshr_b32 s0, s5, 16
274 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
275 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:4
276 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
277 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:6
278 ; GFX7-NEXT: s_lshr_b32 s0, s6, 16
279 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
280 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:8
281 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
282 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:10
283 ; GFX7-NEXT: s_endpgm
285 ; GFX10-LABEL: store_lds_v3i32_align2:
287 ; GFX10-NEXT: s_clause 0x1
288 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
289 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
290 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
291 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
292 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
293 ; GFX10-NEXT: s_lshr_b32 s0, s4, 16
294 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
295 ; GFX10-NEXT: s_lshr_b32 s1, s5, 16
296 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
297 ; GFX10-NEXT: s_lshr_b32 s2, s6, 16
298 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
299 ; GFX10-NEXT: v_mov_b32_e32 v5, s1
300 ; GFX10-NEXT: v_mov_b32_e32 v6, s2
301 ; GFX10-NEXT: ds_write_b16 v1, v0
302 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4
303 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8
304 ; GFX10-NEXT: ds_write_b16 v1, v4 offset:2
305 ; GFX10-NEXT: ds_write_b16 v1, v5 offset:6
306 ; GFX10-NEXT: ds_write_b16 v1, v6 offset:10
307 ; GFX10-NEXT: s_endpgm
309 ; GFX11-LABEL: store_lds_v3i32_align2:
311 ; GFX11-NEXT: s_clause 0x1
312 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
313 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
314 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
315 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16
316 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
317 ; GFX11-NEXT: s_lshr_b32 s0, s5, 16
318 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s1
319 ; GFX11-NEXT: s_lshr_b32 s2, s6, 16
320 ; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s0
321 ; GFX11-NEXT: v_mov_b32_e32 v6, s2
322 ; GFX11-NEXT: ds_store_b16 v1, v0
323 ; GFX11-NEXT: ds_store_b16 v1, v3 offset:2
324 ; GFX11-NEXT: ds_store_b16 v1, v2 offset:4
325 ; GFX11-NEXT: ds_store_b16 v1, v5 offset:6
326 ; GFX11-NEXT: ds_store_b16 v1, v4 offset:8
327 ; GFX11-NEXT: ds_store_b16 v1, v6 offset:10
328 ; GFX11-NEXT: s_endpgm
329 store <3 x i32> %x, ptr addrspace(3) %out, align 2
333 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) {
334 ; GFX9-LABEL: store_lds_v3i32_align4:
336 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
337 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
338 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
339 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
340 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
341 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
342 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
343 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
344 ; GFX9-NEXT: ds_write_b32 v1, v3 offset:8
345 ; GFX9-NEXT: s_endpgm
347 ; GFX7-LABEL: store_lds_v3i32_align4:
349 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
350 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
351 ; GFX7-NEXT: s_mov_b32 m0, -1
352 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
353 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
354 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
355 ; GFX7-NEXT: v_mov_b32_e32 v2, s5
356 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
357 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
358 ; GFX7-NEXT: ds_write_b32 v1, v0 offset:8
359 ; GFX7-NEXT: s_endpgm
361 ; GFX10-LABEL: store_lds_v3i32_align4:
363 ; GFX10-NEXT: s_clause 0x1
364 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
365 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
366 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
367 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
368 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
369 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
370 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
371 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
372 ; GFX10-NEXT: ds_write_b32 v1, v3 offset:8
373 ; GFX10-NEXT: s_endpgm
375 ; GFX11-LABEL: store_lds_v3i32_align4:
377 ; GFX11-NEXT: s_clause 0x1
378 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
379 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
380 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
382 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
383 ; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1
384 ; GFX11-NEXT: ds_store_b32 v1, v3 offset:8
385 ; GFX11-NEXT: s_endpgm
386 store <3 x i32> %x, ptr addrspace(3) %out, align 4
390 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) {
391 ; GFX9-LABEL: store_lds_v3i32_align8:
393 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
394 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
395 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
396 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
397 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
398 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
399 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
400 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
401 ; GFX9-NEXT: ds_write_b32 v1, v3 offset:8
402 ; GFX9-NEXT: s_endpgm
404 ; GFX7-LABEL: store_lds_v3i32_align8:
406 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
407 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
408 ; GFX7-NEXT: s_mov_b32 m0, -1
409 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
410 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
411 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
412 ; GFX7-NEXT: v_mov_b32_e32 v2, s5
413 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
414 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
415 ; GFX7-NEXT: ds_write_b32 v1, v0 offset:8
416 ; GFX7-NEXT: s_endpgm
418 ; GFX10-LABEL: store_lds_v3i32_align8:
420 ; GFX10-NEXT: s_clause 0x1
421 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
422 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
423 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
424 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
425 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
426 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
427 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
428 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
429 ; GFX10-NEXT: ds_write_b32 v1, v3 offset:8
430 ; GFX10-NEXT: s_endpgm
432 ; GFX11-LABEL: store_lds_v3i32_align8:
434 ; GFX11-NEXT: s_clause 0x1
435 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
436 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
437 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
438 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
439 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
440 ; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1
441 ; GFX11-NEXT: ds_store_b32 v1, v3 offset:8
442 ; GFX11-NEXT: s_endpgm
443 store <3 x i32> %x, ptr addrspace(3) %out, align 8
447 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) {
448 ; GFX9-LABEL: store_lds_v3i32_align16:
450 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
451 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
452 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
453 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
454 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
455 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
456 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
457 ; GFX9-NEXT: ds_write_b96 v3, v[0:2]
458 ; GFX9-NEXT: s_endpgm
460 ; GFX7-LABEL: store_lds_v3i32_align16:
462 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
463 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
464 ; GFX7-NEXT: s_mov_b32 m0, -1
465 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
466 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
467 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
468 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
469 ; GFX7-NEXT: v_mov_b32_e32 v3, s0
470 ; GFX7-NEXT: ds_write_b96 v3, v[0:2]
471 ; GFX7-NEXT: s_endpgm
473 ; GFX10-LABEL: store_lds_v3i32_align16:
475 ; GFX10-NEXT: s_clause 0x1
476 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
477 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
478 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
479 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
480 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
481 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
482 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
483 ; GFX10-NEXT: ds_write_b96 v3, v[0:2]
484 ; GFX10-NEXT: s_endpgm
486 ; GFX11-LABEL: store_lds_v3i32_align16:
488 ; GFX11-NEXT: s_clause 0x1
489 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
490 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
491 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
492 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
493 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
494 ; GFX11-NEXT: ds_store_b96 v3, v[0:2]
495 ; GFX11-NEXT: s_endpgm
496 store <3 x i32> %x, ptr addrspace(3) %out, align 16