1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8 define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) {
9 ; GFX9-LABEL: store_lds_v3i32:
11 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
12 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
15 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
16 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
17 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
18 ; GFX9-NEXT: ds_write_b96 v3, v[0:2]
21 ; GFX7-LABEL: store_lds_v3i32:
23 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
24 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
25 ; GFX7-NEXT: s_mov_b32 m0, -1
26 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
27 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
28 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
29 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
30 ; GFX7-NEXT: v_mov_b32_e32 v3, s0
31 ; GFX7-NEXT: ds_write_b96 v3, v[0:2]
34 ; GFX6-LABEL: store_lds_v3i32:
36 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
37 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
38 ; GFX6-NEXT: s_mov_b32 m0, -1
39 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
41 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
42 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
43 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
44 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
45 ; GFX6-NEXT: ds_write_b64 v2, v[0:1]
48 ; GFX10-LABEL: store_lds_v3i32:
50 ; GFX10-NEXT: s_clause 0x1
51 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
52 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
53 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
55 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
56 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
57 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
58 ; GFX10-NEXT: ds_write_b96 v3, v[0:2]
59 ; GFX10-NEXT: s_endpgm
61 ; GFX11-LABEL: store_lds_v3i32:
63 ; GFX11-NEXT: s_clause 0x1
64 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
65 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
66 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
67 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
68 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
69 ; GFX11-NEXT: ds_store_b96 v3, v[0:2]
70 ; GFX11-NEXT: s_endpgm
71 store <3 x i32> %x, ptr addrspace(3) %out
75 define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) {
76 ; GFX9-LABEL: store_lds_v3i32_align1:
78 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
79 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
80 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
81 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
82 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
83 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
84 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8
85 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10
86 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:4
87 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
88 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
89 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8
90 ; GFX9-NEXT: ds_write_b8 v0, v1
91 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
92 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
93 ; GFX9-NEXT: s_lshr_b32 s0, s6, 24
94 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9
95 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
96 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8
97 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11
98 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
99 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24
100 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5
101 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
102 ; GFX9-NEXT: s_lshr_b32 s0, s4, 8
103 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7
104 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
105 ; GFX9-NEXT: s_lshr_b32 s0, s4, 24
106 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1
107 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
108 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3
109 ; GFX9-NEXT: s_endpgm
111 ; GFX7-LABEL: store_lds_v3i32_align1:
113 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
114 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
115 ; GFX7-NEXT: s_mov_b32 m0, -1
116 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
117 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
118 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
119 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
120 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8
121 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
122 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
123 ; GFX7-NEXT: s_lshr_b32 s3, s2, 8
124 ; GFX7-NEXT: ds_write_b8 v0, v1
125 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
126 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24
127 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
128 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
129 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16
130 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
131 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
132 ; GFX7-NEXT: s_lshr_b32 s2, s1, 8
133 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
134 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
135 ; GFX7-NEXT: s_lshr_b32 s2, s1, 24
136 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
137 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
138 ; GFX7-NEXT: s_lshr_b32 s1, s1, 16
139 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
140 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
141 ; GFX7-NEXT: s_lshr_b32 s1, s0, 8
142 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
143 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
144 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24
145 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1
146 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
147 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
148 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
149 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
150 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2
151 ; GFX7-NEXT: s_endpgm
153 ; GFX6-LABEL: store_lds_v3i32_align1:
155 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
156 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
157 ; GFX6-NEXT: s_mov_b32 m0, -1
158 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
159 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
160 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
161 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
162 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8
163 ; GFX6-NEXT: ds_write_b8 v0, v2 offset:4
164 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
165 ; GFX6-NEXT: s_lshr_b32 s3, s2, 8
166 ; GFX6-NEXT: ds_write_b8 v0, v1
167 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
168 ; GFX6-NEXT: s_lshr_b32 s3, s2, 24
169 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:9
170 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
171 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16
172 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
173 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
174 ; GFX6-NEXT: s_lshr_b32 s2, s1, 8
175 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10
176 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
177 ; GFX6-NEXT: s_lshr_b32 s2, s1, 24
178 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:5
179 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
180 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
181 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
182 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
183 ; GFX6-NEXT: s_lshr_b32 s1, s0, 8
184 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:6
185 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
186 ; GFX6-NEXT: s_lshr_b32 s1, s0, 24
187 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1
188 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
189 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
190 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3
191 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
192 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2
193 ; GFX6-NEXT: s_endpgm
195 ; GFX10-LABEL: store_lds_v3i32_align1:
197 ; GFX10-NEXT: s_clause 0x1
198 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
199 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
201 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
202 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
203 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
204 ; GFX10-NEXT: v_mov_b32_e32 v3, s4
205 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8
206 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24
207 ; GFX10-NEXT: s_lshr_b32 s2, s5, 8
208 ; GFX10-NEXT: s_lshr_b32 s3, s5, 24
209 ; GFX10-NEXT: s_lshr_b32 s5, s4, 8
210 ; GFX10-NEXT: s_lshr_b32 s4, s4, 24
211 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
212 ; GFX10-NEXT: v_mov_b32_e32 v5, s1
213 ; GFX10-NEXT: v_mov_b32_e32 v6, s2
214 ; GFX10-NEXT: v_mov_b32_e32 v7, s3
215 ; GFX10-NEXT: v_mov_b32_e32 v8, s5
216 ; GFX10-NEXT: v_mov_b32_e32 v9, s4
217 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8
218 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10
219 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:4
220 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
221 ; GFX10-NEXT: ds_write_b8 v0, v3
222 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2
223 ; GFX10-NEXT: ds_write_b8 v0, v4 offset:9
224 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:11
225 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:5
226 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:7
227 ; GFX10-NEXT: ds_write_b8 v0, v8 offset:1
228 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:3
229 ; GFX10-NEXT: s_endpgm
231 ; GFX11-LABEL: store_lds_v3i32_align1:
233 ; GFX11-NEXT: s_clause 0x1
234 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
235 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10
236 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2
238 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0
239 ; GFX11-NEXT: s_lshr_b32 s3, s2, 8
240 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24
241 ; GFX11-NEXT: s_lshr_b32 s4, s1, 8
242 ; GFX11-NEXT: s_lshr_b32 s1, s1, 24
243 ; GFX11-NEXT: s_lshr_b32 s5, s0, 8
244 ; GFX11-NEXT: s_lshr_b32 s0, s0, 24
245 ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2
246 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s1
247 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s0
248 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:8
249 ; GFX11-NEXT: ds_store_b8 v0, v3
250 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:2
251 ; GFX11-NEXT: ds_store_b8 v0, v2 offset:4
252 ; GFX11-NEXT: ds_store_b8 v0, v4 offset:9
253 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10
254 ; GFX11-NEXT: ds_store_b8 v0, v5 offset:11
255 ; GFX11-NEXT: ds_store_b8 v0, v6 offset:5
256 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:6
257 ; GFX11-NEXT: ds_store_b8 v0, v7 offset:7
258 ; GFX11-NEXT: ds_store_b8 v0, v8 offset:1
259 ; GFX11-NEXT: ds_store_b8 v0, v9 offset:3
260 ; GFX11-NEXT: s_endpgm
261 store <3 x i32> %x, ptr addrspace(3) %out, align 1
265 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) {
266 ; GFX9-LABEL: store_lds_v3i32_align2:
268 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
269 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
270 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
271 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
272 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
273 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
274 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8
275 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
276 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4
277 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6
278 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
279 ; GFX9-NEXT: ds_write_b16 v0, v1
280 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2
281 ; GFX9-NEXT: s_endpgm
283 ; GFX7-LABEL: store_lds_v3i32_align2:
285 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
286 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
287 ; GFX7-NEXT: s_mov_b32 m0, -1
288 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
289 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
290 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
291 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
292 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:8
293 ; GFX7-NEXT: ds_write_b16 v0, v2 offset:4
294 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
295 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16
296 ; GFX7-NEXT: ds_write_b16 v0, v1
297 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
298 ; GFX7-NEXT: s_lshr_b32 s1, s1, 16
299 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10
300 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
301 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
302 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:6
303 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
304 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2
305 ; GFX7-NEXT: s_endpgm
307 ; GFX6-LABEL: store_lds_v3i32_align2:
309 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
310 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
311 ; GFX6-NEXT: s_mov_b32 m0, -1
312 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
313 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
314 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
315 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
316 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:8
317 ; GFX6-NEXT: ds_write_b16 v0, v2 offset:4
318 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
319 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16
320 ; GFX6-NEXT: ds_write_b16 v0, v1
321 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
322 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
323 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10
324 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
325 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
326 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:6
327 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
328 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2
329 ; GFX6-NEXT: s_endpgm
331 ; GFX10-LABEL: store_lds_v3i32_align2:
333 ; GFX10-NEXT: s_clause 0x1
334 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
335 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
336 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
338 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
339 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
340 ; GFX10-NEXT: v_mov_b32_e32 v3, s4
341 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:8
342 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
343 ; GFX10-NEXT: ds_write_b16 v0, v2 offset:4
344 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:6
345 ; GFX10-NEXT: ds_write_b16 v0, v3
346 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2
347 ; GFX10-NEXT: s_endpgm
349 ; GFX11-LABEL: store_lds_v3i32_align2:
351 ; GFX11-NEXT: s_clause 0x1
352 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
353 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10
354 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
355 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2
356 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
357 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10
358 ; GFX11-NEXT: ds_store_b16 v0, v2
359 ; GFX11-NEXT: ds_store_b16 v0, v3 offset:4
360 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:8
361 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6
362 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2
363 ; GFX11-NEXT: s_endpgm
364 store <3 x i32> %x, ptr addrspace(3) %out, align 2
368 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) {
369 ; GFX9-LABEL: store_lds_v3i32_align4:
371 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
372 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
375 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
376 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
377 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
378 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
379 ; GFX9-NEXT: ds_write_b32 v0, v3 offset:8
380 ; GFX9-NEXT: s_endpgm
382 ; GFX7-LABEL: store_lds_v3i32_align4:
384 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
385 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
386 ; GFX7-NEXT: s_mov_b32 m0, -1
387 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
388 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
389 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
390 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
391 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
392 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
393 ; GFX7-NEXT: ds_write_b32 v0, v1 offset:8
394 ; GFX7-NEXT: s_endpgm
396 ; GFX6-LABEL: store_lds_v3i32_align4:
398 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
399 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
400 ; GFX6-NEXT: s_mov_b32 m0, -1
401 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
403 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
404 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
405 ; GFX6-NEXT: ds_write_b32 v0, v1 offset:8
406 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
407 ; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1
408 ; GFX6-NEXT: s_endpgm
410 ; GFX10-LABEL: store_lds_v3i32_align4:
412 ; GFX10-NEXT: s_clause 0x1
413 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
414 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
415 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
416 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
417 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
418 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
419 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
420 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:8
421 ; GFX10-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
422 ; GFX10-NEXT: s_endpgm
424 ; GFX11-LABEL: store_lds_v3i32_align4:
426 ; GFX11-NEXT: s_clause 0x1
427 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
428 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10
429 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
430 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
431 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
432 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
433 ; GFX11-NEXT: ds_store_b32 v0, v3 offset:8
434 ; GFX11-NEXT: s_endpgm
435 store <3 x i32> %x, ptr addrspace(3) %out, align 4
439 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) {
440 ; GFX9-LABEL: store_lds_v3i32_align8:
442 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
443 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
444 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
445 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
446 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
447 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
448 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
449 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8
450 ; GFX9-NEXT: ds_write_b64 v2, v[0:1]
451 ; GFX9-NEXT: s_endpgm
453 ; GFX7-LABEL: store_lds_v3i32_align8:
455 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
456 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
457 ; GFX7-NEXT: s_mov_b32 m0, -1
458 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
459 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
460 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
461 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
462 ; GFX7-NEXT: ds_write_b32 v2, v1 offset:8
463 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
464 ; GFX7-NEXT: ds_write_b64 v2, v[0:1]
465 ; GFX7-NEXT: s_endpgm
467 ; GFX6-LABEL: store_lds_v3i32_align8:
469 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
470 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
471 ; GFX6-NEXT: s_mov_b32 m0, -1
472 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
473 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
474 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
475 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
476 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
477 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
478 ; GFX6-NEXT: ds_write_b64 v2, v[0:1]
479 ; GFX6-NEXT: s_endpgm
481 ; GFX10-LABEL: store_lds_v3i32_align8:
483 ; GFX10-NEXT: s_clause 0x1
484 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
485 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
486 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
488 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
489 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
490 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
491 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:8
492 ; GFX10-NEXT: ds_write_b64 v2, v[0:1]
493 ; GFX10-NEXT: s_endpgm
495 ; GFX11-LABEL: store_lds_v3i32_align8:
497 ; GFX11-NEXT: s_clause 0x1
498 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
499 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10
500 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
501 ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s2
502 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
503 ; GFX11-NEXT: ds_store_b32 v2, v3 offset:8
504 ; GFX11-NEXT: ds_store_b64 v2, v[0:1]
505 ; GFX11-NEXT: s_endpgm
506 store <3 x i32> %x, ptr addrspace(3) %out, align 8
510 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) {
511 ; GFX9-LABEL: store_lds_v3i32_align16:
513 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
514 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
515 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
516 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
517 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
518 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
519 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
520 ; GFX9-NEXT: ds_write_b96 v3, v[0:2]
521 ; GFX9-NEXT: s_endpgm
523 ; GFX7-LABEL: store_lds_v3i32_align16:
525 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
526 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
527 ; GFX7-NEXT: s_mov_b32 m0, -1
528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
529 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
530 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
531 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
532 ; GFX7-NEXT: v_mov_b32_e32 v3, s0
533 ; GFX7-NEXT: ds_write_b96 v3, v[0:2]
534 ; GFX7-NEXT: s_endpgm
536 ; GFX6-LABEL: store_lds_v3i32_align16:
538 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
539 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
540 ; GFX6-NEXT: s_mov_b32 m0, -1
541 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
542 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
543 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
544 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
545 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
546 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
547 ; GFX6-NEXT: ds_write_b64 v2, v[0:1]
548 ; GFX6-NEXT: s_endpgm
550 ; GFX10-LABEL: store_lds_v3i32_align16:
552 ; GFX10-NEXT: s_clause 0x1
553 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
554 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
555 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
556 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
557 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
558 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
559 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
560 ; GFX10-NEXT: ds_write_b96 v3, v[0:2]
561 ; GFX10-NEXT: s_endpgm
563 ; GFX11-LABEL: store_lds_v3i32_align16:
565 ; GFX11-NEXT: s_clause 0x1
566 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
567 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
568 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
570 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
571 ; GFX11-NEXT: ds_store_b96 v3, v[0:2]
572 ; GFX11-NEXT: s_endpgm
573 store <3 x i32> %x, ptr addrspace(3) %out, align 16