1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8 define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
9 ; GFX9-LABEL: store_lds_v3i32:
11 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
12 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
15 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
16 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
17 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
18 ; GFX9-NEXT: ds_write_b96 v3, v[0:2]
21 ; GFX7-LABEL: store_lds_v3i32:
23 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
24 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
25 ; GFX7-NEXT: s_mov_b32 m0, -1
26 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
27 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
28 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
29 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
30 ; GFX7-NEXT: v_mov_b32_e32 v3, s0
31 ; GFX7-NEXT: ds_write_b96 v3, v[0:2]
34 ; GFX6-LABEL: store_lds_v3i32:
36 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
37 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
38 ; GFX6-NEXT: s_mov_b32 m0, -1
39 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
41 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
42 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
43 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
44 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
45 ; GFX6-NEXT: ds_write_b64 v2, v[0:1]
48 ; GFX10-LABEL: store_lds_v3i32:
50 ; GFX10-NEXT: s_clause 0x1
51 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
52 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
53 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
55 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
56 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
57 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
58 ; GFX10-NEXT: ds_write_b96 v3, v[0:2]
59 ; GFX10-NEXT: s_endpgm
61 ; GFX11-LABEL: store_lds_v3i32:
63 ; GFX11-NEXT: s_clause 0x1
64 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
65 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
66 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
67 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
68 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
69 ; GFX11-NEXT: ds_store_b96 v3, v[0:2]
70 ; GFX11-NEXT: s_endpgm
71 store <3 x i32> %x, <3 x i32> addrspace(3)* %out
75 define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
76 ; GFX9-LABEL: store_lds_v3i32_align1:
78 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
79 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
80 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
81 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
82 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
83 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
84 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8
85 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10
86 ; GFX9-NEXT: ds_write_b8 v0, v2
87 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2
88 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
89 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8
90 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:4
91 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
92 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
93 ; GFX9-NEXT: s_lshr_b32 s0, s6, 24
94 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9
95 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
96 ; GFX9-NEXT: s_lshr_b32 s0, s4, 8
97 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11
98 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
99 ; GFX9-NEXT: s_lshr_b32 s0, s4, 24
100 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1
101 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
102 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8
103 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3
104 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
105 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24
106 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5
107 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
108 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7
109 ; GFX9-NEXT: s_endpgm
111 ; GFX7-LABEL: store_lds_v3i32_align1:
113 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
114 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
115 ; GFX7-NEXT: s_mov_b32 m0, -1
116 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
117 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
118 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
119 ; GFX7-NEXT: v_mov_b32_e32 v2, s0
120 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8
121 ; GFX7-NEXT: ds_write_b8 v0, v2
122 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
123 ; GFX7-NEXT: s_lshr_b32 s3, s2, 8
124 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:4
125 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
126 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24
127 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
128 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
129 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16
130 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
131 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
132 ; GFX7-NEXT: s_lshr_b32 s2, s0, 8
133 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
134 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
135 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24
136 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1
137 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
138 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
139 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
140 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
141 ; GFX7-NEXT: s_lshr_b32 s0, s1, 8
142 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2
143 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
144 ; GFX7-NEXT: s_lshr_b32 s0, s1, 24
145 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
146 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
147 ; GFX7-NEXT: s_lshr_b32 s0, s1, 16
148 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
149 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
150 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
151 ; GFX7-NEXT: s_endpgm
153 ; GFX6-LABEL: store_lds_v3i32_align1:
155 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
156 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
157 ; GFX6-NEXT: s_mov_b32 m0, -1
158 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
159 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
160 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
161 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
162 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8
163 ; GFX6-NEXT: ds_write_b8 v0, v2
164 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
165 ; GFX6-NEXT: s_lshr_b32 s3, s2, 8
166 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:4
167 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
168 ; GFX6-NEXT: s_lshr_b32 s3, s2, 24
169 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:9
170 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
171 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16
172 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
173 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
174 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
175 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10
176 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
177 ; GFX6-NEXT: s_lshr_b32 s2, s0, 24
178 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1
179 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
180 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
181 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3
182 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
183 ; GFX6-NEXT: s_lshr_b32 s0, s1, 8
184 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2
185 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
186 ; GFX6-NEXT: s_lshr_b32 s0, s1, 24
187 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:5
188 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
189 ; GFX6-NEXT: s_lshr_b32 s0, s1, 16
190 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
191 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
192 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:6
193 ; GFX6-NEXT: s_endpgm
195 ; GFX10-LABEL: store_lds_v3i32_align1:
197 ; GFX10-NEXT: s_clause 0x1
198 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
199 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
201 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
202 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
203 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
204 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
205 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8
206 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24
207 ; GFX10-NEXT: s_lshr_b32 s2, s4, 8
208 ; GFX10-NEXT: s_lshr_b32 s3, s4, 24
209 ; GFX10-NEXT: s_lshr_b32 s4, s5, 8
210 ; GFX10-NEXT: s_lshr_b32 s5, s5, 24
211 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
212 ; GFX10-NEXT: v_mov_b32_e32 v5, s1
213 ; GFX10-NEXT: v_mov_b32_e32 v6, s2
214 ; GFX10-NEXT: v_mov_b32_e32 v7, s3
215 ; GFX10-NEXT: v_mov_b32_e32 v8, s4
216 ; GFX10-NEXT: v_mov_b32_e32 v9, s5
217 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8
218 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10
219 ; GFX10-NEXT: ds_write_b8 v0, v2
220 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:2
221 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:4
222 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6
223 ; GFX10-NEXT: ds_write_b8 v0, v4 offset:9
224 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:11
225 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:1
226 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:3
227 ; GFX10-NEXT: ds_write_b8 v0, v8 offset:5
228 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:7
229 ; GFX10-NEXT: s_endpgm
231 ; GFX11-LABEL: store_lds_v3i32_align1:
233 ; GFX11-NEXT: s_clause 0x1
234 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
235 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10
236 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2
238 ; GFX11-NEXT: s_lshr_b32 s3, s2, 8
239 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
240 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24
241 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
242 ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2
243 ; GFX11-NEXT: s_lshr_b32 s4, s0, 8
244 ; GFX11-NEXT: s_lshr_b32 s0, s0, 24
245 ; GFX11-NEXT: s_lshr_b32 s5, s1, 8
246 ; GFX11-NEXT: s_lshr_b32 s1, s1, 24
247 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s0
248 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s1
249 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:8
250 ; GFX11-NEXT: ds_store_b8 v0, v2
251 ; GFX11-NEXT: ds_store_b8 v0, v4 offset:9
252 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10
253 ; GFX11-NEXT: ds_store_b8 v0, v5 offset:11
254 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:2
255 ; GFX11-NEXT: ds_store_b8 v0, v6 offset:1
256 ; GFX11-NEXT: ds_store_b8 v0, v3 offset:4
257 ; GFX11-NEXT: ds_store_b8 v0, v7 offset:3
258 ; GFX11-NEXT: ds_store_b8 v0, v8 offset:5
259 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6
260 ; GFX11-NEXT: ds_store_b8 v0, v9 offset:7
261 ; GFX11-NEXT: s_endpgm
262 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
266 define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
267 ; GFX9-LABEL: store_lds_v3i32_align2:
269 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
270 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
271 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
272 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
273 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
274 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
275 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8
276 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
277 ; GFX9-NEXT: ds_write_b16 v0, v2
278 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:2
279 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
280 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
281 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6
282 ; GFX9-NEXT: s_endpgm
284 ; GFX7-LABEL: store_lds_v3i32_align2:
286 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
287 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
288 ; GFX7-NEXT: s_mov_b32 m0, -1
289 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
290 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
291 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
292 ; GFX7-NEXT: v_mov_b32_e32 v2, s0
293 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:8
294 ; GFX7-NEXT: ds_write_b16 v0, v2
295 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
296 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16
297 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:4
298 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
299 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
300 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10
301 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
302 ; GFX7-NEXT: s_lshr_b32 s0, s1, 16
303 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2
304 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
305 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:6
306 ; GFX7-NEXT: s_endpgm
308 ; GFX6-LABEL: store_lds_v3i32_align2:
310 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
311 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
312 ; GFX6-NEXT: s_mov_b32 m0, -1
313 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
314 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
315 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
316 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
317 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:8
318 ; GFX6-NEXT: ds_write_b16 v0, v2
319 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
320 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16
321 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:4
322 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
323 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
324 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10
325 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
326 ; GFX6-NEXT: s_lshr_b32 s0, s1, 16
327 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2
328 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
329 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:6
330 ; GFX6-NEXT: s_endpgm
332 ; GFX10-LABEL: store_lds_v3i32_align2:
334 ; GFX10-NEXT: s_clause 0x1
335 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
336 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
337 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
338 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
339 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
340 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
341 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
342 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:8
343 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
344 ; GFX10-NEXT: ds_write_b16 v0, v2
345 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:2
346 ; GFX10-NEXT: ds_write_b16 v0, v3 offset:4
347 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6
348 ; GFX10-NEXT: s_endpgm
350 ; GFX11-LABEL: store_lds_v3i32_align2:
352 ; GFX11-NEXT: s_clause 0x1
353 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
354 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10
355 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
356 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2
357 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
358 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10
359 ; GFX11-NEXT: ds_store_b16 v0, v2
360 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2
361 ; GFX11-NEXT: ds_store_b16 v0, v3 offset:4
362 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:8
363 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6
364 ; GFX11-NEXT: s_endpgm
365 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
369 define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
370 ; GFX9-LABEL: store_lds_v3i32_align4:
372 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
373 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
374 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
375 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
376 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
377 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
378 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
379 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
380 ; GFX9-NEXT: ds_write_b32 v0, v3 offset:8
381 ; GFX9-NEXT: s_endpgm
383 ; GFX7-LABEL: store_lds_v3i32_align4:
385 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
386 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
387 ; GFX7-NEXT: s_mov_b32 m0, -1
388 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
389 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
390 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
391 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
392 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
393 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
394 ; GFX7-NEXT: ds_write_b32 v0, v1 offset:8
395 ; GFX7-NEXT: s_endpgm
397 ; GFX6-LABEL: store_lds_v3i32_align4:
399 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
400 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
401 ; GFX6-NEXT: s_mov_b32 m0, -1
402 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
403 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
404 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
405 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
406 ; GFX6-NEXT: ds_write_b32 v0, v1 offset:8
407 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
408 ; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1
409 ; GFX6-NEXT: s_endpgm
411 ; GFX10-LABEL: store_lds_v3i32_align4:
413 ; GFX10-NEXT: s_clause 0x1
414 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
415 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
416 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
417 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
418 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
419 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
420 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
421 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:8
422 ; GFX10-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
423 ; GFX10-NEXT: s_endpgm
425 ; GFX11-LABEL: store_lds_v3i32_align4:
427 ; GFX11-NEXT: s_clause 0x1
428 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
429 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10
430 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
431 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
432 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
433 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
434 ; GFX11-NEXT: ds_store_b32 v0, v3 offset:8
435 ; GFX11-NEXT: s_endpgm
436 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4
440 define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
441 ; GFX9-LABEL: store_lds_v3i32_align8:
443 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
444 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
445 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
446 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
447 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
448 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
449 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
450 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8
451 ; GFX9-NEXT: ds_write_b64 v2, v[0:1]
452 ; GFX9-NEXT: s_endpgm
454 ; GFX7-LABEL: store_lds_v3i32_align8:
456 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
457 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
458 ; GFX7-NEXT: s_mov_b32 m0, -1
459 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
460 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
461 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
462 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
463 ; GFX7-NEXT: ds_write_b32 v2, v1 offset:8
464 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
465 ; GFX7-NEXT: ds_write_b64 v2, v[0:1]
466 ; GFX7-NEXT: s_endpgm
468 ; GFX6-LABEL: store_lds_v3i32_align8:
470 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
471 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
472 ; GFX6-NEXT: s_mov_b32 m0, -1
473 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
474 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
475 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
476 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
477 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
478 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
479 ; GFX6-NEXT: ds_write_b64 v2, v[0:1]
480 ; GFX6-NEXT: s_endpgm
482 ; GFX10-LABEL: store_lds_v3i32_align8:
484 ; GFX10-NEXT: s_clause 0x1
485 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
486 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
487 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
488 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
489 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
490 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
491 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
492 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:8
493 ; GFX10-NEXT: ds_write_b64 v2, v[0:1]
494 ; GFX10-NEXT: s_endpgm
496 ; GFX11-LABEL: store_lds_v3i32_align8:
498 ; GFX11-NEXT: s_clause 0x1
499 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
500 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10
501 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
502 ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s2
503 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
504 ; GFX11-NEXT: ds_store_b32 v2, v3 offset:8
505 ; GFX11-NEXT: ds_store_b64 v2, v[0:1]
506 ; GFX11-NEXT: s_endpgm
507 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
511 define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
512 ; GFX9-LABEL: store_lds_v3i32_align16:
514 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
515 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
516 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
517 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
518 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
519 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
520 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
521 ; GFX9-NEXT: ds_write_b96 v3, v[0:2]
522 ; GFX9-NEXT: s_endpgm
524 ; GFX7-LABEL: store_lds_v3i32_align16:
526 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
527 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
528 ; GFX7-NEXT: s_mov_b32 m0, -1
529 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
530 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
531 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
532 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
533 ; GFX7-NEXT: v_mov_b32_e32 v3, s0
534 ; GFX7-NEXT: ds_write_b96 v3, v[0:2]
535 ; GFX7-NEXT: s_endpgm
537 ; GFX6-LABEL: store_lds_v3i32_align16:
539 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0
540 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4
541 ; GFX6-NEXT: s_mov_b32 m0, -1
542 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
543 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
544 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
545 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
546 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
547 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
548 ; GFX6-NEXT: ds_write_b64 v2, v[0:1]
549 ; GFX6-NEXT: s_endpgm
551 ; GFX10-LABEL: store_lds_v3i32_align16:
553 ; GFX10-NEXT: s_clause 0x1
554 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
555 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
556 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
557 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
558 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
559 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
560 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
561 ; GFX10-NEXT: ds_write_b96 v3, v[0:2]
562 ; GFX10-NEXT: s_endpgm
564 ; GFX11-LABEL: store_lds_v3i32_align16:
566 ; GFX11-NEXT: s_clause 0x1
567 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
568 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
569 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
570 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
571 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
572 ; GFX11-NEXT: ds_store_b96 v3, v[0:2]
573 ; GFX11-NEXT: s_endpgm
574 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16