1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8 ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
10 define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) {
11 ; GFX9-LABEL: store_lds_v4i32:
13 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
14 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
17 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
18 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
19 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
20 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
21 ; GFX9-NEXT: ds_write_b128 v4, v[0:3]
24 ; GFX7-LABEL: store_lds_v4i32:
26 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
27 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
28 ; GFX7-NEXT: s_mov_b32 m0, -1
29 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
30 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
31 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
32 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
33 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
34 ; GFX7-NEXT: v_mov_b32_e32 v4, s0
35 ; GFX7-NEXT: ds_write_b128 v4, v[0:3]
38 ; GFX10-LABEL: store_lds_v4i32:
40 ; GFX10-NEXT: s_clause 0x1
41 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
42 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
43 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
44 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
45 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
46 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
47 ; GFX10-NEXT: v_mov_b32_e32 v3, s7
48 ; GFX10-NEXT: v_mov_b32_e32 v4, s2
49 ; GFX10-NEXT: ds_write_b128 v4, v[0:3]
50 ; GFX10-NEXT: s_endpgm
52 ; GFX11-LABEL: store_lds_v4i32:
54 ; GFX11-NEXT: s_clause 0x1
55 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
56 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
57 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
58 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
59 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
60 ; GFX11-NEXT: v_mov_b32_e32 v4, s0
61 ; GFX11-NEXT: ds_store_b128 v4, v[0:3]
62 ; GFX11-NEXT: s_endpgm
63 store <4 x i32> %x, ptr addrspace(3) %out
67 define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) {
68 ; GFX9-LABEL: store_lds_v4i32_align1:
70 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
71 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s4
74 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
75 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
76 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8
77 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16
78 ; GFX9-NEXT: ds_write_b8 v1, v0
79 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
80 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
81 ; GFX9-NEXT: s_lshr_b32 s1, s0, 8
82 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
83 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
84 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
85 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s5
86 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
87 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8
88 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
89 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16
90 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
91 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
92 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
93 ; GFX9-NEXT: s_lshr_b32 s1, s0, 8
94 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
95 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
96 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
97 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s6
98 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
99 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8
100 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
101 ; GFX9-NEXT: s_lshr_b32 s0, s6, 16
102 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
103 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
104 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
105 ; GFX9-NEXT: s_lshr_b32 s1, s0, 8
106 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
107 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
108 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
109 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s7
110 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11
111 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8
112 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
113 ; GFX9-NEXT: s_lshr_b32 s0, s7, 16
114 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12
115 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
116 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:13
117 ; GFX9-NEXT: s_lshr_b32 s1, s0, 8
118 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
119 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:14
120 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
121 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:15
122 ; GFX9-NEXT: s_endpgm
124 ; GFX7-LABEL: store_lds_v4i32_align1:
126 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
127 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
128 ; GFX7-NEXT: s_mov_b32 m0, -1
129 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
130 ; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008
131 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
132 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
133 ; GFX7-NEXT: s_lshr_b32 s1, s4, 16
134 ; GFX7-NEXT: ds_write_b8 v1, v0
135 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
136 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
137 ; GFX7-NEXT: s_lshr_b32 s0, s4, 24
138 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
139 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
140 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
141 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
142 ; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008
143 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
144 ; GFX7-NEXT: s_lshr_b32 s0, s5, 16
145 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
146 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
147 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
148 ; GFX7-NEXT: s_lshr_b32 s1, s5, 24
149 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
150 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
151 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
152 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
153 ; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008
154 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
155 ; GFX7-NEXT: s_lshr_b32 s0, s6, 16
156 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
157 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
158 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9
159 ; GFX7-NEXT: s_lshr_b32 s1, s6, 24
160 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
161 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10
162 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
163 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11
164 ; GFX7-NEXT: s_bfe_u32 s1, s7, 0x80008
165 ; GFX7-NEXT: v_mov_b32_e32 v0, s7
166 ; GFX7-NEXT: s_lshr_b32 s0, s7, 16
167 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:12
168 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
169 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:13
170 ; GFX7-NEXT: s_lshr_b32 s1, s7, 24
171 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
172 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:14
173 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
174 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:15
175 ; GFX7-NEXT: s_endpgm
177 ; GFX10-LABEL: store_lds_v4i32_align1:
179 ; GFX10-NEXT: s_clause 0x1
180 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
181 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
182 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX10-NEXT: s_lshr_b32 s0, s4, 16
184 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s4
185 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
186 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
187 ; GFX10-NEXT: s_lshr_b32 s2, s5, 16
188 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s5
189 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
190 ; GFX10-NEXT: s_lshr_b32 s1, s1, 8
191 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
192 ; GFX10-NEXT: s_lshr_b32 s4, s6, 16
193 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s6
194 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
195 ; GFX10-NEXT: s_lshr_b32 s6, s0, 8
196 ; GFX10-NEXT: s_lshr_b32 s0, s3, 8
197 ; GFX10-NEXT: s_lshr_b32 s3, s2, 8
198 ; GFX10-NEXT: v_mov_b32_e32 v5, s2
199 ; GFX10-NEXT: v_mov_b32_e32 v6, s1
200 ; GFX10-NEXT: s_lshr_b32 s2, s5, 8
201 ; GFX10-NEXT: v_mov_b32_e32 v7, s6
202 ; GFX10-NEXT: v_mov_b32_e32 v8, s0
203 ; GFX10-NEXT: v_mov_b32_e32 v9, s3
204 ; GFX10-NEXT: ds_write_b8 v1, v0
205 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4
206 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2
207 ; GFX10-NEXT: ds_write_b8 v1, v5 offset:6
208 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:1
209 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3
210 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5
211 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
212 ; GFX10-NEXT: v_mov_b32_e32 v10, s2
213 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8
214 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7
215 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
216 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
217 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
218 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
219 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s7
220 ; GFX10-NEXT: s_lshr_b32 s1, s7, 16
221 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8
222 ; GFX10-NEXT: v_mov_b32_e32 v2, s7
223 ; GFX10-NEXT: v_mov_b32_e32 v3, s0
224 ; GFX10-NEXT: s_lshr_b32 s0, s1, 8
225 ; GFX10-NEXT: v_mov_b32_e32 v4, s1
226 ; GFX10-NEXT: v_mov_b32_e32 v5, s0
227 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:11
228 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:12
229 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:13
230 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:14
231 ; GFX10-NEXT: ds_write_b8 v1, v5 offset:15
232 ; GFX10-NEXT: s_endpgm
234 ; GFX11-LABEL: store_lds_v4i32_align1:
236 ; GFX11-NEXT: s_clause 0x1
237 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
238 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
239 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
240 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s4
241 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16
242 ; GFX11-NEXT: s_lshr_b32 s2, s2, 8
243 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
244 ; GFX11-NEXT: s_lshr_b32 s0, s5, 16
245 ; GFX11-NEXT: s_and_b32 s3, 0xffff, s5
246 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
247 ; GFX11-NEXT: s_lshr_b32 s4, s6, 16
248 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s6
249 ; GFX11-NEXT: s_lshr_b32 s6, s1, 8
250 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
251 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6
252 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
253 ; GFX11-NEXT: s_lshr_b32 s1, s3, 8
254 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8
255 ; GFX11-NEXT: s_lshr_b32 s0, s5, 8
256 ; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v9, s3
257 ; GFX11-NEXT: ds_store_b8 v1, v0
258 ; GFX11-NEXT: ds_store_b8 v1, v6 offset:1
259 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
260 ; GFX11-NEXT: ds_store_b8 v1, v7 offset:3
261 ; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
262 ; GFX11-NEXT: ds_store_b8 v1, v8 offset:5
263 ; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
264 ; GFX11-NEXT: ds_store_b8 v1, v9 offset:7
265 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
266 ; GFX11-NEXT: s_lshr_b32 s0, s4, 8
267 ; GFX11-NEXT: s_lshr_b32 s1, s7, 16
268 ; GFX11-NEXT: v_mov_b32_e32 v4, s0
269 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s7
270 ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v7, s1
271 ; GFX11-NEXT: s_lshr_b32 s0, s0, 8
272 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
273 ; GFX11-NEXT: v_mov_b32_e32 v6, s0
274 ; GFX11-NEXT: s_lshr_b32 s0, s1, 8
275 ; GFX11-NEXT: v_mov_b32_e32 v8, s0
276 ; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
277 ; GFX11-NEXT: ds_store_b8 v1, v0 offset:9
278 ; GFX11-NEXT: ds_store_b8 v1, v2 offset:10
279 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:11
280 ; GFX11-NEXT: ds_store_b8 v1, v5 offset:12
281 ; GFX11-NEXT: ds_store_b8 v1, v6 offset:13
282 ; GFX11-NEXT: ds_store_b8 v1, v7 offset:14
283 ; GFX11-NEXT: ds_store_b8 v1, v8 offset:15
284 ; GFX11-NEXT: s_endpgm
285 store <4 x i32> %x, ptr addrspace(3) %out, align 1
289 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) {
290 ; GFX9-LABEL: store_lds_v4i32_align2:
292 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
293 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
294 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
295 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16
296 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
297 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
298 ; GFX9-NEXT: ds_write_b16 v1, v0
299 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
300 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2
301 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16
302 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
303 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4
304 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
305 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6
306 ; GFX9-NEXT: s_lshr_b32 s0, s6, 16
307 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
308 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8
309 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
310 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10
311 ; GFX9-NEXT: s_lshr_b32 s0, s7, 16
312 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
313 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:12
314 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
315 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:14
316 ; GFX9-NEXT: s_endpgm
318 ; GFX7-LABEL: store_lds_v4i32_align2:
320 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
321 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
322 ; GFX7-NEXT: s_mov_b32 m0, -1
323 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
324 ; GFX7-NEXT: s_lshr_b32 s1, s4, 16
325 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
326 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
327 ; GFX7-NEXT: ds_write_b16 v1, v0
328 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
329 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:2
330 ; GFX7-NEXT: s_lshr_b32 s0, s5, 16
331 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
332 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:4
333 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
334 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:6
335 ; GFX7-NEXT: s_lshr_b32 s0, s6, 16
336 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
337 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:8
338 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
339 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:10
340 ; GFX7-NEXT: s_lshr_b32 s0, s7, 16
341 ; GFX7-NEXT: v_mov_b32_e32 v0, s7
342 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:12
343 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
344 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:14
345 ; GFX7-NEXT: s_endpgm
347 ; GFX10-LABEL: store_lds_v4i32_align2:
349 ; GFX10-NEXT: s_clause 0x1
350 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
351 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
352 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
353 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
354 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
355 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
356 ; GFX10-NEXT: s_lshr_b32 s0, s4, 16
357 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
358 ; GFX10-NEXT: s_lshr_b32 s1, s5, 16
359 ; GFX10-NEXT: s_lshr_b32 s2, s6, 16
360 ; GFX10-NEXT: s_lshr_b32 s3, s7, 16
361 ; GFX10-NEXT: v_mov_b32_e32 v4, s7
362 ; GFX10-NEXT: v_mov_b32_e32 v5, s0
363 ; GFX10-NEXT: v_mov_b32_e32 v6, s1
364 ; GFX10-NEXT: v_mov_b32_e32 v7, s2
365 ; GFX10-NEXT: v_mov_b32_e32 v8, s3
366 ; GFX10-NEXT: ds_write_b16 v1, v0
367 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4
368 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8
369 ; GFX10-NEXT: ds_write_b16 v1, v4 offset:12
370 ; GFX10-NEXT: ds_write_b16 v1, v5 offset:2
371 ; GFX10-NEXT: ds_write_b16 v1, v6 offset:6
372 ; GFX10-NEXT: ds_write_b16 v1, v7 offset:10
373 ; GFX10-NEXT: ds_write_b16 v1, v8 offset:14
374 ; GFX10-NEXT: s_endpgm
376 ; GFX11-LABEL: store_lds_v4i32_align2:
378 ; GFX11-NEXT: s_clause 0x1
379 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
380 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
381 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
382 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16
383 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
384 ; GFX11-NEXT: s_lshr_b32 s0, s5, 16
385 ; GFX11-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s1
386 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
387 ; GFX11-NEXT: s_lshr_b32 s2, s6, 16
388 ; GFX11-NEXT: s_lshr_b32 s3, s7, 16
389 ; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s2
390 ; GFX11-NEXT: v_mov_b32_e32 v8, s3
391 ; GFX11-NEXT: ds_store_b16 v1, v0
392 ; GFX11-NEXT: ds_store_b16 v1, v5 offset:2
393 ; GFX11-NEXT: ds_store_b16 v1, v2 offset:4
394 ; GFX11-NEXT: ds_store_b16 v1, v6 offset:6
395 ; GFX11-NEXT: ds_store_b16 v1, v3 offset:8
396 ; GFX11-NEXT: ds_store_b16 v1, v7 offset:10
397 ; GFX11-NEXT: ds_store_b16 v1, v4 offset:12
398 ; GFX11-NEXT: ds_store_b16 v1, v8 offset:14
399 ; GFX11-NEXT: s_endpgm
400 store <4 x i32> %x, ptr addrspace(3) %out, align 2
404 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) {
405 ; GFX9-LABEL: store_lds_v4i32_align4:
407 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
408 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
409 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
410 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
411 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
412 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
413 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
414 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
415 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
416 ; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3
417 ; GFX9-NEXT: s_endpgm
419 ; GFX7-LABEL: store_lds_v4i32_align4:
421 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
422 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
423 ; GFX7-NEXT: s_mov_b32 m0, -1
424 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
425 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
426 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
427 ; GFX7-NEXT: v_mov_b32_e32 v2, s5
428 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
429 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
430 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
431 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset0:2 offset1:3
432 ; GFX7-NEXT: s_endpgm
434 ; GFX10-LABEL: store_lds_v4i32_align4:
436 ; GFX10-NEXT: s_clause 0x1
437 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
438 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
439 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
440 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
441 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
442 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
443 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
444 ; GFX10-NEXT: v_mov_b32_e32 v4, s7
445 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
446 ; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3
447 ; GFX10-NEXT: s_endpgm
449 ; GFX11-LABEL: store_lds_v4i32_align4:
451 ; GFX11-NEXT: s_clause 0x1
452 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
453 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
454 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
455 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
456 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
457 ; GFX11-NEXT: v_mov_b32_e32 v4, s7
458 ; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1
459 ; GFX11-NEXT: ds_store_2addr_b32 v1, v3, v4 offset0:2 offset1:3
460 ; GFX11-NEXT: s_endpgm
461 store <4 x i32> %x, ptr addrspace(3) %out, align 4
465 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) {
466 ; GFX9-LABEL: store_lds_v4i32_align8:
468 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
469 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
470 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
471 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
472 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
473 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
474 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
475 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
476 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
477 ; GFX9-NEXT: s_endpgm
479 ; GFX7-LABEL: store_lds_v4i32_align8:
481 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
482 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
483 ; GFX7-NEXT: s_mov_b32 m0, -1
484 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
485 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
486 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
487 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
488 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
489 ; GFX7-NEXT: v_mov_b32_e32 v4, s0
490 ; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
491 ; GFX7-NEXT: s_endpgm
493 ; GFX10-LABEL: store_lds_v4i32_align8:
495 ; GFX10-NEXT: s_clause 0x1
496 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
497 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
498 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
499 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
500 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
501 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
502 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
503 ; GFX10-NEXT: v_mov_b32_e32 v4, s7
504 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
505 ; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3
506 ; GFX10-NEXT: s_endpgm
508 ; GFX11-LABEL: store_lds_v4i32_align8:
510 ; GFX11-NEXT: s_clause 0x1
511 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
512 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
513 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
514 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
515 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
516 ; GFX11-NEXT: v_mov_b32_e32 v4, s0
517 ; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1
518 ; GFX11-NEXT: s_endpgm
519 store <4 x i32> %x, ptr addrspace(3) %out, align 8
523 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) {
524 ; GFX9-LABEL: store_lds_v4i32_align16:
526 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
527 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
528 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
529 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
530 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
531 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
532 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
533 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
534 ; GFX9-NEXT: ds_write_b128 v4, v[0:3]
535 ; GFX9-NEXT: s_endpgm
537 ; GFX7-LABEL: store_lds_v4i32_align16:
539 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
540 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
541 ; GFX7-NEXT: s_mov_b32 m0, -1
542 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
543 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
544 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
545 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
546 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
547 ; GFX7-NEXT: v_mov_b32_e32 v4, s0
548 ; GFX7-NEXT: ds_write_b128 v4, v[0:3]
549 ; GFX7-NEXT: s_endpgm
551 ; GFX10-LABEL: store_lds_v4i32_align16:
553 ; GFX10-NEXT: s_clause 0x1
554 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
555 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
556 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
557 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
558 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
559 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
560 ; GFX10-NEXT: v_mov_b32_e32 v3, s7
561 ; GFX10-NEXT: v_mov_b32_e32 v4, s2
562 ; GFX10-NEXT: ds_write_b128 v4, v[0:3]
563 ; GFX10-NEXT: s_endpgm
565 ; GFX11-LABEL: store_lds_v4i32_align16:
567 ; GFX11-NEXT: s_clause 0x1
568 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
569 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
570 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
571 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
572 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
573 ; GFX11-NEXT: v_mov_b32_e32 v4, s0
574 ; GFX11-NEXT: ds_store_b128 v4, v[0:3]
575 ; GFX11-NEXT: s_endpgm
576 store <4 x i32> %x, ptr addrspace(3) %out, align 16