1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8 define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) {
9 ; GFX9-LABEL: store_lds_v4i32:
11 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
12 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
15 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
16 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
17 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
18 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
19 ; GFX9-NEXT: ds_write_b128 v4, v[0:3]
22 ; GFX7-LABEL: store_lds_v4i32:
24 ; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0
25 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
26 ; GFX7-NEXT: s_mov_b32 m0, -1
27 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
28 ; GFX7-NEXT: v_mov_b32_e32 v4, s4
29 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
30 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
31 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
32 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
33 ; GFX7-NEXT: ds_write_b128 v4, v[0:3]
36 ; GFX6-LABEL: store_lds_v4i32:
38 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4
39 ; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0
40 ; GFX6-NEXT: s_mov_b32 m0, -1
41 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
43 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
44 ; GFX6-NEXT: v_mov_b32_e32 v4, s0
45 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
46 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
47 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
50 ; GFX10-LABEL: store_lds_v4i32:
52 ; GFX10-NEXT: s_clause 0x1
53 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
54 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
55 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
57 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
58 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
59 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
60 ; GFX10-NEXT: v_mov_b32_e32 v3, s7
61 ; GFX10-NEXT: ds_write_b128 v4, v[0:3]
62 ; GFX10-NEXT: s_endpgm
64 ; GFX11-LABEL: store_lds_v4i32:
66 ; GFX11-NEXT: s_clause 0x1
67 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0
68 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
69 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX11-NEXT: v_mov_b32_e32 v4, s4
71 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
72 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
73 ; GFX11-NEXT: ds_store_b128 v4, v[0:3]
74 ; GFX11-NEXT: s_endpgm
75 store <4 x i32> %x, ptr addrspace(3) %out
79 define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) {
80 ; GFX9-LABEL: store_lds_v4i32_align1:
82 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
83 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
84 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
85 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
86 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
87 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
88 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12
89 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14
90 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:8
91 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10
92 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
93 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:4
94 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
95 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
96 ; GFX9-NEXT: s_lshr_b32 s0, s7, 8
97 ; GFX9-NEXT: ds_write_b8 v0, v1
98 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
99 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
100 ; GFX9-NEXT: s_lshr_b32 s0, s7, 24
101 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:13
102 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
103 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8
104 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:15
105 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
106 ; GFX9-NEXT: s_lshr_b32 s0, s6, 24
107 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9
108 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
109 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8
110 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11
111 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
112 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24
113 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5
114 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
115 ; GFX9-NEXT: s_lshr_b32 s0, s4, 8
116 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7
117 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
118 ; GFX9-NEXT: s_lshr_b32 s0, s4, 24
119 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1
120 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
121 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3
122 ; GFX9-NEXT: s_endpgm
124 ; GFX7-LABEL: store_lds_v4i32_align1:
126 ; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0
127 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
128 ; GFX7-NEXT: s_mov_b32 m0, -1
129 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
130 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
131 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
132 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
133 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:12
134 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:8
135 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
136 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:4
137 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
138 ; GFX7-NEXT: s_lshr_b32 s4, s3, 8
139 ; GFX7-NEXT: ds_write_b8 v0, v1
140 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
141 ; GFX7-NEXT: s_lshr_b32 s4, s3, 24
142 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:13
143 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
144 ; GFX7-NEXT: s_lshr_b32 s3, s3, 16
145 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:15
146 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
147 ; GFX7-NEXT: s_lshr_b32 s3, s2, 8
148 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:14
149 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
150 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24
151 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
152 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
153 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16
154 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
155 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
156 ; GFX7-NEXT: s_lshr_b32 s2, s1, 8
157 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
158 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
159 ; GFX7-NEXT: s_lshr_b32 s2, s1, 24
160 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
161 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
162 ; GFX7-NEXT: s_lshr_b32 s1, s1, 16
163 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
164 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
165 ; GFX7-NEXT: s_lshr_b32 s1, s0, 8
166 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
167 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
168 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24
169 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1
170 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
171 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
172 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
173 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
174 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2
175 ; GFX7-NEXT: s_endpgm
177 ; GFX6-LABEL: store_lds_v4i32_align1:
179 ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0
180 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
181 ; GFX6-NEXT: s_mov_b32 m0, -1
182 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
184 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
185 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
186 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:12
187 ; GFX6-NEXT: ds_write_b8 v0, v2 offset:8
188 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
189 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:4
190 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
191 ; GFX6-NEXT: s_lshr_b32 s4, s3, 8
192 ; GFX6-NEXT: ds_write_b8 v0, v1
193 ; GFX6-NEXT: v_mov_b32_e32 v1, s4
194 ; GFX6-NEXT: s_lshr_b32 s4, s3, 24
195 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:13
196 ; GFX6-NEXT: v_mov_b32_e32 v1, s4
197 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
198 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:15
199 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
200 ; GFX6-NEXT: s_lshr_b32 s3, s2, 8
201 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:14
202 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
203 ; GFX6-NEXT: s_lshr_b32 s3, s2, 24
204 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:9
205 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
206 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16
207 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
208 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
209 ; GFX6-NEXT: s_lshr_b32 s2, s1, 8
210 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10
211 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
212 ; GFX6-NEXT: s_lshr_b32 s2, s1, 24
213 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:5
214 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
215 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
216 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
217 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
218 ; GFX6-NEXT: s_lshr_b32 s1, s0, 8
219 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:6
220 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
221 ; GFX6-NEXT: s_lshr_b32 s1, s0, 24
222 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1
223 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
224 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
225 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3
226 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
227 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2
228 ; GFX6-NEXT: s_endpgm
230 ; GFX10-LABEL: store_lds_v4i32_align1:
232 ; GFX10-NEXT: s_clause 0x1
233 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
234 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
235 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
236 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
237 ; GFX10-NEXT: v_mov_b32_e32 v1, s7
238 ; GFX10-NEXT: s_lshr_b32 s3, s6, 24
239 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
240 ; GFX10-NEXT: s_lshr_b32 s0, s7, 8
241 ; GFX10-NEXT: s_lshr_b32 s2, s6, 8
242 ; GFX10-NEXT: s_lshr_b32 s6, s5, 8
243 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
244 ; GFX10-NEXT: s_lshr_b32 s1, s7, 24
245 ; GFX10-NEXT: s_lshr_b32 s5, s5, 24
246 ; GFX10-NEXT: v_mov_b32_e32 v8, s3
247 ; GFX10-NEXT: v_mov_b32_e32 v5, s0
248 ; GFX10-NEXT: v_mov_b32_e32 v9, s6
249 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8
250 ; GFX10-NEXT: v_mov_b32_e32 v4, s4
251 ; GFX10-NEXT: v_mov_b32_e32 v6, s1
252 ; GFX10-NEXT: v_mov_b32_e32 v7, s2
253 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:12
254 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14
255 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:8
256 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10
257 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:4
258 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6
259 ; GFX10-NEXT: ds_write_b8 v0, v4
260 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2
261 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:13
262 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:15
263 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:9
264 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
265 ; GFX10-NEXT: s_lshr_b32 s1, s4, 24
266 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
267 ; GFX10-NEXT: v_mov_b32_e32 v3, s1
268 ; GFX10-NEXT: ds_write_b8 v0, v8 offset:11
269 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:5
270 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:7
271 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:1
272 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:3
273 ; GFX10-NEXT: s_endpgm
275 ; GFX11-LABEL: store_lds_v4i32_align1:
277 ; GFX11-NEXT: s_clause 0x1
278 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0
279 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
280 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3
282 ; GFX11-NEXT: s_lshr_b32 s4, s3, 8
283 ; GFX11-NEXT: s_lshr_b32 s3, s3, 24
284 ; GFX11-NEXT: s_lshr_b32 s5, s2, 8
285 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1
286 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24
287 ; GFX11-NEXT: s_lshr_b32 s6, s1, 8
288 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s5
289 ; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s6
290 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s4
291 ; GFX11-NEXT: s_lshr_b32 s1, s1, 24
292 ; GFX11-NEXT: s_lshr_b32 s7, s0, 8
293 ; GFX11-NEXT: s_lshr_b32 s0, s0, 24
294 ; GFX11-NEXT: ds_store_b8 v0, v2 offset:8
295 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:10
296 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:12
297 ; GFX11-NEXT: ds_store_b8 v0, v4
298 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v4 offset:2
299 ; GFX11-NEXT: ds_store_b8 v0, v3 offset:4
300 ; GFX11-NEXT: ds_store_b8 v0, v5 offset:13
301 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:14
302 ; GFX11-NEXT: ds_store_b8 v0, v6 offset:15
303 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v10, s7
304 ; GFX11-NEXT: v_mov_b32_e32 v11, s0
305 ; GFX11-NEXT: ds_store_b8 v0, v7 offset:9
306 ; GFX11-NEXT: ds_store_b8 v0, v8 offset:11
307 ; GFX11-NEXT: ds_store_b8 v0, v9 offset:5
308 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6
309 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:7
310 ; GFX11-NEXT: ds_store_b8 v0, v10 offset:1
311 ; GFX11-NEXT: ds_store_b8 v0, v11 offset:3
312 ; GFX11-NEXT: s_endpgm
313 store <4 x i32> %x, ptr addrspace(3) %out, align 1
317 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) {
318 ; GFX9-LABEL: store_lds_v4i32_align2:
320 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
321 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
322 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
324 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
325 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
326 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:12
327 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14
328 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:8
329 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10
330 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
331 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
332 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6
333 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
334 ; GFX9-NEXT: ds_write_b16 v0, v1
335 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2
336 ; GFX9-NEXT: s_endpgm
338 ; GFX7-LABEL: store_lds_v4i32_align2:
340 ; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0
341 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
342 ; GFX7-NEXT: s_mov_b32 m0, -1
343 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
344 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
345 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
346 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
347 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:12
348 ; GFX7-NEXT: ds_write_b16 v0, v2 offset:8
349 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
350 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:4
351 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
352 ; GFX7-NEXT: s_lshr_b32 s3, s3, 16
353 ; GFX7-NEXT: ds_write_b16 v0, v1
354 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
355 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16
356 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:14
357 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
358 ; GFX7-NEXT: s_lshr_b32 s1, s1, 16
359 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10
360 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
361 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
362 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:6
363 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
364 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2
365 ; GFX7-NEXT: s_endpgm
367 ; GFX6-LABEL: store_lds_v4i32_align2:
369 ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0
370 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
371 ; GFX6-NEXT: s_mov_b32 m0, -1
372 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
373 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
374 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
375 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
376 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:12
377 ; GFX6-NEXT: ds_write_b16 v0, v2 offset:8
378 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
379 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:4
380 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
381 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
382 ; GFX6-NEXT: ds_write_b16 v0, v1
383 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
384 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16
385 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:14
386 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
387 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
388 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10
389 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
390 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
391 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:6
392 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
393 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2
394 ; GFX6-NEXT: s_endpgm
396 ; GFX10-LABEL: store_lds_v4i32_align2:
398 ; GFX10-NEXT: s_clause 0x1
399 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
400 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
401 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
403 ; GFX10-NEXT: v_mov_b32_e32 v1, s7
404 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
405 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
406 ; GFX10-NEXT: v_mov_b32_e32 v4, s4
407 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:12
408 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14
409 ; GFX10-NEXT: ds_write_b16 v0, v2 offset:8
410 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10
411 ; GFX10-NEXT: ds_write_b16 v0, v3 offset:4
412 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6
413 ; GFX10-NEXT: ds_write_b16 v0, v4
414 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2
415 ; GFX10-NEXT: s_endpgm
417 ; GFX11-LABEL: store_lds_v4i32_align2:
419 ; GFX11-NEXT: s_clause 0x1
420 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0
421 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
422 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
423 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3
424 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
425 ; GFX11-NEXT: v_mov_b32_e32 v4, s2
426 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:14
427 ; GFX11-NEXT: ds_store_b16 v0, v2
428 ; GFX11-NEXT: ds_store_b16 v0, v3 offset:4
429 ; GFX11-NEXT: ds_store_b16 v0, v4 offset:8
430 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:12
431 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v4 offset:10
432 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6
433 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2
434 ; GFX11-NEXT: s_endpgm
435 store <4 x i32> %x, ptr addrspace(3) %out, align 2
439 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) {
440 ; GFX9-LABEL: store_lds_v4i32_align4:
442 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
443 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
444 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
445 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
446 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
447 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
448 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
449 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
450 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
451 ; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
452 ; GFX9-NEXT: s_endpgm
454 ; GFX7-LABEL: store_lds_v4i32_align4:
456 ; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0
457 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
458 ; GFX7-NEXT: s_mov_b32 m0, -1
459 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
460 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
461 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
462 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
463 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
464 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
465 ; GFX7-NEXT: v_mov_b32_e32 v2, s3
466 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
467 ; GFX7-NEXT: s_endpgm
469 ; GFX6-LABEL: store_lds_v4i32_align4:
471 ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0
472 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
473 ; GFX6-NEXT: s_mov_b32 m0, -1
474 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
475 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
476 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
477 ; GFX6-NEXT: v_mov_b32_e32 v2, s3
478 ; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
479 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
480 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
481 ; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
482 ; GFX6-NEXT: s_endpgm
484 ; GFX10-LABEL: store_lds_v4i32_align4:
486 ; GFX10-NEXT: s_clause 0x1
487 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
488 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
489 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
490 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
491 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
492 ; GFX10-NEXT: v_mov_b32_e32 v2, s7
493 ; GFX10-NEXT: v_mov_b32_e32 v3, s4
494 ; GFX10-NEXT: v_mov_b32_e32 v4, s5
495 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
496 ; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
497 ; GFX10-NEXT: s_endpgm
499 ; GFX11-LABEL: store_lds_v4i32_align4:
501 ; GFX11-NEXT: s_clause 0x1
502 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0
503 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
504 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
505 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
506 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
507 ; GFX11-NEXT: v_mov_b32_e32 v4, s3
508 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
509 ; GFX11-NEXT: ds_store_2addr_b32 v0, v3, v4 offset0:2 offset1:3
510 ; GFX11-NEXT: s_endpgm
511 store <4 x i32> %x, ptr addrspace(3) %out, align 4
515 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) {
516 ; GFX9-LABEL: store_lds_v4i32_align8:
518 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
519 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
520 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
522 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
523 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
524 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
525 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
526 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
527 ; GFX9-NEXT: s_endpgm
529 ; GFX7-LABEL: store_lds_v4i32_align8:
531 ; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0
532 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
533 ; GFX7-NEXT: s_mov_b32 m0, -1
534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
535 ; GFX7-NEXT: v_mov_b32_e32 v4, s4
536 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
537 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
538 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
539 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
540 ; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
541 ; GFX7-NEXT: s_endpgm
543 ; GFX6-LABEL: store_lds_v4i32_align8:
545 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4
546 ; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0
547 ; GFX6-NEXT: s_mov_b32 m0, -1
548 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
549 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
550 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
551 ; GFX6-NEXT: v_mov_b32_e32 v4, s0
552 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
553 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
554 ; GFX6-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
555 ; GFX6-NEXT: s_endpgm
557 ; GFX10-LABEL: store_lds_v4i32_align8:
559 ; GFX10-NEXT: s_clause 0x1
560 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
561 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
562 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
563 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
564 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
565 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
566 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
567 ; GFX10-NEXT: v_mov_b32_e32 v3, s7
568 ; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
569 ; GFX10-NEXT: s_endpgm
571 ; GFX11-LABEL: store_lds_v4i32_align8:
573 ; GFX11-NEXT: s_clause 0x1
574 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0
575 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
576 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX11-NEXT: v_mov_b32_e32 v4, s4
578 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
579 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
580 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
581 ; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1
582 ; GFX11-NEXT: s_endpgm
583 store <4 x i32> %x, ptr addrspace(3) %out, align 8
587 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) {
588 ; GFX9-LABEL: store_lds_v4i32_align16:
590 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
591 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
592 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
593 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
594 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
595 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
596 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
597 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
598 ; GFX9-NEXT: ds_write_b128 v4, v[0:3]
599 ; GFX9-NEXT: s_endpgm
601 ; GFX7-LABEL: store_lds_v4i32_align16:
603 ; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0
604 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
605 ; GFX7-NEXT: s_mov_b32 m0, -1
606 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
607 ; GFX7-NEXT: v_mov_b32_e32 v4, s4
608 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
609 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
610 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
611 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
612 ; GFX7-NEXT: ds_write_b128 v4, v[0:3]
613 ; GFX7-NEXT: s_endpgm
615 ; GFX6-LABEL: store_lds_v4i32_align16:
617 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4
618 ; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0
619 ; GFX6-NEXT: s_mov_b32 m0, -1
620 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
621 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
622 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
623 ; GFX6-NEXT: v_mov_b32_e32 v4, s0
624 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
625 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
626 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
627 ; GFX6-NEXT: s_endpgm
629 ; GFX10-LABEL: store_lds_v4i32_align16:
631 ; GFX10-NEXT: s_clause 0x1
632 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
633 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10
634 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
635 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
636 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
637 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
638 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
639 ; GFX10-NEXT: v_mov_b32_e32 v3, s7
640 ; GFX10-NEXT: ds_write_b128 v4, v[0:3]
641 ; GFX10-NEXT: s_endpgm
643 ; GFX11-LABEL: store_lds_v4i32_align16:
645 ; GFX11-NEXT: s_clause 0x1
646 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0
647 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
648 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
649 ; GFX11-NEXT: v_mov_b32_e32 v4, s4
650 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
651 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
652 ; GFX11-NEXT: ds_store_b128 v4, v[0:3]
653 ; GFX11-NEXT: s_endpgm
654 store <4 x i32> %x, ptr addrspace(3) %out, align 16