1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7 define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
8 ; GFX9-LABEL: store_lds_v3i32:
10 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
11 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
13 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
14 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
15 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
16 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
17 ; GFX9-NEXT: ds_write_b96 v3, v[0:2]
20 ; GFX7-LABEL: store_lds_v3i32:
22 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
23 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
24 ; GFX7-NEXT: s_mov_b32 m0, -1
25 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
26 ; GFX7-NEXT: v_mov_b32_e32 v3, s4
27 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
28 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
29 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
30 ; GFX7-NEXT: ds_write_b96 v3, v[0:2]
33 ; GFX6-LABEL: store_lds_v3i32:
35 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
36 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
37 ; GFX6-NEXT: s_mov_b32 m0, -1
38 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
39 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
40 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
41 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
42 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
43 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
44 ; GFX6-NEXT: ds_write_b64 v2, v[0:1]
47 ; GFX10-LABEL: store_lds_v3i32:
49 ; GFX10-NEXT: s_clause 0x1
50 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
51 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24
52 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
53 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
54 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
55 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
56 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
57 ; GFX10-NEXT: ds_write_b96 v3, v[0:2]
58 ; GFX10-NEXT: s_endpgm
59 store <3 x i32> %x, <3 x i32> addrspace(3)* %out
63 define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
64 ; GFX9-LABEL: store_lds_v3i32_align1:
66 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
67 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
68 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
70 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
71 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
72 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8
73 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10
74 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:4
75 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
76 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
77 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8
78 ; GFX9-NEXT: ds_write_b8 v0, v1
79 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
80 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
81 ; GFX9-NEXT: s_lshr_b32 s0, s6, 24
82 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9
83 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
84 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8
85 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11
86 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
87 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24
88 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5
89 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
90 ; GFX9-NEXT: s_lshr_b32 s0, s4, 8
91 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7
92 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
93 ; GFX9-NEXT: s_lshr_b32 s0, s4, 24
94 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1
95 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
96 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3
99 ; GFX7-LABEL: store_lds_v3i32_align1:
101 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
102 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
103 ; GFX7-NEXT: s_mov_b32 m0, -1
104 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
105 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
106 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
107 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
108 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8
109 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
110 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
111 ; GFX7-NEXT: s_lshr_b32 s3, s2, 8
112 ; GFX7-NEXT: ds_write_b8 v0, v1
113 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
114 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24
115 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
116 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
117 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16
118 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11
119 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
120 ; GFX7-NEXT: s_lshr_b32 s2, s1, 8
121 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10
122 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
123 ; GFX7-NEXT: s_lshr_b32 s2, s1, 24
124 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:5
125 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
126 ; GFX7-NEXT: s_lshr_b32 s1, s1, 16
127 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:7
128 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
129 ; GFX7-NEXT: s_lshr_b32 s1, s0, 8
130 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:6
131 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
132 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24
133 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1
134 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
135 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
136 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3
137 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
138 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2
139 ; GFX7-NEXT: s_endpgm
141 ; GFX6-LABEL: store_lds_v3i32_align1:
143 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
144 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
145 ; GFX6-NEXT: s_mov_b32 m0, -1
146 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
147 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
148 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
149 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
150 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8
151 ; GFX6-NEXT: ds_write_b8 v0, v2 offset:4
152 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
153 ; GFX6-NEXT: s_lshr_b32 s3, s2, 8
154 ; GFX6-NEXT: ds_write_b8 v0, v1
155 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
156 ; GFX6-NEXT: s_lshr_b32 s3, s2, 24
157 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:9
158 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
159 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16
160 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11
161 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
162 ; GFX6-NEXT: s_lshr_b32 s2, s1, 8
163 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10
164 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
165 ; GFX6-NEXT: s_lshr_b32 s2, s1, 24
166 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:5
167 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
168 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
169 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:7
170 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
171 ; GFX6-NEXT: s_lshr_b32 s1, s0, 8
172 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:6
173 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
174 ; GFX6-NEXT: s_lshr_b32 s1, s0, 24
175 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1
176 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
177 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
178 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3
179 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
180 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2
181 ; GFX6-NEXT: s_endpgm
183 ; GFX10-LABEL: store_lds_v3i32_align1:
185 ; GFX10-NEXT: s_clause 0x1
186 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24
187 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
188 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
189 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
190 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
191 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
192 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8
193 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24
194 ; GFX10-NEXT: s_lshr_b32 s2, s5, 8
195 ; GFX10-NEXT: s_lshr_b32 s3, s5, 24
196 ; GFX10-NEXT: s_lshr_b32 s5, s4, 8
197 ; GFX10-NEXT: v_mov_b32_e32 v3, s4
198 ; GFX10-NEXT: s_lshr_b32 s4, s4, 24
199 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
200 ; GFX10-NEXT: v_mov_b32_e32 v5, s1
201 ; GFX10-NEXT: v_mov_b32_e32 v6, s2
202 ; GFX10-NEXT: v_mov_b32_e32 v7, s3
203 ; GFX10-NEXT: v_mov_b32_e32 v8, s5
204 ; GFX10-NEXT: v_mov_b32_e32 v9, s4
205 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8
206 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10
207 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:4
208 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
209 ; GFX10-NEXT: ds_write_b8 v0, v3
210 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2
211 ; GFX10-NEXT: ds_write_b8 v0, v4 offset:9
212 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:11
213 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:5
214 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:7
215 ; GFX10-NEXT: ds_write_b8 v0, v8 offset:1
216 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:3
217 ; GFX10-NEXT: s_endpgm
218 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
222 define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
223 ; GFX9-LABEL: store_lds_v3i32_align2:
225 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
226 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
227 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
228 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
229 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
230 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
231 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8
232 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
233 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4
234 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6
235 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
236 ; GFX9-NEXT: ds_write_b16 v0, v1
237 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2
238 ; GFX9-NEXT: s_endpgm
240 ; GFX7-LABEL: store_lds_v3i32_align2:
242 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
243 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
244 ; GFX7-NEXT: s_mov_b32 m0, -1
245 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
246 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
247 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
248 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
249 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:8
250 ; GFX7-NEXT: ds_write_b16 v0, v2 offset:4
251 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
252 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16
253 ; GFX7-NEXT: ds_write_b16 v0, v1
254 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
255 ; GFX7-NEXT: s_lshr_b32 s1, s1, 16
256 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10
257 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
258 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
259 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:6
260 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
261 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2
262 ; GFX7-NEXT: s_endpgm
264 ; GFX6-LABEL: store_lds_v3i32_align2:
266 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
267 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
268 ; GFX6-NEXT: s_mov_b32 m0, -1
269 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
270 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
271 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
272 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
273 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:8
274 ; GFX6-NEXT: ds_write_b16 v0, v2 offset:4
275 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
276 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16
277 ; GFX6-NEXT: ds_write_b16 v0, v1
278 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
279 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
280 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10
281 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
282 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
283 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:6
284 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
285 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2
286 ; GFX6-NEXT: s_endpgm
288 ; GFX10-LABEL: store_lds_v3i32_align2:
290 ; GFX10-NEXT: s_clause 0x1
291 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24
292 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
293 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
295 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
296 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
297 ; GFX10-NEXT: v_mov_b32_e32 v3, s4
298 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:8
299 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10
300 ; GFX10-NEXT: ds_write_b16 v0, v2 offset:4
301 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:6
302 ; GFX10-NEXT: ds_write_b16 v0, v3
303 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2
304 ; GFX10-NEXT: s_endpgm
305 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
309 define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
310 ; GFX9-LABEL: store_lds_v3i32_align4:
312 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
313 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
314 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
315 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
316 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
317 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
318 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
319 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
320 ; GFX9-NEXT: ds_write_b32 v0, v3 offset:8
321 ; GFX9-NEXT: s_endpgm
323 ; GFX7-LABEL: store_lds_v3i32_align4:
325 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
326 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
327 ; GFX7-NEXT: s_mov_b32 m0, -1
328 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
330 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
331 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
332 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
333 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
334 ; GFX7-NEXT: ds_write_b32 v0, v1 offset:8
335 ; GFX7-NEXT: s_endpgm
337 ; GFX6-LABEL: store_lds_v3i32_align4:
339 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
340 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
341 ; GFX6-NEXT: s_mov_b32 m0, -1
342 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
343 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
344 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
345 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
346 ; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1
347 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
348 ; GFX6-NEXT: ds_write_b32 v0, v1 offset:8
349 ; GFX6-NEXT: s_endpgm
351 ; GFX10-LABEL: store_lds_v3i32_align4:
353 ; GFX10-NEXT: s_clause 0x1
354 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24
355 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
356 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
357 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
358 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
359 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
360 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
361 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
362 ; GFX10-NEXT: ds_write_b32 v0, v3 offset:8
363 ; GFX10-NEXT: s_endpgm
364 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4
368 define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
369 ; GFX9-LABEL: store_lds_v3i32_align8:
371 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
372 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
375 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
376 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
377 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
378 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8
379 ; GFX9-NEXT: ds_write_b64 v2, v[0:1]
380 ; GFX9-NEXT: s_endpgm
382 ; GFX7-LABEL: store_lds_v3i32_align8:
384 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
385 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
386 ; GFX7-NEXT: s_mov_b32 m0, -1
387 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
388 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
389 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
390 ; GFX7-NEXT: ds_write_b32 v2, v1 offset:8
391 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
392 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
393 ; GFX7-NEXT: ds_write_b64 v2, v[0:1]
394 ; GFX7-NEXT: s_endpgm
396 ; GFX6-LABEL: store_lds_v3i32_align8:
398 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
399 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
400 ; GFX6-NEXT: s_mov_b32 m0, -1
401 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
403 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
404 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
405 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
406 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
407 ; GFX6-NEXT: ds_write_b64 v2, v[0:1]
408 ; GFX6-NEXT: s_endpgm
410 ; GFX10-LABEL: store_lds_v3i32_align8:
412 ; GFX10-NEXT: s_clause 0x1
413 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24
414 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
415 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
416 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
417 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
418 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
419 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
420 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:8
421 ; GFX10-NEXT: ds_write_b64 v2, v[0:1]
422 ; GFX10-NEXT: s_endpgm
423 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
427 define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
428 ; GFX9-LABEL: store_lds_v3i32_align16:
430 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
431 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
432 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
434 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
435 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
436 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
437 ; GFX9-NEXT: ds_write_b96 v3, v[0:2]
438 ; GFX9-NEXT: s_endpgm
440 ; GFX7-LABEL: store_lds_v3i32_align16:
442 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
443 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
444 ; GFX7-NEXT: s_mov_b32 m0, -1
445 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
446 ; GFX7-NEXT: v_mov_b32_e32 v3, s4
447 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
448 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
449 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
450 ; GFX7-NEXT: ds_write_b96 v3, v[0:2]
451 ; GFX7-NEXT: s_endpgm
453 ; GFX6-LABEL: store_lds_v3i32_align16:
455 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
456 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
457 ; GFX6-NEXT: s_mov_b32 m0, -1
458 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
459 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
460 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
461 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8
462 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
463 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
464 ; GFX6-NEXT: ds_write_b64 v2, v[0:1]
465 ; GFX6-NEXT: s_endpgm
467 ; GFX10-LABEL: store_lds_v3i32_align16:
469 ; GFX10-NEXT: s_clause 0x1
470 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
471 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24
472 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
473 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
474 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
475 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
476 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
477 ; GFX10-NEXT: ds_write_b96 v3, v[0:2]
478 ; GFX10-NEXT: s_endpgm
479 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16