1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8 define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
9 ; GFX9-LABEL: load_lds_v3i32:
11 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; GFX9-NEXT: ds_read_b96 v[0:2], v0
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: s_setpc_b64 s[30:31]
16 ; GFX7-LABEL: load_lds_v3i32:
18 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; GFX7-NEXT: s_mov_b32 m0, -1
20 ; GFX7-NEXT: ds_read_b96 v[0:2], v0
21 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
22 ; GFX7-NEXT: s_setpc_b64 s[30:31]
24 ; GFX6-LABEL: load_lds_v3i32:
26 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
28 ; GFX6-NEXT: s_mov_b32 m0, -1
29 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2
30 ; GFX6-NEXT: ds_read_b64 v[0:1], v0
31 ; GFX6-NEXT: ds_read_b32 v2, v2
32 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
33 ; GFX6-NEXT: s_setpc_b64 s[30:31]
35 ; GFX10-LABEL: load_lds_v3i32:
37 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38 ; GFX10-NEXT: ds_read_b96 v[0:2], v0
39 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX10-NEXT: s_setpc_b64 s[30:31]
42 ; GFX11-LABEL: load_lds_v3i32:
44 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45 ; GFX11-NEXT: ds_load_b96 v[0:2], v0
46 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
47 ; GFX11-NEXT: s_setpc_b64 s[30:31]
48 %load = load <3 x i32>, ptr addrspace(3) %ptr
52 define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
53 ; GFX9-LABEL: load_lds_v3i32_align1:
55 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GFX9-NEXT: ds_read_u8 v1, v0
57 ; GFX9-NEXT: ds_read_u8 v2, v0 offset:1
58 ; GFX9-NEXT: ds_read_u8 v3, v0 offset:2
59 ; GFX9-NEXT: ds_read_u8 v4, v0 offset:3
60 ; GFX9-NEXT: ds_read_u8 v5, v0 offset:4
61 ; GFX9-NEXT: ds_read_u8 v6, v0 offset:5
62 ; GFX9-NEXT: ds_read_u8 v7, v0 offset:6
63 ; GFX9-NEXT: ds_read_u8 v8, v0 offset:7
64 ; GFX9-NEXT: ds_read_u8 v9, v0 offset:8
65 ; GFX9-NEXT: ds_read_u8 v10, v0 offset:9
66 ; GFX9-NEXT: ds_read_u8 v11, v0 offset:10
67 ; GFX9-NEXT: ds_read_u8 v12, v0 offset:11
68 ; GFX9-NEXT: s_waitcnt lgkmcnt(10)
69 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1
70 ; GFX9-NEXT: s_waitcnt lgkmcnt(8)
71 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3
72 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
73 ; GFX9-NEXT: s_waitcnt lgkmcnt(6)
74 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5
75 ; GFX9-NEXT: s_waitcnt lgkmcnt(4)
76 ; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7
77 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
78 ; GFX9-NEXT: s_waitcnt lgkmcnt(2)
79 ; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9
80 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
81 ; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11
82 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
83 ; GFX9-NEXT: s_setpc_b64 s[30:31]
85 ; GFX7-LABEL: load_lds_v3i32_align1:
87 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88 ; GFX7-NEXT: s_mov_b32 m0, -1
89 ; GFX7-NEXT: ds_read_u8 v1, v0 offset:6
90 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:4
91 ; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
92 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:1
93 ; GFX7-NEXT: ds_read_u8 v5, v0
94 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:3
95 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
96 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
97 ; GFX7-NEXT: s_waitcnt lgkmcnt(4)
98 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4
99 ; GFX7-NEXT: s_waitcnt lgkmcnt(3)
100 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
101 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
102 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6
103 ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
104 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
105 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
106 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
107 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v7
108 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:11
109 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:10
110 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:9
111 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:8
112 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
113 ; GFX7-NEXT: s_waitcnt lgkmcnt(4)
114 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v8
115 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1
116 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
117 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
118 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
119 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v7
120 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
121 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
122 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5
123 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
124 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
125 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
126 ; GFX7-NEXT: v_mov_b32_e32 v0, v3
127 ; GFX7-NEXT: s_setpc_b64 s[30:31]
129 ; GFX6-LABEL: load_lds_v3i32_align1:
131 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 5, v0
133 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0
134 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 7, v0
135 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 6, v0
136 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 9, v0
137 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0
138 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0
139 ; GFX6-NEXT: s_mov_b32 m0, -1
140 ; GFX6-NEXT: ds_read_u8 v1, v1
141 ; GFX6-NEXT: ds_read_u8 v2, v2
142 ; GFX6-NEXT: ds_read_u8 v3, v3
143 ; GFX6-NEXT: ds_read_u8 v4, v4
144 ; GFX6-NEXT: ds_read_u8 v5, v5
145 ; GFX6-NEXT: ds_read_u8 v6, v6
146 ; GFX6-NEXT: ds_read_u8 v7, v7
147 ; GFX6-NEXT: ds_read_u8 v8, v0
148 ; GFX6-NEXT: s_waitcnt lgkmcnt(7)
149 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
150 ; GFX6-NEXT: s_waitcnt lgkmcnt(6)
151 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
152 ; GFX6-NEXT: s_waitcnt lgkmcnt(5)
153 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3
154 ; GFX6-NEXT: s_waitcnt lgkmcnt(4)
155 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
156 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0
157 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
158 ; GFX6-NEXT: ds_read_u8 v4, v4
159 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
160 ; GFX6-NEXT: s_waitcnt lgkmcnt(4)
161 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5
162 ; GFX6-NEXT: s_waitcnt lgkmcnt(3)
163 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
164 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 3, v0
165 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0
166 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0
167 ; GFX6-NEXT: ds_read_u8 v5, v5
168 ; GFX6-NEXT: ds_read_u8 v6, v6
169 ; GFX6-NEXT: ds_read_u8 v0, v0
170 ; GFX6-NEXT: s_waitcnt lgkmcnt(5)
171 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7
172 ; GFX6-NEXT: s_waitcnt lgkmcnt(3)
173 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
174 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
175 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
176 ; GFX6-NEXT: s_waitcnt lgkmcnt(2)
177 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v5
178 ; GFX6-NEXT: s_waitcnt lgkmcnt(1)
179 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v6
180 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0
182 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
183 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v8
184 ; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
185 ; GFX6-NEXT: s_setpc_b64 s[30:31]
187 ; GFX10-LABEL: load_lds_v3i32_align1:
189 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190 ; GFX10-NEXT: ds_read_u8 v1, v0
191 ; GFX10-NEXT: ds_read_u8 v2, v0 offset:1
192 ; GFX10-NEXT: ds_read_u8 v3, v0 offset:2
193 ; GFX10-NEXT: ds_read_u8 v4, v0 offset:3
194 ; GFX10-NEXT: ds_read_u8 v5, v0 offset:4
195 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5
196 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:6
197 ; GFX10-NEXT: ds_read_u8 v8, v0 offset:7
198 ; GFX10-NEXT: ds_read_u8 v9, v0 offset:8
199 ; GFX10-NEXT: ds_read_u8 v10, v0 offset:9
200 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:10
201 ; GFX10-NEXT: ds_read_u8 v0, v0 offset:11
202 ; GFX10-NEXT: s_waitcnt lgkmcnt(10)
203 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1
204 ; GFX10-NEXT: s_waitcnt lgkmcnt(8)
205 ; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3
206 ; GFX10-NEXT: s_waitcnt lgkmcnt(6)
207 ; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5
208 ; GFX10-NEXT: s_waitcnt lgkmcnt(4)
209 ; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7
210 ; GFX10-NEXT: s_waitcnt lgkmcnt(2)
211 ; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9
212 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
213 ; GFX10-NEXT: v_lshl_or_b32 v6, v0, 8, v11
214 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1
215 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3
216 ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5
217 ; GFX10-NEXT: s_setpc_b64 s[30:31]
219 ; GFX11-LABEL: load_lds_v3i32_align1:
221 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
222 ; GFX11-NEXT: ds_load_u8 v1, v0
223 ; GFX11-NEXT: ds_load_u8 v2, v0 offset:1
224 ; GFX11-NEXT: ds_load_u8 v3, v0 offset:2
225 ; GFX11-NEXT: ds_load_u8 v4, v0 offset:3
226 ; GFX11-NEXT: ds_load_u8 v5, v0 offset:4
227 ; GFX11-NEXT: ds_load_u8 v6, v0 offset:5
228 ; GFX11-NEXT: ds_load_u8 v7, v0 offset:6
229 ; GFX11-NEXT: ds_load_u8 v8, v0 offset:7
230 ; GFX11-NEXT: ds_load_u8 v9, v0 offset:8
231 ; GFX11-NEXT: ds_load_u8 v10, v0 offset:9
232 ; GFX11-NEXT: ds_load_u8 v11, v0 offset:10
233 ; GFX11-NEXT: ds_load_u8 v0, v0 offset:11
234 ; GFX11-NEXT: s_waitcnt lgkmcnt(10)
235 ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1
236 ; GFX11-NEXT: s_waitcnt lgkmcnt(8)
237 ; GFX11-NEXT: v_lshl_or_b32 v2, v4, 8, v3
238 ; GFX11-NEXT: s_waitcnt lgkmcnt(6)
239 ; GFX11-NEXT: v_lshl_or_b32 v3, v6, 8, v5
240 ; GFX11-NEXT: s_waitcnt lgkmcnt(4)
241 ; GFX11-NEXT: v_lshl_or_b32 v4, v8, 8, v7
242 ; GFX11-NEXT: s_waitcnt lgkmcnt(2)
243 ; GFX11-NEXT: v_lshl_or_b32 v5, v10, 8, v9
244 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
245 ; GFX11-NEXT: v_lshl_or_b32 v6, v0, 8, v11
246 ; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1
247 ; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3
248 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
249 ; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5
250 ; GFX11-NEXT: s_setpc_b64 s[30:31]
251 %load = load <3 x i32>, ptr addrspace(3) %ptr, align 1
255 define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
256 ; GFX9-LABEL: load_lds_v3i32_align2:
258 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259 ; GFX9-NEXT: ds_read_u16 v1, v0
260 ; GFX9-NEXT: ds_read_u16 v2, v0 offset:2
261 ; GFX9-NEXT: ds_read_u16 v3, v0 offset:4
262 ; GFX9-NEXT: ds_read_u16 v4, v0 offset:6
263 ; GFX9-NEXT: ds_read_u16 v5, v0 offset:8
264 ; GFX9-NEXT: ds_read_u16 v6, v0 offset:10
265 ; GFX9-NEXT: s_waitcnt lgkmcnt(4)
266 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1
267 ; GFX9-NEXT: s_waitcnt lgkmcnt(2)
268 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3
269 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
270 ; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5
271 ; GFX9-NEXT: s_setpc_b64 s[30:31]
273 ; GFX7-LABEL: load_lds_v3i32_align2:
275 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276 ; GFX7-NEXT: s_mov_b32 m0, -1
277 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:8
278 ; GFX7-NEXT: ds_read_u16 v1, v0 offset:4
279 ; GFX7-NEXT: ds_read_u16 v3, v0 offset:2
280 ; GFX7-NEXT: ds_read_u16 v4, v0
281 ; GFX7-NEXT: ds_read_u16 v5, v0 offset:6
282 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
283 ; GFX7-NEXT: s_waitcnt lgkmcnt(3)
284 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
285 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
286 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4
287 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
288 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
289 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
290 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
291 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
292 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
293 ; GFX7-NEXT: s_setpc_b64 s[30:31]
295 ; GFX6-LABEL: load_lds_v3i32_align2:
297 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 6, v0
299 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0
300 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 10, v0
301 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0
302 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 2, v0
303 ; GFX6-NEXT: s_mov_b32 m0, -1
304 ; GFX6-NEXT: ds_read_u16 v1, v1
305 ; GFX6-NEXT: ds_read_u16 v2, v2
306 ; GFX6-NEXT: ds_read_u16 v3, v3
307 ; GFX6-NEXT: ds_read_u16 v4, v4
308 ; GFX6-NEXT: ds_read_u16 v5, v5
309 ; GFX6-NEXT: ds_read_u16 v0, v0
310 ; GFX6-NEXT: s_waitcnt lgkmcnt(5)
311 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
312 ; GFX6-NEXT: s_waitcnt lgkmcnt(4)
313 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
314 ; GFX6-NEXT: s_waitcnt lgkmcnt(3)
315 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
316 ; GFX6-NEXT: s_waitcnt lgkmcnt(1)
317 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
318 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
319 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
320 ; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
321 ; GFX6-NEXT: s_setpc_b64 s[30:31]
323 ; GFX10-LABEL: load_lds_v3i32_align2:
325 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
326 ; GFX10-NEXT: ds_read_u16 v1, v0
327 ; GFX10-NEXT: ds_read_u16 v2, v0 offset:2
328 ; GFX10-NEXT: ds_read_u16 v3, v0 offset:4
329 ; GFX10-NEXT: ds_read_u16 v4, v0 offset:6
330 ; GFX10-NEXT: ds_read_u16 v5, v0 offset:8
331 ; GFX10-NEXT: ds_read_u16 v6, v0 offset:10
332 ; GFX10-NEXT: s_waitcnt lgkmcnt(4)
333 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1
334 ; GFX10-NEXT: s_waitcnt lgkmcnt(2)
335 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3
336 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5
338 ; GFX10-NEXT: s_setpc_b64 s[30:31]
340 ; GFX11-LABEL: load_lds_v3i32_align2:
342 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
343 ; GFX11-NEXT: ds_load_u16 v1, v0
344 ; GFX11-NEXT: ds_load_u16 v2, v0 offset:2
345 ; GFX11-NEXT: ds_load_u16 v3, v0 offset:4
346 ; GFX11-NEXT: ds_load_u16 v4, v0 offset:6
347 ; GFX11-NEXT: ds_load_u16 v5, v0 offset:8
348 ; GFX11-NEXT: ds_load_u16 v6, v0 offset:10
349 ; GFX11-NEXT: s_waitcnt lgkmcnt(4)
350 ; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1
351 ; GFX11-NEXT: s_waitcnt lgkmcnt(2)
352 ; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3
353 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
354 ; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5
355 ; GFX11-NEXT: s_setpc_b64 s[30:31]
356 %load = load <3 x i32>, ptr addrspace(3) %ptr, align 2
360 define <3 x i32> @load_lds_v3i32_align4(ptr addrspace(3) %ptr) {
361 ; GFX9-LABEL: load_lds_v3i32_align4:
363 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
365 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
366 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
367 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
368 ; GFX9-NEXT: s_setpc_b64 s[30:31]
370 ; GFX7-LABEL: load_lds_v3i32_align4:
372 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
373 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
374 ; GFX7-NEXT: s_mov_b32 m0, -1
375 ; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
376 ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8
377 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
378 ; GFX7-NEXT: s_setpc_b64 s[30:31]
380 ; GFX6-LABEL: load_lds_v3i32_align4:
382 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0
384 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0
385 ; GFX6-NEXT: s_mov_b32 m0, -1
386 ; GFX6-NEXT: ds_read_b32 v2, v2
387 ; GFX6-NEXT: ds_read_b32 v0, v0
388 ; GFX6-NEXT: ds_read_b32 v1, v1
389 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
390 ; GFX6-NEXT: s_setpc_b64 s[30:31]
392 ; GFX10-LABEL: load_lds_v3i32_align4:
394 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
396 ; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
397 ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8
398 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
399 ; GFX10-NEXT: s_setpc_b64 s[30:31]
401 ; GFX11-LABEL: load_lds_v3i32_align4:
403 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
404 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
405 ; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
406 ; GFX11-NEXT: ds_load_b32 v2, v2 offset:8
407 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
408 ; GFX11-NEXT: s_setpc_b64 s[30:31]
409 %load = load <3 x i32>, ptr addrspace(3) %ptr, align 4
413 define <3 x i32> @load_lds_v3i32_align8(ptr addrspace(3) %ptr) {
414 ; GFX9-LABEL: load_lds_v3i32_align8:
416 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
418 ; GFX9-NEXT: ds_read_b64 v[0:1], v0
419 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
420 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
421 ; GFX9-NEXT: s_setpc_b64 s[30:31]
423 ; GFX7-LABEL: load_lds_v3i32_align8:
425 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
427 ; GFX7-NEXT: s_mov_b32 m0, -1
428 ; GFX7-NEXT: ds_read_b64 v[0:1], v0
429 ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8
430 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
431 ; GFX7-NEXT: s_setpc_b64 s[30:31]
433 ; GFX6-LABEL: load_lds_v3i32_align8:
435 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
437 ; GFX6-NEXT: s_mov_b32 m0, -1
438 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2
439 ; GFX6-NEXT: ds_read_b64 v[0:1], v0
440 ; GFX6-NEXT: ds_read_b32 v2, v2
441 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
442 ; GFX6-NEXT: s_setpc_b64 s[30:31]
444 ; GFX10-LABEL: load_lds_v3i32_align8:
446 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
448 ; GFX10-NEXT: ds_read_b64 v[0:1], v0
449 ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8
450 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
451 ; GFX10-NEXT: s_setpc_b64 s[30:31]
453 ; GFX11-LABEL: load_lds_v3i32_align8:
455 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
456 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
457 ; GFX11-NEXT: ds_load_b64 v[0:1], v0
458 ; GFX11-NEXT: ds_load_b32 v2, v2 offset:8
459 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
460 ; GFX11-NEXT: s_setpc_b64 s[30:31]
461 %load = load <3 x i32>, ptr addrspace(3) %ptr, align 8
465 define <3 x i32> @load_lds_v3i32_align16(ptr addrspace(3) %ptr) {
466 ; GFX9-LABEL: load_lds_v3i32_align16:
468 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
469 ; GFX9-NEXT: ds_read_b96 v[0:2], v0
470 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
471 ; GFX9-NEXT: s_setpc_b64 s[30:31]
473 ; GFX7-LABEL: load_lds_v3i32_align16:
475 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
476 ; GFX7-NEXT: s_mov_b32 m0, -1
477 ; GFX7-NEXT: ds_read_b96 v[0:2], v0
478 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
479 ; GFX7-NEXT: s_setpc_b64 s[30:31]
481 ; GFX6-LABEL: load_lds_v3i32_align16:
483 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
485 ; GFX6-NEXT: s_mov_b32 m0, -1
486 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2
487 ; GFX6-NEXT: ds_read_b64 v[0:1], v0
488 ; GFX6-NEXT: ds_read_b32 v2, v2
489 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
490 ; GFX6-NEXT: s_setpc_b64 s[30:31]
492 ; GFX10-LABEL: load_lds_v3i32_align16:
494 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495 ; GFX10-NEXT: ds_read_b96 v[0:2], v0
496 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
497 ; GFX10-NEXT: s_setpc_b64 s[30:31]
499 ; GFX11-LABEL: load_lds_v3i32_align16:
501 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502 ; GFX11-NEXT: ds_load_b96 v[0:2], v0
503 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
504 ; GFX11-NEXT: s_setpc_b64 s[30:31]
505 %load = load <3 x i32>, ptr addrspace(3) %ptr, align 16