1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7 define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) {
8 ; GFX9-LABEL: load_lds_v4i32:
10 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11 ; GFX9-NEXT: ds_read_b128 v[0:3], v0
12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
13 ; GFX9-NEXT: s_setpc_b64 s[30:31]
15 ; GFX7-LABEL: load_lds_v4i32:
17 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18 ; GFX7-NEXT: s_mov_b32 m0, -1
19 ; GFX7-NEXT: ds_read_b128 v[0:3], v0
20 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
21 ; GFX7-NEXT: s_setpc_b64 s[30:31]
23 ; GFX6-LABEL: load_lds_v4i32:
25 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
27 ; GFX6-NEXT: s_mov_b32 m0, -1
28 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2
29 ; GFX6-NEXT: ds_read_b64 v[0:1], v0
30 ; GFX6-NEXT: ds_read_b64 v[2:3], v2
31 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
32 ; GFX6-NEXT: s_setpc_b64 s[30:31]
34 ; GFX10-LABEL: load_lds_v4i32:
36 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
38 ; GFX10-NEXT: ds_read_b128 v[0:3], v0
39 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX10-NEXT: s_setpc_b64 s[30:31]
41 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr
45 define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
46 ; GFX9-LABEL: load_lds_v4i32_align1:
48 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49 ; GFX9-NEXT: ds_read_u8 v1, v0
50 ; GFX9-NEXT: ds_read_u8 v2, v0 offset:1
51 ; GFX9-NEXT: ds_read_u8 v3, v0 offset:2
52 ; GFX9-NEXT: ds_read_u8 v4, v0 offset:3
53 ; GFX9-NEXT: ds_read_u8 v5, v0 offset:4
54 ; GFX9-NEXT: ds_read_u8 v6, v0 offset:5
55 ; GFX9-NEXT: ds_read_u8 v7, v0 offset:6
56 ; GFX9-NEXT: ds_read_u8 v8, v0 offset:7
57 ; GFX9-NEXT: ds_read_u8 v9, v0 offset:8
58 ; GFX9-NEXT: ds_read_u8 v10, v0 offset:9
59 ; GFX9-NEXT: ds_read_u8 v11, v0 offset:10
60 ; GFX9-NEXT: ds_read_u8 v12, v0 offset:11
61 ; GFX9-NEXT: ds_read_u8 v13, v0 offset:12
62 ; GFX9-NEXT: ds_read_u8 v14, v0 offset:13
63 ; GFX9-NEXT: ds_read_u8 v15, v0 offset:14
64 ; GFX9-NEXT: ds_read_u8 v16, v0 offset:15
65 ; GFX9-NEXT: s_waitcnt lgkmcnt(14)
66 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1
67 ; GFX9-NEXT: s_waitcnt lgkmcnt(12)
68 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3
69 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
70 ; GFX9-NEXT: s_waitcnt lgkmcnt(10)
71 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(8)
73 ; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7
74 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
75 ; GFX9-NEXT: s_waitcnt lgkmcnt(6)
76 ; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9
77 ; GFX9-NEXT: s_waitcnt lgkmcnt(4)
78 ; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11
79 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
80 ; GFX9-NEXT: s_waitcnt lgkmcnt(2)
81 ; GFX9-NEXT: v_lshl_or_b32 v3, v14, 8, v13
82 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
83 ; GFX9-NEXT: v_lshl_or_b32 v4, v16, 8, v15
84 ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
85 ; GFX9-NEXT: s_setpc_b64 s[30:31]
87 ; GFX7-LABEL: load_lds_v4i32_align1:
89 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90 ; GFX7-NEXT: s_mov_b32 m0, -1
91 ; GFX7-NEXT: ds_read_u8 v1, v0 offset:6
92 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:4
93 ; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
94 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:1
95 ; GFX7-NEXT: ds_read_u8 v5, v0
96 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:3
97 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
98 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
99 ; GFX7-NEXT: s_waitcnt lgkmcnt(4)
100 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4
101 ; GFX7-NEXT: s_waitcnt lgkmcnt(3)
102 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
103 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
104 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6
105 ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
106 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
107 ; GFX7-NEXT: v_or_b32_e32 v4, v3, v4
108 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
109 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v7
110 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
111 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
112 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
113 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:15
114 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:14
115 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:13
116 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:12
117 ; GFX7-NEXT: ds_read_u8 v9, v0 offset:11
118 ; GFX7-NEXT: ds_read_u8 v10, v0 offset:10
119 ; GFX7-NEXT: ds_read_u8 v11, v0 offset:9
120 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:8
121 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
122 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
123 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
124 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
125 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v11
126 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
128 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9
129 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v10
130 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
131 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5
132 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
133 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v7
134 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v6
135 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v8
136 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
137 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v0
138 ; GFX7-NEXT: v_mov_b32_e32 v0, v4
139 ; GFX7-NEXT: s_setpc_b64 s[30:31]
141 ; GFX6-LABEL: load_lds_v4i32_align1:
143 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 5, v0
145 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0
146 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 7, v0
147 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 6, v0
148 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 9, v0
149 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0
150 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0
151 ; GFX6-NEXT: s_mov_b32 m0, -1
152 ; GFX6-NEXT: ds_read_u8 v1, v1
153 ; GFX6-NEXT: ds_read_u8 v2, v2
154 ; GFX6-NEXT: ds_read_u8 v3, v3
155 ; GFX6-NEXT: ds_read_u8 v4, v4
156 ; GFX6-NEXT: ds_read_u8 v5, v5
157 ; GFX6-NEXT: ds_read_u8 v6, v6
158 ; GFX6-NEXT: ds_read_u8 v7, v7
159 ; GFX6-NEXT: ds_read_u8 v8, v0
160 ; GFX6-NEXT: s_waitcnt lgkmcnt(7)
161 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
162 ; GFX6-NEXT: s_waitcnt lgkmcnt(6)
163 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
164 ; GFX6-NEXT: s_waitcnt lgkmcnt(5)
165 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3
166 ; GFX6-NEXT: s_waitcnt lgkmcnt(4)
167 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
168 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
169 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
170 ; GFX6-NEXT: s_waitcnt lgkmcnt(3)
171 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5
172 ; GFX6-NEXT: s_waitcnt lgkmcnt(2)
173 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
174 ; GFX6-NEXT: s_waitcnt lgkmcnt(1)
175 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7
176 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0
177 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 13, v0
178 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0
179 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 15, v0
180 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 14, v0
181 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, 3, v0
182 ; GFX6-NEXT: v_add_i32_e32 v11, vcc, 2, v0
183 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0
184 ; GFX6-NEXT: ds_read_u8 v4, v4
185 ; GFX6-NEXT: ds_read_u8 v5, v5
186 ; GFX6-NEXT: ds_read_u8 v6, v6
187 ; GFX6-NEXT: ds_read_u8 v7, v7
188 ; GFX6-NEXT: ds_read_u8 v9, v9
189 ; GFX6-NEXT: ds_read_u8 v10, v10
190 ; GFX6-NEXT: ds_read_u8 v11, v11
191 ; GFX6-NEXT: ds_read_u8 v0, v0
192 ; GFX6-NEXT: s_waitcnt lgkmcnt(7)
193 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
194 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
195 ; GFX6-NEXT: s_waitcnt lgkmcnt(4)
196 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v7
197 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
198 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v5
199 ; GFX6-NEXT: s_waitcnt lgkmcnt(3)
200 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v9
201 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v6
202 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
203 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
204 ; GFX6-NEXT: s_waitcnt lgkmcnt(2)
205 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v10
206 ; GFX6-NEXT: s_waitcnt lgkmcnt(1)
207 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v11
208 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
209 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0
210 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
211 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v8
212 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
213 ; GFX6-NEXT: s_setpc_b64 s[30:31]
215 ; GFX10-LABEL: load_lds_v4i32_align1:
217 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
219 ; GFX10-NEXT: ds_read_u8 v1, v0
220 ; GFX10-NEXT: ds_read_u8 v2, v0 offset:1
221 ; GFX10-NEXT: ds_read_u8 v3, v0 offset:2
222 ; GFX10-NEXT: ds_read_u8 v4, v0 offset:3
223 ; GFX10-NEXT: ds_read_u8 v5, v0 offset:4
224 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5
225 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:6
226 ; GFX10-NEXT: ds_read_u8 v8, v0 offset:7
227 ; GFX10-NEXT: ds_read_u8 v9, v0 offset:8
228 ; GFX10-NEXT: ds_read_u8 v10, v0 offset:9
229 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:10
230 ; GFX10-NEXT: ds_read_u8 v12, v0 offset:11
231 ; GFX10-NEXT: ds_read_u8 v13, v0 offset:12
232 ; GFX10-NEXT: ds_read_u8 v14, v0 offset:13
233 ; GFX10-NEXT: ds_read_u8 v15, v0 offset:14
234 ; GFX10-NEXT: ds_read_u8 v0, v0 offset:15
235 ; GFX10-NEXT: s_waitcnt lgkmcnt(14)
236 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1
237 ; GFX10-NEXT: s_waitcnt lgkmcnt(12)
238 ; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3
239 ; GFX10-NEXT: s_waitcnt lgkmcnt(10)
240 ; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5
241 ; GFX10-NEXT: s_waitcnt lgkmcnt(8)
242 ; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7
243 ; GFX10-NEXT: s_waitcnt lgkmcnt(6)
244 ; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9
245 ; GFX10-NEXT: s_waitcnt lgkmcnt(4)
246 ; GFX10-NEXT: v_lshl_or_b32 v6, v12, 8, v11
247 ; GFX10-NEXT: s_waitcnt lgkmcnt(2)
248 ; GFX10-NEXT: v_lshl_or_b32 v7, v14, 8, v13
249 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
250 ; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v15
251 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1
252 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3
253 ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5
254 ; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7
255 ; GFX10-NEXT: s_setpc_b64 s[30:31]
256 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
260 define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
261 ; GFX9-LABEL: load_lds_v4i32_align2:
263 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
264 ; GFX9-NEXT: ds_read_u16 v1, v0
265 ; GFX9-NEXT: ds_read_u16 v2, v0 offset:2
266 ; GFX9-NEXT: ds_read_u16 v3, v0 offset:4
267 ; GFX9-NEXT: ds_read_u16 v4, v0 offset:6
268 ; GFX9-NEXT: ds_read_u16 v5, v0 offset:8
269 ; GFX9-NEXT: ds_read_u16 v6, v0 offset:10
270 ; GFX9-NEXT: ds_read_u16 v7, v0 offset:12
271 ; GFX9-NEXT: ds_read_u16 v8, v0 offset:14
272 ; GFX9-NEXT: s_waitcnt lgkmcnt(6)
273 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1
274 ; GFX9-NEXT: s_waitcnt lgkmcnt(4)
275 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3
276 ; GFX9-NEXT: s_waitcnt lgkmcnt(2)
277 ; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5
278 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
279 ; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v7
280 ; GFX9-NEXT: s_setpc_b64 s[30:31]
282 ; GFX7-LABEL: load_lds_v4i32_align2:
284 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
285 ; GFX7-NEXT: s_mov_b32 m0, -1
286 ; GFX7-NEXT: ds_read_u16 v3, v0 offset:12
287 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:8
288 ; GFX7-NEXT: ds_read_u16 v1, v0 offset:4
289 ; GFX7-NEXT: ds_read_u16 v4, v0 offset:2
290 ; GFX7-NEXT: ds_read_u16 v5, v0
291 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:6
292 ; GFX7-NEXT: ds_read_u16 v7, v0 offset:10
293 ; GFX7-NEXT: ds_read_u16 v8, v0 offset:14
294 ; GFX7-NEXT: s_waitcnt lgkmcnt(4)
295 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
296 ; GFX7-NEXT: s_waitcnt lgkmcnt(3)
297 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
298 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
299 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
300 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1
301 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
302 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
303 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
304 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
305 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
306 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
307 ; GFX7-NEXT: s_setpc_b64 s[30:31]
309 ; GFX6-LABEL: load_lds_v4i32_align2:
311 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 6, v0
313 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0
314 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 10, v0
315 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0
316 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 14, v0
317 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0
318 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 2, v0
319 ; GFX6-NEXT: s_mov_b32 m0, -1
320 ; GFX6-NEXT: ds_read_u16 v1, v1
321 ; GFX6-NEXT: ds_read_u16 v2, v2
322 ; GFX6-NEXT: ds_read_u16 v3, v3
323 ; GFX6-NEXT: ds_read_u16 v4, v4
324 ; GFX6-NEXT: ds_read_u16 v5, v5
325 ; GFX6-NEXT: ds_read_u16 v6, v6
326 ; GFX6-NEXT: ds_read_u16 v7, v7
327 ; GFX6-NEXT: ds_read_u16 v0, v0
328 ; GFX6-NEXT: s_waitcnt lgkmcnt(7)
329 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
330 ; GFX6-NEXT: s_waitcnt lgkmcnt(6)
331 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
332 ; GFX6-NEXT: s_waitcnt lgkmcnt(5)
333 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
334 ; GFX6-NEXT: s_waitcnt lgkmcnt(4)
335 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
336 ; GFX6-NEXT: s_waitcnt lgkmcnt(3)
337 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
338 ; GFX6-NEXT: s_waitcnt lgkmcnt(1)
339 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7
340 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v6
341 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
342 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
343 ; GFX6-NEXT: s_setpc_b64 s[30:31]
345 ; GFX10-LABEL: load_lds_v4i32_align2:
347 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
349 ; GFX10-NEXT: ds_read_u16 v1, v0
350 ; GFX10-NEXT: ds_read_u16 v2, v0 offset:2
351 ; GFX10-NEXT: ds_read_u16 v3, v0 offset:4
352 ; GFX10-NEXT: ds_read_u16 v4, v0 offset:6
353 ; GFX10-NEXT: ds_read_u16 v5, v0 offset:8
354 ; GFX10-NEXT: ds_read_u16 v6, v0 offset:10
355 ; GFX10-NEXT: ds_read_u16 v7, v0 offset:12
356 ; GFX10-NEXT: ds_read_u16 v8, v0 offset:14
357 ; GFX10-NEXT: s_waitcnt lgkmcnt(6)
358 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1
359 ; GFX10-NEXT: s_waitcnt lgkmcnt(4)
360 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3
361 ; GFX10-NEXT: s_waitcnt lgkmcnt(2)
362 ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5
363 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
364 ; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7
365 ; GFX10-NEXT: s_setpc_b64 s[30:31]
366 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2
370 define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
371 ; GFX9-LABEL: load_lds_v4i32_align4:
373 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
374 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
375 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
376 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
377 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
378 ; GFX9-NEXT: s_setpc_b64 s[30:31]
380 ; GFX7-LABEL: load_lds_v4i32_align4:
382 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
384 ; GFX7-NEXT: s_mov_b32 m0, -1
385 ; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
386 ; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
387 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
388 ; GFX7-NEXT: s_setpc_b64 s[30:31]
390 ; GFX6-LABEL: load_lds_v4i32_align4:
392 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
393 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0
394 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0
395 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 12, v0
396 ; GFX6-NEXT: s_mov_b32 m0, -1
397 ; GFX6-NEXT: ds_read_b32 v2, v2
398 ; GFX6-NEXT: ds_read_b32 v3, v3
399 ; GFX6-NEXT: ds_read_b32 v0, v0
400 ; GFX6-NEXT: ds_read_b32 v1, v1
401 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX6-NEXT: s_setpc_b64 s[30:31]
404 ; GFX10-LABEL: load_lds_v4i32_align4:
406 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
407 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
408 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
409 ; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
410 ; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
411 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
412 ; GFX10-NEXT: s_setpc_b64 s[30:31]
413 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
417 define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
418 ; GFX9-LABEL: load_lds_v4i32_align8:
420 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421 ; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
422 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
423 ; GFX9-NEXT: s_setpc_b64 s[30:31]
425 ; GFX7-LABEL: load_lds_v4i32_align8:
427 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428 ; GFX7-NEXT: s_mov_b32 m0, -1
429 ; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
430 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
431 ; GFX7-NEXT: s_setpc_b64 s[30:31]
433 ; GFX6-LABEL: load_lds_v4i32_align8:
435 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
437 ; GFX6-NEXT: s_mov_b32 m0, -1
438 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2
439 ; GFX6-NEXT: ds_read_b64 v[0:1], v0
440 ; GFX6-NEXT: ds_read_b64 v[2:3], v2
441 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
442 ; GFX6-NEXT: s_setpc_b64 s[30:31]
444 ; GFX10-LABEL: load_lds_v4i32_align8:
446 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
448 ; GFX10-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
449 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
450 ; GFX10-NEXT: s_setpc_b64 s[30:31]
451 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
455 define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) {
456 ; GFX9-LABEL: load_lds_v4i32_align16:
458 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
459 ; GFX9-NEXT: ds_read_b128 v[0:3], v0
460 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
461 ; GFX9-NEXT: s_setpc_b64 s[30:31]
463 ; GFX7-LABEL: load_lds_v4i32_align16:
465 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
466 ; GFX7-NEXT: s_mov_b32 m0, -1
467 ; GFX7-NEXT: ds_read_b128 v[0:3], v0
468 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
469 ; GFX7-NEXT: s_setpc_b64 s[30:31]
471 ; GFX6-LABEL: load_lds_v4i32_align16:
473 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
475 ; GFX6-NEXT: s_mov_b32 m0, -1
476 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2
477 ; GFX6-NEXT: ds_read_b64 v[0:1], v0
478 ; GFX6-NEXT: ds_read_b64 v[2:3], v2
479 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
480 ; GFX6-NEXT: s_setpc_b64 s[30:31]
482 ; GFX10-LABEL: load_lds_v4i32_align16:
484 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
486 ; GFX10-NEXT: ds_read_b128 v[0:3], v0
487 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
488 ; GFX10-NEXT: s_setpc_b64 s[30:31]
489 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16