1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7 ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
9 define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) {
10 ; GFX9-LABEL: load_lds_v3i32:
12 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13 ; GFX9-NEXT: ds_read_b96 v[0:2], v0
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17 ; GFX7-LABEL: load_lds_v3i32:
19 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20 ; GFX7-NEXT: s_mov_b32 m0, -1
21 ; GFX7-NEXT: ds_read_b96 v[0:2], v0
22 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
23 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25 ; GFX10-LABEL: load_lds_v3i32:
27 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
29 ; GFX10-NEXT: ds_read_b96 v[0:2], v0
30 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
31 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr
36 define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
37 ; GFX9-LABEL: load_lds_v3i32_align1:
39 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40 ; GFX9-NEXT: ds_read_u8 v1, v0
41 ; GFX9-NEXT: ds_read_u8 v2, v0 offset:1
42 ; GFX9-NEXT: ds_read_u8 v3, v0 offset:2
43 ; GFX9-NEXT: ds_read_u8 v4, v0 offset:3
44 ; GFX9-NEXT: ds_read_u8 v5, v0 offset:4
45 ; GFX9-NEXT: ds_read_u8 v6, v0 offset:5
46 ; GFX9-NEXT: ds_read_u8 v7, v0 offset:6
47 ; GFX9-NEXT: ds_read_u8 v8, v0 offset:7
48 ; GFX9-NEXT: s_waitcnt lgkmcnt(6)
49 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1
50 ; GFX9-NEXT: s_waitcnt lgkmcnt(4)
51 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v4
52 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
53 ; GFX9-NEXT: v_or3_b32 v3, v2, v3, v1
54 ; GFX9-NEXT: s_waitcnt lgkmcnt(2)
55 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5
56 ; GFX9-NEXT: ds_read_u8 v2, v0 offset:8
57 ; GFX9-NEXT: ds_read_u8 v4, v0 offset:9
58 ; GFX9-NEXT: ds_read_u8 v5, v0 offset:10
59 ; GFX9-NEXT: ds_read_u8 v0, v0 offset:11
60 ; GFX9-NEXT: s_waitcnt lgkmcnt(4)
61 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v8
62 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
63 ; GFX9-NEXT: s_waitcnt lgkmcnt(2)
64 ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 8, v2
65 ; GFX9-NEXT: s_waitcnt lgkmcnt(1)
66 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
67 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0
69 ; GFX9-NEXT: v_or3_b32 v1, v6, v7, v1
70 ; GFX9-NEXT: v_or3_b32 v2, v0, v4, v2
71 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
72 ; GFX9-NEXT: s_setpc_b64 s[30:31]
74 ; GFX7-LABEL: load_lds_v3i32_align1:
76 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77 ; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
78 ; GFX7-NEXT: ds_read_u8 v2, v0
79 ; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
80 ; GFX7-NEXT: s_mov_b32 m0, -1
81 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
82 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
83 ; GFX7-NEXT: s_waitcnt lgkmcnt(1)
84 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
85 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:3
86 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:4
87 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:5
88 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:6
89 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:7
90 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:8
91 ; GFX7-NEXT: ds_read_u8 v9, v0 offset:9
92 ; GFX7-NEXT: ds_read_u8 v10, v0 offset:10
93 ; GFX7-NEXT: s_waitcnt lgkmcnt(7)
94 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
95 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
96 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
97 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
98 ; GFX7-NEXT: v_or_b32_e32 v3, v2, v1
99 ; GFX7-NEXT: s_waitcnt lgkmcnt(6)
100 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v5
101 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
102 ; GFX7-NEXT: s_waitcnt lgkmcnt(4)
103 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7
104 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
105 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
106 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
107 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
108 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9
109 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
110 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
111 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v10
112 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v8
113 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4
114 ; GFX7-NEXT: v_or_b32_e32 v2, v0, v2
115 ; GFX7-NEXT: v_mov_b32_e32 v0, v3
116 ; GFX7-NEXT: s_setpc_b64 s[30:31]
118 ; GFX10-LABEL: load_lds_v3i32_align1:
120 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
122 ; GFX10-NEXT: ds_read_u8 v1, v0
123 ; GFX10-NEXT: ds_read_u8 v2, v0 offset:1
124 ; GFX10-NEXT: ds_read_u8 v3, v0 offset:2
125 ; GFX10-NEXT: ds_read_u8 v4, v0 offset:3
126 ; GFX10-NEXT: ds_read_u8 v5, v0 offset:4
127 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5
128 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:6
129 ; GFX10-NEXT: ds_read_u8 v8, v0 offset:7
130 ; GFX10-NEXT: ds_read_u8 v9, v0 offset:8
131 ; GFX10-NEXT: ds_read_u8 v10, v0 offset:9
132 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:11
133 ; GFX10-NEXT: ds_read_u8 v0, v0 offset:10
134 ; GFX10-NEXT: s_waitcnt lgkmcnt(10)
135 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1
136 ; GFX10-NEXT: s_waitcnt lgkmcnt(9)
137 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
138 ; GFX10-NEXT: s_waitcnt lgkmcnt(8)
139 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4
140 ; GFX10-NEXT: s_waitcnt lgkmcnt(6)
141 ; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5
142 ; GFX10-NEXT: s_waitcnt lgkmcnt(5)
143 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
144 ; GFX10-NEXT: s_waitcnt lgkmcnt(4)
145 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v8
146 ; GFX10-NEXT: s_waitcnt lgkmcnt(2)
147 ; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9
148 ; GFX10-NEXT: s_waitcnt lgkmcnt(1)
149 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v11
150 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
151 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
152 ; GFX10-NEXT: v_or3_b32 v0, v2, v3, v1
153 ; GFX10-NEXT: v_or3_b32 v1, v5, v6, v4
154 ; GFX10-NEXT: v_or3_b32 v2, v8, v9, v7
155 ; GFX10-NEXT: s_setpc_b64 s[30:31]
156 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
160 define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
161 ; GFX9-LABEL: load_lds_v3i32_align2:
163 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164 ; GFX9-NEXT: ds_read_u16 v1, v0
165 ; GFX9-NEXT: ds_read_u16 v2, v0 offset:2
166 ; GFX9-NEXT: ds_read_u16 v3, v0 offset:4
167 ; GFX9-NEXT: ds_read_u16 v4, v0 offset:6
168 ; GFX9-NEXT: ds_read_u16 v5, v0 offset:8
169 ; GFX9-NEXT: ds_read_u16 v6, v0 offset:10
170 ; GFX9-NEXT: s_waitcnt lgkmcnt(4)
171 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1
172 ; GFX9-NEXT: s_waitcnt lgkmcnt(2)
173 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3
174 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
175 ; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5
176 ; GFX9-NEXT: s_setpc_b64 s[30:31]
178 ; GFX7-LABEL: load_lds_v3i32_align2:
180 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181 ; GFX7-NEXT: ds_read_u16 v1, v0
182 ; GFX7-NEXT: s_mov_b32 m0, -1
183 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
184 ; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
185 ; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
186 ; GFX7-NEXT: ds_read_u16 v5, v0 offset:8
187 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
188 ; GFX7-NEXT: s_waitcnt lgkmcnt(4)
189 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
190 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
191 ; GFX7-NEXT: s_waitcnt lgkmcnt(2)
192 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
193 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
194 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
195 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
196 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
197 ; GFX7-NEXT: s_setpc_b64 s[30:31]
199 ; GFX10-LABEL: load_lds_v3i32_align2:
201 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
202 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
203 ; GFX10-NEXT: ds_read_u16 v1, v0
204 ; GFX10-NEXT: ds_read_u16 v2, v0 offset:2
205 ; GFX10-NEXT: ds_read_u16 v3, v0 offset:4
206 ; GFX10-NEXT: ds_read_u16 v4, v0 offset:6
207 ; GFX10-NEXT: ds_read_u16 v5, v0 offset:8
208 ; GFX10-NEXT: ds_read_u16 v6, v0 offset:10
209 ; GFX10-NEXT: s_waitcnt lgkmcnt(4)
210 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1
211 ; GFX10-NEXT: s_waitcnt lgkmcnt(2)
212 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3
213 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5
215 ; GFX10-NEXT: s_setpc_b64 s[30:31]
216 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2
220 define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
221 ; GFX9-LABEL: load_lds_v3i32_align4:
223 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
225 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
226 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
227 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
228 ; GFX9-NEXT: s_setpc_b64 s[30:31]
230 ; GFX7-LABEL: load_lds_v3i32_align4:
232 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
233 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
234 ; GFX7-NEXT: s_mov_b32 m0, -1
235 ; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
236 ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8
237 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
238 ; GFX7-NEXT: s_setpc_b64 s[30:31]
240 ; GFX10-LABEL: load_lds_v3i32_align4:
242 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
244 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
245 ; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
246 ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8
247 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
248 ; GFX10-NEXT: s_setpc_b64 s[30:31]
249 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
253 define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
254 ; GFX9-LABEL: load_lds_v3i32_align8:
256 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
258 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
259 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
261 ; GFX9-NEXT: s_setpc_b64 s[30:31]
263 ; GFX7-LABEL: load_lds_v3i32_align8:
265 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
266 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
267 ; GFX7-NEXT: s_mov_b32 m0, -1
268 ; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
269 ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8
270 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
271 ; GFX7-NEXT: s_setpc_b64 s[30:31]
273 ; GFX10-LABEL: load_lds_v3i32_align8:
275 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
277 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
278 ; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
279 ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8
280 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX10-NEXT: s_setpc_b64 s[30:31]
282 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8
286 define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) {
287 ; GFX9-LABEL: load_lds_v3i32_align16:
289 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290 ; GFX9-NEXT: ds_read_b96 v[0:2], v0
291 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
292 ; GFX9-NEXT: s_setpc_b64 s[30:31]
294 ; GFX7-LABEL: load_lds_v3i32_align16:
296 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297 ; GFX7-NEXT: s_mov_b32 m0, -1
298 ; GFX7-NEXT: ds_read_b96 v[0:2], v0
299 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
300 ; GFX7-NEXT: s_setpc_b64 s[30:31]
302 ; GFX10-LABEL: load_lds_v3i32_align16:
304 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
306 ; GFX10-NEXT: ds_read_b96 v[0:2], v0
307 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
308 ; GFX10-NEXT: s_setpc_b64 s[30:31]
309 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16