1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=PREGFX10
3 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=PREGFX10
4 ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10
5 ;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11
7 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
8 ; PREGFX10-LABEL: buffer_load:
9 ; PREGFX10: ; %bb.0: ; %main_body
10 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
11 ; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
12 ; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
13 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
14 ; PREGFX10-NEXT: ; return to shader part epilog
16 ; GFX10-LABEL: buffer_load:
17 ; GFX10: ; %bb.0: ; %main_body
18 ; GFX10-NEXT: s_clause 0x2
19 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
20 ; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
21 ; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
22 ; GFX10-NEXT: s_waitcnt vmcnt(0)
23 ; GFX10-NEXT: ; return to shader part epilog
25 ; GFX11-LABEL: buffer_load:
26 ; GFX11: ; %bb.0: ; %main_body
27 ; GFX11-NEXT: s_clause 0x2
28 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0
29 ; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc
30 ; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 slc
31 ; GFX11-NEXT: s_waitcnt vmcnt(0)
32 ; GFX11-NEXT: ; return to shader part epilog
34 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0)
35 %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1)
36 %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2)
37 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
38 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
39 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
40 ret {<4 x float>, <4 x float>, <4 x float>} %r2
43 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i32> inreg) {
44 ; PREGFX10-LABEL: buffer_load_dlc:
45 ; PREGFX10: ; %bb.0: ; %main_body
46 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
47 ; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
48 ; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
49 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
50 ; PREGFX10-NEXT: ; return to shader part epilog
52 ; GFX10-LABEL: buffer_load_dlc:
53 ; GFX10: ; %bb.0: ; %main_body
54 ; GFX10-NEXT: s_clause 0x2
55 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 dlc
56 ; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
57 ; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc dlc
58 ; GFX10-NEXT: s_waitcnt vmcnt(0)
59 ; GFX10-NEXT: ; return to shader part epilog
61 ; GFX11-LABEL: buffer_load_dlc:
62 ; GFX11: ; %bb.0: ; %main_body
63 ; GFX11-NEXT: s_clause 0x2
64 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 dlc
65 ; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
66 ; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 slc dlc
67 ; GFX11-NEXT: s_waitcnt vmcnt(0)
68 ; GFX11-NEXT: ; return to shader part epilog
70 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 4)
71 %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 5)
72 %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 6)
73 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
74 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
75 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
76 ret {<4 x float>, <4 x float>, <4 x float>} %r2
79 define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
80 ; PREGFX10-LABEL: buffer_load_immoffs:
81 ; PREGFX10: ; %bb.0: ; %main_body
82 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
83 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
84 ; PREGFX10-NEXT: ; return to shader part epilog
86 ; GFX10-LABEL: buffer_load_immoffs:
87 ; GFX10: ; %bb.0: ; %main_body
88 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
89 ; GFX10-NEXT: s_waitcnt vmcnt(0)
90 ; GFX10-NEXT: ; return to shader part epilog
92 ; GFX11-LABEL: buffer_load_immoffs:
93 ; GFX11: ; %bb.0: ; %main_body
94 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:40
95 ; GFX11-NEXT: s_waitcnt vmcnt(0)
96 ; GFX11-NEXT: ; return to shader part epilog
98 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0)
102 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
103 ; PREGFX10-LABEL: buffer_load_immoffs_large:
104 ; PREGFX10: ; %bb.0: ; %main_body
105 ; PREGFX10-NEXT: s_movk_i32 s4, 0x1ffc
106 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s4 offset:4
107 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
108 ; PREGFX10-NEXT: ; return to shader part epilog
110 ; GFX10-LABEL: buffer_load_immoffs_large:
111 ; GFX10: ; %bb.0: ; %main_body
112 ; GFX10-NEXT: s_movk_i32 s4, 0x1ffc
113 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s4 offset:4
114 ; GFX10-NEXT: s_waitcnt vmcnt(0)
115 ; GFX10-NEXT: ; return to shader part epilog
117 ; GFX11-LABEL: buffer_load_immoffs_large:
118 ; GFX11: ; %bb.0: ; %main_body
119 ; GFX11-NEXT: s_movk_i32 s4, 0x1ffc
120 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], s4 offset:4
121 ; GFX11-NEXT: s_waitcnt vmcnt(0)
122 ; GFX11-NEXT: ; return to shader part epilog
124 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0)
125 ret <4 x float> %data
128 define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
129 ; PREGFX10-LABEL: buffer_load_ofs:
130 ; PREGFX10: ; %bb.0: ; %main_body
131 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
132 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
133 ; PREGFX10-NEXT: ; return to shader part epilog
135 ; GFX10-LABEL: buffer_load_ofs:
136 ; GFX10: ; %bb.0: ; %main_body
137 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
138 ; GFX10-NEXT: s_waitcnt vmcnt(0)
139 ; GFX10-NEXT: ; return to shader part epilog
141 ; GFX11-LABEL: buffer_load_ofs:
142 ; GFX11: ; %bb.0: ; %main_body
143 ; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen
144 ; GFX11-NEXT: s_waitcnt vmcnt(0)
145 ; GFX11-NEXT: ; return to shader part epilog
147 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0)
148 ret <4 x float> %data
151 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
152 ; PREGFX10-LABEL: buffer_load_ofs_imm:
153 ; PREGFX10: ; %bb.0: ; %main_body
154 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
155 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
156 ; PREGFX10-NEXT: ; return to shader part epilog
158 ; GFX10-LABEL: buffer_load_ofs_imm:
159 ; GFX10: ; %bb.0: ; %main_body
160 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
161 ; GFX10-NEXT: s_waitcnt vmcnt(0)
162 ; GFX10-NEXT: ; return to shader part epilog
164 ; GFX11-LABEL: buffer_load_ofs_imm:
165 ; GFX11: ; %bb.0: ; %main_body
166 ; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60
167 ; GFX11-NEXT: s_waitcnt vmcnt(0)
168 ; GFX11-NEXT: ; return to shader part epilog
170 %ofs = add i32 %1, 60
171 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0)
172 ret <4 x float> %data
175 define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) {
176 ; PREGFX10-LABEL: buffer_load_voffset_large_12bit:
177 ; PREGFX10: ; %bb.0: ; %main_body
178 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4092
179 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
180 ; PREGFX10-NEXT: ; return to shader part epilog
182 ; GFX10-LABEL: buffer_load_voffset_large_12bit:
183 ; GFX10: ; %bb.0: ; %main_body
184 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4092
185 ; GFX10-NEXT: s_waitcnt vmcnt(0)
186 ; GFX10-NEXT: ; return to shader part epilog
188 ; GFX11-LABEL: buffer_load_voffset_large_12bit:
189 ; GFX11: ; %bb.0: ; %main_body
190 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4092
191 ; GFX11-NEXT: s_waitcnt vmcnt(0)
192 ; GFX11-NEXT: ; return to shader part epilog
194 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4092, i32 0, i32 0)
195 ret <4 x float> %data
198 define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) {
199 ; PREGFX10-LABEL: buffer_load_voffset_large_13bit:
200 ; PREGFX10: ; %bb.0: ; %main_body
201 ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0x1000
202 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092
203 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
204 ; PREGFX10-NEXT: ; return to shader part epilog
206 ; GFX10-LABEL: buffer_load_voffset_large_13bit:
207 ; GFX10: ; %bb.0: ; %main_body
208 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1000
209 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092
210 ; GFX10-NEXT: s_waitcnt vmcnt(0)
211 ; GFX10-NEXT: ; return to shader part epilog
213 ; GFX11-LABEL: buffer_load_voffset_large_13bit:
214 ; GFX11: ; %bb.0: ; %main_body
215 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
216 ; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092
217 ; GFX11-NEXT: s_waitcnt vmcnt(0)
218 ; GFX11-NEXT: ; return to shader part epilog
220 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 8188, i32 0, i32 0)
221 ret <4 x float> %data
224 define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) {
225 ; PREGFX10-LABEL: buffer_load_voffset_large_16bit:
226 ; PREGFX10: ; %bb.0: ; %main_body
227 ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0xf000
228 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092
229 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
230 ; PREGFX10-NEXT: ; return to shader part epilog
232 ; GFX10-LABEL: buffer_load_voffset_large_16bit:
233 ; GFX10: ; %bb.0: ; %main_body
234 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xf000
235 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092
236 ; GFX10-NEXT: s_waitcnt vmcnt(0)
237 ; GFX10-NEXT: ; return to shader part epilog
239 ; GFX11-LABEL: buffer_load_voffset_large_16bit:
240 ; GFX11: ; %bb.0: ; %main_body
241 ; GFX11-NEXT: v_mov_b32_e32 v0, 0xf000
242 ; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092
243 ; GFX11-NEXT: s_waitcnt vmcnt(0)
244 ; GFX11-NEXT: ; return to shader part epilog
246 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 65532, i32 0, i32 0)
247 ret <4 x float> %data
250 define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) {
251 ; PREGFX10-LABEL: buffer_load_voffset_large_23bit:
252 ; PREGFX10: ; %bb.0: ; %main_body
253 ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0x7ff000
254 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092
255 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
256 ; PREGFX10-NEXT: ; return to shader part epilog
258 ; GFX10-LABEL: buffer_load_voffset_large_23bit:
259 ; GFX10: ; %bb.0: ; %main_body
260 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7ff000
261 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092
262 ; GFX10-NEXT: s_waitcnt vmcnt(0)
263 ; GFX10-NEXT: ; return to shader part epilog
265 ; GFX11-LABEL: buffer_load_voffset_large_23bit:
266 ; GFX11: ; %bb.0: ; %main_body
267 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7ff000
268 ; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092
269 ; GFX11-NEXT: s_waitcnt vmcnt(0)
270 ; GFX11-NEXT: ; return to shader part epilog
272 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 8388604, i32 0, i32 0)
273 ret <4 x float> %data
276 define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) {
277 ; PREGFX10-LABEL: buffer_load_voffset_large_24bit:
278 ; PREGFX10: ; %bb.0: ; %main_body
279 ; PREGFX10-NEXT: v_mov_b32_e32 v0, 0xfff000
280 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092
281 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
282 ; PREGFX10-NEXT: ; return to shader part epilog
284 ; GFX10-LABEL: buffer_load_voffset_large_24bit:
285 ; GFX10: ; %bb.0: ; %main_body
286 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xfff000
287 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092
288 ; GFX10-NEXT: s_waitcnt vmcnt(0)
289 ; GFX10-NEXT: ; return to shader part epilog
291 ; GFX11-LABEL: buffer_load_voffset_large_24bit:
292 ; GFX11: ; %bb.0: ; %main_body
293 ; GFX11-NEXT: v_mov_b32_e32 v0, 0xfff000
294 ; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092
295 ; GFX11-NEXT: s_waitcnt vmcnt(0)
296 ; GFX11-NEXT: ; return to shader part epilog
298 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 16777212, i32 0, i32 0)
299 ret <4 x float> %data
303 define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) {
304 ; PREGFX10-LABEL: buffer_load_x1:
305 ; PREGFX10: ; %bb.0: ; %main_body
306 ; PREGFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
307 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
308 ; PREGFX10-NEXT: ; return to shader part epilog
310 ; GFX10-LABEL: buffer_load_x1:
311 ; GFX10: ; %bb.0: ; %main_body
312 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
313 ; GFX10-NEXT: s_waitcnt vmcnt(0)
314 ; GFX10-NEXT: ; return to shader part epilog
316 ; GFX11-LABEL: buffer_load_x1:
317 ; GFX11: ; %bb.0: ; %main_body
318 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen
319 ; GFX11-NEXT: s_waitcnt vmcnt(0)
320 ; GFX11-NEXT: ; return to shader part epilog
322 %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0)
326 define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) {
327 ; PREGFX10-LABEL: buffer_load_x2:
328 ; PREGFX10: ; %bb.0: ; %main_body
329 ; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen
330 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
331 ; PREGFX10-NEXT: ; return to shader part epilog
333 ; GFX10-LABEL: buffer_load_x2:
334 ; GFX10: ; %bb.0: ; %main_body
335 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen
336 ; GFX10-NEXT: s_waitcnt vmcnt(0)
337 ; GFX10-NEXT: ; return to shader part epilog
339 ; GFX11-LABEL: buffer_load_x2:
340 ; GFX11: ; %bb.0: ; %main_body
341 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
342 ; GFX11-NEXT: s_waitcnt vmcnt(0)
343 ; GFX11-NEXT: ; return to shader part epilog
345 %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0)
346 ret <2 x float> %data
349 define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
350 ; GFX10-LABEL: buffer_load_negative_offset:
351 ; GFX10: ; %bb.0: ; %main_body
352 ; GFX10-NEXT: v_add_nc_u32_e32 v0, -16, v0
353 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
354 ; GFX10-NEXT: s_waitcnt vmcnt(0)
355 ; GFX10-NEXT: ; return to shader part epilog
357 ; GFX11-LABEL: buffer_load_negative_offset:
358 ; GFX11: ; %bb.0: ; %main_body
359 ; GFX11-NEXT: v_add_nc_u32_e32 v0, -16, v0
360 ; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen
361 ; GFX11-NEXT: s_waitcnt vmcnt(0)
362 ; GFX11-NEXT: ; return to shader part epilog
364 %ofs.1 = add i32 %ofs, -16
365 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs.1, i32 0, i32 0)
366 ret <4 x float> %data
369 define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) {
370 ; GFX10-LABEL: buffer_load_mmo:
371 ; GFX10: ; %bb.0: ; %entry
372 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
373 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
374 ; GFX10-NEXT: ds_write2_b32 v0, v2, v2 offset1:4
375 ; GFX10-NEXT: s_waitcnt vmcnt(0)
376 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
377 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
378 ; GFX10-NEXT: ; return to shader part epilog
380 ; GFX11-LABEL: buffer_load_mmo:
381 ; GFX11: ; %bb.0: ; %entry
382 ; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
383 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
384 ; GFX11-NEXT: ds_store_2addr_b32 v0, v2, v2 offset1:4
385 ; GFX11-NEXT: s_waitcnt vmcnt(0)
386 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
387 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
388 ; GFX11-NEXT: ; return to shader part epilog
390 store float 0.0, ptr addrspace(3) %lds
391 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
392 %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4
393 store float 0.0, ptr addrspace(3) %tmp2
397 define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) {
398 ; PREGFX10-LABEL: buffer_load_x1_offen_merged_and:
399 ; PREGFX10: ; %bb.0: ; %main_body
400 ; PREGFX10-NEXT: buffer_load_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
401 ; PREGFX10-NEXT: buffer_load_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
402 ; PREGFX10-NEXT: s_waitcnt vmcnt(1)
403 ; PREGFX10-NEXT: exp mrt0 v1, v2, v3, v4 done vm
404 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
405 ; PREGFX10-NEXT: exp mrt0 v5, v6, v0, v0 done vm
406 ; PREGFX10-NEXT: s_endpgm
408 ; GFX10-LABEL: buffer_load_x1_offen_merged_and:
409 ; GFX10: ; %bb.0: ; %main_body
410 ; GFX10-NEXT: s_clause 0x1
411 ; GFX10-NEXT: buffer_load_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
412 ; GFX10-NEXT: buffer_load_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
413 ; GFX10-NEXT: s_waitcnt vmcnt(1)
414 ; GFX10-NEXT: exp mrt0 v1, v2, v3, v4 done vm
415 ; GFX10-NEXT: s_waitcnt vmcnt(0)
416 ; GFX10-NEXT: exp mrt0 v5, v6, v0, v0 done vm
417 ; GFX10-NEXT: s_endpgm
419 ; GFX11-LABEL: buffer_load_x1_offen_merged_and:
420 ; GFX11: ; %bb.0: ; %main_body
421 ; GFX11-NEXT: s_clause 0x1
422 ; GFX11-NEXT: buffer_load_b128 v[1:4], v0, s[0:3], 0 offen offset:4
423 ; GFX11-NEXT: buffer_load_b64 v[5:6], v0, s[0:3], 0 offen offset:28
424 ; GFX11-NEXT: s_waitcnt vmcnt(1)
425 ; GFX11-NEXT: exp mrt0 v1, v2, v3, v4 done
426 ; GFX11-NEXT: s_waitcnt vmcnt(0)
427 ; GFX11-NEXT: exp mrt0 v5, v6, v0, v0 done
428 ; GFX11-NEXT: s_endpgm
436 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
437 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
438 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 0)
439 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 0)
440 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 0)
441 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 0)
442 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
443 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
447 define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) {
448 ; PREGFX10-LABEL: buffer_load_x1_offen_merged_or:
449 ; PREGFX10: ; %bb.0: ; %main_body
450 ; PREGFX10-NEXT: v_lshlrev_b32_e32 v4, 6, v0
451 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen offset:4
452 ; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], v4, s[0:3], 0 offen offset:28
453 ; PREGFX10-NEXT: s_waitcnt vmcnt(1)
454 ; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
455 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
456 ; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
457 ; PREGFX10-NEXT: s_endpgm
459 ; GFX10-LABEL: buffer_load_x1_offen_merged_or:
460 ; GFX10: ; %bb.0: ; %main_body
461 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 6, v0
462 ; GFX10-NEXT: s_clause 0x1
463 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v6, s[0:3], 0 offen offset:4
464 ; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v6, s[0:3], 0 offen offset:28
465 ; GFX10-NEXT: s_waitcnt vmcnt(1)
466 ; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
467 ; GFX10-NEXT: s_waitcnt vmcnt(0)
468 ; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
469 ; GFX10-NEXT: s_endpgm
471 ; GFX11-LABEL: buffer_load_x1_offen_merged_or:
472 ; GFX11: ; %bb.0: ; %main_body
473 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 6, v0
474 ; GFX11-NEXT: s_clause 0x1
475 ; GFX11-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], 0 offen offset:4
476 ; GFX11-NEXT: buffer_load_b64 v[4:5], v4, s[0:3], 0 offen offset:28
477 ; GFX11-NEXT: s_waitcnt vmcnt(1)
478 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
479 ; GFX11-NEXT: s_waitcnt vmcnt(0)
480 ; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
481 ; GFX11-NEXT: s_endpgm
490 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
491 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
492 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 0)
493 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 0)
494 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 0)
495 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 0)
496 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
497 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
501 define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
502 ; PREGFX10-LABEL: buffer_load_x1_offen_merged_glc_slc:
503 ; PREGFX10: ; %bb.0: ; %main_body
504 ; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4
505 ; PREGFX10-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc
506 ; PREGFX10-NEXT: buffer_load_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
507 ; PREGFX10-NEXT: s_waitcnt vmcnt(1)
508 ; PREGFX10-NEXT: exp mrt0 v1, v2, v3, v4 done vm
509 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
510 ; PREGFX10-NEXT: exp mrt0 v5, v6, v0, v0 done vm
511 ; PREGFX10-NEXT: s_endpgm
513 ; GFX10-LABEL: buffer_load_x1_offen_merged_glc_slc:
514 ; GFX10: ; %bb.0: ; %main_body
515 ; GFX10-NEXT: s_clause 0x2
516 ; GFX10-NEXT: buffer_load_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4
517 ; GFX10-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc
518 ; GFX10-NEXT: buffer_load_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
519 ; GFX10-NEXT: s_waitcnt vmcnt(1)
520 ; GFX10-NEXT: exp mrt0 v1, v2, v3, v4 done vm
521 ; GFX10-NEXT: s_waitcnt vmcnt(0)
522 ; GFX10-NEXT: exp mrt0 v5, v6, v0, v0 done vm
523 ; GFX10-NEXT: s_endpgm
525 ; GFX11-LABEL: buffer_load_x1_offen_merged_glc_slc:
526 ; GFX11: ; %bb.0: ; %main_body
527 ; GFX11-NEXT: s_clause 0x2
528 ; GFX11-NEXT: buffer_load_b64 v[1:2], v0, s[0:3], 0 offen offset:4
529 ; GFX11-NEXT: buffer_load_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc
530 ; GFX11-NEXT: buffer_load_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
531 ; GFX11-NEXT: s_waitcnt vmcnt(1)
532 ; GFX11-NEXT: exp mrt0 v1, v2, v3, v4 done
533 ; GFX11-NEXT: s_waitcnt vmcnt(0)
534 ; GFX11-NEXT: exp mrt0 v5, v6, v0, v0 done
535 ; GFX11-NEXT: s_endpgm
543 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
544 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
545 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 1)
546 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 1)
547 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 3)
548 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 3)
549 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
550 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
554 define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) {
555 ; PREGFX10-LABEL: buffer_load_x2_offen_merged_and:
556 ; PREGFX10: ; %bb.0: ; %main_body
557 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4
558 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
559 ; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
560 ; PREGFX10-NEXT: s_endpgm
562 ; GFX10-LABEL: buffer_load_x2_offen_merged_and:
563 ; GFX10: ; %bb.0: ; %main_body
564 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4
565 ; GFX10-NEXT: s_waitcnt vmcnt(0)
566 ; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
567 ; GFX10-NEXT: s_endpgm
569 ; GFX11-LABEL: buffer_load_x2_offen_merged_and:
570 ; GFX11: ; %bb.0: ; %main_body
571 ; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4
572 ; GFX11-NEXT: s_waitcnt vmcnt(0)
573 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
574 ; GFX11-NEXT: s_endpgm
578 %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
579 %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
580 %r1 = extractelement <2 x float> %vr1, i32 0
581 %r2 = extractelement <2 x float> %vr1, i32 1
582 %r3 = extractelement <2 x float> %vr2, i32 0
583 %r4 = extractelement <2 x float> %vr2, i32 1
584 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
588 define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) {
589 ; PREGFX10-LABEL: buffer_load_x2_offen_merged_or:
590 ; PREGFX10: ; %bb.0: ; %main_body
591 ; PREGFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
592 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4
593 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
594 ; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
595 ; PREGFX10-NEXT: s_endpgm
597 ; GFX10-LABEL: buffer_load_x2_offen_merged_or:
598 ; GFX10: ; %bb.0: ; %main_body
599 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
600 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4
601 ; GFX10-NEXT: s_waitcnt vmcnt(0)
602 ; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
603 ; GFX10-NEXT: s_endpgm
605 ; GFX11-LABEL: buffer_load_x2_offen_merged_or:
606 ; GFX11: ; %bb.0: ; %main_body
607 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
608 ; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4
609 ; GFX11-NEXT: s_waitcnt vmcnt(0)
610 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
611 ; GFX11-NEXT: s_endpgm
616 %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
617 %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
618 %r1 = extractelement <2 x float> %vr1, i32 0
619 %r2 = extractelement <2 x float> %vr1, i32 1
620 %r3 = extractelement <2 x float> %vr2, i32 0
621 %r4 = extractelement <2 x float> %vr2, i32 1
622 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
626 define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
627 ; PREGFX10-LABEL: buffer_load_x1_offset_merged:
628 ; PREGFX10: ; %bb.0: ; %main_body
629 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
630 ; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
631 ; PREGFX10-NEXT: s_waitcnt vmcnt(1)
632 ; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
633 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
634 ; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
635 ; PREGFX10-NEXT: s_endpgm
637 ; GFX10-LABEL: buffer_load_x1_offset_merged:
638 ; GFX10: ; %bb.0: ; %main_body
639 ; GFX10-NEXT: s_clause 0x1
640 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
641 ; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
642 ; GFX10-NEXT: s_waitcnt vmcnt(1)
643 ; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
644 ; GFX10-NEXT: s_waitcnt vmcnt(0)
645 ; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
646 ; GFX10-NEXT: s_endpgm
648 ; GFX11-LABEL: buffer_load_x1_offset_merged:
649 ; GFX11: ; %bb.0: ; %main_body
650 ; GFX11-NEXT: s_clause 0x1
651 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4
652 ; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:28
653 ; GFX11-NEXT: s_waitcnt vmcnt(1)
654 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
655 ; GFX11-NEXT: s_waitcnt vmcnt(0)
656 ; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
657 ; GFX11-NEXT: s_endpgm
659 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0)
660 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0)
661 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0)
662 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0)
663 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0)
664 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0)
665 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
666 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
670 define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) {
671 ; PREGFX10-LABEL: buffer_load_x2_offset_merged:
672 ; PREGFX10: ; %bb.0: ; %main_body
673 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
674 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
675 ; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
676 ; PREGFX10-NEXT: s_endpgm
678 ; GFX10-LABEL: buffer_load_x2_offset_merged:
679 ; GFX10: ; %bb.0: ; %main_body
680 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
681 ; GFX10-NEXT: s_waitcnt vmcnt(0)
682 ; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
683 ; GFX10-NEXT: s_endpgm
685 ; GFX11-LABEL: buffer_load_x2_offset_merged:
686 ; GFX11: ; %bb.0: ; %main_body
687 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4
688 ; GFX11-NEXT: s_waitcnt vmcnt(0)
689 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
690 ; GFX11-NEXT: s_endpgm
692 %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0)
693 %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0)
694 %r1 = extractelement <2 x float> %vr1, i32 0
695 %r2 = extractelement <2 x float> %vr1, i32 1
696 %r3 = extractelement <2 x float> %vr2, i32 0
697 %r4 = extractelement <2 x float> %vr2, i32 1
698 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
702 define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) {
703 ; PREGFX10-LABEL: buffer_load_int:
704 ; PREGFX10: ; %bb.0: ; %main_body
705 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
706 ; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc
707 ; PREGFX10-NEXT: buffer_load_dword v6, off, s[0:3], 0 slc
708 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
709 ; PREGFX10-NEXT: ; return to shader part epilog
711 ; GFX10-LABEL: buffer_load_int:
712 ; GFX10: ; %bb.0: ; %main_body
713 ; GFX10-NEXT: s_clause 0x2
714 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
715 ; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc
716 ; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], 0 slc
717 ; GFX10-NEXT: s_waitcnt vmcnt(0)
718 ; GFX10-NEXT: ; return to shader part epilog
720 ; GFX11-LABEL: buffer_load_int:
721 ; GFX11: ; %bb.0: ; %main_body
722 ; GFX11-NEXT: s_clause 0x2
723 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0
724 ; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 glc
725 ; GFX11-NEXT: buffer_load_b32 v6, off, s[0:3], 0 slc
726 ; GFX11-NEXT: s_waitcnt vmcnt(0)
727 ; GFX11-NEXT: ; return to shader part epilog
729 %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0)
730 %data_glc = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 1)
731 %data_slc = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %0, i32 0, i32 0, i32 2)
732 %fdata = bitcast <4 x i32> %data to <4 x float>
733 %fdata_glc = bitcast <2 x i32> %data_glc to <2 x float>
734 %fdata_slc = bitcast i32 %data_slc to float
735 %r0 = insertvalue {<4 x float>, <2 x float>, float} undef, <4 x float> %fdata, 0
736 %r1 = insertvalue {<4 x float>, <2 x float>, float} %r0, <2 x float> %fdata_glc, 1
737 %r2 = insertvalue {<4 x float>, <2 x float>, float} %r1, float %fdata_slc, 2
738 ret {<4 x float>, <2 x float>, float} %r2
741 define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) {
742 ; PREGFX10-LABEL: raw_buffer_load_ubyte:
743 ; PREGFX10: ; %bb.0: ; %main_body
744 ; PREGFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
745 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
746 ; PREGFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
747 ; PREGFX10-NEXT: ; return to shader part epilog
749 ; GFX10-LABEL: raw_buffer_load_ubyte:
750 ; GFX10: ; %bb.0: ; %main_body
751 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
752 ; GFX10-NEXT: s_waitcnt vmcnt(0)
753 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
754 ; GFX10-NEXT: ; return to shader part epilog
756 ; GFX11-LABEL: raw_buffer_load_ubyte:
757 ; GFX11: ; %bb.0: ; %main_body
758 ; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0
759 ; GFX11-NEXT: s_waitcnt vmcnt(0)
760 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
761 ; GFX11-NEXT: ; return to shader part epilog
763 %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
764 %tmp2 = zext i8 %tmp to i32
765 %val = uitofp i32 %tmp2 to float
769 define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) {
770 ; PREGFX10-LABEL: raw_buffer_load_i16:
771 ; PREGFX10: ; %bb.0: ; %main_body
772 ; PREGFX10-NEXT: buffer_load_ushort v0, off, s[0:3], 0
773 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
774 ; PREGFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
775 ; PREGFX10-NEXT: ; return to shader part epilog
777 ; GFX10-LABEL: raw_buffer_load_i16:
778 ; GFX10: ; %bb.0: ; %main_body
779 ; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], 0
780 ; GFX10-NEXT: s_waitcnt vmcnt(0)
781 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
782 ; GFX10-NEXT: ; return to shader part epilog
784 ; GFX11-LABEL: raw_buffer_load_i16:
785 ; GFX11: ; %bb.0: ; %main_body
786 ; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
787 ; GFX11-NEXT: s_waitcnt vmcnt(0)
788 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
789 ; GFX11-NEXT: ; return to shader part epilog
791 %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
792 %tmp2 = zext i16 %tmp to i32
793 %val = uitofp i32 %tmp2 to float
797 define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) {
798 ; PREGFX10-LABEL: raw_buffer_load_sbyte:
799 ; PREGFX10: ; %bb.0: ; %main_body
800 ; PREGFX10-NEXT: buffer_load_sbyte v0, off, s[0:3], 0
801 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
802 ; PREGFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
803 ; PREGFX10-NEXT: ; return to shader part epilog
805 ; GFX10-LABEL: raw_buffer_load_sbyte:
806 ; GFX10: ; %bb.0: ; %main_body
807 ; GFX10-NEXT: buffer_load_sbyte v0, off, s[0:3], 0
808 ; GFX10-NEXT: s_waitcnt vmcnt(0)
809 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
810 ; GFX10-NEXT: ; return to shader part epilog
812 ; GFX11-LABEL: raw_buffer_load_sbyte:
813 ; GFX11: ; %bb.0: ; %main_body
814 ; GFX11-NEXT: buffer_load_i8 v0, off, s[0:3], 0
815 ; GFX11-NEXT: s_waitcnt vmcnt(0)
816 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
817 ; GFX11-NEXT: ; return to shader part epilog
819 %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
820 %tmp2 = sext i8 %tmp to i32
821 %val = sitofp i32 %tmp2 to float
825 define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) {
826 ; PREGFX10-LABEL: raw_buffer_load_sshort:
827 ; PREGFX10: ; %bb.0: ; %main_body
828 ; PREGFX10-NEXT: buffer_load_sshort v0, off, s[0:3], 0
829 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
830 ; PREGFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
831 ; PREGFX10-NEXT: ; return to shader part epilog
833 ; GFX10-LABEL: raw_buffer_load_sshort:
834 ; GFX10: ; %bb.0: ; %main_body
835 ; GFX10-NEXT: buffer_load_sshort v0, off, s[0:3], 0
836 ; GFX10-NEXT: s_waitcnt vmcnt(0)
837 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
838 ; GFX10-NEXT: ; return to shader part epilog
840 ; GFX11-LABEL: raw_buffer_load_sshort:
841 ; GFX11: ; %bb.0: ; %main_body
842 ; GFX11-NEXT: buffer_load_i16 v0, off, s[0:3], 0
843 ; GFX11-NEXT: s_waitcnt vmcnt(0)
844 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
845 ; GFX11-NEXT: ; return to shader part epilog
847 %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
848 %tmp2 = sext i16 %tmp to i32
849 %val = sitofp i32 %tmp2 to float
853 define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
854 ; PREGFX10-LABEL: raw_buffer_load_f16:
855 ; PREGFX10: ; %bb.0: ; %main_body
856 ; PREGFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0
857 ; PREGFX10-NEXT: s_mov_b32 m0, -1
858 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
859 ; PREGFX10-NEXT: ds_write_b16 v0, v1
860 ; PREGFX10-NEXT: s_endpgm
862 ; GFX10-LABEL: raw_buffer_load_f16:
863 ; GFX10: ; %bb.0: ; %main_body
864 ; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0
865 ; GFX10-NEXT: s_waitcnt vmcnt(0)
866 ; GFX10-NEXT: ds_write_b16 v0, v1
867 ; GFX10-NEXT: s_endpgm
869 ; GFX11-LABEL: raw_buffer_load_f16:
870 ; GFX11: ; %bb.0: ; %main_body
871 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0
872 ; GFX11-NEXT: s_waitcnt vmcnt(0)
873 ; GFX11-NEXT: ds_store_b16 v0, v1
874 ; GFX11-NEXT: s_endpgm
876 %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
877 store half %val, ptr addrspace(3) %ptr
881 define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
882 ; PREGFX10-LABEL: raw_buffer_load_v2f16:
883 ; PREGFX10: ; %bb.0: ; %main_body
884 ; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
885 ; PREGFX10-NEXT: s_mov_b32 m0, -1
886 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
887 ; PREGFX10-NEXT: ds_write_b32 v0, v1
888 ; PREGFX10-NEXT: s_endpgm
890 ; GFX10-LABEL: raw_buffer_load_v2f16:
891 ; GFX10: ; %bb.0: ; %main_body
892 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
893 ; GFX10-NEXT: s_waitcnt vmcnt(0)
894 ; GFX10-NEXT: ds_write_b32 v0, v1
895 ; GFX10-NEXT: s_endpgm
897 ; GFX11-LABEL: raw_buffer_load_v2f16:
898 ; GFX11: ; %bb.0: ; %main_body
899 ; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
900 ; GFX11-NEXT: s_waitcnt vmcnt(0)
901 ; GFX11-NEXT: ds_store_b32 v0, v1
902 ; GFX11-NEXT: s_endpgm
904 %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
905 store <2 x half> %val, ptr addrspace(3) %ptr
909 define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
910 ; PREGFX10-LABEL: raw_buffer_load_v4f16:
911 ; PREGFX10: ; %bb.0: ; %main_body
912 ; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
913 ; PREGFX10-NEXT: s_mov_b32 m0, -1
914 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
915 ; PREGFX10-NEXT: ds_write_b64 v0, v[1:2]
916 ; PREGFX10-NEXT: s_endpgm
918 ; GFX10-LABEL: raw_buffer_load_v4f16:
919 ; GFX10: ; %bb.0: ; %main_body
920 ; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
921 ; GFX10-NEXT: s_waitcnt vmcnt(0)
922 ; GFX10-NEXT: ds_write_b64 v0, v[1:2]
923 ; GFX10-NEXT: s_endpgm
925 ; GFX11-LABEL: raw_buffer_load_v4f16:
926 ; GFX11: ; %bb.0: ; %main_body
927 ; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0
928 ; GFX11-NEXT: s_waitcnt vmcnt(0)
929 ; GFX11-NEXT: ds_store_b64 v0, v[1:2]
930 ; GFX11-NEXT: s_endpgm
932 %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
933 store <4 x half> %val, ptr addrspace(3) %ptr
937 define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
938 ; PREGFX10-LABEL: raw_buffer_load_v2i16:
939 ; PREGFX10: ; %bb.0: ; %main_body
940 ; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
941 ; PREGFX10-NEXT: s_mov_b32 m0, -1
942 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
943 ; PREGFX10-NEXT: ds_write_b32 v0, v1
944 ; PREGFX10-NEXT: s_endpgm
946 ; GFX10-LABEL: raw_buffer_load_v2i16:
947 ; GFX10: ; %bb.0: ; %main_body
948 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
949 ; GFX10-NEXT: s_waitcnt vmcnt(0)
950 ; GFX10-NEXT: ds_write_b32 v0, v1
951 ; GFX10-NEXT: s_endpgm
953 ; GFX11-LABEL: raw_buffer_load_v2i16:
954 ; GFX11: ; %bb.0: ; %main_body
955 ; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
956 ; GFX11-NEXT: s_waitcnt vmcnt(0)
957 ; GFX11-NEXT: ds_store_b32 v0, v1
958 ; GFX11-NEXT: s_endpgm
960 %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
961 store <2 x i16> %val, ptr addrspace(3) %ptr
965 define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
966 ; PREGFX10-LABEL: raw_buffer_load_v4i16:
967 ; PREGFX10: ; %bb.0: ; %main_body
968 ; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
969 ; PREGFX10-NEXT: s_mov_b32 m0, -1
970 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
971 ; PREGFX10-NEXT: ds_write_b64 v0, v[1:2]
972 ; PREGFX10-NEXT: s_endpgm
974 ; GFX10-LABEL: raw_buffer_load_v4i16:
975 ; GFX10: ; %bb.0: ; %main_body
976 ; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
977 ; GFX10-NEXT: s_waitcnt vmcnt(0)
978 ; GFX10-NEXT: ds_write_b64 v0, v[1:2]
979 ; GFX10-NEXT: s_endpgm
981 ; GFX11-LABEL: raw_buffer_load_v4i16:
982 ; GFX11: ; %bb.0: ; %main_body
983 ; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0
984 ; GFX11-NEXT: s_waitcnt vmcnt(0)
985 ; GFX11-NEXT: ds_store_b64 v0, v[1:2]
986 ; GFX11-NEXT: s_endpgm
988 %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
989 store <4 x i16> %val, ptr addrspace(3) %ptr
993 define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
994 ; PREGFX10-LABEL: raw_buffer_load_x1_offset_merged:
995 ; PREGFX10: ; %bb.0: ; %main_body
996 ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
997 ; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
998 ; PREGFX10-NEXT: s_waitcnt vmcnt(1)
999 ; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
1000 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
1001 ; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
1002 ; PREGFX10-NEXT: s_endpgm
1004 ; GFX10-LABEL: raw_buffer_load_x1_offset_merged:
1005 ; GFX10: ; %bb.0: ; %main_body
1006 ; GFX10-NEXT: s_clause 0x1
1007 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
1008 ; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
1009 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1010 ; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
1011 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1012 ; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
1013 ; GFX10-NEXT: s_endpgm
1015 ; GFX11-LABEL: raw_buffer_load_x1_offset_merged:
1016 ; GFX11: ; %bb.0: ; %main_body
1017 ; GFX11-NEXT: s_clause 0x1
1018 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4
1019 ; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:28
1020 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1021 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
1022 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1023 ; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
1024 ; GFX11-NEXT: s_endpgm
1026 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0)
1027 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0)
1028 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0)
1029 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0)
1030 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0)
1031 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0)
1032 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
1033 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
1037 define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) {
1038 ; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1039 ; PREGFX10: ; %bb.0: ; %main_body
1040 ; PREGFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
1041 ; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
1042 ; PREGFX10-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:12
1043 ; PREGFX10-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16
1044 ; PREGFX10-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:28
1045 ; PREGFX10-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:32
1046 ; PREGFX10-NEXT: s_waitcnt vmcnt(2)
1047 ; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
1048 ; PREGFX10-NEXT: s_waitcnt vmcnt(0)
1049 ; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
1050 ; PREGFX10-NEXT: s_endpgm
1052 ; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1053 ; GFX10: ; %bb.0: ; %main_body
1054 ; GFX10-NEXT: s_clause 0x5
1055 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
1056 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
1057 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:12
1058 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16
1059 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:28
1060 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:32
1061 ; GFX10-NEXT: s_waitcnt vmcnt(2)
1062 ; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
1063 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1064 ; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
1065 ; GFX10-NEXT: s_endpgm
1067 ; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1068 ; GFX11: ; %bb.0: ; %main_body
1069 ; GFX11-NEXT: s_clause 0x5
1070 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 offset:4
1071 ; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:8
1072 ; GFX11-NEXT: buffer_load_b32 v2, off, s[0:3], 0 offset:12
1073 ; GFX11-NEXT: buffer_load_b32 v3, off, s[0:3], 0 offset:16
1074 ; GFX11-NEXT: buffer_load_b32 v4, off, s[0:3], 0 offset:28
1075 ; GFX11-NEXT: buffer_load_b32 v5, off, s[0:3], 0 offset:32
1076 ; GFX11-NEXT: s_waitcnt vmcnt(2)
1077 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
1078 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1079 ; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
1080 ; GFX11-NEXT: s_endpgm
1082 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 8)
1083 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 8)
1084 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 8)
1085 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 8)
1086 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 8)
1087 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 8)
1088 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
1089 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
1093 declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0
1094 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0
1095 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
1096 declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #0
1097 declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32) #0
1098 declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #0
1099 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
1100 declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0
1101 declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0
1102 declare <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32>, i32, i32, i32) #0
1103 declare <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32>, i32, i32, i32) #0
1104 declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #0
1105 declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #0
1106 declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #0
1107 attributes #0 = { nounwind readonly }