1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
6 ; Test using saddr addressing mode of global_*load_* flat instructions.
8 ; --------------------------------------------------------------------------------
9 ; No vgpr offset, constants
10 ; --------------------------------------------------------------------------------
13 define amdgpu_ps float @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sbase) {
14 ; GCN-LABEL: global_load_saddr_i8_offset_0:
16 ; GCN-NEXT: v_mov_b32_e32 v0, 0
17 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
18 ; GCN-NEXT: s_waitcnt vmcnt(0)
19 ; GCN-NEXT: ; return to shader part epilog
21 ; GFX11-LABEL: global_load_saddr_i8_offset_0:
23 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
24 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
25 ; GFX11-NEXT: s_waitcnt vmcnt(0)
26 ; GFX11-NEXT: ; return to shader part epilog
27 %load = load i8, ptr addrspace(1) %sbase
28 %zext = zext i8 %load to i32
29 %to.vgpr = bitcast i32 %zext to float
33 ; SGPR base with maximum gfx9 immediate offset
34 define amdgpu_ps float @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg %sbase) {
35 ; GFX9-LABEL: global_load_saddr_i8_offset_4095:
37 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
38 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
39 ; GFX9-NEXT: s_waitcnt vmcnt(0)
40 ; GFX9-NEXT: ; return to shader part epilog
42 ; GFX10-LABEL: global_load_saddr_i8_offset_4095:
44 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
45 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
46 ; GFX10-NEXT: s_waitcnt vmcnt(0)
47 ; GFX10-NEXT: ; return to shader part epilog
49 ; GFX11-LABEL: global_load_saddr_i8_offset_4095:
51 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
52 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095
53 ; GFX11-NEXT: s_waitcnt vmcnt(0)
54 ; GFX11-NEXT: ; return to shader part epilog
55 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
56 %load = load i8, ptr addrspace(1) %gep0
57 %zext = zext i8 %load to i32
58 %to.vgpr = bitcast i32 %zext to float
62 ; SGPR base with maximum gfx9 immediate offset + 1
63 define amdgpu_ps float @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg %sbase) {
64 ; GCN-LABEL: global_load_saddr_i8_offset_4096:
66 ; GCN-NEXT: v_mov_b32_e32 v0, 0x1000
67 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
68 ; GCN-NEXT: s_waitcnt vmcnt(0)
69 ; GCN-NEXT: ; return to shader part epilog
71 ; GFX11-LABEL: global_load_saddr_i8_offset_4096:
73 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
74 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
75 ; GFX11-NEXT: s_waitcnt vmcnt(0)
76 ; GFX11-NEXT: ; return to shader part epilog
77 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096
78 %load = load i8, ptr addrspace(1) %gep0
79 %zext = zext i8 %load to i32
80 %to.vgpr = bitcast i32 %zext to float
84 ; SGPR base with maximum gfx9 immediate offset + 2
85 define amdgpu_ps float @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg %sbase) {
86 ; GCN-LABEL: global_load_saddr_i8_offset_4097:
88 ; GCN-NEXT: v_mov_b32_e32 v0, 0x1000
89 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
90 ; GCN-NEXT: s_waitcnt vmcnt(0)
91 ; GCN-NEXT: ; return to shader part epilog
93 ; GFX11-LABEL: global_load_saddr_i8_offset_4097:
95 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
96 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1
97 ; GFX11-NEXT: s_waitcnt vmcnt(0)
98 ; GFX11-NEXT: ; return to shader part epilog
99 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097
100 %load = load i8, ptr addrspace(1) %gep0
101 %zext = zext i8 %load to i32
102 %to.vgpr = bitcast i32 %zext to float
106 ; SGPR base with maximum negative gfx9 immediate offset
107 define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inreg %sbase) {
108 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4096:
110 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
111 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096
112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
113 ; GFX9-NEXT: ; return to shader part epilog
115 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4096:
117 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
118 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
119 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
120 ; GFX10-NEXT: s_waitcnt vmcnt(0)
121 ; GFX10-NEXT: ; return to shader part epilog
123 ; GFX11-LABEL: global_load_saddr_i8_offset_neg4096:
125 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
126 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096
127 ; GFX11-NEXT: s_waitcnt vmcnt(0)
128 ; GFX11-NEXT: ; return to shader part epilog
129 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096
130 %load = load i8, ptr addrspace(1) %gep0
131 %zext = zext i8 %load to i32
132 %to.vgpr = bitcast i32 %zext to float
136 ; SGPR base with maximum negative gfx9 immediate offset -1
137 define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inreg %sbase) {
138 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4097:
140 ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffefff
141 ; GFX9-NEXT: s_addc_u32 s1, s3, -1
142 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
143 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
144 ; GFX9-NEXT: s_waitcnt vmcnt(0)
145 ; GFX9-NEXT: ; return to shader part epilog
147 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4097:
149 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
150 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
151 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
152 ; GFX10-NEXT: s_waitcnt vmcnt(0)
153 ; GFX10-NEXT: ; return to shader part epilog
155 ; GFX11-LABEL: global_load_saddr_i8_offset_neg4097:
157 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
158 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
159 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
160 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1
161 ; GFX11-NEXT: s_waitcnt vmcnt(0)
162 ; GFX11-NEXT: ; return to shader part epilog
163 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097
164 %load = load i8, ptr addrspace(1) %gep0
165 %zext = zext i8 %load to i32
166 %to.vgpr = bitcast i32 %zext to float
170 ; SGPR base with maximum negative gfx9 immediate offset -2
171 define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inreg %sbase) {
172 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4098:
174 ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffeffe
175 ; GFX9-NEXT: s_addc_u32 s1, s3, -1
176 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
177 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
178 ; GFX9-NEXT: s_waitcnt vmcnt(0)
179 ; GFX9-NEXT: ; return to shader part epilog
181 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4098:
183 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
184 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
185 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
186 ; GFX10-NEXT: s_waitcnt vmcnt(0)
187 ; GFX10-NEXT: ; return to shader part epilog
189 ; GFX11-LABEL: global_load_saddr_i8_offset_neg4098:
191 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
192 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
193 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
194 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2
195 ; GFX11-NEXT: s_waitcnt vmcnt(0)
196 ; GFX11-NEXT: ; return to shader part epilog
197 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098
198 %load = load i8, ptr addrspace(1) %gep0
199 %zext = zext i8 %load to i32
200 %to.vgpr = bitcast i32 %zext to float
204 ; SGPR base with maximum gfx10 immediate offset
205 define amdgpu_ps float @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg %sbase) {
206 ; GFX9-LABEL: global_load_saddr_i8_offset_2048:
208 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
209 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048
210 ; GFX9-NEXT: s_waitcnt vmcnt(0)
211 ; GFX9-NEXT: ; return to shader part epilog
213 ; GFX10-LABEL: global_load_saddr_i8_offset_2048:
215 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
216 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
217 ; GFX10-NEXT: s_waitcnt vmcnt(0)
218 ; GFX10-NEXT: ; return to shader part epilog
220 ; GFX11-LABEL: global_load_saddr_i8_offset_2048:
222 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
223 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048
224 ; GFX11-NEXT: s_waitcnt vmcnt(0)
225 ; GFX11-NEXT: ; return to shader part epilog
226 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048
227 %load = load i8, ptr addrspace(1) %gep0
228 %zext = zext i8 %load to i32
229 %to.vgpr = bitcast i32 %zext to float
233 ; SGPR base with maximum gfx10 immediate offset + 1
234 define amdgpu_ps float @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg %sbase) {
235 ; GFX9-LABEL: global_load_saddr_i8_offset_2049:
237 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
238 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2049
239 ; GFX9-NEXT: s_waitcnt vmcnt(0)
240 ; GFX9-NEXT: ; return to shader part epilog
242 ; GFX10-LABEL: global_load_saddr_i8_offset_2049:
244 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
245 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
246 ; GFX10-NEXT: s_waitcnt vmcnt(0)
247 ; GFX10-NEXT: ; return to shader part epilog
249 ; GFX11-LABEL: global_load_saddr_i8_offset_2049:
251 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
252 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2049
253 ; GFX11-NEXT: s_waitcnt vmcnt(0)
254 ; GFX11-NEXT: ; return to shader part epilog
255 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049
256 %load = load i8, ptr addrspace(1) %gep0
257 %zext = zext i8 %load to i32
258 %to.vgpr = bitcast i32 %zext to float
262 ; SGPR base with maximum gfx10 immediate offset + 2
263 define amdgpu_ps float @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg %sbase) {
264 ; GFX9-LABEL: global_load_saddr_i8_offset_2050:
266 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
267 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2050
268 ; GFX9-NEXT: s_waitcnt vmcnt(0)
269 ; GFX9-NEXT: ; return to shader part epilog
271 ; GFX10-LABEL: global_load_saddr_i8_offset_2050:
273 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
274 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
275 ; GFX10-NEXT: s_waitcnt vmcnt(0)
276 ; GFX10-NEXT: ; return to shader part epilog
278 ; GFX11-LABEL: global_load_saddr_i8_offset_2050:
280 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
281 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2050
282 ; GFX11-NEXT: s_waitcnt vmcnt(0)
283 ; GFX11-NEXT: ; return to shader part epilog
284 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050
285 %load = load i8, ptr addrspace(1) %gep0
286 %zext = zext i8 %load to i32
287 %to.vgpr = bitcast i32 %zext to float
291 ; SGPR base with maximum negative gfx10 immediate offset
292 define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inreg %sbase) {
293 ; GCN-LABEL: global_load_saddr_i8_offset_neg2048:
295 ; GCN-NEXT: v_mov_b32_e32 v0, 0
296 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048
297 ; GCN-NEXT: s_waitcnt vmcnt(0)
298 ; GCN-NEXT: ; return to shader part epilog
300 ; GFX11-LABEL: global_load_saddr_i8_offset_neg2048:
302 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
303 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048
304 ; GFX11-NEXT: s_waitcnt vmcnt(0)
305 ; GFX11-NEXT: ; return to shader part epilog
306 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048
307 %load = load i8, ptr addrspace(1) %gep0
308 %zext = zext i8 %load to i32
309 %to.vgpr = bitcast i32 %zext to float
313 ; SGPR base with maximum negative gfx10 immediate offset - 1
314 define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inreg %sbase) {
315 ; GFX9-LABEL: global_load_saddr_i8_offset_neg2049:
317 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
318 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049
319 ; GFX9-NEXT: s_waitcnt vmcnt(0)
320 ; GFX9-NEXT: ; return to shader part epilog
322 ; GFX10-LABEL: global_load_saddr_i8_offset_neg2049:
324 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2
325 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
326 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
327 ; GFX10-NEXT: s_waitcnt vmcnt(0)
328 ; GFX10-NEXT: ; return to shader part epilog
330 ; GFX11-LABEL: global_load_saddr_i8_offset_neg2049:
332 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
333 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049
334 ; GFX11-NEXT: s_waitcnt vmcnt(0)
335 ; GFX11-NEXT: ; return to shader part epilog
336 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049
337 %load = load i8, ptr addrspace(1) %gep0
338 %zext = zext i8 %load to i32
339 %to.vgpr = bitcast i32 %zext to float
343 ; SGPR base with maximum negative gfx10 immediate offset - 1
344 define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inreg %sbase) {
345 ; GFX9-LABEL: global_load_saddr_i8_offset_neg2050:
347 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
348 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2050
349 ; GFX9-NEXT: s_waitcnt vmcnt(0)
350 ; GFX9-NEXT: ; return to shader part epilog
352 ; GFX10-LABEL: global_load_saddr_i8_offset_neg2050:
354 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2
355 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
356 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
357 ; GFX10-NEXT: s_waitcnt vmcnt(0)
358 ; GFX10-NEXT: ; return to shader part epilog
360 ; GFX11-LABEL: global_load_saddr_i8_offset_neg2050:
362 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
363 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2050
364 ; GFX11-NEXT: s_waitcnt vmcnt(0)
365 ; GFX11-NEXT: ; return to shader part epilog
366 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050
367 %load = load i8, ptr addrspace(1) %gep0
368 %zext = zext i8 %load to i32
369 %to.vgpr = bitcast i32 %zext to float
373 define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(ptr addrspace(1) inreg %sbase) {
374 ; GFX9-LABEL: global_load_saddr_i8_offset_4294967295:
376 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xfffff000
377 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
378 ; GFX9-NEXT: s_waitcnt vmcnt(0)
379 ; GFX9-NEXT: ; return to shader part epilog
381 ; GFX10-LABEL: global_load_saddr_i8_offset_4294967295:
383 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xfffff800
384 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
385 ; GFX10-NEXT: s_waitcnt vmcnt(0)
386 ; GFX10-NEXT: ; return to shader part epilog
388 ; GFX11-LABEL: global_load_saddr_i8_offset_4294967295:
390 ; GFX11-NEXT: v_mov_b32_e32 v0, 0xfffff000
391 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095
392 ; GFX11-NEXT: s_waitcnt vmcnt(0)
393 ; GFX11-NEXT: ; return to shader part epilog
394 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295
395 %load = load i8, ptr addrspace(1) %gep0
396 %zext = zext i8 %load to i32
397 %to.vgpr = bitcast i32 %zext to float
401 define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1) inreg %sbase) {
402 ; GFX9-LABEL: global_load_saddr_i8_offset_4294967296:
404 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
405 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
406 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
407 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
408 ; GFX9-NEXT: s_waitcnt vmcnt(0)
409 ; GFX9-NEXT: ; return to shader part epilog
411 ; GFX10-LABEL: global_load_saddr_i8_offset_4294967296:
413 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
414 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
415 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
416 ; GFX10-NEXT: s_waitcnt vmcnt(0)
417 ; GFX10-NEXT: ; return to shader part epilog
419 ; GFX11-LABEL: global_load_saddr_i8_offset_4294967296:
421 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
422 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
423 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
424 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off
425 ; GFX11-NEXT: s_waitcnt vmcnt(0)
426 ; GFX11-NEXT: ; return to shader part epilog
427 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296
428 %load = load i8, ptr addrspace(1) %gep0
429 %zext = zext i8 %load to i32
430 %to.vgpr = bitcast i32 %zext to float
434 define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1) inreg %sbase) {
435 ; GFX9-LABEL: global_load_saddr_i8_offset_4294967297:
437 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
438 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
439 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
440 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
441 ; GFX9-NEXT: s_waitcnt vmcnt(0)
442 ; GFX9-NEXT: ; return to shader part epilog
444 ; GFX10-LABEL: global_load_saddr_i8_offset_4294967297:
446 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
447 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
448 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
449 ; GFX10-NEXT: s_waitcnt vmcnt(0)
450 ; GFX10-NEXT: ; return to shader part epilog
452 ; GFX11-LABEL: global_load_saddr_i8_offset_4294967297:
454 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
455 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
456 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
457 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1
458 ; GFX11-NEXT: s_waitcnt vmcnt(0)
459 ; GFX11-NEXT: ; return to shader part epilog
460 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297
461 %load = load i8, ptr addrspace(1) %gep0
462 %zext = zext i8 %load to i32
463 %to.vgpr = bitcast i32 %zext to float
467 define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1) inreg %sbase) {
468 ; GFX9-LABEL: global_load_saddr_i8_offset_4294971391:
470 ; GFX9-NEXT: s_add_u32 s0, s2, 0xfff
471 ; GFX9-NEXT: s_addc_u32 s1, s3, 1
472 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
473 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
474 ; GFX9-NEXT: s_waitcnt vmcnt(0)
475 ; GFX9-NEXT: ; return to shader part epilog
477 ; GFX10-LABEL: global_load_saddr_i8_offset_4294971391:
479 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2
480 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
481 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
482 ; GFX10-NEXT: s_waitcnt vmcnt(0)
483 ; GFX10-NEXT: ; return to shader part epilog
485 ; GFX11-LABEL: global_load_saddr_i8_offset_4294971391:
487 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
488 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
489 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
490 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095
491 ; GFX11-NEXT: s_waitcnt vmcnt(0)
492 ; GFX11-NEXT: ; return to shader part epilog
493 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391
494 %load = load i8, ptr addrspace(1) %gep0
495 %zext = zext i8 %load to i32
496 %to.vgpr = bitcast i32 %zext to float
500 define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1) inreg %sbase) {
501 ; GFX9-LABEL: global_load_saddr_i8_offset_4294971392:
503 ; GFX9-NEXT: s_add_u32 s0, s2, 0x1000
504 ; GFX9-NEXT: s_addc_u32 s1, s3, 1
505 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
506 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
507 ; GFX9-NEXT: s_waitcnt vmcnt(0)
508 ; GFX9-NEXT: ; return to shader part epilog
510 ; GFX10-LABEL: global_load_saddr_i8_offset_4294971392:
512 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2
513 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
514 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
515 ; GFX10-NEXT: s_waitcnt vmcnt(0)
516 ; GFX10-NEXT: ; return to shader part epilog
518 ; GFX11-LABEL: global_load_saddr_i8_offset_4294971392:
520 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2
521 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
522 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
523 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off
524 ; GFX11-NEXT: s_waitcnt vmcnt(0)
525 ; GFX11-NEXT: ; return to shader part epilog
526 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392
527 %load = load i8, ptr addrspace(1) %gep0
528 %zext = zext i8 %load to i32
529 %to.vgpr = bitcast i32 %zext to float
533 define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace(1) inreg %sbase) {
534 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295:
536 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
537 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
538 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
539 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
540 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
541 ; GFX9-NEXT: s_waitcnt vmcnt(0)
542 ; GFX9-NEXT: ; return to shader part epilog
544 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295:
546 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2
547 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
548 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047
549 ; GFX10-NEXT: s_waitcnt vmcnt(0)
550 ; GFX10-NEXT: ; return to shader part epilog
552 ; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967295:
554 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2
555 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
556 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
557 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4095
558 ; GFX11-NEXT: s_waitcnt vmcnt(0)
559 ; GFX11-NEXT: ; return to shader part epilog
560 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295
561 %load = load i8, ptr addrspace(1) %gep0
562 %zext = zext i8 %load to i32
563 %to.vgpr = bitcast i32 %zext to float
567 define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace(1) inreg %sbase) {
568 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296:
570 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
571 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
572 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
573 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
574 ; GFX9-NEXT: s_waitcnt vmcnt(0)
575 ; GFX9-NEXT: ; return to shader part epilog
577 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296:
579 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
580 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
581 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
582 ; GFX10-NEXT: s_waitcnt vmcnt(0)
583 ; GFX10-NEXT: ; return to shader part epilog
585 ; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967296:
587 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
588 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
589 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
590 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off
591 ; GFX11-NEXT: s_waitcnt vmcnt(0)
592 ; GFX11-NEXT: ; return to shader part epilog
593 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296
594 %load = load i8, ptr addrspace(1) %gep0
595 %zext = zext i8 %load to i32
596 %to.vgpr = bitcast i32 %zext to float
600 define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace(1) inreg %sbase) {
601 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297:
603 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
604 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
605 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
606 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
607 ; GFX9-NEXT: s_waitcnt vmcnt(0)
608 ; GFX9-NEXT: ; return to shader part epilog
610 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297:
612 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
613 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
614 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
615 ; GFX10-NEXT: s_waitcnt vmcnt(0)
616 ; GFX10-NEXT: ; return to shader part epilog
618 ; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967297:
620 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
621 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
622 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
623 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1
624 ; GFX11-NEXT: s_waitcnt vmcnt(0)
625 ; GFX11-NEXT: ; return to shader part epilog
626 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297
627 %load = load i8, ptr addrspace(1) %gep0
628 %zext = zext i8 %load to i32
629 %to.vgpr = bitcast i32 %zext to float
633 ; --------------------------------------------------------------------------------
634 ; Basic addressing patterns
635 ; --------------------------------------------------------------------------------
637 ; Basic pattern, no immediate offset.
638 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset) {
639 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr:
641 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
642 ; GCN-NEXT: s_waitcnt vmcnt(0)
643 ; GCN-NEXT: ; return to shader part epilog
645 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr:
647 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
648 ; GFX11-NEXT: s_waitcnt vmcnt(0)
649 ; GFX11-NEXT: ; return to shader part epilog
650 %zext.offset = zext i32 %voffset to i64
651 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
652 %load = load i8, ptr addrspace(1) %gep0
653 %zext = zext i8 %load to i32
654 %to.vgpr = bitcast i32 %zext to float
658 ; Maximum positive offset on gfx9
659 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(ptr addrspace(1) inreg %sbase, i32 %voffset) {
660 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
662 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
663 ; GFX9-NEXT: s_waitcnt vmcnt(0)
664 ; GFX9-NEXT: ; return to shader part epilog
666 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
668 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
669 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
670 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
671 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
672 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
673 ; GFX10-NEXT: s_waitcnt vmcnt(0)
674 ; GFX10-NEXT: ; return to shader part epilog
676 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
678 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095
679 ; GFX11-NEXT: s_waitcnt vmcnt(0)
680 ; GFX11-NEXT: ; return to shader part epilog
681 %zext.offset = zext i32 %voffset to i64
682 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
683 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
684 %load = load i8, ptr addrspace(1) %gep1
685 %zext = zext i8 %load to i32
686 %to.vgpr = bitcast i32 %zext to float
690 ; Maximum positive offset on gfx9 + 1
691 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace(1) inreg %sbase, i32 %voffset) {
692 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
694 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
695 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
696 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
697 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
698 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
699 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
700 ; GFX9-NEXT: s_waitcnt vmcnt(0)
701 ; GFX9-NEXT: ; return to shader part epilog
703 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
705 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
706 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
707 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0
708 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
709 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
710 ; GFX10-NEXT: s_waitcnt vmcnt(0)
711 ; GFX10-NEXT: ; return to shader part epilog
713 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
715 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
716 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
717 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
718 ; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0
719 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
720 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
721 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off
722 ; GFX11-NEXT: s_waitcnt vmcnt(0)
723 ; GFX11-NEXT: ; return to shader part epilog
724 %zext.offset = zext i32 %voffset to i64
725 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
726 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4096
727 %load = load i8, ptr addrspace(1) %gep1
728 %zext = zext i8 %load to i32
729 %to.vgpr = bitcast i32 %zext to float
733 ; Maximum negative offset on gfx9
734 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(ptr addrspace(1) inreg %sbase, i32 %voffset) {
735 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
737 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096
738 ; GFX9-NEXT: s_waitcnt vmcnt(0)
739 ; GFX9-NEXT: ; return to shader part epilog
741 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
743 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
744 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
745 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
746 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
747 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
748 ; GFX10-NEXT: s_waitcnt vmcnt(0)
749 ; GFX10-NEXT: ; return to shader part epilog
751 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
753 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096
754 ; GFX11-NEXT: s_waitcnt vmcnt(0)
755 ; GFX11-NEXT: ; return to shader part epilog
756 %zext.offset = zext i32 %voffset to i64
757 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
758 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4096
759 %load = load i8, ptr addrspace(1) %gep1
760 %zext = zext i8 %load to i32
761 %to.vgpr = bitcast i32 %zext to float
765 ; Maximum negative offset on gfx9 - 1
766 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrspace(1) inreg %sbase, i32 %voffset) {
767 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
769 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
770 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
771 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
772 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
773 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
774 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
775 ; GFX9-NEXT: s_waitcnt vmcnt(0)
776 ; GFX9-NEXT: ; return to shader part epilog
778 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
780 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
781 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
782 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
783 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
784 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
785 ; GFX10-NEXT: s_waitcnt vmcnt(0)
786 ; GFX10-NEXT: ; return to shader part epilog
788 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
790 ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
791 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
792 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
793 ; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
794 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
795 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
796 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1
797 ; GFX11-NEXT: s_waitcnt vmcnt(0)
798 ; GFX11-NEXT: ; return to shader part epilog
799 %zext.offset = zext i32 %voffset to i64
800 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
801 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4097
802 %load = load i8, ptr addrspace(1) %gep1
803 %zext = zext i8 %load to i32
804 %to.vgpr = bitcast i32 %zext to float
808 ; Maximum positive offset on gfx10
809 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(ptr addrspace(1) inreg %sbase, i32 %voffset) {
810 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
812 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
813 ; GCN-NEXT: s_waitcnt vmcnt(0)
814 ; GCN-NEXT: ; return to shader part epilog
816 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
818 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047
819 ; GFX11-NEXT: s_waitcnt vmcnt(0)
820 ; GFX11-NEXT: ; return to shader part epilog
821 %zext.offset = zext i32 %voffset to i64
822 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
823 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
824 %load = load i8, ptr addrspace(1) %gep1
825 %zext = zext i8 %load to i32
826 %to.vgpr = bitcast i32 %zext to float
830 ; Maximum positive offset on gfx10 + 1
831 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(ptr addrspace(1) inreg %sbase, i32 %voffset) {
832 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
834 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048
835 ; GFX9-NEXT: s_waitcnt vmcnt(0)
836 ; GFX9-NEXT: ; return to shader part epilog
838 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
840 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
841 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
842 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
843 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
844 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
845 ; GFX10-NEXT: s_waitcnt vmcnt(0)
846 ; GFX10-NEXT: ; return to shader part epilog
848 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
850 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048
851 ; GFX11-NEXT: s_waitcnt vmcnt(0)
852 ; GFX11-NEXT: ; return to shader part epilog
853 %zext.offset = zext i32 %voffset to i64
854 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
855 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048
856 %load = load i8, ptr addrspace(1) %gep1
857 %zext = zext i8 %load to i32
858 %to.vgpr = bitcast i32 %zext to float
862 ; Maximum negative offset on gfx10
863 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(ptr addrspace(1) inreg %sbase, i32 %voffset) {
864 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
866 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048
867 ; GCN-NEXT: s_waitcnt vmcnt(0)
868 ; GCN-NEXT: ; return to shader part epilog
870 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
872 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048
873 ; GFX11-NEXT: s_waitcnt vmcnt(0)
874 ; GFX11-NEXT: ; return to shader part epilog
875 %zext.offset = zext i32 %voffset to i64
876 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
877 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
878 %load = load i8, ptr addrspace(1) %gep1
879 %zext = zext i8 %load to i32
880 %to.vgpr = bitcast i32 %zext to float
884 ; Maximum negative offset on gfx10 - 1
885 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrspace(1) inreg %sbase, i32 %voffset) {
886 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
888 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049
889 ; GFX9-NEXT: s_waitcnt vmcnt(0)
890 ; GFX9-NEXT: ; return to shader part epilog
892 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
894 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
895 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
896 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff800, v0
897 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
898 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
899 ; GFX10-NEXT: s_waitcnt vmcnt(0)
900 ; GFX10-NEXT: ; return to shader part epilog
902 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
904 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049
905 ; GFX11-NEXT: s_waitcnt vmcnt(0)
906 ; GFX11-NEXT: ; return to shader part epilog
907 %zext.offset = zext i32 %voffset to i64
908 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
909 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2049
910 %load = load i8, ptr addrspace(1) %gep1
911 %zext = zext i8 %load to i32
912 %to.vgpr = bitcast i32 %zext to float
916 ; Maximum positive offset on gfx9, and immediate needs to be moved lower.
917 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr addrspace(1) inreg %sbase, i32 %voffset) {
918 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
920 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
921 ; GFX9-NEXT: s_waitcnt vmcnt(0)
922 ; GFX9-NEXT: ; return to shader part epilog
924 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
926 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
927 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
928 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
929 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
930 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
931 ; GFX10-NEXT: s_waitcnt vmcnt(0)
932 ; GFX10-NEXT: ; return to shader part epilog
934 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
936 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095
937 ; GFX11-NEXT: s_waitcnt vmcnt(0)
938 ; GFX11-NEXT: ; return to shader part epilog
939 %zext.offset = zext i32 %voffset to i64
940 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
941 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %zext.offset
942 %load = load i8, ptr addrspace(1) %gep1
943 %zext = zext i8 %load to i32
944 %to.vgpr = bitcast i32 %zext to float
948 ; pointer addressing done in integers
949 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(ptr addrspace(1) inreg %sbase, i32 %voffset) {
950 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
952 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
953 ; GCN-NEXT: s_waitcnt vmcnt(0)
954 ; GCN-NEXT: ; return to shader part epilog
956 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
958 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
959 ; GFX11-NEXT: s_waitcnt vmcnt(0)
960 ; GFX11-NEXT: ; return to shader part epilog
961 %zext.offset = zext i32 %voffset to i64
962 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
963 %add = add i64 %sbase.as.int, %zext.offset
964 %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
965 %load = load i8, ptr addrspace(1) %dirty.gep
966 %zext = zext i8 %load to i32
967 %to.vgpr = bitcast i32 %zext to float
971 ; zext forced to LHS of addressing expression
972 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 %voffset) {
973 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
975 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
976 ; GCN-NEXT: s_waitcnt vmcnt(0)
977 ; GCN-NEXT: ; return to shader part epilog
979 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
981 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
982 ; GFX11-NEXT: s_waitcnt vmcnt(0)
983 ; GFX11-NEXT: ; return to shader part epilog
984 %zext.offset = zext i32 %voffset to i64
985 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
986 %add = add i64 %zext.offset, %sbase.as.int
987 %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
988 %load = load i8, ptr addrspace(1) %dirty.gep
989 %zext = zext i8 %load to i32
990 %to.vgpr = bitcast i32 %zext to float
994 ; zext forced to LHS of addressing expression, with immediate offset
995 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 %voffset) {
996 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
998 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128
999 ; GCN-NEXT: s_waitcnt vmcnt(0)
1000 ; GCN-NEXT: ; return to shader part epilog
1002 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
1004 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128
1005 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1006 ; GFX11-NEXT: ; return to shader part epilog
1007 %zext.offset = zext i32 %voffset to i64
1008 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
1009 %add = add i64 %zext.offset, %sbase.as.int
1010 %add.immoffset = add i64 %add, 128
1011 %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
1012 %load = load i8, ptr addrspace(1) %dirty.gep
1013 %zext = zext i8 %load to i32
1014 %to.vgpr = bitcast i32 %zext to float
1018 ; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
1019 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1020 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
1022 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128
1023 ; GCN-NEXT: s_waitcnt vmcnt(0)
1024 ; GCN-NEXT: ; return to shader part epilog
1026 ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
1028 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128
1029 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1030 ; GFX11-NEXT: ; return to shader part epilog
1031 %zext.offset = zext i32 %voffset to i64
1032 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
1033 %add.immoffset = add i64 %sbase.as.int, 128
1034 %add = add i64 %zext.offset, %add.immoffset
1035 %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
1036 %load = load i8, ptr addrspace(1) %dirty.gep
1037 %zext = zext i8 %load to i32
1038 %to.vgpr = bitcast i32 %zext to float
1042 ; --------------------------------------------------------------------------------
1043 ; Uniformity edge cases
1044 ; --------------------------------------------------------------------------------
1046 @ptr.in.lds = internal addrspace(3) global ptr addrspace(1) undef
1048 ; Base pointer is uniform, but also in VGPRs
1049 define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
1050 ; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
1052 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1053 ; GFX9-NEXT: ds_read_b64 v[1:2], v1
1054 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1055 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1
1056 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2
1057 ; GFX9-NEXT: s_nop 4
1058 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
1059 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1060 ; GFX9-NEXT: ; return to shader part epilog
1062 ; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
1064 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1065 ; GFX10-NEXT: ds_read_b64 v[1:2], v1
1066 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1067 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
1068 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
1069 ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
1070 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1071 ; GFX10-NEXT: ; return to shader part epilog
1073 ; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
1075 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1076 ; GFX11-NEXT: ds_load_b64 v[1:2], v1
1077 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1078 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
1079 ; GFX11-NEXT: v_readfirstlane_b32 s1, v2
1080 ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
1081 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1082 ; GFX11-NEXT: ; return to shader part epilog
1083 %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
1084 %zext.offset = zext i32 %voffset to i64
1085 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1086 %load = load i8, ptr addrspace(1) %gep0
1087 %zext = zext i8 %load to i32
1088 %to.vgpr = bitcast i32 %zext to float
1092 ; Base pointer is uniform, but also in VGPRs, with imm offset
1093 define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
1094 ; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
1096 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1097 ; GFX9-NEXT: ds_read_b64 v[1:2], v1
1098 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1099 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1
1100 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2
1101 ; GFX9-NEXT: s_nop 4
1102 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42
1103 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1104 ; GFX9-NEXT: ; return to shader part epilog
1106 ; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
1108 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1109 ; GFX10-NEXT: ds_read_b64 v[1:2], v1
1110 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1111 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
1112 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
1113 ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42
1114 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1115 ; GFX10-NEXT: ; return to shader part epilog
1117 ; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
1119 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1120 ; GFX11-NEXT: ds_load_b64 v[1:2], v1
1121 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1122 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
1123 ; GFX11-NEXT: v_readfirstlane_b32 s1, v2
1124 ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:42
1125 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1126 ; GFX11-NEXT: ; return to shader part epilog
1127 %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
1128 %zext.offset = zext i32 %voffset to i64
1129 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1130 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42
1131 %load = load i8, ptr addrspace(1) %gep1
1132 %zext = zext i8 %load to i32
1133 %to.vgpr = bitcast i32 %zext to float
1137 ; Both 64-bit base and 32-bit offset are scalar
1138 define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
1139 ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset:
1141 ; GCN-NEXT: v_mov_b32_e32 v0, s4
1142 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
1143 ; GCN-NEXT: s_waitcnt vmcnt(0)
1144 ; GCN-NEXT: ; return to shader part epilog
1146 ; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset:
1148 ; GFX11-NEXT: v_mov_b32_e32 v0, s4
1149 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
1150 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1151 ; GFX11-NEXT: ; return to shader part epilog
1152 %zext.offset = zext i32 %soffset to i64
1153 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1154 %load = load i8, ptr addrspace(1) %gep0
1155 %zext = zext i8 %load to i32
1156 %to.vgpr = bitcast i32 %zext to float
1160 ; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
1161 define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
1162 ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
1164 ; GCN-NEXT: v_mov_b32_e32 v0, s4
1165 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24
1166 ; GCN-NEXT: s_waitcnt vmcnt(0)
1167 ; GCN-NEXT: ; return to shader part epilog
1169 ; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
1171 ; GFX11-NEXT: v_mov_b32_e32 v0, s4
1172 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-24
1173 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1174 ; GFX11-NEXT: ; return to shader part epilog
1175 %zext.offset = zext i32 %soffset to i64
1176 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1177 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24
1178 %load = load i8, ptr addrspace(1) %gep1
1179 %zext = zext i8 %load to i32
1180 %to.vgpr = bitcast i32 %zext to float
1184 ; Both components uniform, zext forced to LHS of addressing expression
1185 define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
1186 ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
1188 ; GCN-NEXT: v_mov_b32_e32 v0, s4
1189 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
1190 ; GCN-NEXT: s_waitcnt vmcnt(0)
1191 ; GCN-NEXT: ; return to shader part epilog
1193 ; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
1195 ; GFX11-NEXT: v_mov_b32_e32 v0, s4
1196 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
1197 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1198 ; GFX11-NEXT: ; return to shader part epilog
1199 %zext.offset = zext i32 %soffset to i64
1200 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
1201 %add = add i64 %zext.offset, %sbase.as.int
1202 %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
1203 %load = load i8, ptr addrspace(1) %dirty.gep
1204 %zext = zext i8 %load to i32
1205 %to.vgpr = bitcast i32 %zext to float
1209 ; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
1210 define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
1211 ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
1213 ; GCN-NEXT: v_mov_b32_e32 v0, s4
1214 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128
1215 ; GCN-NEXT: s_waitcnt vmcnt(0)
1216 ; GCN-NEXT: ; return to shader part epilog
1218 ; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
1220 ; GFX11-NEXT: v_mov_b32_e32 v0, s4
1221 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128
1222 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1223 ; GFX11-NEXT: ; return to shader part epilog
1224 %zext.offset = zext i32 %soffset to i64
1225 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
1226 %add = add i64 %zext.offset, %sbase.as.int
1227 %add.immoffset = add i64 %add, 128
1228 %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
1229 %load = load i8, ptr addrspace(1) %dirty.gep
1230 %zext = zext i8 %load to i32
1231 %to.vgpr = bitcast i32 %zext to float
1235 ; divergent 64-bit base, 32-bit scalar offset.
1236 define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i32 inreg %soffset) {
1237 ; GFX9-LABEL: global_load_i8_vgpr64_sgpr32:
1239 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
1240 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1241 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
1242 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1243 ; GFX9-NEXT: ; return to shader part epilog
1245 ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32:
1247 ; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2
1248 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1249 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
1250 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1251 ; GFX10-NEXT: ; return to shader part epilog
1253 ; GFX11-LABEL: global_load_i8_vgpr64_sgpr32:
1255 ; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2
1256 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1257 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off
1258 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1259 ; GFX11-NEXT: ; return to shader part epilog
1260 %zext.offset = zext i32 %soffset to i64
1261 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
1262 %load = load i8, ptr addrspace(1) %gep0
1263 %zext = zext i8 %load to i32
1264 %to.vgpr = bitcast i32 %zext to float
1268 ; divergent 64-bit base, 32-bit scalar offset, with imm offset
1269 define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1) %vbase, i32 inreg %soffset) {
1270 ; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
1272 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
1273 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1274 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
1275 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1276 ; GFX9-NEXT: ; return to shader part epilog
1278 ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
1280 ; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2
1281 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1282 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
1283 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1284 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
1285 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1286 ; GFX10-NEXT: ; return to shader part epilog
1288 ; GFX11-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
1290 ; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2
1291 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1292 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095
1293 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1294 ; GFX11-NEXT: ; return to shader part epilog
1295 %zext.offset = zext i32 %soffset to i64
1296 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
1297 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
1298 %load = load i8, ptr addrspace(1) %gep1
1299 %zext = zext i8 %load to i32
1300 %to.vgpr = bitcast i32 %zext to float
1304 ; --------------------------------------------------------------------------------
1305 ; Natural addressing shifts with restricted range
1306 ; --------------------------------------------------------------------------------
1308 ; Cannot push the shift into 32-bits, and cannot match.
1309 define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
1310 ; GFX9-LABEL: global_load_saddr_f32_natural_addressing:
1312 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1313 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1314 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
1315 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1316 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
1317 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
1318 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1319 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1320 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1321 ; GFX9-NEXT: ; return to shader part epilog
1323 ; GFX10-LABEL: global_load_saddr_f32_natural_addressing:
1325 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1326 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1327 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1328 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
1329 ; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
1330 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1331 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1332 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1333 ; GFX10-NEXT: ; return to shader part epilog
1335 ; GFX11-LABEL: global_load_saddr_f32_natural_addressing:
1337 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1338 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1339 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1340 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1341 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
1342 ; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0
1343 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1344 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1345 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1346 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1347 ; GFX11-NEXT: ; return to shader part epilog
1348 %voffset = load i32, ptr addrspace(1) %voffset.ptr
1349 %zext.offset = zext i32 %voffset to i64
1350 %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
1351 %load = load float, ptr addrspace(1) %gep
1355 ; Cannot push the shift into 32-bits, with an immediate offset.
1356 define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
1357 ; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
1359 ; GCN-NEXT: global_load_dword v0, v[0:1], off
1360 ; GCN-NEXT: s_waitcnt vmcnt(0)
1361 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128
1362 ; GCN-NEXT: s_waitcnt vmcnt(0)
1363 ; GCN-NEXT: ; return to shader part epilog
1365 ; GFX11-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
1367 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1368 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1369 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:128
1370 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1371 ; GFX11-NEXT: ; return to shader part epilog
1372 %voffset = load i32, ptr addrspace(1) %voffset.ptr
1373 %zext.offset = zext i32 %voffset to i64
1374 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1375 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 128
1376 %load = load float, ptr addrspace(1) %gep1
1380 ; Range is sufficiently restricted to push the shift into 32-bits.
1381 define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
1382 ; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range:
1384 ; GCN-NEXT: global_load_dword v0, v[0:1], off
1385 ; GCN-NEXT: s_waitcnt vmcnt(0)
1386 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1387 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1388 ; GCN-NEXT: s_waitcnt vmcnt(0)
1389 ; GCN-NEXT: ; return to shader part epilog
1391 ; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range:
1393 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1394 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1395 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1396 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1397 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1398 ; GFX11-NEXT: ; return to shader part epilog
1399 %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
1400 %zext.offset = zext i32 %voffset to i64
1401 %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
1402 %load = load float, ptr addrspace(1) %gep
1406 ; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
1407 define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
1408 ; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
1410 ; GCN-NEXT: global_load_dword v0, v[0:1], off
1411 ; GCN-NEXT: s_waitcnt vmcnt(0)
1412 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1413 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400
1414 ; GCN-NEXT: s_waitcnt vmcnt(0)
1415 ; GCN-NEXT: ; return to shader part epilog
1417 ; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
1419 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1420 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1421 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1422 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:400
1423 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1424 ; GFX11-NEXT: ; return to shader part epilog
1425 %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
1426 %zext.offset = zext i32 %voffset to i64
1427 %gep0 = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
1428 %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i64 100
1429 %load = load float, ptr addrspace(1) %gep1
1433 ; Range is 1 beyond the limit where we can move the shift into 32-bits.
1434 define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
1435 ; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1437 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1438 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1439 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
1440 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1441 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
1442 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
1443 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1444 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1445 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1446 ; GFX9-NEXT: ; return to shader part epilog
1448 ; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1450 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1451 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1452 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1453 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
1454 ; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
1455 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1456 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1457 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1458 ; GFX10-NEXT: ; return to shader part epilog
1460 ; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1462 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1463 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1464 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1465 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1466 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
1467 ; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0
1468 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1469 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1470 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1471 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1472 ; GFX11-NEXT: ; return to shader part epilog
1473 %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{}
1474 %zext.offset = zext i32 %voffset to i64
1475 %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
1476 %load = load float, ptr addrspace(1) %gep
1480 ; --------------------------------------------------------------------------------
1481 ; Stress various type loads
1482 ; --------------------------------------------------------------------------------
1484 define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1485 ; GCN-LABEL: global_load_saddr_i16:
1487 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3]
1488 ; GCN-NEXT: s_waitcnt vmcnt(0)
1489 ; GCN-NEXT: ; return to shader part epilog
1491 ; GFX11-LABEL: global_load_saddr_i16:
1493 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
1494 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1495 ; GFX11-NEXT: ; return to shader part epilog
1496 %zext.offset = zext i32 %voffset to i64
1497 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1498 %load = load i16, ptr addrspace(1) %gep0
1499 %cast.load = bitcast i16 %load to half
1503 define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1504 ; GCN-LABEL: global_load_saddr_i16_immneg128:
1506 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128
1507 ; GCN-NEXT: s_waitcnt vmcnt(0)
1508 ; GCN-NEXT: ; return to shader part epilog
1510 ; GFX11-LABEL: global_load_saddr_i16_immneg128:
1512 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
1513 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1514 ; GFX11-NEXT: ; return to shader part epilog
1515 %zext.offset = zext i32 %voffset to i64
1516 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1517 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1518 %load = load i16, ptr addrspace(1) %gep1
1519 %cast.load = bitcast i16 %load to half
1523 define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1524 ; GCN-LABEL: global_load_saddr_f16:
1526 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3]
1527 ; GCN-NEXT: s_waitcnt vmcnt(0)
1528 ; GCN-NEXT: ; return to shader part epilog
1530 ; GFX11-LABEL: global_load_saddr_f16:
1532 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
1533 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1534 ; GFX11-NEXT: ; return to shader part epilog
1535 %zext.offset = zext i32 %voffset to i64
1536 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1537 %load = load half, ptr addrspace(1) %gep0
1541 define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1542 ; GCN-LABEL: global_load_saddr_f16_immneg128:
1544 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128
1545 ; GCN-NEXT: s_waitcnt vmcnt(0)
1546 ; GCN-NEXT: ; return to shader part epilog
1548 ; GFX11-LABEL: global_load_saddr_f16_immneg128:
1550 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
1551 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1552 ; GFX11-NEXT: ; return to shader part epilog
1553 %zext.offset = zext i32 %voffset to i64
1554 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1555 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1556 %load = load half, ptr addrspace(1) %gep1
1560 define amdgpu_ps float @global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1561 ; GCN-LABEL: global_load_saddr_i32:
1563 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1564 ; GCN-NEXT: s_waitcnt vmcnt(0)
1565 ; GCN-NEXT: ; return to shader part epilog
1567 ; GFX11-LABEL: global_load_saddr_i32:
1569 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1570 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1571 ; GFX11-NEXT: ; return to shader part epilog
1572 %zext.offset = zext i32 %voffset to i64
1573 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1574 %load = load i32, ptr addrspace(1) %gep0
1575 %cast.load = bitcast i32 %load to float
1576 ret float %cast.load
1579 define amdgpu_ps float @global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1580 ; GCN-LABEL: global_load_saddr_i32_immneg128:
1582 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1583 ; GCN-NEXT: s_waitcnt vmcnt(0)
1584 ; GCN-NEXT: ; return to shader part epilog
1586 ; GFX11-LABEL: global_load_saddr_i32_immneg128:
1588 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
1589 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1590 ; GFX11-NEXT: ; return to shader part epilog
1591 %zext.offset = zext i32 %voffset to i64
1592 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1593 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1594 %load = load i32, ptr addrspace(1) %gep1
1595 %cast.load = bitcast i32 %load to float
1596 ret float %cast.load
1599 define amdgpu_ps float @global_load_saddr_f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1600 ; GCN-LABEL: global_load_saddr_f32:
1602 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1603 ; GCN-NEXT: s_waitcnt vmcnt(0)
1604 ; GCN-NEXT: ; return to shader part epilog
1606 ; GFX11-LABEL: global_load_saddr_f32:
1608 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1609 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1610 ; GFX11-NEXT: ; return to shader part epilog
1611 %zext.offset = zext i32 %voffset to i64
1612 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1613 %load = load float, ptr addrspace(1) %gep0
1617 define amdgpu_ps float @global_load_saddr_f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1618 ; GCN-LABEL: global_load_saddr_f32_immneg128:
1620 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1621 ; GCN-NEXT: s_waitcnt vmcnt(0)
1622 ; GCN-NEXT: ; return to shader part epilog
1624 ; GFX11-LABEL: global_load_saddr_f32_immneg128:
1626 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
1627 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1628 ; GFX11-NEXT: ; return to shader part epilog
1629 %zext.offset = zext i32 %voffset to i64
1630 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1631 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1632 %load = load float, ptr addrspace(1) %gep1
1636 define amdgpu_ps <2 x half> @global_load_saddr_v2i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1637 ; GCN-LABEL: global_load_saddr_v2i16:
1639 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1640 ; GCN-NEXT: s_waitcnt vmcnt(0)
1641 ; GCN-NEXT: ; return to shader part epilog
1643 ; GFX11-LABEL: global_load_saddr_v2i16:
1645 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1646 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1647 ; GFX11-NEXT: ; return to shader part epilog
1648 %zext.offset = zext i32 %voffset to i64
1649 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1650 %load = load <2 x i16>, ptr addrspace(1) %gep0
1651 %cast.load = bitcast <2 x i16> %load to <2 x half>
1652 ret <2 x half> %cast.load
1655 define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1656 ; GCN-LABEL: global_load_saddr_v2i16_immneg128:
1658 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1659 ; GCN-NEXT: s_waitcnt vmcnt(0)
1660 ; GCN-NEXT: ; return to shader part epilog
1662 ; GFX11-LABEL: global_load_saddr_v2i16_immneg128:
1664 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
1665 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1666 ; GFX11-NEXT: ; return to shader part epilog
1667 %zext.offset = zext i32 %voffset to i64
1668 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1669 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1670 %load = load <2 x i16>, ptr addrspace(1) %gep1
1671 %cast.load = bitcast <2 x i16> %load to <2 x half>
1672 ret <2 x half> %cast.load
1675 define amdgpu_ps <2 x half> @global_load_saddr_v2f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1676 ; GCN-LABEL: global_load_saddr_v2f16:
1678 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1679 ; GCN-NEXT: s_waitcnt vmcnt(0)
1680 ; GCN-NEXT: ; return to shader part epilog
1682 ; GFX11-LABEL: global_load_saddr_v2f16:
1684 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1685 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1686 ; GFX11-NEXT: ; return to shader part epilog
1687 %zext.offset = zext i32 %voffset to i64
1688 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1689 %load = load <2 x half>, ptr addrspace(1) %gep0
1690 ret <2 x half> %load
1693 define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1694 ; GCN-LABEL: global_load_saddr_v2f16_immneg128:
1696 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1697 ; GCN-NEXT: s_waitcnt vmcnt(0)
1698 ; GCN-NEXT: ; return to shader part epilog
1700 ; GFX11-LABEL: global_load_saddr_v2f16_immneg128:
1702 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
1703 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1704 ; GFX11-NEXT: ; return to shader part epilog
1705 %zext.offset = zext i32 %voffset to i64
1706 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1707 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1708 %load = load <2 x half>, ptr addrspace(1) %gep1
1709 ret <2 x half> %load
1712 define amdgpu_ps <2 x half> @global_load_saddr_p3(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1713 ; GCN-LABEL: global_load_saddr_p3:
1715 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1716 ; GCN-NEXT: s_waitcnt vmcnt(0)
1717 ; GCN-NEXT: ; return to shader part epilog
1719 ; GFX11-LABEL: global_load_saddr_p3:
1721 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1722 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1723 ; GFX11-NEXT: ; return to shader part epilog
1724 %zext.offset = zext i32 %voffset to i64
1725 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1726 %load = load ptr addrspace(3), ptr addrspace(1) %gep0
1727 %cast.load0 = ptrtoint ptr addrspace(3) %load to i32
1728 %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
1729 ret <2 x half> %cast.load1
1732 define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1733 ; GCN-LABEL: global_load_saddr_p3_immneg128:
1735 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1736 ; GCN-NEXT: s_waitcnt vmcnt(0)
1737 ; GCN-NEXT: ; return to shader part epilog
1739 ; GFX11-LABEL: global_load_saddr_p3_immneg128:
1741 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
1742 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1743 ; GFX11-NEXT: ; return to shader part epilog
1744 %zext.offset = zext i32 %voffset to i64
1745 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1746 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1747 %load = load ptr addrspace(3), ptr addrspace(1) %gep1
1748 %cast.load0 = ptrtoint ptr addrspace(3) %load to i32
1749 %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
1750 ret <2 x half> %cast.load1
1753 define amdgpu_ps <2 x float> @global_load_saddr_f64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1754 ; GCN-LABEL: global_load_saddr_f64:
1756 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1757 ; GCN-NEXT: s_waitcnt vmcnt(0)
1758 ; GCN-NEXT: ; return to shader part epilog
1760 ; GFX11-LABEL: global_load_saddr_f64:
1762 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
1763 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1764 ; GFX11-NEXT: ; return to shader part epilog
1765 %zext.offset = zext i32 %voffset to i64
1766 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1767 %load = load double, ptr addrspace(1) %gep0
1768 %cast.load = bitcast double %load to <2 x float>
1769 ret <2 x float> %cast.load
1772 define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1773 ; GCN-LABEL: global_load_saddr_f64_immneg128:
1775 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1776 ; GCN-NEXT: s_waitcnt vmcnt(0)
1777 ; GCN-NEXT: ; return to shader part epilog
1779 ; GFX11-LABEL: global_load_saddr_f64_immneg128:
1781 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
1782 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1783 ; GFX11-NEXT: ; return to shader part epilog
1784 %zext.offset = zext i32 %voffset to i64
1785 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1786 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1787 %load = load double, ptr addrspace(1) %gep1
1788 %cast.load = bitcast double %load to <2 x float>
1789 ret <2 x float> %cast.load
1792 define amdgpu_ps <2 x float> @global_load_saddr_i64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1793 ; GCN-LABEL: global_load_saddr_i64:
1795 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1796 ; GCN-NEXT: s_waitcnt vmcnt(0)
1797 ; GCN-NEXT: ; return to shader part epilog
1799 ; GFX11-LABEL: global_load_saddr_i64:
1801 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
1802 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1803 ; GFX11-NEXT: ; return to shader part epilog
1804 %zext.offset = zext i32 %voffset to i64
1805 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1806 %load = load i64, ptr addrspace(1) %gep0
1807 %cast.load = bitcast i64 %load to <2 x float>
1808 ret <2 x float> %cast.load
1811 define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1812 ; GCN-LABEL: global_load_saddr_i64_immneg128:
1814 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1815 ; GCN-NEXT: s_waitcnt vmcnt(0)
1816 ; GCN-NEXT: ; return to shader part epilog
1818 ; GFX11-LABEL: global_load_saddr_i64_immneg128:
1820 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
1821 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1822 ; GFX11-NEXT: ; return to shader part epilog
1823 %zext.offset = zext i32 %voffset to i64
1824 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1825 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1826 %load = load i64, ptr addrspace(1) %gep1
1827 %cast.load = bitcast i64 %load to <2 x float>
1828 ret <2 x float> %cast.load
1831 define amdgpu_ps <2 x float> @global_load_saddr_v2f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1832 ; GCN-LABEL: global_load_saddr_v2f32:
1834 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1835 ; GCN-NEXT: s_waitcnt vmcnt(0)
1836 ; GCN-NEXT: ; return to shader part epilog
1838 ; GFX11-LABEL: global_load_saddr_v2f32:
1840 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
1841 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1842 ; GFX11-NEXT: ; return to shader part epilog
1843 %zext.offset = zext i32 %voffset to i64
1844 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1845 %load = load <2 x float>, ptr addrspace(1) %gep0
1846 ret <2 x float> %load
1849 define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1850 ; GCN-LABEL: global_load_saddr_v2f32_immneg128:
1852 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1853 ; GCN-NEXT: s_waitcnt vmcnt(0)
1854 ; GCN-NEXT: ; return to shader part epilog
1856 ; GFX11-LABEL: global_load_saddr_v2f32_immneg128:
1858 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
1859 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1860 ; GFX11-NEXT: ; return to shader part epilog
1861 %zext.offset = zext i32 %voffset to i64
1862 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1863 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1864 %load = load <2 x float>, ptr addrspace(1) %gep1
1865 ret <2 x float> %load
1868 define amdgpu_ps <2 x float> @global_load_saddr_v2i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1869 ; GCN-LABEL: global_load_saddr_v2i32:
1871 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1872 ; GCN-NEXT: s_waitcnt vmcnt(0)
1873 ; GCN-NEXT: ; return to shader part epilog
1875 ; GFX11-LABEL: global_load_saddr_v2i32:
1877 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
1878 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1879 ; GFX11-NEXT: ; return to shader part epilog
1880 %zext.offset = zext i32 %voffset to i64
1881 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1882 %load = load <2 x i32>, ptr addrspace(1) %gep0
1883 %cast.load = bitcast <2 x i32> %load to <2 x float>
1884 ret <2 x float> %cast.load
1887 define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1888 ; GCN-LABEL: global_load_saddr_v2i32_immneg128:
1890 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1891 ; GCN-NEXT: s_waitcnt vmcnt(0)
1892 ; GCN-NEXT: ; return to shader part epilog
1894 ; GFX11-LABEL: global_load_saddr_v2i32_immneg128:
1896 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
1897 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1898 ; GFX11-NEXT: ; return to shader part epilog
1899 %zext.offset = zext i32 %voffset to i64
1900 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1901 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1902 %load = load <2 x i32>, ptr addrspace(1) %gep1
1903 %cast.load = bitcast <2 x i32> %load to <2 x float>
1904 ret <2 x float> %cast.load
1907 define amdgpu_ps <2 x float> @global_load_saddr_v4i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1908 ; GCN-LABEL: global_load_saddr_v4i16:
1910 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1911 ; GCN-NEXT: s_waitcnt vmcnt(0)
1912 ; GCN-NEXT: ; return to shader part epilog
1914 ; GFX11-LABEL: global_load_saddr_v4i16:
1916 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
1917 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1918 ; GFX11-NEXT: ; return to shader part epilog
1919 %zext.offset = zext i32 %voffset to i64
1920 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1921 %load = load <4 x i16>, ptr addrspace(1) %gep0
1922 %cast.load = bitcast <4 x i16> %load to <2 x float>
1923 ret <2 x float> %cast.load
1926 define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1927 ; GCN-LABEL: global_load_saddr_v4i16_immneg128:
1929 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1930 ; GCN-NEXT: s_waitcnt vmcnt(0)
1931 ; GCN-NEXT: ; return to shader part epilog
1933 ; GFX11-LABEL: global_load_saddr_v4i16_immneg128:
1935 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
1936 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1937 ; GFX11-NEXT: ; return to shader part epilog
1938 %zext.offset = zext i32 %voffset to i64
1939 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1940 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1941 %load = load <4 x i16>, ptr addrspace(1) %gep1
1942 %cast.load = bitcast <4 x i16> %load to <2 x float>
1943 ret <2 x float> %cast.load
1946 define amdgpu_ps <2 x float> @global_load_saddr_v4f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1947 ; GCN-LABEL: global_load_saddr_v4f16:
1949 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1950 ; GCN-NEXT: s_waitcnt vmcnt(0)
1951 ; GCN-NEXT: ; return to shader part epilog
1953 ; GFX11-LABEL: global_load_saddr_v4f16:
1955 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
1956 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1957 ; GFX11-NEXT: ; return to shader part epilog
1958 %zext.offset = zext i32 %voffset to i64
1959 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1960 %load = load <4 x half>, ptr addrspace(1) %gep0
1961 %cast.load = bitcast <4 x half> %load to <2 x float>
1962 ret <2 x float> %cast.load
1965 define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1966 ; GCN-LABEL: global_load_saddr_v4f16_immneg128:
1968 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1969 ; GCN-NEXT: s_waitcnt vmcnt(0)
1970 ; GCN-NEXT: ; return to shader part epilog
1972 ; GFX11-LABEL: global_load_saddr_v4f16_immneg128:
1974 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
1975 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1976 ; GFX11-NEXT: ; return to shader part epilog
1977 %zext.offset = zext i32 %voffset to i64
1978 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1979 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1980 %load = load <4 x half>, ptr addrspace(1) %gep1
1981 %cast.load = bitcast <4 x half> %load to <2 x float>
1982 ret <2 x float> %cast.load
1985 define amdgpu_ps <2 x float> @global_load_saddr_p1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
1986 ; GCN-LABEL: global_load_saddr_p1:
1988 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1989 ; GCN-NEXT: s_waitcnt vmcnt(0)
1990 ; GCN-NEXT: ; return to shader part epilog
1992 ; GFX11-LABEL: global_load_saddr_p1:
1994 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
1995 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1996 ; GFX11-NEXT: ; return to shader part epilog
1997 %zext.offset = zext i32 %voffset to i64
1998 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1999 %load = load ptr addrspace(1), ptr addrspace(1) %gep0
2000 %cast.load0 = ptrtoint ptr addrspace(1) %load to i64
2001 %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
2002 ret <2 x float> %cast.load1
2005 define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2006 ; GCN-LABEL: global_load_saddr_p1_immneg128:
2008 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
2009 ; GCN-NEXT: s_waitcnt vmcnt(0)
2010 ; GCN-NEXT: ; return to shader part epilog
2012 ; GFX11-LABEL: global_load_saddr_p1_immneg128:
2014 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
2015 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2016 ; GFX11-NEXT: ; return to shader part epilog
2017 %zext.offset = zext i32 %voffset to i64
2018 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2019 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2020 %load = load ptr addrspace(1), ptr addrspace(1) %gep1
2021 %cast.load0 = ptrtoint ptr addrspace(1) %load to i64
2022 %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
2023 ret <2 x float> %cast.load1
2026 define amdgpu_ps <3 x float> @global_load_saddr_v3f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2027 ; GCN-LABEL: global_load_saddr_v3f32:
2029 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3]
2030 ; GCN-NEXT: s_waitcnt vmcnt(0)
2031 ; GCN-NEXT: ; return to shader part epilog
2033 ; GFX11-LABEL: global_load_saddr_v3f32:
2035 ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3]
2036 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2037 ; GFX11-NEXT: ; return to shader part epilog
2038 %zext.offset = zext i32 %voffset to i64
2039 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2040 %load = load <3 x float>, ptr addrspace(1) %gep0
2041 ret <3 x float> %load
2044 define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2045 ; GCN-LABEL: global_load_saddr_v3f32_immneg128:
2047 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
2048 ; GCN-NEXT: s_waitcnt vmcnt(0)
2049 ; GCN-NEXT: ; return to shader part epilog
2051 ; GFX11-LABEL: global_load_saddr_v3f32_immneg128:
2053 ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128
2054 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2055 ; GFX11-NEXT: ; return to shader part epilog
2056 %zext.offset = zext i32 %voffset to i64
2057 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2058 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2059 %load = load <3 x float>, ptr addrspace(1) %gep1
2060 ret <3 x float> %load
2063 define amdgpu_ps <3 x float> @global_load_saddr_v3i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2064 ; GCN-LABEL: global_load_saddr_v3i32:
2066 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3]
2067 ; GCN-NEXT: s_waitcnt vmcnt(0)
2068 ; GCN-NEXT: ; return to shader part epilog
2070 ; GFX11-LABEL: global_load_saddr_v3i32:
2072 ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3]
2073 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2074 ; GFX11-NEXT: ; return to shader part epilog
2075 %zext.offset = zext i32 %voffset to i64
2076 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2077 %load = load <3 x i32>, ptr addrspace(1) %gep0
2078 %cast.load = bitcast <3 x i32> %load to <3 x float>
2079 ret <3 x float> %cast.load
2082 define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2083 ; GCN-LABEL: global_load_saddr_v3i32_immneg128:
2085 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
2086 ; GCN-NEXT: s_waitcnt vmcnt(0)
2087 ; GCN-NEXT: ; return to shader part epilog
2089 ; GFX11-LABEL: global_load_saddr_v3i32_immneg128:
2091 ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128
2092 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2093 ; GFX11-NEXT: ; return to shader part epilog
2094 %zext.offset = zext i32 %voffset to i64
2095 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2096 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2097 %load = load <3 x i32>, ptr addrspace(1) %gep1
2098 %cast.load = bitcast <3 x i32> %load to <3 x float>
2099 ret <3 x float> %cast.load
2102 define amdgpu_ps <6 x half> @global_load_saddr_v6f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2103 ; GCN-LABEL: global_load_saddr_v6f16:
2105 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3]
2106 ; GCN-NEXT: s_waitcnt vmcnt(0)
2107 ; GCN-NEXT: ; return to shader part epilog
2109 ; GFX11-LABEL: global_load_saddr_v6f16:
2111 ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3]
2112 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2113 ; GFX11-NEXT: ; return to shader part epilog
2114 %zext.offset = zext i32 %voffset to i64
2115 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2116 %load = load <6 x half>, ptr addrspace(1) %gep0
2117 ret <6 x half> %load
2120 define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2121 ; GCN-LABEL: global_load_saddr_v6f16_immneg128:
2123 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
2124 ; GCN-NEXT: s_waitcnt vmcnt(0)
2125 ; GCN-NEXT: ; return to shader part epilog
2127 ; GFX11-LABEL: global_load_saddr_v6f16_immneg128:
2129 ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128
2130 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2131 ; GFX11-NEXT: ; return to shader part epilog
2132 %zext.offset = zext i32 %voffset to i64
2133 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2134 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2135 %load = load <6 x half>, ptr addrspace(1) %gep1
2136 ret <6 x half> %load
2139 define amdgpu_ps <4 x float> @global_load_saddr_v4f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2140 ; GCN-LABEL: global_load_saddr_v4f32:
2142 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
2143 ; GCN-NEXT: s_waitcnt vmcnt(0)
2144 ; GCN-NEXT: ; return to shader part epilog
2146 ; GFX11-LABEL: global_load_saddr_v4f32:
2148 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
2149 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2150 ; GFX11-NEXT: ; return to shader part epilog
2151 %zext.offset = zext i32 %voffset to i64
2152 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2153 %load = load <4 x float>, ptr addrspace(1) %gep0
2154 ret <4 x float> %load
2157 define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2158 ; GCN-LABEL: global_load_saddr_v4f32_immneg128:
2160 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2161 ; GCN-NEXT: s_waitcnt vmcnt(0)
2162 ; GCN-NEXT: ; return to shader part epilog
2164 ; GFX11-LABEL: global_load_saddr_v4f32_immneg128:
2166 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
2167 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2168 ; GFX11-NEXT: ; return to shader part epilog
2169 %zext.offset = zext i32 %voffset to i64
2170 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2171 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2172 %load = load <4 x float>, ptr addrspace(1) %gep1
2173 ret <4 x float> %load
2176 define amdgpu_ps <4 x float> @global_load_saddr_v4i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2177 ; GCN-LABEL: global_load_saddr_v4i32:
2179 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
2180 ; GCN-NEXT: s_waitcnt vmcnt(0)
2181 ; GCN-NEXT: ; return to shader part epilog
2183 ; GFX11-LABEL: global_load_saddr_v4i32:
2185 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
2186 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2187 ; GFX11-NEXT: ; return to shader part epilog
2188 %zext.offset = zext i32 %voffset to i64
2189 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2190 %load = load <4 x i32>, ptr addrspace(1) %gep0
2191 %cast.load = bitcast <4 x i32> %load to <4 x float>
2192 ret <4 x float> %cast.load
2195 define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2196 ; GCN-LABEL: global_load_saddr_v4i32_immneg128:
2198 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2199 ; GCN-NEXT: s_waitcnt vmcnt(0)
2200 ; GCN-NEXT: ; return to shader part epilog
2202 ; GFX11-LABEL: global_load_saddr_v4i32_immneg128:
2204 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
2205 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2206 ; GFX11-NEXT: ; return to shader part epilog
2207 %zext.offset = zext i32 %voffset to i64
2208 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2209 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2210 %load = load <4 x i32>, ptr addrspace(1) %gep1
2211 %cast.load = bitcast <4 x i32> %load to <4 x float>
2212 ret <4 x float> %cast.load
2215 define amdgpu_ps <4 x float> @global_load_saddr_v2i64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2216 ; GCN-LABEL: global_load_saddr_v2i64:
2218 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
2219 ; GCN-NEXT: s_waitcnt vmcnt(0)
2220 ; GCN-NEXT: ; return to shader part epilog
2222 ; GFX11-LABEL: global_load_saddr_v2i64:
2224 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
2225 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2226 ; GFX11-NEXT: ; return to shader part epilog
2227 %zext.offset = zext i32 %voffset to i64
2228 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2229 %load = load <2 x i64>, ptr addrspace(1) %gep0
2230 %cast.load = bitcast <2 x i64> %load to <4 x float>
2231 ret <4 x float> %cast.load
2234 define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2235 ; GCN-LABEL: global_load_saddr_v2i64_immneg128:
2237 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2238 ; GCN-NEXT: s_waitcnt vmcnt(0)
2239 ; GCN-NEXT: ; return to shader part epilog
2241 ; GFX11-LABEL: global_load_saddr_v2i64_immneg128:
2243 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
2244 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2245 ; GFX11-NEXT: ; return to shader part epilog
2246 %zext.offset = zext i32 %voffset to i64
2247 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2248 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2249 %load = load <2 x i64>, ptr addrspace(1) %gep1
2250 %cast.load = bitcast <2 x i64> %load to <4 x float>
2251 ret <4 x float> %cast.load
2254 define amdgpu_ps <4 x float> @global_load_saddr_i128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2255 ; GCN-LABEL: global_load_saddr_i128:
2257 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
2258 ; GCN-NEXT: s_waitcnt vmcnt(0)
2259 ; GCN-NEXT: ; return to shader part epilog
2261 ; GFX11-LABEL: global_load_saddr_i128:
2263 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
2264 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2265 ; GFX11-NEXT: ; return to shader part epilog
2266 %zext.offset = zext i32 %voffset to i64
2267 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2268 %load = load i128, ptr addrspace(1) %gep0
2269 %cast.load = bitcast i128 %load to <4 x float>
2270 ret <4 x float> %cast.load
2273 define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2274 ; GCN-LABEL: global_load_saddr_i128_immneg128:
2276 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2277 ; GCN-NEXT: s_waitcnt vmcnt(0)
2278 ; GCN-NEXT: ; return to shader part epilog
2280 ; GFX11-LABEL: global_load_saddr_i128_immneg128:
2282 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
2283 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2284 ; GFX11-NEXT: ; return to shader part epilog
2285 %zext.offset = zext i32 %voffset to i64
2286 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2287 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2288 %load = load i128, ptr addrspace(1) %gep1
2289 %cast.load = bitcast i128 %load to <4 x float>
2290 ret <4 x float> %cast.load
2293 define amdgpu_ps <4 x float> @global_load_saddr_v2p1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2294 ; GCN-LABEL: global_load_saddr_v2p1:
2296 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
2297 ; GCN-NEXT: s_waitcnt vmcnt(0)
2298 ; GCN-NEXT: ; return to shader part epilog
2300 ; GFX11-LABEL: global_load_saddr_v2p1:
2302 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
2303 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2304 ; GFX11-NEXT: ; return to shader part epilog
2305 %zext.offset = zext i32 %voffset to i64
2306 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2307 %load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep0
2308 %cast.load0 = ptrtoint <2 x ptr addrspace(1)> %load to <2 x i64>
2309 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
2310 ret <4 x float> %cast.load1
2313 define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2314 ; GCN-LABEL: global_load_saddr_v2p1_immneg128:
2316 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2317 ; GCN-NEXT: s_waitcnt vmcnt(0)
2318 ; GCN-NEXT: ; return to shader part epilog
2320 ; GFX11-LABEL: global_load_saddr_v2p1_immneg128:
2322 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
2323 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2324 ; GFX11-NEXT: ; return to shader part epilog
2325 %zext.offset = zext i32 %voffset to i64
2326 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2327 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2328 %load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep1
2329 %cast.load0 = ptrtoint <2 x ptr addrspace(1)> %load to <2 x i64>
2330 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
2331 ret <4 x float> %cast.load1
2334 define amdgpu_ps <4 x float> @global_load_saddr_v4p3(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2335 ; GCN-LABEL: global_load_saddr_v4p3:
2337 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
2338 ; GCN-NEXT: s_waitcnt vmcnt(0)
2339 ; GCN-NEXT: ; return to shader part epilog
2341 ; GFX11-LABEL: global_load_saddr_v4p3:
2343 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
2344 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2345 ; GFX11-NEXT: ; return to shader part epilog
2346 %zext.offset = zext i32 %voffset to i64
2347 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2348 %load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep0
2349 %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
2350 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
2351 ret <4 x float> %cast.load1
2354 define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2355 ; GCN-LABEL: global_load_saddr_v4p3_immneg128:
2357 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2358 ; GCN-NEXT: s_waitcnt vmcnt(0)
2359 ; GCN-NEXT: ; return to shader part epilog
2361 ; GFX11-LABEL: global_load_saddr_v4p3_immneg128:
2363 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
2364 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2365 ; GFX11-NEXT: ; return to shader part epilog
2366 %zext.offset = zext i32 %voffset to i64
2367 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2368 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2369 %load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep1
2370 %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
2371 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
2372 ret <4 x float> %cast.load1
2375 ; --------------------------------------------------------------------------------
2377 ; --------------------------------------------------------------------------------
2379 define amdgpu_ps float @global_sextload_saddr_i8(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2380 ; GCN-LABEL: global_sextload_saddr_i8:
2382 ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3]
2383 ; GCN-NEXT: s_waitcnt vmcnt(0)
2384 ; GCN-NEXT: ; return to shader part epilog
2386 ; GFX11-LABEL: global_sextload_saddr_i8:
2388 ; GFX11-NEXT: global_load_i8 v0, v0, s[2:3]
2389 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2390 ; GFX11-NEXT: ; return to shader part epilog
2391 %zext.offset = zext i32 %voffset to i64
2392 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2393 %load = load i8, ptr addrspace(1) %gep0
2394 %sextload = sext i8 %load to i32
2395 %cast.load = bitcast i32 %sextload to float
2396 ret float %cast.load
2399 define amdgpu_ps float @global_sextload_saddr_i8_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2400 ; GCN-LABEL: global_sextload_saddr_i8_immneg128:
2402 ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128
2403 ; GCN-NEXT: s_waitcnt vmcnt(0)
2404 ; GCN-NEXT: ; return to shader part epilog
2406 ; GFX11-LABEL: global_sextload_saddr_i8_immneg128:
2408 ; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128
2409 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2410 ; GFX11-NEXT: ; return to shader part epilog
2411 %zext.offset = zext i32 %voffset to i64
2412 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2413 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2414 %load = load i8, ptr addrspace(1) %gep1
2415 %sextload = sext i8 %load to i32
2416 %cast.load = bitcast i32 %sextload to float
2417 ret float %cast.load
2420 define amdgpu_ps float @global_sextload_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2421 ; GCN-LABEL: global_sextload_saddr_i16:
2423 ; GCN-NEXT: global_load_sshort v0, v0, s[2:3]
2424 ; GCN-NEXT: s_waitcnt vmcnt(0)
2425 ; GCN-NEXT: ; return to shader part epilog
2427 ; GFX11-LABEL: global_sextload_saddr_i16:
2429 ; GFX11-NEXT: global_load_i16 v0, v0, s[2:3]
2430 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2431 ; GFX11-NEXT: ; return to shader part epilog
2432 %zext.offset = zext i32 %voffset to i64
2433 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2434 %load = load i16, ptr addrspace(1) %gep0
2435 %sextload = sext i16 %load to i32
2436 %cast.load = bitcast i32 %sextload to float
2437 ret float %cast.load
2440 define amdgpu_ps float @global_sextload_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2441 ; GCN-LABEL: global_sextload_saddr_i16_immneg128:
2443 ; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128
2444 ; GCN-NEXT: s_waitcnt vmcnt(0)
2445 ; GCN-NEXT: ; return to shader part epilog
2447 ; GFX11-LABEL: global_sextload_saddr_i16_immneg128:
2449 ; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] offset:-128
2450 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2451 ; GFX11-NEXT: ; return to shader part epilog
2452 %zext.offset = zext i32 %voffset to i64
2453 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2454 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2455 %load = load i16, ptr addrspace(1) %gep1
2456 %sextload = sext i16 %load to i32
2457 %cast.load = bitcast i32 %sextload to float
2458 ret float %cast.load
2461 define amdgpu_ps float @global_zextload_saddr_i8(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2462 ; GCN-LABEL: global_zextload_saddr_i8:
2464 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
2465 ; GCN-NEXT: s_waitcnt vmcnt(0)
2466 ; GCN-NEXT: ; return to shader part epilog
2468 ; GFX11-LABEL: global_zextload_saddr_i8:
2470 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
2471 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2472 ; GFX11-NEXT: ; return to shader part epilog
2473 %zext.offset = zext i32 %voffset to i64
2474 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2475 %load = load i8, ptr addrspace(1) %gep0
2476 %zextload = zext i8 %load to i32
2477 %cast.load = bitcast i32 %zextload to float
2478 ret float %cast.load
2481 define amdgpu_ps float @global_zextload_saddr_i8_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2482 ; GCN-LABEL: global_zextload_saddr_i8_immneg128:
2484 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128
2485 ; GCN-NEXT: s_waitcnt vmcnt(0)
2486 ; GCN-NEXT: ; return to shader part epilog
2488 ; GFX11-LABEL: global_zextload_saddr_i8_immneg128:
2490 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128
2491 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2492 ; GFX11-NEXT: ; return to shader part epilog
2493 %zext.offset = zext i32 %voffset to i64
2494 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2495 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2496 %load = load i8, ptr addrspace(1) %gep1
2497 %zextload = zext i8 %load to i32
2498 %cast.load = bitcast i32 %zextload to float
2499 ret float %cast.load
2502 define amdgpu_ps float @global_zextload_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2503 ; GCN-LABEL: global_zextload_saddr_i16:
2505 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3]
2506 ; GCN-NEXT: s_waitcnt vmcnt(0)
2507 ; GCN-NEXT: ; return to shader part epilog
2509 ; GFX11-LABEL: global_zextload_saddr_i16:
2511 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
2512 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2513 ; GFX11-NEXT: ; return to shader part epilog
2514 %zext.offset = zext i32 %voffset to i64
2515 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2516 %load = load i16, ptr addrspace(1) %gep0
2517 %zextload = zext i16 %load to i32
2518 %cast.load = bitcast i32 %zextload to float
2519 ret float %cast.load
2522 define amdgpu_ps float @global_zextload_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2523 ; GCN-LABEL: global_zextload_saddr_i16_immneg128:
2525 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128
2526 ; GCN-NEXT: s_waitcnt vmcnt(0)
2527 ; GCN-NEXT: ; return to shader part epilog
2529 ; GFX11-LABEL: global_zextload_saddr_i16_immneg128:
2531 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
2532 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2533 ; GFX11-NEXT: ; return to shader part epilog
2534 %zext.offset = zext i32 %voffset to i64
2535 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2536 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2537 %load = load i16, ptr addrspace(1) %gep1
2538 %zextload = zext i16 %load to i32
2539 %cast.load = bitcast i32 %zextload to float
2540 ret float %cast.load
2543 ; --------------------------------------------------------------------------------
2545 ; --------------------------------------------------------------------------------
2547 define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2548 ; GFX9-LABEL: atomic_global_load_saddr_i32:
2550 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2551 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
2552 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2553 ; GFX9-NEXT: buffer_wbinvl1
2554 ; GFX9-NEXT: ; return to shader part epilog
2556 ; GFX10-LABEL: atomic_global_load_saddr_i32:
2558 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2559 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2560 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
2561 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2562 ; GFX10-NEXT: buffer_gl0_inv
2563 ; GFX10-NEXT: buffer_gl1_inv
2564 ; GFX10-NEXT: ; return to shader part epilog
2566 ; GFX11-LABEL: atomic_global_load_saddr_i32:
2568 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2569 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2570 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc
2571 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2572 ; GFX11-NEXT: buffer_gl0_inv
2573 ; GFX11-NEXT: buffer_gl1_inv
2574 ; GFX11-NEXT: ; return to shader part epilog
2575 %zext.offset = zext i32 %voffset to i64
2576 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2577 %load = load atomic i32, ptr addrspace(1) %gep0 seq_cst, align 4
2578 %cast.load = bitcast i32 %load to float
2579 ret float %cast.load
2582 define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2583 ; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128:
2585 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2586 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc
2587 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2588 ; GFX9-NEXT: buffer_wbinvl1
2589 ; GFX9-NEXT: ; return to shader part epilog
2591 ; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128:
2593 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2594 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2595 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc dlc
2596 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2597 ; GFX10-NEXT: buffer_gl0_inv
2598 ; GFX10-NEXT: buffer_gl1_inv
2599 ; GFX10-NEXT: ; return to shader part epilog
2601 ; GFX11-LABEL: atomic_global_load_saddr_i32_immneg128:
2603 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2604 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2605 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 glc
2606 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2607 ; GFX11-NEXT: buffer_gl0_inv
2608 ; GFX11-NEXT: buffer_gl1_inv
2609 ; GFX11-NEXT: ; return to shader part epilog
2610 %zext.offset = zext i32 %voffset to i64
2611 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2612 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2613 %load = load atomic i32, ptr addrspace(1) %gep1 seq_cst, align 4
2614 %cast.load = bitcast i32 %load to float
2615 ret float %cast.load
2618 define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2619 ; GFX9-LABEL: atomic_global_load_saddr_i64:
2621 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2622 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc
2623 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2624 ; GFX9-NEXT: buffer_wbinvl1
2625 ; GFX9-NEXT: ; return to shader part epilog
2627 ; GFX10-LABEL: atomic_global_load_saddr_i64:
2629 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2630 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2631 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
2632 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2633 ; GFX10-NEXT: buffer_gl0_inv
2634 ; GFX10-NEXT: buffer_gl1_inv
2635 ; GFX10-NEXT: ; return to shader part epilog
2637 ; GFX11-LABEL: atomic_global_load_saddr_i64:
2639 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2640 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2641 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] glc
2642 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2643 ; GFX11-NEXT: buffer_gl0_inv
2644 ; GFX11-NEXT: buffer_gl1_inv
2645 ; GFX11-NEXT: ; return to shader part epilog
2646 %zext.offset = zext i32 %voffset to i64
2647 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2648 %load = load atomic i64, ptr addrspace(1) %gep0 seq_cst, align 8
2649 %cast.load = bitcast i64 %load to <2 x float>
2650 ret <2 x float> %cast.load
2653 define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2654 ; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128:
2656 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2657 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc
2658 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2659 ; GFX9-NEXT: buffer_wbinvl1
2660 ; GFX9-NEXT: ; return to shader part epilog
2662 ; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128:
2664 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2665 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2666 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc
2667 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2668 ; GFX10-NEXT: buffer_gl0_inv
2669 ; GFX10-NEXT: buffer_gl1_inv
2670 ; GFX10-NEXT: ; return to shader part epilog
2672 ; GFX11-LABEL: atomic_global_load_saddr_i64_immneg128:
2674 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2675 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2676 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 glc
2677 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2678 ; GFX11-NEXT: buffer_gl0_inv
2679 ; GFX11-NEXT: buffer_gl1_inv
2680 ; GFX11-NEXT: ; return to shader part epilog
2681 %zext.offset = zext i32 %voffset to i64
2682 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2683 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2684 %load = load atomic i64, ptr addrspace(1) %gep1 seq_cst, align 8
2685 %cast.load = bitcast i64 %load to <2 x float>
2686 ret <2 x float> %cast.load
2689 ; --------------------------------------------------------------------------------
2691 ; --------------------------------------------------------------------------------
2693 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2694 ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi:
2696 ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3]
2697 ; GCN-NEXT: s_waitcnt vmcnt(0)
2698 ; GCN-NEXT: ; return to shader part epilog
2700 ; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi:
2702 ; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3]
2703 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2704 ; GFX11-NEXT: ; return to shader part epilog
2705 %zext.offset = zext i32 %voffset to i64
2706 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2707 %load = load i16, ptr addrspace(1) %gep0
2708 %build = insertelement <2 x i16> undef, i16 %load, i32 0
2709 %cast = bitcast <2 x i16> %build to <2 x half>
2710 ret <2 x half> %cast
2713 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2714 ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
2716 ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128
2717 ; GCN-NEXT: s_waitcnt vmcnt(0)
2718 ; GCN-NEXT: ; return to shader part epilog
2720 ; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
2722 ; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128
2723 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2724 ; GFX11-NEXT: ; return to shader part epilog
2725 %zext.offset = zext i32 %voffset to i64
2726 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2727 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2728 %load = load i16, ptr addrspace(1) %gep1
2729 %build = insertelement <2 x i16> undef, i16 %load, i32 0
2730 %cast = bitcast <2 x i16> %build to <2 x half>
2731 ret <2 x half> %cast
2734 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2735 ; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi:
2737 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2738 ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3]
2739 ; GCN-NEXT: s_waitcnt vmcnt(0)
2740 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2741 ; GCN-NEXT: ; return to shader part epilog
2743 ; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi:
2745 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
2746 ; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3]
2747 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2748 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
2749 ; GFX11-NEXT: ; return to shader part epilog
2750 %zext.offset = zext i32 %voffset to i64
2751 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2752 %load = load i16, ptr addrspace(1) %gep0
2753 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
2754 %cast = bitcast <2 x i16> %build to <2 x half>
2755 ret <2 x half> %cast
2758 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2759 ; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
2761 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2762 ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128
2763 ; GCN-NEXT: s_waitcnt vmcnt(0)
2764 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2765 ; GCN-NEXT: ; return to shader part epilog
2767 ; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
2769 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
2770 ; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128
2771 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2772 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
2773 ; GFX11-NEXT: ; return to shader part epilog
2774 %zext.offset = zext i32 %voffset to i64
2775 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2776 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2777 %load = load i16, ptr addrspace(1) %gep1
2778 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
2779 %cast = bitcast <2 x i16> %build to <2 x half>
2780 ret <2 x half> %cast
2783 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2784 ; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi:
2786 ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3]
2787 ; GCN-NEXT: s_waitcnt vmcnt(0)
2788 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2789 ; GCN-NEXT: ; return to shader part epilog
2791 ; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi:
2793 ; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3]
2794 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2795 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
2796 ; GFX11-NEXT: ; return to shader part epilog
2797 %zext.offset = zext i32 %voffset to i64
2798 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2799 %load = load i16, ptr addrspace(1) %gep0
2800 %build = insertelement <2 x i16> %reg, i16 %load, i32 0
2801 %cast = bitcast <2 x i16> %build to <2 x half>
2802 ret <2 x half> %cast
2805 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2806 ; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
2808 ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128
2809 ; GCN-NEXT: s_waitcnt vmcnt(0)
2810 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2811 ; GCN-NEXT: ; return to shader part epilog
2813 ; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
2815 ; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128
2816 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2817 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
2818 ; GFX11-NEXT: ; return to shader part epilog
2819 %zext.offset = zext i32 %voffset to i64
2820 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2821 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2822 %load = load i16, ptr addrspace(1) %gep1
2823 %build = insertelement <2 x i16> %reg, i16 %load, i32 0
2824 %cast = bitcast <2 x i16> %build to <2 x half>
2825 ret <2 x half> %cast
2828 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2829 ; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
2831 ; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3]
2832 ; GCN-NEXT: s_waitcnt vmcnt(0)
2833 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2834 ; GCN-NEXT: ; return to shader part epilog
2836 ; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
2838 ; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3]
2839 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2840 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
2841 ; GFX11-NEXT: ; return to shader part epilog
2842 %zext.offset = zext i32 %voffset to i64
2843 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2844 %load = load i8, ptr addrspace(1) %gep0
2845 %zext.load = zext i8 %load to i16
2846 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
2847 %cast = bitcast <2 x i16> %build to <2 x half>
2848 ret <2 x half> %cast
2851 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2852 ; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
2854 ; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] offset:-128
2855 ; GCN-NEXT: s_waitcnt vmcnt(0)
2856 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2857 ; GCN-NEXT: ; return to shader part epilog
2859 ; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
2861 ; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128
2862 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2863 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
2864 ; GFX11-NEXT: ; return to shader part epilog
2865 %zext.offset = zext i32 %voffset to i64
2866 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2867 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2868 %load = load i8, ptr addrspace(1) %gep1
2869 %zext.load = zext i8 %load to i16
2870 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
2871 %cast = bitcast <2 x i16> %build to <2 x half>
2872 ret <2 x half> %cast
2875 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2876 ; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
2878 ; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3]
2879 ; GCN-NEXT: s_waitcnt vmcnt(0)
2880 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2881 ; GCN-NEXT: ; return to shader part epilog
2883 ; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
2885 ; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3]
2886 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2887 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
2888 ; GFX11-NEXT: ; return to shader part epilog
2889 %zext.offset = zext i32 %voffset to i64
2890 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2891 %load = load i8, ptr addrspace(1) %gep0
2892 %sext.load = sext i8 %load to i16
2893 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
2894 %cast = bitcast <2 x i16> %build to <2 x half>
2895 ret <2 x half> %cast
2898 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2899 ; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
2901 ; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] offset:-128
2902 ; GCN-NEXT: s_waitcnt vmcnt(0)
2903 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2904 ; GCN-NEXT: ; return to shader part epilog
2906 ; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
2908 ; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128
2909 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2910 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
2911 ; GFX11-NEXT: ; return to shader part epilog
2912 %zext.offset = zext i32 %voffset to i64
2913 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2914 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2915 %load = load i8, ptr addrspace(1) %gep1
2916 %sext.load = sext i8 %load to i16
2917 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
2918 %cast = bitcast <2 x i16> %build to <2 x half>
2919 ret <2 x half> %cast
2922 ; --------------------------------------------------------------------------------
2923 ; D16 hi load (hi16)
2924 ; --------------------------------------------------------------------------------
2926 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2927 ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi:
2929 ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3]
2930 ; GCN-NEXT: s_waitcnt vmcnt(0)
2931 ; GCN-NEXT: ; return to shader part epilog
2933 ; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi:
2935 ; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3]
2936 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2937 ; GFX11-NEXT: ; return to shader part epilog
2938 %zext.offset = zext i32 %voffset to i64
2939 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2940 %load = load i16, ptr addrspace(1) %gep0
2941 %build = insertelement <2 x i16> undef, i16 %load, i32 1
2942 %cast = bitcast <2 x i16> %build to <2 x half>
2943 ret <2 x half> %cast
2946 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2947 ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
2949 ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128
2950 ; GCN-NEXT: s_waitcnt vmcnt(0)
2951 ; GCN-NEXT: ; return to shader part epilog
2953 ; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
2955 ; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128
2956 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2957 ; GFX11-NEXT: ; return to shader part epilog
2958 %zext.offset = zext i32 %voffset to i64
2959 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2960 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2961 %load = load i16, ptr addrspace(1) %gep1
2962 %build = insertelement <2 x i16> undef, i16 %load, i32 1
2963 %cast = bitcast <2 x i16> %build to <2 x half>
2964 ret <2 x half> %cast
2967 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2968 ; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi:
2970 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2971 ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3]
2972 ; GCN-NEXT: s_waitcnt vmcnt(0)
2973 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2974 ; GCN-NEXT: ; return to shader part epilog
2976 ; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi:
2978 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
2979 ; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
2980 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2981 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
2982 ; GFX11-NEXT: ; return to shader part epilog
2983 %zext.offset = zext i32 %voffset to i64
2984 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2985 %load = load i16, ptr addrspace(1) %gep0
2986 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
2987 %cast = bitcast <2 x i16> %build to <2 x half>
2988 ret <2 x half> %cast
2991 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
2992 ; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
2994 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2995 ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128
2996 ; GCN-NEXT: s_waitcnt vmcnt(0)
2997 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2998 ; GCN-NEXT: ; return to shader part epilog
3000 ; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
3002 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
3003 ; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
3004 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3005 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
3006 ; GFX11-NEXT: ; return to shader part epilog
3007 %zext.offset = zext i32 %voffset to i64
3008 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3009 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3010 %load = load i16, ptr addrspace(1) %gep1
3011 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
3012 %cast = bitcast <2 x i16> %build to <2 x half>
3013 ret <2 x half> %cast
3016 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3017 ; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi:
3019 ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3]
3020 ; GCN-NEXT: s_waitcnt vmcnt(0)
3021 ; GCN-NEXT: v_mov_b32_e32 v0, v1
3022 ; GCN-NEXT: ; return to shader part epilog
3024 ; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi:
3026 ; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
3027 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3028 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
3029 ; GFX11-NEXT: ; return to shader part epilog
3030 %zext.offset = zext i32 %voffset to i64
3031 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3032 %load = load i16, ptr addrspace(1) %gep0
3033 %build = insertelement <2 x i16> %reg, i16 %load, i32 1
3034 %cast = bitcast <2 x i16> %build to <2 x half>
3035 ret <2 x half> %cast
3038 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3039 ; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
3041 ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128
3042 ; GCN-NEXT: s_waitcnt vmcnt(0)
3043 ; GCN-NEXT: v_mov_b32_e32 v0, v1
3044 ; GCN-NEXT: ; return to shader part epilog
3046 ; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
3048 ; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
3049 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3050 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
3051 ; GFX11-NEXT: ; return to shader part epilog
3052 %zext.offset = zext i32 %voffset to i64
3053 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3054 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3055 %load = load i16, ptr addrspace(1) %gep1
3056 %build = insertelement <2 x i16> %reg, i16 %load, i32 1
3057 %cast = bitcast <2 x i16> %build to <2 x half>
3058 ret <2 x half> %cast
3061 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3062 ; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
3064 ; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3]
3065 ; GCN-NEXT: s_waitcnt vmcnt(0)
3066 ; GCN-NEXT: v_mov_b32_e32 v0, v1
3067 ; GCN-NEXT: ; return to shader part epilog
3069 ; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
3071 ; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3]
3072 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3073 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
3074 ; GFX11-NEXT: ; return to shader part epilog
3075 %zext.offset = zext i32 %voffset to i64
3076 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3077 %load = load i8, ptr addrspace(1) %gep0
3078 %zext.load = zext i8 %load to i16
3079 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
3080 %cast = bitcast <2 x i16> %build to <2 x half>
3081 ret <2 x half> %cast
3084 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3085 ; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
3087 ; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128
3088 ; GCN-NEXT: s_waitcnt vmcnt(0)
3089 ; GCN-NEXT: v_mov_b32_e32 v0, v1
3090 ; GCN-NEXT: ; return to shader part epilog
3092 ; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
3094 ; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128
3095 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3096 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
3097 ; GFX11-NEXT: ; return to shader part epilog
3098 %zext.offset = zext i32 %voffset to i64
3099 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3100 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3101 %load = load i8, ptr addrspace(1) %gep1
3102 %zext.load = zext i8 %load to i16
3103 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
3104 %cast = bitcast <2 x i16> %build to <2 x half>
3105 ret <2 x half> %cast
3108 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3109 ; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
3111 ; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3]
3112 ; GCN-NEXT: s_waitcnt vmcnt(0)
3113 ; GCN-NEXT: v_mov_b32_e32 v0, v1
3114 ; GCN-NEXT: ; return to shader part epilog
3116 ; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
3118 ; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3]
3119 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3120 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
3121 ; GFX11-NEXT: ; return to shader part epilog
3122 %zext.offset = zext i32 %voffset to i64
3123 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3124 %load = load i8, ptr addrspace(1) %gep0
3125 %sext.load = sext i8 %load to i16
3126 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
3127 %cast = bitcast <2 x i16> %build to <2 x half>
3128 ret <2 x half> %cast
3131 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3132 ; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
3134 ; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128
3135 ; GCN-NEXT: s_waitcnt vmcnt(0)
3136 ; GCN-NEXT: v_mov_b32_e32 v0, v1
3137 ; GCN-NEXT: ; return to shader part epilog
3139 ; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
3141 ; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
3142 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3143 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
3144 ; GFX11-NEXT: ; return to shader part epilog
3145 %zext.offset = zext i32 %voffset to i64
3146 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3147 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3148 %load = load i8, ptr addrspace(1) %gep1
3149 %sext.load = sext i8 %load to i16
3150 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
3151 %cast = bitcast <2 x i16> %build to <2 x half>
3152 ret <2 x half> %cast
3155 ; --------------------------------------------------------------------------------
3156 ; or-with-constant as add
3157 ; --------------------------------------------------------------------------------
3159 ; Check add-as-or with split 64-bit or.
3160 define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) {
3161 ; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
3163 ; GCN-NEXT: v_or_b32_e32 v0, 16, v0
3164 ; GCN-NEXT: v_mov_b32_e32 v1, 0
3165 ; GCN-NEXT: global_load_ubyte v0, v[0:1], off
3166 ; GCN-NEXT: s_waitcnt vmcnt(0)
3167 ; GCN-NEXT: ; return to shader part epilog
3169 ; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
3171 ; GFX11-NEXT: v_or_b32_e32 v0, 16, v0
3172 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
3173 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off
3174 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3175 ; GFX11-NEXT: ; return to shader part epilog
3176 %zext.idx = zext i32 %idx to i64
3177 %or = or i64 %zext.idx, 16
3178 %addr = inttoptr i64 %or to ptr addrspace(1)
3179 %load = load i8, ptr addrspace(1) %addr
3180 %zext = zext i8 %load to i32
3181 %to.vgpr = bitcast i32 %zext to float
3185 define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
3186 ; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
3188 ; GCN-NEXT: v_or_b32_e32 v0, 0x1040, v0
3189 ; GCN-NEXT: v_mov_b32_e32 v1, 0
3190 ; GCN-NEXT: global_load_ubyte v0, v[0:1], off
3191 ; GCN-NEXT: s_waitcnt vmcnt(0)
3192 ; GCN-NEXT: ; return to shader part epilog
3194 ; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
3196 ; GFX11-NEXT: v_or_b32_e32 v0, 0x1040, v0
3197 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
3198 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off
3199 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3200 ; GFX11-NEXT: ; return to shader part epilog
3201 %zext.idx = zext i32 %idx to i64
3202 %or = or i64 %zext.idx, 4160
3203 %addr = inttoptr i64 %or to ptr addrspace(1)
3204 %load = load i8, ptr addrspace(1) %addr
3205 %zext = zext i8 %load to i32
3206 %to.vgpr = bitcast i32 %zext to float
3210 ; --------------------------------------------------------------------------------
3211 ; Full 64-bit scalar add.
3212 ; --------------------------------------------------------------------------------
3214 define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
3215 ; GFX9-LABEL: global_addr_64bit_lsr_iv:
3216 ; GFX9: ; %bb.0: ; %bb
3217 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
3218 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3219 ; GFX9-NEXT: .LBB128_1: ; %bb3
3220 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3221 ; GFX9-NEXT: s_add_u32 s4, s2, s0
3222 ; GFX9-NEXT: s_addc_u32 s5, s3, s1
3223 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
3224 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3225 ; GFX9-NEXT: s_add_u32 s0, s0, 4
3226 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
3227 ; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
3228 ; GFX9-NEXT: s_cbranch_scc0 .LBB128_1
3229 ; GFX9-NEXT: ; %bb.2: ; %bb2
3230 ; GFX9-NEXT: s_endpgm
3232 ; GFX10-LABEL: global_addr_64bit_lsr_iv:
3233 ; GFX10: ; %bb.0: ; %bb
3234 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3235 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
3236 ; GFX10-NEXT: .LBB128_1: ; %bb3
3237 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3238 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
3239 ; GFX10-NEXT: s_add_u32 s4, s2, s0
3240 ; GFX10-NEXT: s_addc_u32 s5, s3, s1
3241 ; GFX10-NEXT: s_add_u32 s0, s0, 4
3242 ; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
3243 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3244 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
3245 ; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
3246 ; GFX10-NEXT: s_cbranch_scc0 .LBB128_1
3247 ; GFX10-NEXT: ; %bb.2: ; %bb2
3248 ; GFX10-NEXT: s_endpgm
3250 ; GFX11-LABEL: global_addr_64bit_lsr_iv:
3251 ; GFX11: ; %bb.0: ; %bb
3252 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3253 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
3254 ; GFX11-NEXT: .LBB128_1: ; %bb3
3255 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3256 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3257 ; GFX11-NEXT: s_add_u32 s4, s2, s0
3258 ; GFX11-NEXT: s_addc_u32 s5, s3, s1
3259 ; GFX11-NEXT: s_add_u32 s0, s0, 4
3260 ; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
3261 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3262 ; GFX11-NEXT: s_addc_u32 s1, s1, 0
3263 ; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
3264 ; GFX11-NEXT: s_cbranch_scc0 .LBB128_1
3265 ; GFX11-NEXT: ; %bb.2: ; %bb2
3266 ; GFX11-NEXT: s_endpgm
3273 bb3: ; preds = %bb3, %bb
3274 %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
3275 %i4 = zext i32 %i to i64
3276 %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
3277 %i6 = load volatile float, ptr addrspace(1) %i5, align 4
3278 %i8 = add nuw nsw i32 %i, 1
3279 %i9 = icmp eq i32 %i8, 256
3280 br i1 %i9, label %bb2, label %bb3
3283 ; Make sure we only have a single zero vaddr initialization.
3285 define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1) {
3286 ; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload:
3287 ; GFX9: ; %bb.0: ; %bb
3288 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
3289 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3290 ; GFX9-NEXT: .LBB129_1: ; %bb3
3291 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3292 ; GFX9-NEXT: s_add_u32 s4, s2, s0
3293 ; GFX9-NEXT: s_addc_u32 s5, s3, s1
3294 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
3295 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3296 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
3297 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3298 ; GFX9-NEXT: s_add_u32 s0, s0, 4
3299 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
3300 ; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
3301 ; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5
3302 ; GFX9-NEXT: s_cbranch_scc0 .LBB129_1
3303 ; GFX9-NEXT: ; %bb.2: ; %bb2
3304 ; GFX9-NEXT: s_endpgm
3306 ; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload:
3307 ; GFX10: ; %bb.0: ; %bb
3308 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3309 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
3310 ; GFX10-NEXT: .LBB129_1: ; %bb3
3311 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3312 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
3313 ; GFX10-NEXT: s_add_u32 s4, s2, s0
3314 ; GFX10-NEXT: s_addc_u32 s5, s3, s1
3315 ; GFX10-NEXT: s_add_u32 s0, s0, 4
3316 ; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
3317 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3318 ; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
3319 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3320 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
3321 ; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
3322 ; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5
3323 ; GFX10-NEXT: s_cbranch_scc0 .LBB129_1
3324 ; GFX10-NEXT: ; %bb.2: ; %bb2
3325 ; GFX10-NEXT: s_endpgm
3327 ; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload:
3328 ; GFX11: ; %bb.0: ; %bb
3329 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3330 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
3331 ; GFX11-NEXT: .LBB129_1: ; %bb3
3332 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3333 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3334 ; GFX11-NEXT: s_add_u32 s4, s2, s0
3335 ; GFX11-NEXT: s_addc_u32 s5, s3, s1
3336 ; GFX11-NEXT: s_add_u32 s0, s0, 4
3337 ; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
3338 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3339 ; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
3340 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3341 ; GFX11-NEXT: s_addc_u32 s1, s1, 0
3342 ; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
3343 ; GFX11-NEXT: s_cbranch_scc0 .LBB129_1
3344 ; GFX11-NEXT: ; %bb.2: ; %bb2
3345 ; GFX11-NEXT: s_endpgm
3352 bb3: ; preds = %bb3, %bb
3353 %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
3354 %i4 = zext i32 %i to i64
3355 %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
3356 %i6 = load volatile float, ptr addrspace(1) %i5, align 4
3357 %i5.1 = getelementptr inbounds float, ptr addrspace(1) %arg.1, i64 %i4
3358 %i6.1 = load volatile float, ptr addrspace(1) %i5, align 4
3359 %i8 = add nuw nsw i32 %i, 1
3360 %i9 = icmp eq i32 %i8, 256
3361 br i1 %i9, label %bb2, label %bb3
3364 !0 = !{i32 0, i32 1073741824} ; (1 << 30)
3365 !1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1