1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
5 ; Test using saddr addressing mode of global_*load_* flat instructions.
7 ; --------------------------------------------------------------------------------
8 ; No vgpr offset, constants
9 ; --------------------------------------------------------------------------------
12 define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sbase) {
13 ; GCN-LABEL: global_load_saddr_i8_offset_0:
15 ; GCN-NEXT: v_mov_b32_e32 v0, 0
16 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
17 ; GCN-NEXT: s_waitcnt vmcnt(0)
18 ; GCN-NEXT: ; return to shader part epilog
19 %load = load i8, i8 addrspace(1)* %sbase
20 %zext = zext i8 %load to i32
21 %to.vgpr = bitcast i32 %zext to float
25 ; SGPR base with maximum gfx9 immediate offset
26 define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) {
27 ; GFX9-LABEL: global_load_saddr_i8_offset_4095:
29 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
30 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
31 ; GFX9-NEXT: s_waitcnt vmcnt(0)
32 ; GFX9-NEXT: ; return to shader part epilog
34 ; GFX10-LABEL: global_load_saddr_i8_offset_4095:
36 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
37 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
38 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39 ; GFX10-NEXT: ; return to shader part epilog
40 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
41 %load = load i8, i8 addrspace(1)* %gep0
42 %zext = zext i8 %load to i32
43 %to.vgpr = bitcast i32 %zext to float
47 ; SGPR base with maximum gfx9 immediate offset + 1
48 define amdgpu_ps float @global_load_saddr_i8_offset_4096(i8 addrspace(1)* inreg %sbase) {
49 ; GCN-LABEL: global_load_saddr_i8_offset_4096:
51 ; GCN-NEXT: v_mov_b32_e32 v0, 0x1000
52 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
53 ; GCN-NEXT: s_waitcnt vmcnt(0)
54 ; GCN-NEXT: ; return to shader part epilog
55 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4096
56 %load = load i8, i8 addrspace(1)* %gep0
57 %zext = zext i8 %load to i32
58 %to.vgpr = bitcast i32 %zext to float
62 ; SGPR base with maximum gfx9 immediate offset + 2
63 define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg %sbase) {
64 ; GCN-LABEL: global_load_saddr_i8_offset_4097:
66 ; GCN-NEXT: v_mov_b32_e32 v0, 0x1000
67 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
68 ; GCN-NEXT: s_waitcnt vmcnt(0)
69 ; GCN-NEXT: ; return to shader part epilog
70 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4097
71 %load = load i8, i8 addrspace(1)* %gep0
72 %zext = zext i8 %load to i32
73 %to.vgpr = bitcast i32 %zext to float
77 ; SGPR base with maximum negative gfx9 immediate offset
78 define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) {
79 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4096:
81 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
82 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096
83 ; GFX9-NEXT: s_waitcnt vmcnt(0)
84 ; GFX9-NEXT: ; return to shader part epilog
86 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4096:
88 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
89 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
90 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
91 ; GFX10-NEXT: s_waitcnt vmcnt(0)
92 ; GFX10-NEXT: ; return to shader part epilog
93 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096
94 %load = load i8, i8 addrspace(1)* %gep0
95 %zext = zext i8 %load to i32
96 %to.vgpr = bitcast i32 %zext to float
100 ; SGPR base with maximum negative gfx9 immediate offset -1
101 define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) {
102 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4097:
104 ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffefff
105 ; GFX9-NEXT: s_addc_u32 s1, s3, -1
106 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
107 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
108 ; GFX9-NEXT: s_waitcnt vmcnt(0)
109 ; GFX9-NEXT: ; return to shader part epilog
111 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4097:
113 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
114 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
115 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
116 ; GFX10-NEXT: s_waitcnt vmcnt(0)
117 ; GFX10-NEXT: ; return to shader part epilog
118 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097
119 %load = load i8, i8 addrspace(1)* %gep0
120 %zext = zext i8 %load to i32
121 %to.vgpr = bitcast i32 %zext to float
125 ; SGPR base with maximum negative gfx9 immediate offset -2
126 define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) {
127 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4098:
129 ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffeffe
130 ; GFX9-NEXT: s_addc_u32 s1, s3, -1
131 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
132 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
133 ; GFX9-NEXT: s_waitcnt vmcnt(0)
134 ; GFX9-NEXT: ; return to shader part epilog
136 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4098:
138 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
139 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
140 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
141 ; GFX10-NEXT: s_waitcnt vmcnt(0)
142 ; GFX10-NEXT: ; return to shader part epilog
143 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098
144 %load = load i8, i8 addrspace(1)* %gep0
145 %zext = zext i8 %load to i32
146 %to.vgpr = bitcast i32 %zext to float
150 ; SGPR base with maximum gfx10 immediate offset
151 define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) {
152 ; GFX9-LABEL: global_load_saddr_i8_offset_2048:
154 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
155 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048
156 ; GFX9-NEXT: s_waitcnt vmcnt(0)
157 ; GFX9-NEXT: ; return to shader part epilog
159 ; GFX10-LABEL: global_load_saddr_i8_offset_2048:
161 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
162 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
163 ; GFX10-NEXT: s_waitcnt vmcnt(0)
164 ; GFX10-NEXT: ; return to shader part epilog
165 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2048
166 %load = load i8, i8 addrspace(1)* %gep0
167 %zext = zext i8 %load to i32
168 %to.vgpr = bitcast i32 %zext to float
172 ; SGPR base with maximum gfx10 immediate offset + 1
173 define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) {
174 ; GFX9-LABEL: global_load_saddr_i8_offset_2049:
176 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
177 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2049
178 ; GFX9-NEXT: s_waitcnt vmcnt(0)
179 ; GFX9-NEXT: ; return to shader part epilog
181 ; GFX10-LABEL: global_load_saddr_i8_offset_2049:
183 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
184 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
185 ; GFX10-NEXT: s_waitcnt vmcnt(0)
186 ; GFX10-NEXT: ; return to shader part epilog
187 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2049
188 %load = load i8, i8 addrspace(1)* %gep0
189 %zext = zext i8 %load to i32
190 %to.vgpr = bitcast i32 %zext to float
194 ; SGPR base with maximum gfx10 immediate offset + 2
195 define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) {
196 ; GFX9-LABEL: global_load_saddr_i8_offset_2050:
198 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
199 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2050
200 ; GFX9-NEXT: s_waitcnt vmcnt(0)
201 ; GFX9-NEXT: ; return to shader part epilog
203 ; GFX10-LABEL: global_load_saddr_i8_offset_2050:
205 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
206 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
207 ; GFX10-NEXT: s_waitcnt vmcnt(0)
208 ; GFX10-NEXT: ; return to shader part epilog
209 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2050
210 %load = load i8, i8 addrspace(1)* %gep0
211 %zext = zext i8 %load to i32
212 %to.vgpr = bitcast i32 %zext to float
216 ; SGPR base with maximum negative gfx10 immediate offset
217 define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) {
218 ; GCN-LABEL: global_load_saddr_i8_offset_neg2048:
220 ; GCN-NEXT: v_mov_b32_e32 v0, 0
221 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048
222 ; GCN-NEXT: s_waitcnt vmcnt(0)
223 ; GCN-NEXT: ; return to shader part epilog
224 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048
225 %load = load i8, i8 addrspace(1)* %gep0
226 %zext = zext i8 %load to i32
227 %to.vgpr = bitcast i32 %zext to float
231 ; SGPR base with maximum negative gfx10 immediate offset - 1
232 define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) {
233 ; GFX9-LABEL: global_load_saddr_i8_offset_neg2049:
235 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
236 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049
237 ; GFX9-NEXT: s_waitcnt vmcnt(0)
238 ; GFX9-NEXT: ; return to shader part epilog
240 ; GFX10-LABEL: global_load_saddr_i8_offset_neg2049:
242 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2
243 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
244 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
245 ; GFX10-NEXT: s_waitcnt vmcnt(0)
246 ; GFX10-NEXT: ; return to shader part epilog
247 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049
248 %load = load i8, i8 addrspace(1)* %gep0
249 %zext = zext i8 %load to i32
250 %to.vgpr = bitcast i32 %zext to float
254 ; SGPR base with maximum negative gfx10 immediate offset - 1
255 define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) {
256 ; GFX9-LABEL: global_load_saddr_i8_offset_neg2050:
258 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
259 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2050
260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
261 ; GFX9-NEXT: ; return to shader part epilog
263 ; GFX10-LABEL: global_load_saddr_i8_offset_neg2050:
265 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2
266 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
267 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
268 ; GFX10-NEXT: s_waitcnt vmcnt(0)
269 ; GFX10-NEXT: ; return to shader part epilog
270 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050
271 %load = load i8, i8 addrspace(1)* %gep0
272 %zext = zext i8 %load to i32
273 %to.vgpr = bitcast i32 %zext to float
277 define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(i8 addrspace(1)* inreg %sbase) {
278 ; GFX9-LABEL: global_load_saddr_i8_offset_4294967295:
280 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xfffff000
281 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
282 ; GFX9-NEXT: s_waitcnt vmcnt(0)
283 ; GFX9-NEXT: ; return to shader part epilog
285 ; GFX10-LABEL: global_load_saddr_i8_offset_4294967295:
287 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xfffff800
288 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
289 ; GFX10-NEXT: s_waitcnt vmcnt(0)
290 ; GFX10-NEXT: ; return to shader part epilog
291 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967295
292 %load = load i8, i8 addrspace(1)* %gep0
293 %zext = zext i8 %load to i32
294 %to.vgpr = bitcast i32 %zext to float
298 define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)* inreg %sbase) {
299 ; GFX9-LABEL: global_load_saddr_i8_offset_4294967296:
301 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
302 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
303 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
304 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
305 ; GFX9-NEXT: s_waitcnt vmcnt(0)
306 ; GFX9-NEXT: ; return to shader part epilog
308 ; GFX10-LABEL: global_load_saddr_i8_offset_4294967296:
310 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
311 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
312 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
313 ; GFX10-NEXT: s_waitcnt vmcnt(0)
314 ; GFX10-NEXT: ; return to shader part epilog
315 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296
316 %load = load i8, i8 addrspace(1)* %gep0
317 %zext = zext i8 %load to i32
318 %to.vgpr = bitcast i32 %zext to float
322 define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)* inreg %sbase) {
323 ; GFX9-LABEL: global_load_saddr_i8_offset_4294967297:
325 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
326 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
327 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
328 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
329 ; GFX9-NEXT: s_waitcnt vmcnt(0)
330 ; GFX9-NEXT: ; return to shader part epilog
332 ; GFX10-LABEL: global_load_saddr_i8_offset_4294967297:
334 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
335 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
336 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
337 ; GFX10-NEXT: s_waitcnt vmcnt(0)
338 ; GFX10-NEXT: ; return to shader part epilog
339 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297
340 %load = load i8, i8 addrspace(1)* %gep0
341 %zext = zext i8 %load to i32
342 %to.vgpr = bitcast i32 %zext to float
346 define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) {
347 ; GFX9-LABEL: global_load_saddr_i8_offset_4294971391:
349 ; GFX9-NEXT: s_add_u32 s0, s2, 0xfff
350 ; GFX9-NEXT: s_addc_u32 s1, s3, 1
351 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
352 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
353 ; GFX9-NEXT: s_waitcnt vmcnt(0)
354 ; GFX9-NEXT: ; return to shader part epilog
356 ; GFX10-LABEL: global_load_saddr_i8_offset_4294971391:
358 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2
359 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
360 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
361 ; GFX10-NEXT: s_waitcnt vmcnt(0)
362 ; GFX10-NEXT: ; return to shader part epilog
363 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391
364 %load = load i8, i8 addrspace(1)* %gep0
365 %zext = zext i8 %load to i32
366 %to.vgpr = bitcast i32 %zext to float
370 define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) {
371 ; GFX9-LABEL: global_load_saddr_i8_offset_4294971392:
373 ; GFX9-NEXT: s_add_u32 s0, s2, 0x1000
374 ; GFX9-NEXT: s_addc_u32 s1, s3, 1
375 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
376 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
377 ; GFX9-NEXT: s_waitcnt vmcnt(0)
378 ; GFX9-NEXT: ; return to shader part epilog
380 ; GFX10-LABEL: global_load_saddr_i8_offset_4294971392:
382 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2
383 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
384 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
385 ; GFX10-NEXT: s_waitcnt vmcnt(0)
386 ; GFX10-NEXT: ; return to shader part epilog
387 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392
388 %load = load i8, i8 addrspace(1)* %gep0
389 %zext = zext i8 %load to i32
390 %to.vgpr = bitcast i32 %zext to float
394 define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1)* inreg %sbase) {
395 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295:
397 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
398 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
399 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
400 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
401 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
402 ; GFX9-NEXT: s_waitcnt vmcnt(0)
403 ; GFX9-NEXT: ; return to shader part epilog
405 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295:
407 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2
408 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
409 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047
410 ; GFX10-NEXT: s_waitcnt vmcnt(0)
411 ; GFX10-NEXT: ; return to shader part epilog
412 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295
413 %load = load i8, i8 addrspace(1)* %gep0
414 %zext = zext i8 %load to i32
415 %to.vgpr = bitcast i32 %zext to float
419 define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1)* inreg %sbase) {
420 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296:
422 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
423 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
424 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
425 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
426 ; GFX9-NEXT: s_waitcnt vmcnt(0)
427 ; GFX9-NEXT: ; return to shader part epilog
429 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296:
431 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
432 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
433 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
434 ; GFX10-NEXT: s_waitcnt vmcnt(0)
435 ; GFX10-NEXT: ; return to shader part epilog
436 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296
437 %load = load i8, i8 addrspace(1)* %gep0
438 %zext = zext i8 %load to i32
439 %to.vgpr = bitcast i32 %zext to float
443 define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1)* inreg %sbase) {
444 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297:
446 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
447 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
448 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
449 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
450 ; GFX9-NEXT: s_waitcnt vmcnt(0)
451 ; GFX9-NEXT: ; return to shader part epilog
453 ; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297:
455 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
456 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
457 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
458 ; GFX10-NEXT: s_waitcnt vmcnt(0)
459 ; GFX10-NEXT: ; return to shader part epilog
460 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297
461 %load = load i8, i8 addrspace(1)* %gep0
462 %zext = zext i8 %load to i32
463 %to.vgpr = bitcast i32 %zext to float
467 ; --------------------------------------------------------------------------------
468 ; Basic addressing patterns
469 ; --------------------------------------------------------------------------------
471 ; Basic pattern, no immediate offset.
472 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
473 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr:
475 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
476 ; GCN-NEXT: s_waitcnt vmcnt(0)
477 ; GCN-NEXT: ; return to shader part epilog
478 %zext.offset = zext i32 %voffset to i64
479 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
480 %load = load i8, i8 addrspace(1)* %gep0
481 %zext = zext i8 %load to i32
482 %to.vgpr = bitcast i32 %zext to float
486 ; Maximum positive offset on gfx9
487 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
488 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
490 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
491 ; GFX9-NEXT: s_waitcnt vmcnt(0)
492 ; GFX9-NEXT: ; return to shader part epilog
494 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
496 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
497 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
498 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
499 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
500 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
501 ; GFX10-NEXT: s_waitcnt vmcnt(0)
502 ; GFX10-NEXT: ; return to shader part epilog
503 %zext.offset = zext i32 %voffset to i64
504 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
505 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
506 %load = load i8, i8 addrspace(1)* %gep1
507 %zext = zext i8 %load to i32
508 %to.vgpr = bitcast i32 %zext to float
512 ; Maximum positive offset on gfx9 + 1
513 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
514 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
516 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
517 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
518 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
519 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
520 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
521 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
522 ; GFX9-NEXT: s_waitcnt vmcnt(0)
523 ; GFX9-NEXT: ; return to shader part epilog
525 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
527 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
528 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
529 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0
530 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
531 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
532 ; GFX10-NEXT: s_waitcnt vmcnt(0)
533 ; GFX10-NEXT: ; return to shader part epilog
534 %zext.offset = zext i32 %voffset to i64
535 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
536 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096
537 %load = load i8, i8 addrspace(1)* %gep1
538 %zext = zext i8 %load to i32
539 %to.vgpr = bitcast i32 %zext to float
543 ; Maximum negative offset on gfx9
544 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
545 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
547 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096
548 ; GFX9-NEXT: s_waitcnt vmcnt(0)
549 ; GFX9-NEXT: ; return to shader part epilog
551 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
553 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
554 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
555 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
556 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
557 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
558 ; GFX10-NEXT: s_waitcnt vmcnt(0)
559 ; GFX10-NEXT: ; return to shader part epilog
560 %zext.offset = zext i32 %voffset to i64
561 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
562 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096
563 %load = load i8, i8 addrspace(1)* %gep1
564 %zext = zext i8 %load to i32
565 %to.vgpr = bitcast i32 %zext to float
569 ; Maximum negative offset on gfx9 - 1
570 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
571 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
573 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
574 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
575 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
576 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
577 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
578 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
579 ; GFX9-NEXT: s_waitcnt vmcnt(0)
580 ; GFX9-NEXT: ; return to shader part epilog
582 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
584 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
585 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
586 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
587 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
588 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
589 ; GFX10-NEXT: s_waitcnt vmcnt(0)
590 ; GFX10-NEXT: ; return to shader part epilog
591 %zext.offset = zext i32 %voffset to i64
592 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
593 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097
594 %load = load i8, i8 addrspace(1)* %gep1
595 %zext = zext i8 %load to i32
596 %to.vgpr = bitcast i32 %zext to float
600 ; Maximum positive offset on gfx10
601 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
602 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
604 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
605 ; GCN-NEXT: s_waitcnt vmcnt(0)
606 ; GCN-NEXT: ; return to shader part epilog
607 %zext.offset = zext i32 %voffset to i64
608 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
609 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047
610 %load = load i8, i8 addrspace(1)* %gep1
611 %zext = zext i8 %load to i32
612 %to.vgpr = bitcast i32 %zext to float
616 ; Maximum positive offset on gfx10 + 1
617 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
618 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
620 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048
621 ; GFX9-NEXT: s_waitcnt vmcnt(0)
622 ; GFX9-NEXT: ; return to shader part epilog
624 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
626 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
627 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
628 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
629 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
630 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
631 ; GFX10-NEXT: s_waitcnt vmcnt(0)
632 ; GFX10-NEXT: ; return to shader part epilog
633 %zext.offset = zext i32 %voffset to i64
634 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
635 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048
636 %load = load i8, i8 addrspace(1)* %gep1
637 %zext = zext i8 %load to i32
638 %to.vgpr = bitcast i32 %zext to float
642 ; Maximum negative offset on gfx10
643 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
644 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
646 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048
647 ; GCN-NEXT: s_waitcnt vmcnt(0)
648 ; GCN-NEXT: ; return to shader part epilog
649 %zext.offset = zext i32 %voffset to i64
650 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
651 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
652 %load = load i8, i8 addrspace(1)* %gep1
653 %zext = zext i8 %load to i32
654 %to.vgpr = bitcast i32 %zext to float
658 ; Maximum negative offset on gfx10 - 1
659 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
660 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
662 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049
663 ; GFX9-NEXT: s_waitcnt vmcnt(0)
664 ; GFX9-NEXT: ; return to shader part epilog
666 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
668 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
669 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
670 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff800, v0
671 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
672 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
673 ; GFX10-NEXT: s_waitcnt vmcnt(0)
674 ; GFX10-NEXT: ; return to shader part epilog
675 %zext.offset = zext i32 %voffset to i64
676 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
677 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049
678 %load = load i8, i8 addrspace(1)* %gep1
679 %zext = zext i8 %load to i32
680 %to.vgpr = bitcast i32 %zext to float
684 ; Maximum positive offset on gfx9, and immediate needs to be moved lower.
685 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
686 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
688 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
689 ; GFX9-NEXT: s_waitcnt vmcnt(0)
690 ; GFX9-NEXT: ; return to shader part epilog
692 ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
694 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
695 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
696 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
697 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
698 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
699 ; GFX10-NEXT: s_waitcnt vmcnt(0)
700 ; GFX10-NEXT: ; return to shader part epilog
701 %zext.offset = zext i32 %voffset to i64
702 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
703 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset
704 %load = load i8, i8 addrspace(1)* %gep1
705 %zext = zext i8 %load to i32
706 %to.vgpr = bitcast i32 %zext to float
710 ; pointer addressing done in integers
711 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
712 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
714 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
715 ; GCN-NEXT: s_waitcnt vmcnt(0)
716 ; GCN-NEXT: ; return to shader part epilog
717 %zext.offset = zext i32 %voffset to i64
718 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
719 %add = add i64 %sbase.as.int, %zext.offset
720 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
721 %load = load i8, i8 addrspace(1)* %dirty.gep
722 %zext = zext i8 %load to i32
723 %to.vgpr = bitcast i32 %zext to float
727 ; zext forced to LHS of addressing expression
728 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
729 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
731 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
732 ; GCN-NEXT: s_waitcnt vmcnt(0)
733 ; GCN-NEXT: ; return to shader part epilog
734 %zext.offset = zext i32 %voffset to i64
735 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
736 %add = add i64 %zext.offset, %sbase.as.int
737 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
738 %load = load i8, i8 addrspace(1)* %dirty.gep
739 %zext = zext i8 %load to i32
740 %to.vgpr = bitcast i32 %zext to float
744 ; zext forced to LHS of addressing expression, with immediate offset
745 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
746 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
748 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128
749 ; GCN-NEXT: s_waitcnt vmcnt(0)
750 ; GCN-NEXT: ; return to shader part epilog
751 %zext.offset = zext i32 %voffset to i64
752 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
753 %add = add i64 %zext.offset, %sbase.as.int
754 %add.immoffset = add i64 %add, 128
755 %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
756 %load = load i8, i8 addrspace(1)* %dirty.gep
757 %zext = zext i8 %load to i32
758 %to.vgpr = bitcast i32 %zext to float
762 ; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
763 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
764 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
766 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128
767 ; GCN-NEXT: s_waitcnt vmcnt(0)
768 ; GCN-NEXT: ; return to shader part epilog
769 %zext.offset = zext i32 %voffset to i64
770 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
771 %add.immoffset = add i64 %sbase.as.int, 128
772 %add = add i64 %zext.offset, %add.immoffset
773 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
774 %load = load i8, i8 addrspace(1)* %dirty.gep
775 %zext = zext i8 %load to i32
776 %to.vgpr = bitcast i32 %zext to float
780 ; --------------------------------------------------------------------------------
781 ; Uniformity edge cases
782 ; --------------------------------------------------------------------------------
784 @ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef
786 ; Base pointer is uniform, but also in VGPRs
787 define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
788 ; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
790 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
791 ; GFX9-NEXT: ds_read_b64 v[1:2], v1
792 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
793 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1
794 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2
796 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
797 ; GFX9-NEXT: s_waitcnt vmcnt(0)
798 ; GFX9-NEXT: ; return to shader part epilog
800 ; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
802 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
803 ; GFX10-NEXT: ds_read_b64 v[1:2], v1
804 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
805 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
806 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
807 ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
808 ; GFX10-NEXT: s_waitcnt vmcnt(0)
809 ; GFX10-NEXT: ; return to shader part epilog
810 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
811 %zext.offset = zext i32 %voffset to i64
812 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
813 %load = load i8, i8 addrspace(1)* %gep0
814 %zext = zext i8 %load to i32
815 %to.vgpr = bitcast i32 %zext to float
819 ; Base pointer is uniform, but also in VGPRs, with imm offset
820 define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
821 ; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
823 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
824 ; GFX9-NEXT: ds_read_b64 v[1:2], v1
825 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
826 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1
827 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2
829 ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42
830 ; GFX9-NEXT: s_waitcnt vmcnt(0)
831 ; GFX9-NEXT: ; return to shader part epilog
833 ; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
835 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
836 ; GFX10-NEXT: ds_read_b64 v[1:2], v1
837 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
838 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
839 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
840 ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42
841 ; GFX10-NEXT: s_waitcnt vmcnt(0)
842 ; GFX10-NEXT: ; return to shader part epilog
843 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
844 %zext.offset = zext i32 %voffset to i64
845 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
846 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
847 %load = load i8, i8 addrspace(1)* %gep1
848 %zext = zext i8 %load to i32
849 %to.vgpr = bitcast i32 %zext to float
853 ; Both 64-bit base and 32-bit offset are scalar
854 define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
855 ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset:
857 ; GCN-NEXT: v_mov_b32_e32 v0, s4
858 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
859 ; GCN-NEXT: s_waitcnt vmcnt(0)
860 ; GCN-NEXT: ; return to shader part epilog
861 %zext.offset = zext i32 %soffset to i64
862 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
863 %load = load i8, i8 addrspace(1)* %gep0
864 %zext = zext i8 %load to i32
865 %to.vgpr = bitcast i32 %zext to float
869 ; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
870 define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
871 ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
873 ; GCN-NEXT: v_mov_b32_e32 v0, s4
874 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24
875 ; GCN-NEXT: s_waitcnt vmcnt(0)
876 ; GCN-NEXT: ; return to shader part epilog
877 %zext.offset = zext i32 %soffset to i64
878 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
879 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24
880 %load = load i8, i8 addrspace(1)* %gep1
881 %zext = zext i8 %load to i32
882 %to.vgpr = bitcast i32 %zext to float
886 ; Both components uniform, zext forced to LHS of addressing expression
887 define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
888 ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
890 ; GCN-NEXT: v_mov_b32_e32 v0, s4
891 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
892 ; GCN-NEXT: s_waitcnt vmcnt(0)
893 ; GCN-NEXT: ; return to shader part epilog
894 %zext.offset = zext i32 %soffset to i64
895 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
896 %add = add i64 %zext.offset, %sbase.as.int
897 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
898 %load = load i8, i8 addrspace(1)* %dirty.gep
899 %zext = zext i8 %load to i32
900 %to.vgpr = bitcast i32 %zext to float
904 ; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
905 define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
906 ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
908 ; GCN-NEXT: v_mov_b32_e32 v0, s4
909 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128
910 ; GCN-NEXT: s_waitcnt vmcnt(0)
911 ; GCN-NEXT: ; return to shader part epilog
912 %zext.offset = zext i32 %soffset to i64
913 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
914 %add = add i64 %zext.offset, %sbase.as.int
915 %add.immoffset = add i64 %add, 128
916 %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
917 %load = load i8, i8 addrspace(1)* %dirty.gep
918 %zext = zext i8 %load to i32
919 %to.vgpr = bitcast i32 %zext to float
923 ; divergent 64-bit base, 32-bit scalar offset.
924 define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
925 ; GFX9-LABEL: global_load_i8_vgpr64_sgpr32:
927 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
928 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
929 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
930 ; GFX9-NEXT: s_waitcnt vmcnt(0)
931 ; GFX9-NEXT: ; return to shader part epilog
933 ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32:
935 ; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2
936 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
937 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
938 ; GFX10-NEXT: s_waitcnt vmcnt(0)
939 ; GFX10-NEXT: ; return to shader part epilog
940 %zext.offset = zext i32 %soffset to i64
941 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
942 %load = load i8, i8 addrspace(1)* %gep0
943 %zext = zext i8 %load to i32
944 %to.vgpr = bitcast i32 %zext to float
948 ; divergent 64-bit base, 32-bit scalar offset, with imm offset
949 define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
950 ; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
952 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
953 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
954 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
955 ; GFX9-NEXT: s_waitcnt vmcnt(0)
956 ; GFX9-NEXT: ; return to shader part epilog
958 ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
960 ; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2
961 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
962 ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
963 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
964 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
965 ; GFX10-NEXT: s_waitcnt vmcnt(0)
966 ; GFX10-NEXT: ; return to shader part epilog
967 %zext.offset = zext i32 %soffset to i64
968 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
969 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
970 %load = load i8, i8 addrspace(1)* %gep1
971 %zext = zext i8 %load to i32
972 %to.vgpr = bitcast i32 %zext to float
976 ; --------------------------------------------------------------------------------
977 ; Natural addressing shifts with restricted range
978 ; --------------------------------------------------------------------------------
980 ; Cannot push the shift into 32-bits, and cannot match.
981 define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
982 ; GFX9-LABEL: global_load_saddr_f32_natural_addressing:
984 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
985 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
986 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
987 ; GFX9-NEXT: s_waitcnt vmcnt(0)
988 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
989 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
990 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
991 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
992 ; GFX9-NEXT: s_waitcnt vmcnt(0)
993 ; GFX9-NEXT: ; return to shader part epilog
995 ; GFX10-LABEL: global_load_saddr_f32_natural_addressing:
997 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
998 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
999 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1000 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
1001 ; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
1002 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1003 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1004 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1005 ; GFX10-NEXT: ; return to shader part epilog
1006 %voffset = load i32, i32 addrspace(1)* %voffset.ptr
1007 %zext.offset = zext i32 %voffset to i64
1008 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1009 %load = load float, float addrspace(1)* %gep
1013 ; Cannot push the shift into 32-bits, with an immediate offset.
1014 define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1015 ; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
1017 ; GCN-NEXT: global_load_dword v0, v[0:1], off
1018 ; GCN-NEXT: s_waitcnt vmcnt(0)
1019 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128
1020 ; GCN-NEXT: s_waitcnt vmcnt(0)
1021 ; GCN-NEXT: ; return to shader part epilog
1022 %voffset = load i32, i32 addrspace(1)* %voffset.ptr
1023 %zext.offset = zext i32 %voffset to i64
1024 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1025 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 128
1026 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
1027 %load = load float, float addrspace(1)* %gep1.cast
1031 ; Range is sufficiently restricted to push the shift into 32-bits.
1032 define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1033 ; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range:
1035 ; GCN-NEXT: global_load_dword v0, v[0:1], off
1036 ; GCN-NEXT: s_waitcnt vmcnt(0)
1037 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1038 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1039 ; GCN-NEXT: s_waitcnt vmcnt(0)
1040 ; GCN-NEXT: ; return to shader part epilog
1041 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
1042 %zext.offset = zext i32 %voffset to i64
1043 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1044 %load = load float, float addrspace(1)* %gep
1048 ; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
1049 define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1050 ; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
1052 ; GCN-NEXT: global_load_dword v0, v[0:1], off
1053 ; GCN-NEXT: s_waitcnt vmcnt(0)
1054 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1055 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400
1056 ; GCN-NEXT: s_waitcnt vmcnt(0)
1057 ; GCN-NEXT: ; return to shader part epilog
1058 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
1059 %zext.offset = zext i32 %voffset to i64
1060 %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1061 %gep1 = getelementptr inbounds float, float addrspace(1)* %gep0, i64 100
1062 %load = load float, float addrspace(1)* %gep1
1066 ; Range is 1 beyond the limit where we can move the shift into 32-bits.
1067 define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1068 ; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1070 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1071 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1072 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
1073 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1074 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
1075 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
1076 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1077 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1078 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1079 ; GFX9-NEXT: ; return to shader part epilog
1081 ; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1083 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1084 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1085 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1086 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
1087 ; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
1088 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1089 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1090 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1091 ; GFX10-NEXT: ; return to shader part epilog
1092 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1
1093 %zext.offset = zext i32 %voffset to i64
1094 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1095 %load = load float, float addrspace(1)* %gep
1099 ; --------------------------------------------------------------------------------
1100 ; Stress various type loads
1101 ; --------------------------------------------------------------------------------
1103 define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1104 ; GCN-LABEL: global_load_saddr_i16:
1106 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3]
1107 ; GCN-NEXT: s_waitcnt vmcnt(0)
1108 ; GCN-NEXT: ; return to shader part epilog
1109 %zext.offset = zext i32 %voffset to i64
1110 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1111 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1112 %load = load i16, i16 addrspace(1)* %gep0.cast
1113 %cast.load = bitcast i16 %load to half
1117 define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1118 ; GCN-LABEL: global_load_saddr_i16_immneg128:
1120 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128
1121 ; GCN-NEXT: s_waitcnt vmcnt(0)
1122 ; GCN-NEXT: ; return to shader part epilog
1123 %zext.offset = zext i32 %voffset to i64
1124 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1125 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1126 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1127 %load = load i16, i16 addrspace(1)* %gep1.cast
1128 %cast.load = bitcast i16 %load to half
1132 define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1133 ; GCN-LABEL: global_load_saddr_f16:
1135 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3]
1136 ; GCN-NEXT: s_waitcnt vmcnt(0)
1137 ; GCN-NEXT: ; return to shader part epilog
1138 %zext.offset = zext i32 %voffset to i64
1139 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1140 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)*
1141 %load = load half, half addrspace(1)* %gep0.cast
1145 define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1146 ; GCN-LABEL: global_load_saddr_f16_immneg128:
1148 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128
1149 ; GCN-NEXT: s_waitcnt vmcnt(0)
1150 ; GCN-NEXT: ; return to shader part epilog
1151 %zext.offset = zext i32 %voffset to i64
1152 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1153 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1154 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)*
1155 %load = load half, half addrspace(1)* %gep1.cast
1159 define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1160 ; GCN-LABEL: global_load_saddr_i32:
1162 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1163 ; GCN-NEXT: s_waitcnt vmcnt(0)
1164 ; GCN-NEXT: ; return to shader part epilog
1165 %zext.offset = zext i32 %voffset to i64
1166 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1167 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1168 %load = load i32, i32 addrspace(1)* %gep0.cast
1169 %cast.load = bitcast i32 %load to float
1170 ret float %cast.load
1173 define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1174 ; GCN-LABEL: global_load_saddr_i32_immneg128:
1176 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1177 ; GCN-NEXT: s_waitcnt vmcnt(0)
1178 ; GCN-NEXT: ; return to shader part epilog
1179 %zext.offset = zext i32 %voffset to i64
1180 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1181 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1182 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1183 %load = load i32, i32 addrspace(1)* %gep1.cast
1184 %cast.load = bitcast i32 %load to float
1185 ret float %cast.load
1188 define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1189 ; GCN-LABEL: global_load_saddr_f32:
1191 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1192 ; GCN-NEXT: s_waitcnt vmcnt(0)
1193 ; GCN-NEXT: ; return to shader part epilog
1194 %zext.offset = zext i32 %voffset to i64
1195 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1196 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)*
1197 %load = load float, float addrspace(1)* %gep0.cast
1201 define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1202 ; GCN-LABEL: global_load_saddr_f32_immneg128:
1204 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1205 ; GCN-NEXT: s_waitcnt vmcnt(0)
1206 ; GCN-NEXT: ; return to shader part epilog
1207 %zext.offset = zext i32 %voffset to i64
1208 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1209 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1210 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
1211 %load = load float, float addrspace(1)* %gep1.cast
1215 define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1216 ; GCN-LABEL: global_load_saddr_v2i16:
1218 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1219 ; GCN-NEXT: s_waitcnt vmcnt(0)
1220 ; GCN-NEXT: ; return to shader part epilog
1221 %zext.offset = zext i32 %voffset to i64
1222 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1223 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)*
1224 %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep0.cast
1225 %cast.load = bitcast <2 x i16> %load to <2 x half>
1226 ret <2 x half> %cast.load
1229 define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1230 ; GCN-LABEL: global_load_saddr_v2i16_immneg128:
1232 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1233 ; GCN-NEXT: s_waitcnt vmcnt(0)
1234 ; GCN-NEXT: ; return to shader part epilog
1235 %zext.offset = zext i32 %voffset to i64
1236 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1237 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1238 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i16> addrspace(1)*
1239 %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep1.cast
1240 %cast.load = bitcast <2 x i16> %load to <2 x half>
1241 ret <2 x half> %cast.load
1244 define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1245 ; GCN-LABEL: global_load_saddr_v2f16:
1247 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1248 ; GCN-NEXT: s_waitcnt vmcnt(0)
1249 ; GCN-NEXT: ; return to shader part epilog
1250 %zext.offset = zext i32 %voffset to i64
1251 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1252 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)*
1253 %load = load <2 x half>, <2 x half> addrspace(1)* %gep0.cast
1254 ret <2 x half> %load
1257 define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1258 ; GCN-LABEL: global_load_saddr_v2f16_immneg128:
1260 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1261 ; GCN-NEXT: s_waitcnt vmcnt(0)
1262 ; GCN-NEXT: ; return to shader part epilog
1263 %zext.offset = zext i32 %voffset to i64
1264 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1265 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1266 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)*
1267 %load = load <2 x half>, <2 x half> addrspace(1)* %gep1.cast
1268 ret <2 x half> %load
1271 define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1272 ; GCN-LABEL: global_load_saddr_p3:
1274 ; GCN-NEXT: global_load_dword v0, v0, s[2:3]
1275 ; GCN-NEXT: s_waitcnt vmcnt(0)
1276 ; GCN-NEXT: ; return to shader part epilog
1277 %zext.offset = zext i32 %voffset to i64
1278 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1279 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)*
1280 %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep0.cast
1281 %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
1282 %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
1283 ret <2 x half> %cast.load1
1286 define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1287 ; GCN-LABEL: global_load_saddr_p3_immneg128:
1289 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1290 ; GCN-NEXT: s_waitcnt vmcnt(0)
1291 ; GCN-NEXT: ; return to shader part epilog
1292 %zext.offset = zext i32 %voffset to i64
1293 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1294 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1295 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)*
1296 %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep1.cast
1297 %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
1298 %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
1299 ret <2 x half> %cast.load1
1302 define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1303 ; GCN-LABEL: global_load_saddr_f64:
1305 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1306 ; GCN-NEXT: s_waitcnt vmcnt(0)
1307 ; GCN-NEXT: ; return to shader part epilog
1308 %zext.offset = zext i32 %voffset to i64
1309 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1310 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)*
1311 %load = load double, double addrspace(1)* %gep0.cast
1312 %cast.load = bitcast double %load to <2 x float>
1313 ret <2 x float> %cast.load
1316 define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1317 ; GCN-LABEL: global_load_saddr_f64_immneg128:
1319 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1320 ; GCN-NEXT: s_waitcnt vmcnt(0)
1321 ; GCN-NEXT: ; return to shader part epilog
1322 %zext.offset = zext i32 %voffset to i64
1323 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1324 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1325 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)*
1326 %load = load double, double addrspace(1)* %gep1.cast
1327 %cast.load = bitcast double %load to <2 x float>
1328 ret <2 x float> %cast.load
1331 define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1332 ; GCN-LABEL: global_load_saddr_i64:
1334 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1335 ; GCN-NEXT: s_waitcnt vmcnt(0)
1336 ; GCN-NEXT: ; return to shader part epilog
1337 %zext.offset = zext i32 %voffset to i64
1338 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1339 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1340 %load = load i64, i64 addrspace(1)* %gep0.cast
1341 %cast.load = bitcast i64 %load to <2 x float>
1342 ret <2 x float> %cast.load
1345 define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1346 ; GCN-LABEL: global_load_saddr_i64_immneg128:
1348 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1349 ; GCN-NEXT: s_waitcnt vmcnt(0)
1350 ; GCN-NEXT: ; return to shader part epilog
1351 %zext.offset = zext i32 %voffset to i64
1352 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1353 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1354 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1355 %load = load i64, i64 addrspace(1)* %gep1.cast
1356 %cast.load = bitcast i64 %load to <2 x float>
1357 ret <2 x float> %cast.load
1360 define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1361 ; GCN-LABEL: global_load_saddr_v2f32:
1363 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1364 ; GCN-NEXT: s_waitcnt vmcnt(0)
1365 ; GCN-NEXT: ; return to shader part epilog
1366 %zext.offset = zext i32 %voffset to i64
1367 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1368 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)*
1369 %load = load <2 x float>, <2 x float> addrspace(1)* %gep0.cast
1370 ret <2 x float> %load
1373 define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1374 ; GCN-LABEL: global_load_saddr_v2f32_immneg128:
1376 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1377 ; GCN-NEXT: s_waitcnt vmcnt(0)
1378 ; GCN-NEXT: ; return to shader part epilog
1379 %zext.offset = zext i32 %voffset to i64
1380 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1381 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1382 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)*
1383 %load = load <2 x float>, <2 x float> addrspace(1)* %gep1.cast
1384 ret <2 x float> %load
1387 define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1388 ; GCN-LABEL: global_load_saddr_v2i32:
1390 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1391 ; GCN-NEXT: s_waitcnt vmcnt(0)
1392 ; GCN-NEXT: ; return to shader part epilog
1393 %zext.offset = zext i32 %voffset to i64
1394 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1395 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)*
1396 %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep0.cast
1397 %cast.load = bitcast <2 x i32> %load to <2 x float>
1398 ret <2 x float> %cast.load
1401 define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1402 ; GCN-LABEL: global_load_saddr_v2i32_immneg128:
1404 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1405 ; GCN-NEXT: s_waitcnt vmcnt(0)
1406 ; GCN-NEXT: ; return to shader part epilog
1407 %zext.offset = zext i32 %voffset to i64
1408 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1409 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1410 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)*
1411 %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep1.cast
1412 %cast.load = bitcast <2 x i32> %load to <2 x float>
1413 ret <2 x float> %cast.load
1416 define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1417 ; GCN-LABEL: global_load_saddr_v4i16:
1419 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1420 ; GCN-NEXT: s_waitcnt vmcnt(0)
1421 ; GCN-NEXT: ; return to shader part epilog
1422 %zext.offset = zext i32 %voffset to i64
1423 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1424 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)*
1425 %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep0.cast
1426 %cast.load = bitcast <4 x i16> %load to <2 x float>
1427 ret <2 x float> %cast.load
1430 define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1431 ; GCN-LABEL: global_load_saddr_v4i16_immneg128:
1433 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1434 ; GCN-NEXT: s_waitcnt vmcnt(0)
1435 ; GCN-NEXT: ; return to shader part epilog
1436 %zext.offset = zext i32 %voffset to i64
1437 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1438 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1439 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)*
1440 %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep1.cast
1441 %cast.load = bitcast <4 x i16> %load to <2 x float>
1442 ret <2 x float> %cast.load
1445 define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1446 ; GCN-LABEL: global_load_saddr_v4f16:
1448 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1449 ; GCN-NEXT: s_waitcnt vmcnt(0)
1450 ; GCN-NEXT: ; return to shader part epilog
1451 %zext.offset = zext i32 %voffset to i64
1452 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1453 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)*
1454 %load = load <4 x half>, <4 x half> addrspace(1)* %gep0.cast
1455 %cast.load = bitcast <4 x half> %load to <2 x float>
1456 ret <2 x float> %cast.load
1459 define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1460 ; GCN-LABEL: global_load_saddr_v4f16_immneg128:
1462 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1463 ; GCN-NEXT: s_waitcnt vmcnt(0)
1464 ; GCN-NEXT: ; return to shader part epilog
1465 %zext.offset = zext i32 %voffset to i64
1466 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1467 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1468 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)*
1469 %load = load <4 x half>, <4 x half> addrspace(1)* %gep1.cast
1470 %cast.load = bitcast <4 x half> %load to <2 x float>
1471 ret <2 x float> %cast.load
1474 define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1475 ; GCN-LABEL: global_load_saddr_p1:
1477 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1478 ; GCN-NEXT: s_waitcnt vmcnt(0)
1479 ; GCN-NEXT: ; return to shader part epilog
1480 %zext.offset = zext i32 %voffset to i64
1481 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1482 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)*
1483 %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep0.cast
1484 %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
1485 %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
1486 ret <2 x float> %cast.load1
1489 define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1490 ; GCN-LABEL: global_load_saddr_p1_immneg128:
1492 ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1493 ; GCN-NEXT: s_waitcnt vmcnt(0)
1494 ; GCN-NEXT: ; return to shader part epilog
1495 %zext.offset = zext i32 %voffset to i64
1496 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1497 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1498 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)*
1499 %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep1.cast
1500 %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
1501 %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
1502 ret <2 x float> %cast.load1
1505 define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1506 ; GCN-LABEL: global_load_saddr_v3f32:
1508 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3]
1509 ; GCN-NEXT: s_waitcnt vmcnt(0)
1510 ; GCN-NEXT: ; return to shader part epilog
1511 %zext.offset = zext i32 %voffset to i64
1512 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1513 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)*
1514 %load = load <3 x float>, <3 x float> addrspace(1)* %gep0.cast
1515 ret <3 x float> %load
1518 define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1519 ; GCN-LABEL: global_load_saddr_v3f32_immneg128:
1521 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
1522 ; GCN-NEXT: s_waitcnt vmcnt(0)
1523 ; GCN-NEXT: ; return to shader part epilog
1524 %zext.offset = zext i32 %voffset to i64
1525 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1526 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1527 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)*
1528 %load = load <3 x float>, <3 x float> addrspace(1)* %gep1.cast
1529 ret <3 x float> %load
1532 define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1533 ; GCN-LABEL: global_load_saddr_v3i32:
1535 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3]
1536 ; GCN-NEXT: s_waitcnt vmcnt(0)
1537 ; GCN-NEXT: ; return to shader part epilog
1538 %zext.offset = zext i32 %voffset to i64
1539 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1540 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)*
1541 %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep0.cast
1542 %cast.load = bitcast <3 x i32> %load to <3 x float>
1543 ret <3 x float> %cast.load
1546 define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1547 ; GCN-LABEL: global_load_saddr_v3i32_immneg128:
1549 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
1550 ; GCN-NEXT: s_waitcnt vmcnt(0)
1551 ; GCN-NEXT: ; return to shader part epilog
1552 %zext.offset = zext i32 %voffset to i64
1553 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1554 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1555 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)*
1556 %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast
1557 %cast.load = bitcast <3 x i32> %load to <3 x float>
1558 ret <3 x float> %cast.load
1561 define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1562 ; GCN-LABEL: global_load_saddr_v6f16:
1564 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3]
1565 ; GCN-NEXT: s_waitcnt vmcnt(0)
1566 ; GCN-NEXT: ; return to shader part epilog
1567 %zext.offset = zext i32 %voffset to i64
1568 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1569 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)*
1570 %load = load <6 x half>, <6 x half> addrspace(1)* %gep0.cast
1571 ret <6 x half> %load
1574 define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1575 ; GCN-LABEL: global_load_saddr_v6f16_immneg128:
1577 ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
1578 ; GCN-NEXT: s_waitcnt vmcnt(0)
1579 ; GCN-NEXT: ; return to shader part epilog
1580 %zext.offset = zext i32 %voffset to i64
1581 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1582 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1583 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)*
1584 %load = load <6 x half>, <6 x half> addrspace(1)* %gep1.cast
1585 ret <6 x half> %load
1588 define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1589 ; GCN-LABEL: global_load_saddr_v4f32:
1591 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
1592 ; GCN-NEXT: s_waitcnt vmcnt(0)
1593 ; GCN-NEXT: ; return to shader part epilog
1594 %zext.offset = zext i32 %voffset to i64
1595 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1596 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)*
1597 %load = load <4 x float>, <4 x float> addrspace(1)* %gep0.cast
1598 ret <4 x float> %load
1601 define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1602 ; GCN-LABEL: global_load_saddr_v4f32_immneg128:
1604 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1605 ; GCN-NEXT: s_waitcnt vmcnt(0)
1606 ; GCN-NEXT: ; return to shader part epilog
1607 %zext.offset = zext i32 %voffset to i64
1608 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1609 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1610 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)*
1611 %load = load <4 x float>, <4 x float> addrspace(1)* %gep1.cast
1612 ret <4 x float> %load
1615 define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1616 ; GCN-LABEL: global_load_saddr_v4i32:
1618 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
1619 ; GCN-NEXT: s_waitcnt vmcnt(0)
1620 ; GCN-NEXT: ; return to shader part epilog
1621 %zext.offset = zext i32 %voffset to i64
1622 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1623 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)*
1624 %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep0.cast
1625 %cast.load = bitcast <4 x i32> %load to <4 x float>
1626 ret <4 x float> %cast.load
1629 define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1630 ; GCN-LABEL: global_load_saddr_v4i32_immneg128:
1632 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1633 ; GCN-NEXT: s_waitcnt vmcnt(0)
1634 ; GCN-NEXT: ; return to shader part epilog
1635 %zext.offset = zext i32 %voffset to i64
1636 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1637 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1638 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)*
1639 %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1.cast
1640 %cast.load = bitcast <4 x i32> %load to <4 x float>
1641 ret <4 x float> %cast.load
1644 define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1645 ; GCN-LABEL: global_load_saddr_v2i64:
1647 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
1648 ; GCN-NEXT: s_waitcnt vmcnt(0)
1649 ; GCN-NEXT: ; return to shader part epilog
1650 %zext.offset = zext i32 %voffset to i64
1651 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1652 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)*
1653 %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep0.cast
1654 %cast.load = bitcast <2 x i64> %load to <4 x float>
1655 ret <4 x float> %cast.load
1658 define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1659 ; GCN-LABEL: global_load_saddr_v2i64_immneg128:
1661 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1662 ; GCN-NEXT: s_waitcnt vmcnt(0)
1663 ; GCN-NEXT: ; return to shader part epilog
1664 %zext.offset = zext i32 %voffset to i64
1665 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1666 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1667 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)*
1668 %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep1.cast
1669 %cast.load = bitcast <2 x i64> %load to <4 x float>
1670 ret <4 x float> %cast.load
1673 define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1674 ; GCN-LABEL: global_load_saddr_i128:
1676 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
1677 ; GCN-NEXT: s_waitcnt vmcnt(0)
1678 ; GCN-NEXT: ; return to shader part epilog
1679 %zext.offset = zext i32 %voffset to i64
1680 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1681 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)*
1682 %load = load i128, i128 addrspace(1)* %gep0.cast
1683 %cast.load = bitcast i128 %load to <4 x float>
1684 ret <4 x float> %cast.load
1687 define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1688 ; GCN-LABEL: global_load_saddr_i128_immneg128:
1690 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1691 ; GCN-NEXT: s_waitcnt vmcnt(0)
1692 ; GCN-NEXT: ; return to shader part epilog
1693 %zext.offset = zext i32 %voffset to i64
1694 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1695 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1696 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i128 addrspace(1)*
1697 %load = load i128, i128 addrspace(1)* %gep1.cast
1698 %cast.load = bitcast i128 %load to <4 x float>
1699 ret <4 x float> %cast.load
1702 define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1703 ; GCN-LABEL: global_load_saddr_v2p1:
1705 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
1706 ; GCN-NEXT: s_waitcnt vmcnt(0)
1707 ; GCN-NEXT: ; return to shader part epilog
1708 %zext.offset = zext i32 %voffset to i64
1709 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1710 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)*
1711 %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast
1712 %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
1713 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
1714 ret <4 x float> %cast.load1
1717 define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1718 ; GCN-LABEL: global_load_saddr_v2p1_immneg128:
1720 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1721 ; GCN-NEXT: s_waitcnt vmcnt(0)
1722 ; GCN-NEXT: ; return to shader part epilog
1723 %zext.offset = zext i32 %voffset to i64
1724 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1725 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1726 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)*
1727 %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast
1728 %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
1729 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
1730 ret <4 x float> %cast.load1
1733 define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1734 ; GCN-LABEL: global_load_saddr_v4p3:
1736 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
1737 ; GCN-NEXT: s_waitcnt vmcnt(0)
1738 ; GCN-NEXT: ; return to shader part epilog
1739 %zext.offset = zext i32 %voffset to i64
1740 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1741 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)*
1742 %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast
1743 %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
1744 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
1745 ret <4 x float> %cast.load1
1748 define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1749 ; GCN-LABEL: global_load_saddr_v4p3_immneg128:
1751 ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1752 ; GCN-NEXT: s_waitcnt vmcnt(0)
1753 ; GCN-NEXT: ; return to shader part epilog
1754 %zext.offset = zext i32 %voffset to i64
1755 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1756 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1757 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)*
1758 %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast
1759 %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
1760 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
1761 ret <4 x float> %cast.load1
1764 ; --------------------------------------------------------------------------------
1766 ; --------------------------------------------------------------------------------
1768 define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1769 ; GCN-LABEL: global_sextload_saddr_i8:
1771 ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3]
1772 ; GCN-NEXT: s_waitcnt vmcnt(0)
1773 ; GCN-NEXT: ; return to shader part epilog
1774 %zext.offset = zext i32 %voffset to i64
1775 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1776 %load = load i8, i8 addrspace(1)* %gep0
1777 %sextload = sext i8 %load to i32
1778 %cast.load = bitcast i32 %sextload to float
1779 ret float %cast.load
1782 define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1783 ; GCN-LABEL: global_sextload_saddr_i8_immneg128:
1785 ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128
1786 ; GCN-NEXT: s_waitcnt vmcnt(0)
1787 ; GCN-NEXT: ; return to shader part epilog
1788 %zext.offset = zext i32 %voffset to i64
1789 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1790 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1791 %load = load i8, i8 addrspace(1)* %gep1
1792 %sextload = sext i8 %load to i32
1793 %cast.load = bitcast i32 %sextload to float
1794 ret float %cast.load
1797 define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1798 ; GCN-LABEL: global_sextload_saddr_i16:
1800 ; GCN-NEXT: global_load_sshort v0, v0, s[2:3]
1801 ; GCN-NEXT: s_waitcnt vmcnt(0)
1802 ; GCN-NEXT: ; return to shader part epilog
1803 %zext.offset = zext i32 %voffset to i64
1804 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1805 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1806 %load = load i16, i16 addrspace(1)* %gep0.cast
1807 %sextload = sext i16 %load to i32
1808 %cast.load = bitcast i32 %sextload to float
1809 ret float %cast.load
1812 define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1813 ; GCN-LABEL: global_sextload_saddr_i16_immneg128:
1815 ; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128
1816 ; GCN-NEXT: s_waitcnt vmcnt(0)
1817 ; GCN-NEXT: ; return to shader part epilog
1818 %zext.offset = zext i32 %voffset to i64
1819 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1820 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1821 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1822 %load = load i16, i16 addrspace(1)* %gep1.cast
1823 %sextload = sext i16 %load to i32
1824 %cast.load = bitcast i32 %sextload to float
1825 ret float %cast.load
1828 define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1829 ; GCN-LABEL: global_zextload_saddr_i8:
1831 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
1832 ; GCN-NEXT: s_waitcnt vmcnt(0)
1833 ; GCN-NEXT: ; return to shader part epilog
1834 %zext.offset = zext i32 %voffset to i64
1835 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1836 %load = load i8, i8 addrspace(1)* %gep0
1837 %zextload = zext i8 %load to i32
1838 %cast.load = bitcast i32 %zextload to float
1839 ret float %cast.load
1842 define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1843 ; GCN-LABEL: global_zextload_saddr_i8_immneg128:
1845 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128
1846 ; GCN-NEXT: s_waitcnt vmcnt(0)
1847 ; GCN-NEXT: ; return to shader part epilog
1848 %zext.offset = zext i32 %voffset to i64
1849 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1850 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1851 %load = load i8, i8 addrspace(1)* %gep1
1852 %zextload = zext i8 %load to i32
1853 %cast.load = bitcast i32 %zextload to float
1854 ret float %cast.load
1857 define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1858 ; GCN-LABEL: global_zextload_saddr_i16:
1860 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3]
1861 ; GCN-NEXT: s_waitcnt vmcnt(0)
1862 ; GCN-NEXT: ; return to shader part epilog
1863 %zext.offset = zext i32 %voffset to i64
1864 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1865 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1866 %load = load i16, i16 addrspace(1)* %gep0.cast
1867 %zextload = zext i16 %load to i32
1868 %cast.load = bitcast i32 %zextload to float
1869 ret float %cast.load
1872 define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1873 ; GCN-LABEL: global_zextload_saddr_i16_immneg128:
1875 ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128
1876 ; GCN-NEXT: s_waitcnt vmcnt(0)
1877 ; GCN-NEXT: ; return to shader part epilog
1878 %zext.offset = zext i32 %voffset to i64
1879 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1880 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1881 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1882 %load = load i16, i16 addrspace(1)* %gep1.cast
1883 %zextload = zext i16 %load to i32
1884 %cast.load = bitcast i32 %zextload to float
1885 ret float %cast.load
1888 ; --------------------------------------------------------------------------------
1890 ; --------------------------------------------------------------------------------
1892 define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1893 ; GFX9-LABEL: atomic_global_load_saddr_i32:
1895 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1896 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
1897 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1898 ; GFX9-NEXT: buffer_wbinvl1
1899 ; GFX9-NEXT: ; return to shader part epilog
1901 ; GFX10-LABEL: atomic_global_load_saddr_i32:
1903 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1904 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1905 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
1906 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1907 ; GFX10-NEXT: buffer_gl0_inv
1908 ; GFX10-NEXT: buffer_gl1_inv
1909 ; GFX10-NEXT: ; return to shader part epilog
1910 %zext.offset = zext i32 %voffset to i64
1911 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1912 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1913 %load = load atomic i32, i32 addrspace(1)* %gep0.cast seq_cst, align 4
1914 %cast.load = bitcast i32 %load to float
1915 ret float %cast.load
1918 define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1919 ; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128:
1921 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1922 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc
1923 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1924 ; GFX9-NEXT: buffer_wbinvl1
1925 ; GFX9-NEXT: ; return to shader part epilog
1927 ; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128:
1929 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1930 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1931 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc dlc
1932 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1933 ; GFX10-NEXT: buffer_gl0_inv
1934 ; GFX10-NEXT: buffer_gl1_inv
1935 ; GFX10-NEXT: ; return to shader part epilog
1936 %zext.offset = zext i32 %voffset to i64
1937 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1938 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1939 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1940 %load = load atomic i32, i32 addrspace(1)* %gep1.cast seq_cst, align 4
1941 %cast.load = bitcast i32 %load to float
1942 ret float %cast.load
1945 define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1946 ; GFX9-LABEL: atomic_global_load_saddr_i64:
1948 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1949 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc
1950 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1951 ; GFX9-NEXT: buffer_wbinvl1
1952 ; GFX9-NEXT: ; return to shader part epilog
1954 ; GFX10-LABEL: atomic_global_load_saddr_i64:
1956 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1957 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1958 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
1959 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1960 ; GFX10-NEXT: buffer_gl0_inv
1961 ; GFX10-NEXT: buffer_gl1_inv
1962 ; GFX10-NEXT: ; return to shader part epilog
1963 %zext.offset = zext i32 %voffset to i64
1964 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1965 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1966 %load = load atomic i64, i64 addrspace(1)* %gep0.cast seq_cst, align 8
1967 %cast.load = bitcast i64 %load to <2 x float>
1968 ret <2 x float> %cast.load
1971 define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1972 ; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128:
1974 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1975 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc
1976 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1977 ; GFX9-NEXT: buffer_wbinvl1
1978 ; GFX9-NEXT: ; return to shader part epilog
1980 ; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128:
1982 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1983 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1984 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc
1985 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1986 ; GFX10-NEXT: buffer_gl0_inv
1987 ; GFX10-NEXT: buffer_gl1_inv
1988 ; GFX10-NEXT: ; return to shader part epilog
1989 %zext.offset = zext i32 %voffset to i64
1990 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1991 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1992 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1993 %load = load atomic i64, i64 addrspace(1)* %gep1.cast seq_cst, align 8
1994 %cast.load = bitcast i64 %load to <2 x float>
1995 ret <2 x float> %cast.load
1998 ; --------------------------------------------------------------------------------
2000 ; --------------------------------------------------------------------------------
2002 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2003 ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi:
2005 ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3]
2006 ; GCN-NEXT: s_waitcnt vmcnt(0)
2007 ; GCN-NEXT: ; return to shader part epilog
2008 %zext.offset = zext i32 %voffset to i64
2009 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2010 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2011 %load = load i16, i16 addrspace(1)* %gep0.cast
2012 %build = insertelement <2 x i16> undef, i16 %load, i32 0
2013 %cast = bitcast <2 x i16> %build to <2 x half>
2014 ret <2 x half> %cast
2017 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2018 ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
2020 ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128
2021 ; GCN-NEXT: s_waitcnt vmcnt(0)
2022 ; GCN-NEXT: ; return to shader part epilog
2023 %zext.offset = zext i32 %voffset to i64
2024 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2025 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2026 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2027 %load = load i16, i16 addrspace(1)* %gep1.cast
2028 %build = insertelement <2 x i16> undef, i16 %load, i32 0
2029 %cast = bitcast <2 x i16> %build to <2 x half>
2030 ret <2 x half> %cast
2033 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2034 ; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi:
2036 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2037 ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3]
2038 ; GCN-NEXT: s_waitcnt vmcnt(0)
2039 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2040 ; GCN-NEXT: ; return to shader part epilog
2041 %zext.offset = zext i32 %voffset to i64
2042 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2043 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2044 %load = load i16, i16 addrspace(1)* %gep0.cast
2045 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
2046 %cast = bitcast <2 x i16> %build to <2 x half>
2047 ret <2 x half> %cast
2050 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2051 ; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
2053 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2054 ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128
2055 ; GCN-NEXT: s_waitcnt vmcnt(0)
2056 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2057 ; GCN-NEXT: ; return to shader part epilog
2058 %zext.offset = zext i32 %voffset to i64
2059 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2060 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2061 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2062 %load = load i16, i16 addrspace(1)* %gep1.cast
2063 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
2064 %cast = bitcast <2 x i16> %build to <2 x half>
2065 ret <2 x half> %cast
2068 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2069 ; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi:
2071 ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3]
2072 ; GCN-NEXT: s_waitcnt vmcnt(0)
2073 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2074 ; GCN-NEXT: ; return to shader part epilog
2075 %zext.offset = zext i32 %voffset to i64
2076 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2077 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2078 %load = load i16, i16 addrspace(1)* %gep0.cast
2079 %build = insertelement <2 x i16> %reg, i16 %load, i32 0
2080 %cast = bitcast <2 x i16> %build to <2 x half>
2081 ret <2 x half> %cast
2084 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2085 ; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
2087 ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128
2088 ; GCN-NEXT: s_waitcnt vmcnt(0)
2089 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2090 ; GCN-NEXT: ; return to shader part epilog
2091 %zext.offset = zext i32 %voffset to i64
2092 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2093 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2094 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2095 %load = load i16, i16 addrspace(1)* %gep1.cast
2096 %build = insertelement <2 x i16> %reg, i16 %load, i32 0
2097 %cast = bitcast <2 x i16> %build to <2 x half>
2098 ret <2 x half> %cast
2101 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2102 ; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
2104 ; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3]
2105 ; GCN-NEXT: s_waitcnt vmcnt(0)
2106 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2107 ; GCN-NEXT: ; return to shader part epilog
2108 %zext.offset = zext i32 %voffset to i64
2109 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2110 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2111 %load = load i8, i8 addrspace(1)* %gep0.cast
2112 %zext.load = zext i8 %load to i16
2113 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
2114 %cast = bitcast <2 x i16> %build to <2 x half>
2115 ret <2 x half> %cast
2118 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2119 ; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
2121 ; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] offset:-128
2122 ; GCN-NEXT: s_waitcnt vmcnt(0)
2123 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2124 ; GCN-NEXT: ; return to shader part epilog
2125 %zext.offset = zext i32 %voffset to i64
2126 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2127 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2128 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2129 %load = load i8, i8 addrspace(1)* %gep1.cast
2130 %zext.load = zext i8 %load to i16
2131 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
2132 %cast = bitcast <2 x i16> %build to <2 x half>
2133 ret <2 x half> %cast
2136 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2137 ; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
2139 ; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3]
2140 ; GCN-NEXT: s_waitcnt vmcnt(0)
2141 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2142 ; GCN-NEXT: ; return to shader part epilog
2143 %zext.offset = zext i32 %voffset to i64
2144 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2145 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2146 %load = load i8, i8 addrspace(1)* %gep0.cast
2147 %sext.load = sext i8 %load to i16
2148 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
2149 %cast = bitcast <2 x i16> %build to <2 x half>
2150 ret <2 x half> %cast
2153 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2154 ; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
2156 ; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] offset:-128
2157 ; GCN-NEXT: s_waitcnt vmcnt(0)
2158 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2159 ; GCN-NEXT: ; return to shader part epilog
2160 %zext.offset = zext i32 %voffset to i64
2161 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2162 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2163 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2164 %load = load i8, i8 addrspace(1)* %gep1.cast
2165 %sext.load = sext i8 %load to i16
2166 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
2167 %cast = bitcast <2 x i16> %build to <2 x half>
2168 ret <2 x half> %cast
2171 ; --------------------------------------------------------------------------------
2172 ; D16 hi load (hi16)
2173 ; --------------------------------------------------------------------------------
2175 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2176 ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi:
2178 ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3]
2179 ; GCN-NEXT: s_waitcnt vmcnt(0)
2180 ; GCN-NEXT: ; return to shader part epilog
2181 %zext.offset = zext i32 %voffset to i64
2182 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2183 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2184 %load = load i16, i16 addrspace(1)* %gep0.cast
2185 %build = insertelement <2 x i16> undef, i16 %load, i32 1
2186 %cast = bitcast <2 x i16> %build to <2 x half>
2187 ret <2 x half> %cast
2190 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2191 ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
2193 ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128
2194 ; GCN-NEXT: s_waitcnt vmcnt(0)
2195 ; GCN-NEXT: ; return to shader part epilog
2196 %zext.offset = zext i32 %voffset to i64
2197 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2198 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2199 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2200 %load = load i16, i16 addrspace(1)* %gep1.cast
2201 %build = insertelement <2 x i16> undef, i16 %load, i32 1
2202 %cast = bitcast <2 x i16> %build to <2 x half>
2203 ret <2 x half> %cast
2206 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2207 ; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi:
2209 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2210 ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3]
2211 ; GCN-NEXT: s_waitcnt vmcnt(0)
2212 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2213 ; GCN-NEXT: ; return to shader part epilog
2214 %zext.offset = zext i32 %voffset to i64
2215 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2216 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2217 %load = load i16, i16 addrspace(1)* %gep0.cast
2218 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
2219 %cast = bitcast <2 x i16> %build to <2 x half>
2220 ret <2 x half> %cast
2223 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2224 ; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
2226 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2227 ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128
2228 ; GCN-NEXT: s_waitcnt vmcnt(0)
2229 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2230 ; GCN-NEXT: ; return to shader part epilog
2231 %zext.offset = zext i32 %voffset to i64
2232 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2233 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2234 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2235 %load = load i16, i16 addrspace(1)* %gep1.cast
2236 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
2237 %cast = bitcast <2 x i16> %build to <2 x half>
2238 ret <2 x half> %cast
2241 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2242 ; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi:
2244 ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3]
2245 ; GCN-NEXT: s_waitcnt vmcnt(0)
2246 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2247 ; GCN-NEXT: ; return to shader part epilog
2248 %zext.offset = zext i32 %voffset to i64
2249 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2250 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2251 %load = load i16, i16 addrspace(1)* %gep0.cast
2252 %build = insertelement <2 x i16> %reg, i16 %load, i32 1
2253 %cast = bitcast <2 x i16> %build to <2 x half>
2254 ret <2 x half> %cast
2257 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2258 ; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
2260 ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128
2261 ; GCN-NEXT: s_waitcnt vmcnt(0)
2262 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2263 ; GCN-NEXT: ; return to shader part epilog
2264 %zext.offset = zext i32 %voffset to i64
2265 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2266 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2267 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2268 %load = load i16, i16 addrspace(1)* %gep1.cast
2269 %build = insertelement <2 x i16> %reg, i16 %load, i32 1
2270 %cast = bitcast <2 x i16> %build to <2 x half>
2271 ret <2 x half> %cast
2274 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2275 ; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
2277 ; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3]
2278 ; GCN-NEXT: s_waitcnt vmcnt(0)
2279 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2280 ; GCN-NEXT: ; return to shader part epilog
2281 %zext.offset = zext i32 %voffset to i64
2282 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2283 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2284 %load = load i8, i8 addrspace(1)* %gep0.cast
2285 %zext.load = zext i8 %load to i16
2286 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
2287 %cast = bitcast <2 x i16> %build to <2 x half>
2288 ret <2 x half> %cast
2291 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2292 ; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
2294 ; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128
2295 ; GCN-NEXT: s_waitcnt vmcnt(0)
2296 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2297 ; GCN-NEXT: ; return to shader part epilog
2298 %zext.offset = zext i32 %voffset to i64
2299 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2300 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2301 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2302 %load = load i8, i8 addrspace(1)* %gep1.cast
2303 %zext.load = zext i8 %load to i16
2304 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
2305 %cast = bitcast <2 x i16> %build to <2 x half>
2306 ret <2 x half> %cast
2309 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2310 ; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
2312 ; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3]
2313 ; GCN-NEXT: s_waitcnt vmcnt(0)
2314 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2315 ; GCN-NEXT: ; return to shader part epilog
2316 %zext.offset = zext i32 %voffset to i64
2317 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2318 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2319 %load = load i8, i8 addrspace(1)* %gep0.cast
2320 %sext.load = sext i8 %load to i16
2321 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
2322 %cast = bitcast <2 x i16> %build to <2 x half>
2323 ret <2 x half> %cast
2326 define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2327 ; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
2329 ; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128
2330 ; GCN-NEXT: s_waitcnt vmcnt(0)
2331 ; GCN-NEXT: v_mov_b32_e32 v0, v1
2332 ; GCN-NEXT: ; return to shader part epilog
2333 %zext.offset = zext i32 %voffset to i64
2334 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2335 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2336 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2337 %load = load i8, i8 addrspace(1)* %gep1.cast
2338 %sext.load = sext i8 %load to i16
2339 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
2340 %cast = bitcast <2 x i16> %build to <2 x half>
2341 ret <2 x half> %cast
2344 ; --------------------------------------------------------------------------------
2345 ; or-with-constant as add
2346 ; --------------------------------------------------------------------------------
2348 ; Check add-as-or with split 64-bit or.
2349 define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addrspace(6)* inreg %sbase, i32 %idx) {
2350 ; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
2352 ; GCN-NEXT: v_or_b32_e32 v0, 16, v0
2353 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2354 ; GCN-NEXT: global_load_ubyte v0, v[0:1], off
2355 ; GCN-NEXT: s_waitcnt vmcnt(0)
2356 ; GCN-NEXT: ; return to shader part epilog
2357 %zext.idx = zext i32 %idx to i64
2358 %or = or i64 %zext.idx, 16
2359 %addr = inttoptr i64 %or to i8 addrspace(1)*
2360 %load = load i8, i8 addrspace(1)* %addr
2361 %zext = zext i8 %load to i32
2362 %to.vgpr = bitcast i32 %zext to float
2366 define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 addrspace(6)* inreg %sbase, i32 %idx) {
2367 ; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
2369 ; GCN-NEXT: v_or_b32_e32 v0, 0x1040, v0
2370 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2371 ; GCN-NEXT: global_load_ubyte v0, v[0:1], off
2372 ; GCN-NEXT: s_waitcnt vmcnt(0)
2373 ; GCN-NEXT: ; return to shader part epilog
2374 %zext.idx = zext i32 %idx to i64
2375 %or = or i64 %zext.idx, 4160
2376 %addr = inttoptr i64 %or to i8 addrspace(1)*
2377 %load = load i8, i8 addrspace(1)* %addr
2378 %zext = zext i8 %load to i32
2379 %to.vgpr = bitcast i32 %zext to float
2383 ; --------------------------------------------------------------------------------
2384 ; Full 64-bit scalar add.
2385 ; --------------------------------------------------------------------------------
2387 define amdgpu_ps void @global_addr_64bit_lsr_iv(float addrspace(1)* inreg %arg) {
2388 ; GFX9-LABEL: global_addr_64bit_lsr_iv:
2389 ; GFX9: ; %bb.0: ; %bb
2390 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2391 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2392 ; GFX9-NEXT: BB128_1: ; %bb3
2393 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2394 ; GFX9-NEXT: s_add_u32 s4, s2, s0
2395 ; GFX9-NEXT: s_addc_u32 s5, s3, s1
2396 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
2397 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2398 ; GFX9-NEXT: s_add_u32 s0, s0, 4
2399 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
2400 ; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
2401 ; GFX9-NEXT: s_cbranch_scc0 BB128_1
2402 ; GFX9-NEXT: ; %bb.2: ; %bb2
2403 ; GFX9-NEXT: s_endpgm
2405 ; GFX10-LABEL: global_addr_64bit_lsr_iv:
2406 ; GFX10: ; %bb.0: ; %bb
2407 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2408 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2409 ; GFX10-NEXT: BB128_1: ; %bb3
2410 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2411 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
2412 ; GFX10-NEXT: s_add_u32 s4, s2, s0
2413 ; GFX10-NEXT: s_addc_u32 s5, s3, s1
2414 ; GFX10-NEXT: s_add_u32 s0, s0, 4
2415 ; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
2416 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2417 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
2418 ; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
2419 ; GFX10-NEXT: s_cbranch_scc0 BB128_1
2420 ; GFX10-NEXT: ; %bb.2: ; %bb2
2421 ; GFX10-NEXT: s_endpgm
2428 bb3: ; preds = %bb3, %bb
2429 %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
2430 %i4 = zext i32 %i to i64
2431 %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4
2432 %i6 = load volatile float, float addrspace(1)* %i5, align 4
2433 %i8 = add nuw nsw i32 %i, 1
2434 %i9 = icmp eq i32 %i8, 256
2435 br i1 %i9, label %bb2, label %bb3
2438 ; Make sure we only have a single zero vaddr initialization.
2440 define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(float addrspace(1)* inreg %arg, float addrspace(1)* inreg %arg.1) {
2441 ; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload:
2442 ; GFX9: ; %bb.0: ; %bb
2443 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2444 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2445 ; GFX9-NEXT: BB129_1: ; %bb3
2446 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2447 ; GFX9-NEXT: s_add_u32 s4, s2, s0
2448 ; GFX9-NEXT: s_addc_u32 s5, s3, s1
2449 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
2450 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2451 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
2452 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2453 ; GFX9-NEXT: s_add_u32 s0, s0, 4
2454 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
2455 ; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
2456 ; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5
2457 ; GFX9-NEXT: s_cbranch_scc0 BB129_1
2458 ; GFX9-NEXT: ; %bb.2: ; %bb2
2459 ; GFX9-NEXT: s_endpgm
2461 ; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload:
2462 ; GFX10: ; %bb.0: ; %bb
2463 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2464 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2465 ; GFX10-NEXT: BB129_1: ; %bb3
2466 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2467 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
2468 ; GFX10-NEXT: s_add_u32 s4, s2, s0
2469 ; GFX10-NEXT: s_addc_u32 s5, s3, s1
2470 ; GFX10-NEXT: s_add_u32 s0, s0, 4
2471 ; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
2472 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2473 ; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
2474 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2475 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
2476 ; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
2477 ; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5
2478 ; GFX10-NEXT: s_cbranch_scc0 BB129_1
2479 ; GFX10-NEXT: ; %bb.2: ; %bb2
2480 ; GFX10-NEXT: s_endpgm
2487 bb3: ; preds = %bb3, %bb
2488 %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
2489 %i4 = zext i32 %i to i64
2490 %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4
2491 %i6 = load volatile float, float addrspace(1)* %i5, align 4
2492 %i5.1 = getelementptr inbounds float, float addrspace(1)* %arg.1, i64 %i4
2493 %i6.1 = load volatile float, float addrspace(1)* %i5, align 4
2494 %i8 = add nuw nsw i32 %i, 1
2495 %i9 = icmp eq i32 %i8, 256
2496 br i1 %i9, label %bb2, label %bb3
2499 !0 = !{i32 0, i32 1073741824} ; (1 << 30)
2500 !1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1