1 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
2 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
3 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
4 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
5 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
6 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
7 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
10 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
12 ; OPT-LABEL: @test_sink_global_small_offset_i32(
13 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
14 ; OPT-VI: getelementptr i32, i32 addrspace(1)* %in
16 ; OPT-CI: getelementptr i8,
18 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
19 define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
21 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
22 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
23 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
24 %tmp0 = icmp eq i32 %tid, 0
25 br i1 %tmp0, label %endif, label %if
28 %tmp1 = load i32, i32 addrspace(1)* %in.gep
32 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
33 store i32 %x, i32 addrspace(1)* %out.gep
40 ; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
41 ; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
44 ; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
45 ; GCN: s_and_saveexec_b64
46 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
48 ; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xf000,
49 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0,
50 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
53 define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
55 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
56 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
57 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
58 %tmp0 = icmp eq i32 %tid, 0
59 br i1 %tmp0, label %endif, label %if
62 %tmp1 = load i8, i8 addrspace(1)* %in.gep
63 %tmp2 = sext i8 %tmp1 to i32
67 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
68 store i32 %x, i32 addrspace(1)* %out.gep
75 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
76 ; GCN: s_and_saveexec_b64
77 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
78 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
81 define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
83 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
84 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
85 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
86 %tmp0 = icmp eq i32 %tid, 0
87 br i1 %tmp0, label %endif, label %if
90 %tmp1 = load i8, i8 addrspace(1)* %in.gep
91 %tmp2 = sext i8 %tmp1 to i32
95 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
96 store i32 %x, i32 addrspace(1)* %out.gep
103 ; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
104 ; GCN: s_and_saveexec_b64
105 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
106 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
109 define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
111 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
112 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
113 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
114 %tmp0 = icmp eq i32 %tid, 0
115 br i1 %tmp0, label %endif, label %if
118 %tmp1 = load i8, i8 addrspace(1)* %in.gep
119 %tmp2 = sext i8 %tmp1 to i32
123 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
124 store i32 %x, i32 addrspace(1)* %out.gep
131 ; OPT-LABEL: @test_sink_scratch_small_offset_i32(
132 ; OPT-NOT: getelementptr [512 x i32]
134 ; OPT: getelementptr i8,
136 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
137 ; GCN: s_and_saveexec_b64
138 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
139 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
141 define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
143 %alloca = alloca [512 x i32], align 4, addrspace(5)
144 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
145 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
146 %add.arg = add i32 %arg, 8
147 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1022
148 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
149 %tmp0 = icmp eq i32 %tid, 0
150 br i1 %tmp0, label %endif, label %if
153 store volatile i32 123, i32 addrspace(5)* %alloca.gep
154 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
158 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
159 store i32 %x, i32 addrspace(1)* %out.gep.0
160 %load = load volatile i32, i32 addrspace(5)* %alloca.gep
161 store i32 %load, i32 addrspace(1)* %out.gep.1
168 ; This ends up not fitting due to the reserved 4 bytes at offset 0
169 ; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
170 ; OPT-NOT: getelementptr [512 x i32]
172 ; OPT: getelementptr i8,
174 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
175 ; GCN: s_and_saveexec_b64
176 ; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
177 ; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
178 ; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
179 ; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
180 ; GCN: {{^BB[0-9]+}}_2:
182 define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
184 %alloca = alloca [512 x i32], align 4, addrspace(5)
185 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
186 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
187 %add.arg = add i32 %arg, 8
188 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1023
189 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
190 %tmp0 = icmp eq i32 %tid, 0
191 br i1 %tmp0, label %endif, label %if
194 store volatile i32 123, i32 addrspace(5)* %alloca.gep
195 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
199 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
200 store i32 %x, i32 addrspace(1)* %out.gep.0
201 %load = load volatile i32, i32 addrspace(5)* %alloca.gep
202 store i32 %load, i32 addrspace(1)* %out.gep.1
209 ; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
210 ; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
214 ; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
215 ; GCN: s_and_saveexec_b64
216 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
217 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
218 ; GCN: {{^BB[0-9]+}}_2:
219 define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
221 %alloca = alloca [512 x i32], align 4, addrspace(5)
222 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
223 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
224 %add.arg = add i32 %arg, 8
225 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
226 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
227 %tmp0 = icmp eq i32 %tid, 0
228 br i1 %tmp0, label %endif, label %if
231 store volatile i32 123, i32 addrspace(5)* %alloca.gep
232 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
236 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
237 store i32 %x, i32 addrspace(1)* %out.gep.0
238 %load = load volatile i32, i32 addrspace(5)* %alloca.gep
239 store i32 %load, i32 addrspace(1)* %out.gep.1
246 ; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
247 ; GCN: s_and_saveexec_b64
248 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
249 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
250 ; GCN: {{^BB[0-9]+}}_2:
251 define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
253 %offset.ext = zext i32 %offset to i64
254 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
255 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
256 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
257 %tmp0 = icmp eq i32 %tid, 0
258 br i1 %tmp0, label %endif, label %if
261 %tmp1 = load i32, i32 addrspace(1)* %in.gep
265 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
266 store i32 %x, i32 addrspace(1)* %out.gep
273 ; OPT-LABEL: @test_sink_constant_small_offset_i32
274 ; OPT-NOT: getelementptr i32, i32 addrspace(4)*
277 ; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
278 ; GCN: s_and_saveexec_b64
279 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
280 ; GCN: s_or_b64 exec, exec
281 define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
283 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
284 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
285 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
286 %tmp0 = icmp eq i32 %tid, 0
287 br i1 %tmp0, label %endif, label %if
290 %tmp1 = load i32, i32 addrspace(4)* %in.gep
294 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
295 store i32 %x, i32 addrspace(1)* %out.gep
302 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
303 ; OPT-NOT: getelementptr i32, i32 addrspace(4)*
306 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
307 ; GCN: s_and_saveexec_b64
308 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
309 ; GCN: s_or_b64 exec, exec
310 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
312 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
313 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255
314 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
315 %tmp0 = icmp eq i32 %tid, 0
316 br i1 %tmp0, label %endif, label %if
319 %tmp1 = load i32, i32 addrspace(4)* %in.gep
323 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
324 store i32 %x, i32 addrspace(1)* %out.gep
331 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
332 ; OPT-SI: getelementptr i32, i32 addrspace(4)*
333 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
334 ; OPT-VI-NOT: getelementptr i32, i32 addrspace(4)*
337 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
338 ; GCN: s_and_saveexec_b64
339 ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400
341 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
342 ; GCN: s_or_b64 exec, exec
343 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
345 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
346 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256
347 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
348 %tmp0 = icmp eq i32 %tid, 0
349 br i1 %tmp0, label %endif, label %if
352 %tmp1 = load i32, i32 addrspace(4)* %in.gep
356 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
357 store i32 %x, i32 addrspace(1)* %out.gep
364 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
365 ; OPT-SI: getelementptr i32, i32 addrspace(4)*
366 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
369 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
370 ; GCN: s_and_saveexec_b64
371 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}}
372 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
373 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
374 ; GCN: s_or_b64 exec, exec
375 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
377 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
378 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295
379 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
380 %tmp0 = icmp eq i32 %tid, 0
381 br i1 %tmp0, label %endif, label %if
384 %tmp1 = load i32, i32 addrspace(4)* %in.gep
388 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
389 store i32 %x, i32 addrspace(1)* %out.gep
396 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
397 ; OPT: getelementptr i32, i32 addrspace(4)*
400 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
401 ; GCN: s_and_saveexec_b64
404 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
405 ; GCN: s_or_b64 exec, exec
406 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
408 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
409 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181
410 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
411 %tmp0 = icmp eq i32 %tid, 0
412 br i1 %tmp0, label %endif, label %if
415 %tmp1 = load i32, i32 addrspace(4)* %in.gep
419 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
420 store i32 %x, i32 addrspace(1)* %out.gep
427 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32:
428 ; GCN: s_and_saveexec_b64
429 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}}
430 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
432 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}}
433 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
435 ; GCN: s_or_b64 exec, exec
436 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
438 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
439 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143
440 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
441 %tmp0 = icmp eq i32 %tid, 0
442 br i1 %tmp0, label %endif, label %if
445 %tmp1 = load i32, i32 addrspace(4)* %in.gep
449 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
450 store i32 %x, i32 addrspace(1)* %out.gep
457 ; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
458 ; OPT-SI: getelementptr i32, i32 addrspace(4)*
459 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
460 ; OPT-VI: getelementptr i32, i32 addrspace(4)*
463 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
464 ; GCN: s_and_saveexec_b64
465 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
466 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
468 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}}
470 ; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
471 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
473 ; GCN: s_or_b64 exec, exec
474 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
476 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
477 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144
478 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
479 %tmp0 = icmp eq i32 %tid, 0
480 br i1 %tmp0, label %endif, label %if
483 %tmp1 = load i32, i32 addrspace(4)* %in.gep
487 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
488 store i32 %x, i32 addrspace(1)* %out.gep
495 %struct.foo = type { [3 x float], [3 x float] }
497 ; OPT-LABEL: @sink_ds_address(
498 ; OPT: getelementptr inbounds i8,
500 ; GCN-LABEL: {{^}}sink_ds_address:
501 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
502 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
503 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
504 define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
506 %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
507 %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
511 %a = load float, float addrspace(3)* %x, align 4
512 %b = load float, float addrspace(3)* %y, align 4
513 %cmp = fcmp one float %a, %b
514 br i1 %cmp, label %bb34, label %bb33
523 ; Address offset is not a multiple of 4. This is a valid mubuf offset,
526 ; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(
529 ; OPT: getelementptr i8, {{.*}} 4095
530 define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) {
532 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
533 %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
534 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
535 %tmp0 = icmp eq i32 %tid, 0
536 br i1 %tmp0, label %endif, label %if
539 %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)*
540 %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1
544 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
545 store i32 %x, i32 addrspace(1)* %out.gep
552 ; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32(
553 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
554 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
555 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
556 ; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst
557 define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
559 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
560 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
561 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
562 %tmp0 = icmp eq i32 %tid, 0
563 br i1 %tmp0, label %endif, label %if
566 %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst
570 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
571 store i32 %x, i32 addrspace(3)* %out.gep
578 ; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32(
579 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
580 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
581 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
582 ; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic
583 define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
585 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
586 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
587 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
588 %tmp0 = icmp eq i32 %tid, 0
589 br i1 %tmp0, label %endif, label %if
592 %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic
593 %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0
597 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
598 store i32 %x, i32 addrspace(3)* %out.gep
605 ; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32(
606 ; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
608 ; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
609 define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
611 %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
612 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
613 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
614 %tmp0 = icmp eq i32 %tid, 0
615 br i1 %tmp0, label %endif, label %if
618 %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
619 %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0
623 %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ]
624 store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep
631 ; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32(
632 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
633 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
634 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
635 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
636 define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
638 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
639 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
640 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
641 %tmp0 = icmp eq i32 %tid, 0
642 br i1 %tmp0, label %endif, label %if
645 %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
649 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
650 store i32 %x, i32 addrspace(3)* %out.gep
657 ; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32(
658 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
659 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
660 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
661 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
662 define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
664 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
665 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
666 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
667 %tmp0 = icmp eq i32 %tid, 0
668 br i1 %tmp0, label %endif, label %if
671 %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
675 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
676 store i32 %x, i32 addrspace(3)* %out.gep
683 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
684 ; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
686 ; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep
689 ; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
690 ; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
692 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
693 ; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}}
694 define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
696 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
697 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
698 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
699 %tmp0 = icmp eq i32 %tid, 0
700 br i1 %tmp0, label %endif, label %if
703 %tmp1 = load i8, i8 addrspace(1)* %in.gep
704 %tmp2 = sext i8 %tmp1 to i32
708 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
709 store i32 %x, i32 addrspace(1)* %out.gep
716 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
717 ; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
719 ; OPT: load i8, i8 addrspace(1)* %in.gep
721 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
722 define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
724 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
725 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
726 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
727 %tmp0 = icmp eq i32 %tid, 0
728 br i1 %tmp0, label %endif, label %if
731 %tmp1 = load i8, i8 addrspace(1)* %in.gep
732 %tmp2 = sext i8 %tmp1 to i32
736 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
737 store i32 %x, i32 addrspace(1)* %out.gep
744 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
745 declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
746 declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
748 attributes #0 = { nounwind readnone }
749 attributes #1 = { nounwind }
750 attributes #2 = { nounwind argmemonly }