1 ; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
2 ; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
3 ; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
4 ; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
8 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
10 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
12 ; OPT-LABEL: @test_sink_global_small_offset_i32(
13 ; OPT-CI-NOT: getelementptr i32, ptr addrspace(1) %in
14 ; OPT-VI: getelementptr i32, ptr addrspace(1) %in
16 ; OPT-CI: getelementptr i8,
18 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
19 define amdgpu_kernel void @test_sink_global_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
21 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
22 %in.gep = getelementptr i32, ptr addrspace(1) %in, i64 7
23 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
24 %tmp0 = icmp eq i32 %tid, 0
25 br i1 %tmp0, label %endif, label %if
28 %tmp1 = load i32, ptr addrspace(1) %in.gep
32 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
33 store i32 %x, ptr addrspace(1) %out.gep
40 ; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
41 ; OPT: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 65535
44 ; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
45 ; GCN: s_and_saveexec_b64
46 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
48 ; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0xf000{{$}}
49 ; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
52 define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
54 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999
55 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 65535
56 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
57 %tmp0 = icmp eq i32 %tid, 0
58 br i1 %tmp0, label %endif, label %if
61 %tmp1 = load i8, ptr addrspace(1) %in.gep
62 %tmp2 = sext i8 %tmp1 to i32
66 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
67 store i32 %x, ptr addrspace(1) %out.gep
74 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
75 ; GCN: s_and_saveexec_b64
76 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
77 ; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
78 ; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
81 define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
83 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024
84 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 4095
85 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
86 %tmp0 = icmp eq i32 %tid, 0
87 br i1 %tmp0, label %endif, label %if
90 %tmp1 = load i8, ptr addrspace(1) %in.gep
91 %tmp2 = sext i8 %tmp1 to i32
95 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
96 store i32 %x, ptr addrspace(1) %out.gep
103 ; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
104 ; GCN: s_and_saveexec_b64
105 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
106 ; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0x1000{{$}}
107 ; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]$}}
110 define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
112 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999
113 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 4096
114 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
115 %tmp0 = icmp eq i32 %tid, 0
116 br i1 %tmp0, label %endif, label %if
119 %tmp1 = load i8, ptr addrspace(1) %in.gep
120 %tmp2 = sext i8 %tmp1 to i32
124 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
125 store i32 %x, ptr addrspace(1) %out.gep
132 ; OPT-LABEL: @test_sink_scratch_small_offset_i32(
133 ; OPT-NOT: getelementptr [512 x i32]
135 ; OPT: getelementptr i8,
137 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
138 ; GCN: s_and_saveexec_b64
139 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088{{$}}
140 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088 glc{{$}}
142 define amdgpu_kernel void @test_sink_scratch_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
144 %alloca = alloca [512 x i32], align 4, addrspace(5)
145 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998
146 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999
147 %add.arg = add i32 %arg, 8
148 %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1022
149 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
150 %tmp0 = icmp eq i32 %tid, 0
151 br i1 %tmp0, label %endif, label %if
154 store volatile i32 123, ptr addrspace(5) %alloca.gep
155 %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep
159 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
160 store i32 %x, ptr addrspace(1) %out.gep.0
161 %load = load volatile i32, ptr addrspace(5) %alloca.gep
162 store i32 %load, ptr addrspace(1) %out.gep.1
169 ; This used to be a special case when the scavenge slot was
171 ; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
172 ; OPT-NOT: getelementptr [512 x i32]
174 ; OPT: getelementptr i8,
176 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
177 ; GCN: s_and_saveexec_b64
178 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
179 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}}
180 ; GCN: {{^.LBB[0-9]+}}_2:
182 define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
184 %alloca = alloca [512 x i32], align 4, addrspace(5)
185 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998
186 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999
187 %add.arg = add i32 %arg, 8
188 %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1023
189 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
190 %tmp0 = icmp eq i32 %tid, 0
191 br i1 %tmp0, label %endif, label %if
194 store volatile i32 123, ptr addrspace(5) %alloca.gep
195 %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep
199 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
200 store i32 %x, ptr addrspace(1) %out.gep.0
201 %load = load volatile i32, ptr addrspace(5) %alloca.gep
202 store i32 %load, ptr addrspace(1) %out.gep.1
209 ; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
210 ; OPT: %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1024
214 ; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
215 ; GCN: s_and_saveexec_b64
216 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
217 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc{{$}}
218 ; GCN: {{^.LBB[0-9]+}}_2:
219 define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
221 %alloca = alloca [512 x i32], align 4, addrspace(5)
222 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998
223 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999
224 %add.arg = add i32 %arg, 8
225 %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1024
226 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
227 %tmp0 = icmp eq i32 %tid, 0
228 br i1 %tmp0, label %endif, label %if
231 store volatile i32 123, ptr addrspace(5) %alloca.gep
232 %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep
236 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
237 store i32 %x, ptr addrspace(1) %out.gep.0
238 %load = load volatile i32, ptr addrspace(5) %alloca.gep
239 store i32 %load, ptr addrspace(1) %out.gep.1
246 ; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
247 ; GCN: s_and_saveexec_b64
248 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
249 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
250 ; GCN: {{^.LBB[0-9]+}}_2:
251 define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %offset) {
253 %offset.ext = zext i32 %offset to i64
254 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
255 %in.gep = getelementptr i32, ptr addrspace(1) %in, i64 %offset.ext
256 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
257 %tmp0 = icmp eq i32 %tid, 0
258 br i1 %tmp0, label %endif, label %if
261 %tmp1 = load i32, ptr addrspace(1) %in.gep
265 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
266 store i32 %x, ptr addrspace(1) %out.gep
273 ; OPT-LABEL: @test_sink_constant_small_offset_i32
274 ; OPT-NOT: getelementptr i32, ptr addrspace(4)
277 ; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
278 ; GCN: s_and_saveexec_b64
279 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
280 ; GCN: s_or_b64 exec, exec
281 define amdgpu_kernel void @test_sink_constant_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
283 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
284 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 7
285 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
286 %tmp0 = icmp eq i32 %tid, 0
287 br i1 %tmp0, label %endif, label %if
290 %tmp1 = load i32, ptr addrspace(4) %in.gep
294 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
295 store i32 %x, ptr addrspace(1) %out.gep
302 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
303 ; OPT-NOT: getelementptr i32, ptr addrspace(4)
306 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
307 ; GCN: s_and_saveexec_b64
308 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
309 ; GCN: s_or_b64 exec, exec
310 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
312 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
313 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 255
314 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
315 %tmp0 = icmp eq i32 %tid, 0
316 br i1 %tmp0, label %endif, label %if
319 %tmp1 = load i32, ptr addrspace(4) %in.gep
323 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
324 store i32 %x, ptr addrspace(1) %out.gep
331 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
332 ; OPT-SI: getelementptr i32, ptr addrspace(4)
333 ; OPT-CI-NOT: getelementptr i32, ptr addrspace(4)
334 ; OPT-VI-NOT: getelementptr i32, ptr addrspace(4)
337 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
338 ; GCN: s_and_saveexec_b64
339 ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400
341 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
342 ; GCN: s_or_b64 exec, exec
343 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
345 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
346 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 256
347 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
348 %tmp0 = icmp eq i32 %tid, 0
349 br i1 %tmp0, label %endif, label %if
352 %tmp1 = load i32, ptr addrspace(4) %in.gep
356 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
357 store i32 %x, ptr addrspace(1) %out.gep
364 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
365 ; OPT-SI: getelementptr i32, ptr addrspace(4)
366 ; OPT-CI-NOT: getelementptr i32, ptr addrspace(4)
369 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
370 ; GCN: s_and_saveexec_b64
371 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}}
372 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
373 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
375 ; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}}
376 ; VI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
377 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
379 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffffff{{$}}
381 ; GCN: s_or_b64 exec, exec
382 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
384 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
385 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 4294967295
386 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
387 %tmp0 = icmp eq i32 %tid, 0
388 br i1 %tmp0, label %endif, label %if
391 %tmp1 = load i32, ptr addrspace(4) %in.gep
395 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
396 store i32 %x, ptr addrspace(1) %out.gep
403 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
404 ; OPT: getelementptr i32, ptr addrspace(4)
407 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
408 ; GCN: s_and_saveexec_b64
411 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
412 ; GCN: s_or_b64 exec, exec
413 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
415 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
416 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 17179869181
417 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
418 %tmp0 = icmp eq i32 %tid, 0
419 br i1 %tmp0, label %endif, label %if
422 %tmp1 = load i32, ptr addrspace(4) %in.gep
426 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
427 store i32 %x, ptr addrspace(1) %out.gep
434 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32:
435 ; GCN: s_and_saveexec_b64
436 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}}
437 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
439 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}}
440 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
442 ; GCN: s_or_b64 exec, exec
443 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
445 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
446 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 262143
447 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
448 %tmp0 = icmp eq i32 %tid, 0
449 br i1 %tmp0, label %endif, label %if
452 %tmp1 = load i32, ptr addrspace(4) %in.gep
456 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
457 store i32 %x, ptr addrspace(1) %out.gep
464 ; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
465 ; OPT-SI: getelementptr i32, ptr addrspace(4)
466 ; OPT-CI-NOT: getelementptr i32, ptr addrspace(4)
467 ; OPT-VI: getelementptr i32, ptr addrspace(4)
470 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
471 ; GCN: s_and_saveexec_b64
472 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
473 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
475 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}}
477 ; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
478 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
480 ; GCN: s_or_b64 exec, exec
481 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
483 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
484 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 262144
485 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
486 %tmp0 = icmp eq i32 %tid, 0
487 br i1 %tmp0, label %endif, label %if
490 %tmp1 = load i32, ptr addrspace(4) %in.gep
494 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
495 store i32 %x, ptr addrspace(1) %out.gep
502 %struct.foo = type { [3 x float], [3 x float] }
504 ; OPT-LABEL: @sink_ds_address(
505 ; OPT: getelementptr inbounds i8,
507 ; GCN-LABEL: {{^}}sink_ds_address:
508 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
509 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
510 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
511 define amdgpu_kernel void @sink_ds_address(ptr addrspace(3) nocapture %ptr) nounwind {
513 %x = getelementptr inbounds %struct.foo, ptr addrspace(3) %ptr, i32 0, i32 1, i32 0
514 %y = getelementptr inbounds %struct.foo, ptr addrspace(3) %ptr, i32 0, i32 1, i32 2
518 %a = load float, ptr addrspace(3) %x, align 4
519 %b = load float, ptr addrspace(3) %y, align 4
520 %cmp = fcmp one float %a, %b
521 br i1 %cmp, label %bb34, label %bb33
530 ; Address offset is not a multiple of 4. This is a valid mubuf offset,
533 ; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(
536 ; OPT: getelementptr i8, {{.*}} 4095
537 define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(ptr addrspace(1) %out, ptr addrspace(4) %in) {
539 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024
540 %in.gep = getelementptr i8, ptr addrspace(4) %in, i64 4095
541 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
542 %tmp0 = icmp eq i32 %tid, 0
543 br i1 %tmp0, label %endif, label %if
546 %tmp1 = load i32, ptr addrspace(4) %in.gep, align 1
550 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
551 store i32 %x, ptr addrspace(1) %out.gep
558 ; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32(
559 ; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
560 ; OPT: %tmp1 = atomicrmw add ptr addrspace(3) %sunkaddr, i32 2 seq_cst
561 define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
563 %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
564 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
565 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
566 %tmp0 = icmp eq i32 %tid, 0
567 br i1 %tmp0, label %endif, label %if
570 %tmp1 = atomicrmw add ptr addrspace(3) %in.gep, i32 2 seq_cst
574 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
575 store i32 %x, ptr addrspace(3) %out.gep
582 ; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32(
583 ; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
584 ; OPT: %tmp1.struct = cmpxchg ptr addrspace(3) %sunkaddr, i32 undef, i32 2 seq_cst monotonic
585 define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
587 %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
588 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
589 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
590 %tmp0 = icmp eq i32 %tid, 0
591 br i1 %tmp0, label %endif, label %if
594 %tmp1.struct = cmpxchg ptr addrspace(3) %in.gep, i32 undef, i32 2 seq_cst monotonic
595 %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0
599 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
600 store i32 %x, ptr addrspace(3) %out.gep
607 ; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32(
608 ; OPT: %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
610 ; OPT: cmpxchg ptr addrspace(3) undef, ptr addrspace(3) %in.gep, ptr addrspace(3) undef seq_cst monotonic
611 define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
613 %out.gep = getelementptr ptr addrspace(3), ptr addrspace(3) %out, i32 999999
614 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
615 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
616 %tmp0 = icmp eq i32 %tid, 0
617 br i1 %tmp0, label %endif, label %if
620 %tmp1.struct = cmpxchg ptr addrspace(3) undef, ptr addrspace(3) %in.gep, ptr addrspace(3) undef seq_cst monotonic
621 %tmp1 = extractvalue { ptr addrspace(3), i1 } %tmp1.struct, 0
625 %x = phi ptr addrspace(3) [ %tmp1, %if ], [ null, %entry ]
626 store ptr addrspace(3) %x, ptr addrspace(3) %out.gep
633 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
634 ; OPT-SICIVI: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096
636 ; OPT-SICIVI: %tmp1 = load i8, ptr addrspace(1) %in.gep
639 ; OPT-GFX9: %sunkaddr = getelementptr i8, ptr addrspace(1) %in, i64 -4096
640 ; OPT-GFX9: load i8, ptr addrspace(1) %sunkaddr
642 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
643 ; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
644 ; GFX9: global_load_sbyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}}
645 define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
647 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024
648 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096
649 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
650 %tmp0 = icmp eq i32 %tid, 0
651 br i1 %tmp0, label %endif, label %if
654 %tmp1 = load i8, ptr addrspace(1) %in.gep
655 %tmp2 = sext i8 %tmp1 to i32
659 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
660 store i32 %x, ptr addrspace(1) %out.gep
667 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
668 ; OPT: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4097
670 ; OPT: load i8, ptr addrspace(1) %in.gep
672 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
673 define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
675 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999
676 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4097
677 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
678 %tmp0 = icmp eq i32 %tid, 0
679 br i1 %tmp0, label %endif, label %if
682 %tmp1 = load i8, ptr addrspace(1) %in.gep
683 %tmp2 = sext i8 %tmp1 to i32
687 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
688 store i32 %x, ptr addrspace(1) %out.gep
695 ; OPT-LABEL: @test_sink_small_offset_ds_append(
696 ; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
697 ; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %sunkaddr, i1 false)
698 define amdgpu_kernel void @test_sink_small_offset_ds_append(ptr addrspace(3) %out, ptr addrspace(3) %in) {
700 %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
701 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
702 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
703 %tmp0 = icmp eq i32 %tid, 0
704 br i1 %tmp0, label %endif, label %if
707 %tmp1 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %in.gep, i1 false)
711 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
712 store i32 %x, ptr addrspace(3) %out.gep
719 ; OPT-LABEL: @test_sink_small_offset_ds_consume(
720 ; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
721 ; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %sunkaddr, i1 false)
722 define amdgpu_kernel void @test_sink_small_offset_ds_consume(ptr addrspace(3) %out, ptr addrspace(3) %in) {
724 %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
725 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
726 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
727 %tmp0 = icmp eq i32 %tid, 0
728 br i1 %tmp0, label %endif, label %if
731 %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %in.gep, i1 false)
735 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
736 store i32 %x, ptr addrspace(3) %out.gep
743 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
744 declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture, i1 immarg) #3
745 declare i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) nocapture, i1 immarg) #3
747 attributes #0 = { nounwind readnone }
748 attributes #1 = { nounwind }
749 attributes #2 = { nounwind argmemonly }
750 attributes #3 = { argmemonly convergent nounwind willreturn }