1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s
5 define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
6 ; GFX12-LABEL: flat_last_use_load_0:
7 ; GFX12: ; %bb.0: ; %entry
8 ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
9 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
10 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
11 ; GFX12-NEXT: s_wait_kmcnt 0x0
12 ; GFX12-NEXT: v_mov_b32_e32 v0, s2
13 ; GFX12-NEXT: v_mov_b32_e32 v1, s3
14 ; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
15 ; GFX12-NEXT: v_mov_b32_e32 v0, s0
16 ; GFX12-NEXT: v_mov_b32_e32 v1, s1
17 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
18 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
19 ; GFX12-NEXT: s_endpgm
21 %val = load i32, ptr %in, align 4, !amdgpu.last.use !{}
22 store i32 %val, ptr %out
26 define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
27 ; GFX12-LABEL: flat_last_use_load_1:
28 ; GFX12: ; %bb.0: ; %entry
29 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
30 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
31 ; GFX12-NEXT: s_mov_b32 s2, 0x3ff
32 ; GFX12-NEXT: v_and_b32_e64 v0, v0, s2
33 ; GFX12-NEXT: s_mov_b32 s2, 2
34 ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0
35 ; GFX12-NEXT: s_mov_b32 s2, 0
36 ; GFX12-NEXT: ; implicit-def: $sgpr2
37 ; GFX12-NEXT: v_mov_b32_e32 v0, 0
38 ; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
39 ; GFX12-NEXT: v_mov_b32_e32 v2, v0
40 ; GFX12-NEXT: s_wait_kmcnt 0x0
41 ; GFX12-NEXT: s_mov_b32 s3, s4
42 ; GFX12-NEXT: v_mov_b32_e32 v0, v1
43 ; GFX12-NEXT: s_mov_b32 s2, s5
44 ; GFX12-NEXT: v_mov_b32_e32 v1, v2
45 ; GFX12-NEXT: v_add_co_u32 v0, s3, s3, v0
46 ; GFX12-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
47 ; GFX12-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
48 ; GFX12-NEXT: v_mov_b32_e32 v1, v2
49 ; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
50 ; GFX12-NEXT: v_mov_b32_e32 v0, s0
51 ; GFX12-NEXT: v_mov_b32_e32 v1, s1
52 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
53 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
54 ; GFX12-NEXT: s_endpgm
56 %tid = call i32 @llvm.amdgcn.workitem.id.x()
57 %val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
58 %val = load i32, ptr %val.gep, align 4, !amdgpu.last.use !{}
59 store i32 %val, ptr %out
63 define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
64 ; GFX12-LABEL: flat_last_use_and_volatile_load:
65 ; GFX12: ; %bb.0: ; %entry
66 ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
67 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
68 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
69 ; GFX12-NEXT: s_wait_kmcnt 0x0
70 ; GFX12-NEXT: v_mov_b32_e32 v0, s2
71 ; GFX12-NEXT: v_mov_b32_e32 v1, s3
72 ; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
73 ; GFX12-NEXT: s_wait_bvhcnt 0x0
74 ; GFX12-NEXT: s_wait_samplecnt 0x0
75 ; GFX12-NEXT: s_wait_loadcnt 0x0
76 ; GFX12-NEXT: v_mov_b32_e32 v0, s0
77 ; GFX12-NEXT: v_mov_b32_e32 v1, s1
78 ; GFX12-NEXT: s_wait_dscnt 0x0
79 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
80 ; GFX12-NEXT: s_endpgm
82 %val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{}
83 store i32 %val, ptr %out
87 define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) {
88 ; GFX12-LABEL: flat_last_use_and_nontemporal_load:
89 ; GFX12: ; %bb.0: ; %entry
90 ; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
91 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
92 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
93 ; GFX12-NEXT: s_wait_kmcnt 0x0
94 ; GFX12-NEXT: v_mov_b32_e32 v0, s2
95 ; GFX12-NEXT: v_mov_b32_e32 v1, s3
96 ; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
97 ; GFX12-NEXT: v_mov_b32_e32 v0, s0
98 ; GFX12-NEXT: v_mov_b32_e32 v1, s1
99 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
100 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
101 ; GFX12-NEXT: s_endpgm
103 %val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0
104 store i32 %val, ptr %out
109 declare i32 @llvm.amdgcn.workitem.id.x()