1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
4 ; Load argument depends on waitcnt which should be skipped.
5 define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
6 ; GCN-LABEL: call_memory_arg_load:
8 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
9 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
10 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
11 ; GCN-NEXT: s_add_u32 s0, s0, s9
12 ; GCN-NEXT: s_addc_u32 s1, s1, 0
13 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
14 ; GCN-NEXT: v_mov_b32_e32 v0, s4
15 ; GCN-NEXT: ds_read_b32 v0, v0
16 ; GCN-NEXT: s_getpc_b64 s[4:5]
17 ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
18 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
19 ; GCN-NEXT: s_mov_b32 s32, 0
20 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
22 %vgpr = load volatile i32, i32 addrspace(3)* %ptr
23 call void @func(i32 %vgpr)
27 ; Memory waitcnt with no register dependence on the call
28 define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
29 ; GCN-LABEL: call_memory_no_dep:
31 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
32 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
33 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
34 ; GCN-NEXT: s_add_u32 s0, s0, s9
35 ; GCN-NEXT: v_mov_b32_e32 v0, 0
36 ; GCN-NEXT: s_addc_u32 s1, s1, 0
37 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
38 ; GCN-NEXT: global_store_dword v0, v0, s[4:5]
39 ; GCN-NEXT: v_mov_b32_e32 v0, 0
40 ; GCN-NEXT: s_getpc_b64 s[6:7]
41 ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4
42 ; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12
43 ; GCN-NEXT: s_mov_b32 s32, 0
44 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
46 store i32 0, i32 addrspace(1)* %ptr
47 call void @func(i32 0)
51 ; Should not wait after the call before memory
52 define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 {
53 ; GCN-LABEL: call_no_wait_after_call:
55 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
56 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
57 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
58 ; GCN-NEXT: s_add_u32 s0, s0, s9
59 ; GCN-NEXT: s_addc_u32 s1, s1, 0
60 ; GCN-NEXT: v_mov_b32_e32 v0, 0
61 ; GCN-NEXT: s_getpc_b64 s[4:5]
62 ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
63 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
64 ; GCN-NEXT: s_mov_b32 s32, 0
65 ; GCN-NEXT: v_mov_b32_e32 v40, 0
66 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
67 ; GCN-NEXT: global_store_dword v40, v40, s[34:35]
69 call void @func(i32 0)
70 store i32 0, i32 addrspace(1)* %ptr
74 define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 {
75 ; GCN-LABEL: call_no_wait_after_call_return_val:
77 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
78 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
79 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
80 ; GCN-NEXT: s_add_u32 s0, s0, s9
81 ; GCN-NEXT: s_addc_u32 s1, s1, 0
82 ; GCN-NEXT: v_mov_b32_e32 v0, 0
83 ; GCN-NEXT: s_getpc_b64 s[4:5]
84 ; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4
85 ; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12
86 ; GCN-NEXT: s_mov_b32 s32, 0
87 ; GCN-NEXT: v_mov_b32_e32 v40, 0
88 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
89 ; GCN-NEXT: global_store_dword v40, v0, s[34:35]
91 %rv = call i32 @func.return(i32 0)
92 store i32 %rv, i32 addrspace(1)* %ptr
96 ; Need to wait for the address dependency
97 define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 {
98 ; GCN-LABEL: call_got_load:
100 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
101 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
102 ; GCN-NEXT: s_add_u32 s0, s0, s9
103 ; GCN-NEXT: s_addc_u32 s1, s1, 0
104 ; GCN-NEXT: s_getpc_b64 s[4:5]
105 ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
106 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
107 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
108 ; GCN-NEXT: v_mov_b32_e32 v0, 0
109 ; GCN-NEXT: s_mov_b32 s32, 0
110 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
111 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
113 call void @got.func(i32 0)
117 ; Need to wait for the address dependency
118 define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 {
119 ; GCN-LABEL: tailcall_got_load:
121 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122 ; GCN-NEXT: s_getpc_b64 s[4:5]
123 ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
124 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
125 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
126 ; GCN-NEXT: v_mov_b32_e32 v0, 0
127 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
128 ; GCN-NEXT: s_setpc_b64 s[4:5]
129 tail call void @got.func(i32 0)
133 ; No need to wait for the load.
134 define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
135 ; GCN-LABEL: tail_call_memory_arg_load:
137 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138 ; GCN-NEXT: ds_read_b32 v0, v0
139 ; GCN-NEXT: s_getpc_b64 s[4:5]
140 ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
141 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
142 ; GCN-NEXT: s_setpc_b64 s[4:5]
143 %vgpr = load volatile i32, i32 addrspace(3)* %ptr
144 tail call void @func(i32 %vgpr)
148 declare hidden void @func(i32) #0
149 declare hidden i32 @func.return(i32) #0
150 declare void @got.func(i32) #0
152 attributes #0 = { nounwind }