1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
4 ; Load argument depends on waitcnt which should be skipped.
5 define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
6 ; GCN-LABEL: call_memory_arg_load:
8 ; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
9 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
10 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
11 ; GCN-NEXT: s_add_u32 s0, s0, s11
12 ; GCN-NEXT: s_addc_u32 s1, s1, 0
13 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
14 ; GCN-NEXT: v_mov_b32_e32 v0, s6
15 ; GCN-NEXT: ds_read_b32 v0, v0
16 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
17 ; GCN-NEXT: s_mov_b32 s32, 0
18 ; GCN-NEXT: s_getpc_b64 s[8:9]
19 ; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
20 ; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
21 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
23 %vgpr = load volatile i32, ptr addrspace(3) %ptr
24 call void @func(i32 %vgpr)
28 ; Memory waitcnt with no register dependence on the call
29 define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
30 ; GCN-LABEL: call_memory_no_dep:
32 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
33 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
34 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
35 ; GCN-NEXT: s_add_u32 s0, s0, s11
36 ; GCN-NEXT: v_mov_b32_e32 v0, 0
37 ; GCN-NEXT: s_addc_u32 s1, s1, 0
38 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
39 ; GCN-NEXT: global_store_dword v0, v0, s[6:7]
40 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
41 ; GCN-NEXT: v_mov_b32_e32 v0, 0
42 ; GCN-NEXT: s_mov_b32 s32, 0
43 ; GCN-NEXT: s_getpc_b64 s[8:9]
44 ; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
45 ; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
46 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
48 store i32 0, ptr addrspace(1) %ptr
49 call void @func(i32 0)
53 ; Should not wait after the call before memory
54 define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
55 ; GCN-LABEL: call_no_wait_after_call:
57 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
58 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
59 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
60 ; GCN-NEXT: s_add_u32 s0, s0, s11
61 ; GCN-NEXT: s_addc_u32 s1, s1, 0
62 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
63 ; GCN-NEXT: v_mov_b32_e32 v0, 0
64 ; GCN-NEXT: s_mov_b32 s32, 0
65 ; GCN-NEXT: s_getpc_b64 s[8:9]
66 ; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
67 ; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
68 ; GCN-NEXT: v_mov_b32_e32 v40, 0
69 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
70 ; GCN-NEXT: global_store_dword v40, v40, s[34:35]
72 call void @func(i32 0)
73 store i32 0, ptr addrspace(1) %ptr
77 define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
78 ; GCN-LABEL: call_no_wait_after_call_return_val:
80 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
81 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
82 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
83 ; GCN-NEXT: s_add_u32 s0, s0, s11
84 ; GCN-NEXT: s_addc_u32 s1, s1, 0
85 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
86 ; GCN-NEXT: v_mov_b32_e32 v0, 0
87 ; GCN-NEXT: s_mov_b32 s32, 0
88 ; GCN-NEXT: s_getpc_b64 s[8:9]
89 ; GCN-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4
90 ; GCN-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12
91 ; GCN-NEXT: v_mov_b32_e32 v40, 0
92 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
93 ; GCN-NEXT: global_store_dword v40, v0, s[34:35]
95 %rv = call i32 @func.return(i32 0)
96 store i32 %rv, ptr addrspace(1) %ptr
100 ; Need to wait for the address dependency
101 define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
102 ; GCN-LABEL: call_got_load:
104 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
105 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
106 ; GCN-NEXT: s_add_u32 s0, s0, s11
107 ; GCN-NEXT: s_addc_u32 s1, s1, 0
108 ; GCN-NEXT: s_getpc_b64 s[6:7]
109 ; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
110 ; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
111 ; GCN-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
112 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
113 ; GCN-NEXT: v_mov_b32_e32 v0, 0
114 ; GCN-NEXT: s_mov_b32 s32, 0
115 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
116 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
118 call void @got.func(i32 0)
122 ; Need to wait for the address dependency
123 define void @tailcall_got_load(ptr addrspace(1) %ptr, i32) #0 {
124 ; GCN-LABEL: tailcall_got_load:
126 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127 ; GCN-NEXT: s_getpc_b64 s[4:5]
128 ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
129 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
130 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
131 ; GCN-NEXT: v_mov_b32_e32 v0, 0
132 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
133 ; GCN-NEXT: s_setpc_b64 s[4:5]
134 tail call void @got.func(i32 0)
138 ; No need to wait for the load.
139 define void @tail_call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
140 ; GCN-LABEL: tail_call_memory_arg_load:
142 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143 ; GCN-NEXT: ds_read_b32 v0, v0
144 ; GCN-NEXT: s_getpc_b64 s[4:5]
145 ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
146 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
147 ; GCN-NEXT: s_setpc_b64 s[4:5]
148 %vgpr = load volatile i32, ptr addrspace(3) %ptr
149 tail call void @func(i32 %vgpr)
153 declare hidden void @func(i32) #0
154 declare hidden i32 @func.return(i32) #0
155 declare void @got.func(i32) #0
157 attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
159 !llvm.module.flags = !{!0}
160 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}