1 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement < %s | FileCheck -check-prefix=SI %s
3 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
5 ; SI-LABEL: {{^}}test_if:
6 ; Make sure the i1 values created by the cfg structurizer pass are
7 ; moved using VALU instructions
10 ; waitcnt should be inserted after exec modification
11 ; SI: v_cmp_lt_i32_e32 vcc, 1,
12 ; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
13 ; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
14 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
15 ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
16 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
17 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
19 ; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
20 ; SI: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
21 ; SI: s_and_saveexec_b64
22 ; SI-NEXT: ; mask branch
24 ; v_mov should be after exec modification
26 ; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
27 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
28 ; SI-NEXT: ; mask branch
30 define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
32 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
33 switch i32 %tid, label %default [
39 %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
40 store i32 13, i32 addrspace(1)* %arrayidx1, align 4
44 %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
45 store i32 17, i32 addrspace(1)* %arrayidx5, align 4
49 %cmp8 = icmp eq i32 %tid, 2
50 %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
51 br i1 %cmp8, label %if, label %else
54 store i32 19, i32 addrspace(1)* %arrayidx10, align 4
58 store i32 21, i32 addrspace(1)* %arrayidx10, align 4
65 ; SI-LABEL: {{^}}simple_test_v_if:
66 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
67 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
68 ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
69 ; SI-NEXT: s_cbranch_execz [[EXIT]]
71 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
72 ; SI: buffer_store_dword
74 ; SI-NEXT: {{^}}[[EXIT]]:
76 define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
77 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
78 %is.0 = icmp ne i32 %tid, 0
79 br i1 %is.0, label %then, label %exit
82 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
83 store i32 999, i32 addrspace(1)* %gep
90 ; FIXME: It would be better to endpgm in the then block.
92 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
93 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
94 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
95 ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
96 ; SI-NEXT: s_cbranch_execz [[EXIT]]
98 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
99 ; SI: buffer_store_dword
101 ; SI-NEXT: {{^}}[[EXIT]]:
103 define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
104 %tid = call i32 @llvm.amdgcn.workitem.id.x()
105 %is.0 = icmp ne i32 %tid, 0
106 br i1 %is.0, label %then, label %exit
109 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
110 store i32 999, i32 addrspace(1)* %gep
117 ; Final block has more than a ret to execute. This was miscompiled
118 ; before function exit blocks were unified since the endpgm would
119 ; terminate the then wavefront before reaching the store.
121 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
122 ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
123 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
124 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
125 ; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
127 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
130 ; SI-NEXT: {{^}}[[FLOW]]:
131 ; SI-NEXT: s_or_saveexec_b64
132 ; SI-NEXT: s_xor_b64 exec, exec
133 ; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
134 ; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN]]
136 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
138 ; SI-NEXT: buffer_store_dword
140 ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
142 define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
143 %tid = call i32 @llvm.amdgcn.workitem.id.x()
144 %is.0 = icmp ne i32 %tid, 0
145 br i1 %is.0, label %then, label %exit
148 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
149 store i32 999, i32 addrspace(1)* %gep
153 store volatile i32 7, i32 addrspace(3)* undef
157 ; SI-LABEL: {{^}}simple_test_v_loop:
158 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
159 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
160 ; SI-NEXT: ; mask branch
161 ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
163 ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
165 ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
166 ; SI: buffer_load_dword
167 ; SI-DAG: buffer_store_dword
168 ; SI-DAG: s_cmpk_eq_i32 s{{[0-9+]}}, 0x100
169 ; SI: s_cbranch_scc0 [[LABEL_LOOP]]
170 ; SI: [[LABEL_EXIT]]:
173 define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
175 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
176 %is.0 = icmp ne i32 %tid, 0
177 %limit = add i32 %tid, 64
178 br i1 %is.0, label %loop, label %exit
181 %i = phi i32 [%tid, %entry], [%i.inc, %loop]
182 %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
183 %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
184 %load = load i32, i32 addrspace(1)* %src
185 store i32 %load, i32 addrspace(1)* %gep.dst
186 %i.inc = add nsw i32 %i, 1
187 %cmp = icmp eq i32 %limit, %i.inc
188 br i1 %cmp, label %exit, label %loop
194 ; SI-LABEL: {{^}}multi_vcond_loop:
196 ; Load loop limit from buffer
197 ; Branch to exit if uniformly not taken
199 ; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
200 ; SI: v_cmp_lt_i32_e32 vcc
201 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
202 ; SI-NEXT: ; mask branch
203 ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
205 ; Initialize inner condition to false
206 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader
207 ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}}
209 ; Clear exec bits for workitems that load -1s
210 ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
211 ; SI: buffer_load_dword [[B:v[0-9]+]]
212 ; SI: buffer_load_dword [[A:v[0-9]+]]
213 ; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
214 ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
215 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
216 ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
217 ; SI: ; mask branch [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
219 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
220 ; SI: buffer_store_dword
222 ; SI: [[LABEL_FLOW]]:
223 ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
224 ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
225 ; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
226 ; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]]
227 ; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]]
228 ; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]]
229 ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
231 ; SI: [[LABEL_EXIT]]:
232 ; SI-NOT: [[COND_STATE]]
235 define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
237 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
238 %tmp4 = sext i32 %tmp to i64
239 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
240 %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
241 %tmp7 = icmp sgt i32 %tmp6, 0
242 %tmp8 = sext i32 %tmp6 to i64
243 br i1 %tmp7, label %bb10, label %bb26
245 bb10: ; preds = %bb, %bb20
246 %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
247 %tmp12 = add nsw i64 %tmp11, %tmp4
248 %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
249 %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
250 %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
251 %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
252 %tmp17 = icmp ne i32 %tmp14, -1
253 %tmp18 = icmp ne i32 %tmp16, -1
254 %tmp19 = and i1 %tmp17, %tmp18
255 br i1 %tmp19, label %bb20, label %bb26
257 bb20: ; preds = %bb10
258 %tmp21 = add nsw i32 %tmp16, %tmp14
259 %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
260 store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
261 %tmp23 = add nuw nsw i64 %tmp11, 1
262 %tmp24 = icmp slt i64 %tmp23, %tmp8
263 br i1 %tmp24, label %bb10, label %bb26
265 bb26: ; preds = %bb10, %bb20, %bb
269 attributes #0 = { nounwind readnone }
270 attributes #1 = { nounwind }