1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2 ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
4 ; FIXME: This leaves behind a now unnecessary and with exec
6 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
7 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
8 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
9 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
10 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
11 ; GCN: buffer_store_dword [[RESULT]]
12 define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
14 %v = load float, float addrspace(1)* %in
15 %cc = fcmp oeq float %v, 1.000000e+00
16 br i1 %cc, label %if, label %endif
19 %u = fadd float %v, %v
23 %r = phi float [ %v, %entry ], [ %u, %if ]
24 store float %r, float addrspace(1)* %out
28 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
29 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
30 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
31 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
32 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
33 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc
34 ; GCN: buffer_store_dword [[RESULT]]
35 define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
37 %v = load float, float addrspace(1)* %in
38 %cc = fcmp oeq float %v, 1.000000e+00
39 br i1 %cc, label %if, label %else
42 %u0 = fadd float %v, %v
46 %u1 = fmul float %v, %v
50 %r = phi float [ %u0, %if ], [ %u1, %else ]
51 store float %r, float addrspace(1)* %out
55 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
57 ; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
58 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
59 ; GCN: s_mov_b64 vcc, [[CMP]]
60 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
61 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
63 %v = load i32, i32 addrspace(1)* %in
64 %cc = fcmp oeq float %k, 1.000000e+00
65 br i1 %cc, label %if, label %endif
68 call void asm "; clobber $0", "~{vcc}"() #0
73 %r = phi i32 [ %v, %entry ], [ %u, %if ]
74 store i32 %r, i32 addrspace(1)* %out
78 ; Longest chain of cheap instructions to convert
79 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
89 ; GCN: v_cndmask_b32_e32
90 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
92 %v = load float, float addrspace(1)* %in
93 %cc = fcmp oeq float %v, 1.000000e+00
94 br i1 %cc, label %if, label %endif
97 %u.0 = fmul float %v, %v
98 %u.1 = fmul float %v, %u.0
99 %u.2 = fmul float %v, %u.1
100 %u.3 = fmul float %v, %u.2
101 %u.4 = fmul float %v, %u.3
102 %u.5 = fmul float %v, %u.4
103 %u.6 = fmul float %v, %u.5
104 %u.7 = fmul float %v, %u.6
105 %u.8 = fmul float %v, %u.7
109 %r = phi float [ %v, %entry ], [ %u.8, %if ]
110 store float %r, float addrspace(1)* %out
114 ; Short chain of cheap instructions to not convert
115 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
116 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
130 ; GCN: buffer_store_dword
131 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
133 %v = load float, float addrspace(1)* %in
134 %cc = fcmp oeq float %v, 1.000000e+00
135 br i1 %cc, label %if, label %endif
138 %u.0 = fmul float %v, %v
139 %u.1 = fmul float %v, %u.0
140 %u.2 = fmul float %v, %u.1
141 %u.3 = fmul float %v, %u.2
142 %u.4 = fmul float %v, %u.3
143 %u.5 = fmul float %v, %u.4
144 %u.6 = fmul float %v, %u.5
145 %u.7 = fmul float %v, %u.6
146 %u.8 = fmul float %v, %u.7
147 %u.9 = fmul float %v, %u.8
151 %r = phi float [ %v, %entry ], [ %u.9, %if ]
152 store float %r, float addrspace(1)* %out
156 ; Should still branch over fdiv expansion
157 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
158 ; GCN: v_cmp_neq_f32_e32
159 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
161 ; GCN: v_div_scale_f32
164 ; GCN: buffer_store_dword
165 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
167 %v = load float, float addrspace(1)* %in
168 %cc = fcmp oeq float %v, 1.000000e+00
169 br i1 %cc, label %if, label %endif
172 %u = fdiv float %v, %v
176 %r = phi float [ %v, %entry ], [ %u, %if ]
177 store float %r, float addrspace(1)* %out
181 ; vcc branch with SGPR inputs
182 ; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
183 ; GCN: v_cmp_neq_f32_e64
184 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
189 ; GCN: buffer_store_dword
190 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 {
192 %v = load i32, i32 addrspace(4)* %in
193 %cc = fcmp oeq float %cnd, 1.000000e+00
194 br i1 %cc, label %if, label %endif
201 %r = phi i32 [ %v, %entry ], [ %u, %if ]
202 store i32 %r, i32 addrspace(1)* %out
207 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
209 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 {
211 %v = load float, float addrspace(4)* %in
212 %cc = fcmp oeq float %v, 1.000000e+00
213 br i1 %cc, label %if, label %endif
216 %u = fadd float %v, %v
220 %r = phi float [ %v, %entry ], [ %u, %if ]
221 store float %r, float addrspace(1)* %out
225 ; Due to broken cost heuristic, this is not if converted like
226 ; test_vccnz_ifcvt_triangle_constant_load even though it should be.
228 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
230 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
232 %cc = fcmp oeq float %v, 1.000000e+00
233 br i1 %cc, label %if, label %endif
236 %u = fadd float %v, %v
240 %r = phi float [ %v, %entry ], [ %u, %if ]
241 store float %r, float addrspace(1)* %out
245 ; Scalar branch and scalar inputs
246 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
247 ; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
248 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
249 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
250 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
251 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 {
253 %v = load i32, i32 addrspace(4)* %in
254 %cc = icmp eq i32 %cond, 1
255 br i1 %cc, label %if, label %endif
262 %r = phi i32 [ %v, %entry ], [ %u, %if ]
263 call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
267 ; FIXME: Should be able to use VALU compare and select
268 ; Scalar branch but VGPR select operands
269 ; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
271 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
276 ; GCN: buffer_store_dword
277 define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
279 %v = load float, float addrspace(1)* %in
280 %cc = icmp eq i32 %cond, 1
281 br i1 %cc, label %if, label %endif
284 %u = fadd float %v, %v
288 %r = phi float [ %v, %entry ], [ %u, %if ]
289 store float %r, float addrspace(1)* %out
293 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
296 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
297 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
298 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 {
300 %v = load i64, i64 addrspace(4)* %in
301 %cc = icmp eq i32 %cond, 1
302 br i1 %cc, label %if, label %endif
309 %r = phi i64 [ %v, %entry ], [ %u, %if ]
310 call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
314 ; TODO: Can do s_cselect_b64; s_cselect_b32
315 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
319 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
320 ; GCN-NEXT: s_cselect_b32 s
321 ; GCN-NEXT: s_cselect_b32 s
322 ; GCN-NEXT: s_cselect_b32 s
323 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 {
325 %v = load <3 x i32>, <3 x i32> addrspace(4)* %in
326 %cc = icmp eq i32 %cond, 1
327 br i1 %cc, label %if, label %endif
330 %u = add <3 x i32> %v, %v
334 %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
335 %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
336 call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
340 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
345 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
346 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
347 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
348 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 {
350 %v = load <4 x i32>, <4 x i32> addrspace(4)* %in
351 %cc = icmp eq i32 %cond, 1
352 br i1 %cc, label %if, label %endif
355 %u = add <4 x i32> %v, %v
359 %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
360 call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
364 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
365 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
366 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
367 define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
369 %cmp0 = icmp eq i32 %cond, 0
370 br i1 %cmp0, label %else, label %if
379 %value = phi i32 [0, %if], [1, %else]
380 store i32 %value, i32 addrspace(1)* %out
384 ; GCN-LABEL: {{^}}ifcvt_undef_scc:
386 ; GCN-NEXT: s_load_dwordx2
387 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
388 define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
390 br i1 undef, label %else, label %if
399 %value = phi i32 [0, %if], [1, %else]
400 store i32 %value, i32 addrspace(1)* %out
404 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
406 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
412 ; GCN: buffer_store_dword
413 define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
415 %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
416 %cc = fcmp oeq float %cnd, 1.000000e+00
417 br i1 %cc, label %if, label %endif
420 %u = add <8 x i32> %v, %v
424 %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
425 store <8 x i32> %r, <8 x i32> addrspace(1)* %out
429 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
431 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
437 ; GCN: buffer_store_dword
438 define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
440 %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
441 %cc = fcmp oeq float %cnd, 1.000000e+00
442 br i1 %cc, label %if, label %endif
445 %u = add <16 x i32> %v, %v
449 %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
450 store <16 x i32> %r, <16 x i32> addrspace(1)* %out
454 attributes #0 = { nounwind }