1 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
4 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
6 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
8 ; GCN-LABEL: {{^}}full_mask:
9 ; GCN: s_mov_b64 exec, -1
10 ; GCN: v_add_f32_e32 v0,
11 define amdgpu_ps float @full_mask(float %a, float %b) {
13 %s = fadd float %a, %b
14 call void @llvm.amdgcn.init.exec(i64 -1)
18 ; GCN-LABEL: {{^}}partial_mask:
19 ; GCN: s_mov_b64 exec, 0x1e240
20 ; GCN: v_add_f32_e32 v0,
21 define amdgpu_ps float @partial_mask(float %a, float %b) {
23 %s = fadd float %a, %b
24 call void @llvm.amdgcn.init.exec(i64 123456)
28 ; GCN-LABEL: {{^}}input_s3off8:
29 ; GCN: s_bfe_u32 s0, s3, 0x70008
30 ; GCN: s_bfm_b64 exec, s0, 0
31 ; GCN: s_cmp_eq_u32 s0, 64
32 ; GCN: s_cmov_b64 exec, -1
33 ; GCN: v_add_f32_e32 v0,
34 define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
36 %s = fadd float %a, %b
37 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
41 ; GCN-LABEL: {{^}}input_s0off19:
42 ; GCN: s_bfe_u32 s0, s0, 0x70013
43 ; GCN: s_bfm_b64 exec, s0, 0
44 ; GCN: s_cmp_eq_u32 s0, 64
45 ; GCN: s_cmov_b64 exec, -1
46 ; GCN: v_add_f32_e32 v0,
47 define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) {
49 %s = fadd float %a, %b
50 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
54 ; GCN-LABEL: {{^}}reuse_input:
55 ; GCN: s_bfe_u32 s1, s0, 0x70013
56 ; GCN: s_bfm_b64 exec, s1, 0
57 ; GCN: s_cmp_eq_u32 s1, 64
58 ; GCN: s_cmov_b64 exec, -1
59 ; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
60 define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) {
62 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
63 %s = add i32 %a, %count
64 %f = sitofp i32 %s to float
68 ; GCN-LABEL: {{^}}reuse_input2:
69 ; GCN: s_bfe_u32 s1, s0, 0x70013
70 ; GCN: s_bfm_b64 exec, s1, 0
71 ; GCN: s_cmp_eq_u32 s1, 64
72 ; GCN: s_cmov_b64 exec, -1
73 ; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
74 define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) {
76 %s = add i32 %a, %count
77 %f = sitofp i32 %s to float
78 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
82 ; GCN-LABEL: {{^}}init_unreachable:
85 define amdgpu_ps void @init_unreachable() {
87 call void @llvm.amdgcn.init.exec(i64 -1)
91 ; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
93 ; GCN: s_mov_b64 exec, -1
96 define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
98 %array0 = alloca [1024 x i32], align 16, addrspace(5)
99 %array1 = alloca [20 x i32], align 16, addrspace(5)
100 call void @llvm.amdgcn.init.exec(i64 -1)
102 %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
103 store i32 %a, ptr addrspace(5) %ptr0, align 4
105 %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
106 store i32 %a, ptr addrspace(5) %ptr1, align 4
108 %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
109 store i32 %b, ptr addrspace(5) %ptr2, align 4
111 %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
112 %v3 = load i32, ptr addrspace(5) %ptr3, align 4
114 %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
115 %v4 = load i32, ptr addrspace(5) %ptr4, align 4
117 %v5 = add i32 %v3, %v4
118 %v = bitcast i32 %v5 to float
122 ; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
124 ; GCN: s_bfe_u32 s2, s2, 0x70008
125 ; GCN-NEXT: s_bfm_b64 exec, s2, 0
126 ; GCN-NEXT: s_cmp_eq_u32 s2, 64
127 ; GCN-NEXT: s_cmov_b64 exec, -1
130 define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
132 %array0 = alloca [1024 x i32], align 16, addrspace(5)
133 %array1 = alloca [20 x i32], align 16, addrspace(5)
134 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
136 %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
137 store i32 %a, ptr addrspace(5) %ptr0, align 4
139 %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
140 store i32 %a, ptr addrspace(5) %ptr1, align 4
142 %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
143 store i32 %b, ptr addrspace(5) %ptr2, align 4
145 %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
146 %v3 = load i32, ptr addrspace(5) %ptr3, align 4
148 %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
149 %v4 = load i32, ptr addrspace(5) %ptr4, align 4
151 %v5 = add i32 %v3, %v4
152 %v = bitcast i32 %v5 to float
156 ; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
159 ; GCN: s_bfe_u32 [[S:s[0-9]+]], s2, 0x70008
160 ; GCN-NEXT: s_bfm_b64 exec, [[S]], 0
161 ; GCN-NEXT: s_cmp_eq_u32 [[S]], 64
162 ; GCN-NEXT: s_cmov_b64 exec, -1
165 define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
167 ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
168 %array0 = alloca [1024 x i32], align 16, addrspace(5)
169 %array1 = alloca [20 x i32], align 16, addrspace(5)
171 %cc = icmp uge i32 %count, 32
172 br i1 %cc, label %endif, label %if
175 call void asm sideeffect "", ""()
179 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
181 %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
182 store i32 %a, ptr addrspace(5) %ptr0, align 4
184 %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
185 store i32 %a, ptr addrspace(5) %ptr1, align 4
187 %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
188 store i32 %b, ptr addrspace(5) %ptr2, align 4
190 %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
191 %v3 = load i32, ptr addrspace(5) %ptr3, align 4
193 %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
194 %v4 = load i32, ptr addrspace(5) %ptr4, align 4
196 %v5 = add i32 %v3, %v4
197 %v6 = add i32 %v5, %count
198 %v = bitcast i32 %v6 to float
202 declare void @llvm.amdgcn.init.exec(i64) #1
203 declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
205 attributes #1 = { convergent }