1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s
3 ; Check that WQM is not triggered by the softwqm intrinsic alone.
5 ;CHECK-LABEL: {{^}}test1:
6 ;CHECK-NOT: s_wqm_b64 exec, exec
7 ;CHECK: buffer_load_dword
8 ;CHECK: buffer_load_dword
10 define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
12 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
13 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
14 %out = fadd float %src0, %src1
15 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
19 ; Check that the softwqm intrinsic works correctly for integers.
21 ;CHECK-LABEL: {{^}}test2:
22 ;CHECK-NOT: s_wqm_b64 exec, exec
23 ;CHECK: buffer_load_dword
24 ;CHECK: buffer_load_dword
26 define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
28 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
29 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
30 %out = fadd float %src0, %src1
31 %out.0 = bitcast float %out to i32
32 %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
33 %out.2 = bitcast i32 %out.1 to float
37 ; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
39 ;CHECK-LABEL: {{^}}test_softwqm1:
40 ;CHECK-NOT: s_wqm_b64 exec, exec
41 ;CHECK: buffer_load_dword
42 ;CHECK: buffer_load_dword
43 ;CHECK: buffer_store_dword
44 ;CHECK-NOT; s_wqm_b64 exec, exec
46 define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
48 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
49 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
50 %temp = fadd float %src0, %src1
51 call void @llvm.amdgcn.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
52 %out = fadd float %temp, %temp
53 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
57 ; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
59 ;CHECK-LABEL: {{^}}test_softwqm2:
60 ;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
61 ;CHECK: s_wqm_b64 exec, exec
62 ;CHECK: buffer_load_dword
63 ;CHECK: buffer_load_dword
64 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
65 ;CHECK: buffer_store_dword
66 ;CHECK; s_wqm_b64 exec, exec
68 define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
70 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
71 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
72 %temp = fadd float %src0, %src1
73 %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
74 call void @llvm.amdgcn.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
75 %out = fadd float %temp, %temp
76 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
80 ; Make sure the transition from Exact to WWM then softwqm does not trigger WQM.
82 ;CHECK-LABEL: {{^}}test_wwm1:
83 ;CHECK: buffer_load_dword
84 ;CHECK: buffer_store_dword
85 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
86 ;CHECK: buffer_load_dword
88 ;CHECK: s_mov_b64 exec, [[ORIG]]
90 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
92 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
93 call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
94 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
95 %temp = fadd float %src0, %src1
96 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
97 %out = fadd float %temp.0, %temp.0
98 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
102 ; Check that softwqm on one case of branch does not trigger WQM for shader.
104 ;CHECK-LABEL: {{^}}test_control_flow_0:
105 ;CHECK-NEXT: ; %main_body
106 ;CHECK-NOT: s_wqm_b64 exec, exec
112 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
114 %cmp = icmp eq i32 %z, 0
115 br i1 %cmp, label %IF, label %ELSE
118 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
119 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
120 %out = fadd float %src0, %src1
121 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
125 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
129 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
133 ; Check that softwqm on one case of branch is treated as WQM in WQM shader.
135 ;CHECK-LABEL: {{^}}test_control_flow_1:
136 ;CHECK-NEXT: ; %main_body
137 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
138 ;CHECK-NEXT: s_wqm_b64 exec, exec
140 ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
142 ;CHECK: s_mov_b64 exec, [[SAVED]]
144 ;CHECK-NOT: s_and_saveexec_b64
145 ;CHECK-NOT: s_and_b64 exec
148 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
150 %c.bc = bitcast i32 %c to float
151 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
152 %tex0 = extractelement <4 x float> %tex, i32 0
153 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
154 %data.sample = extractelement <4 x float> %dtex, i32 0
156 %cmp = icmp eq i32 %z, 0
157 br i1 %cmp, label %IF, label %ELSE
160 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
161 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
162 %out = fadd float %src0, %src1
163 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
167 call void @llvm.amdgcn.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
171 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
175 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
176 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
177 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
178 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
179 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
180 declare void @llvm.amdgcn.kill(i1) #1
181 declare float @llvm.amdgcn.wqm.f32(float) #3
182 declare float @llvm.amdgcn.softwqm.f32(float) #3
183 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
184 declare float @llvm.amdgcn.wwm.f32(float) #3
186 attributes #1 = { nounwind }
187 attributes #2 = { nounwind readonly }
188 attributes #3 = { nounwind readnone }