1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
9 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
12 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s
13 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s
15 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s
17 ; FIXME: Should probably test this, but sometimes selecting fmac is painful to match.
18 ; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s
21 ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
23 target triple = "amdgcn--"
26 declare i32 @llvm.amdgcn.workitem.id.x() #1
27 declare float @llvm.fmuladd.f32(float, float, float) #1
28 declare half @llvm.fmuladd.f16(half, half, half) #1
29 declare float @llvm.fabs.f32(float) #1
31 ; GCN-LABEL: {{^}}fmuladd_f32:
32 ; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
33 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
35 ; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
37 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
38 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
39 define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
40 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
41 %r0 = load float, float addrspace(1)* %in1
42 %r1 = load float, float addrspace(1)* %in2
43 %r2 = load float, float addrspace(1)* %in3
44 %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
45 store float %r3, float addrspace(1)* %out
49 ; GCN-LABEL: {{^}}fmul_fadd_f32:
50 ; GCN-FLUSH: v_mac_f32
52 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
54 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
55 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
57 ; GCN-DENORM-STRICT: v_mul_f32_e32
58 ; GCN-DENORM-STRICT: v_add_f32_e32
59 define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
60 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
61 %r0 = load volatile float, float addrspace(1)* %in1
62 %r1 = load volatile float, float addrspace(1)* %in2
63 %r2 = load volatile float, float addrspace(1)* %in3
64 %mul = fmul float %r0, %r1
65 %add = fadd float %mul, %r2
66 store float %add, float addrspace(1)* %out
70 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
71 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
72 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
74 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
75 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
76 ; SI-FLUSH: buffer_store_dword [[R2]]
77 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
79 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
81 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
82 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
84 ; SI-DENORM buffer_store_dword [[RESULT]]
85 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
86 define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
87 %tid = call i32 @llvm.amdgcn.workitem.id.x()
88 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
89 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
90 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
92 %r1 = load volatile float, float addrspace(1)* %gep.0
93 %r2 = load volatile float, float addrspace(1)* %gep.1
95 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
96 store float %r3, float addrspace(1)* %gep.out
100 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32
101 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
102 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
104 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
105 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
107 ; SI-FLUSH: buffer_store_dword [[R2]]
108 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
110 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
112 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
113 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
115 ; SI-DENORM: buffer_store_dword [[RESULT]]
116 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
117 define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
118 %tid = call i32 @llvm.amdgcn.workitem.id.x()
119 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
120 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
121 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
123 %r1 = load volatile float, float addrspace(1)* %gep.0
124 %r2 = load volatile float, float addrspace(1)* %gep.1
126 %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
127 store float %r3, float addrspace(1)* %gep.out
131 ; GCN-LABEL: {{^}}fadd_a_a_b_f32:
132 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
133 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
135 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
137 ; SI-FLUSH: buffer_store_dword [[R2]]
138 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
140 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
142 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
143 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
145 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
146 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
148 ; SI-DENORM: buffer_store_dword [[RESULT]]
149 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
150 define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
151 float addrspace(1)* %in1,
152 float addrspace(1)* %in2) #0 {
153 %tid = call i32 @llvm.amdgcn.workitem.id.x()
154 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
155 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
156 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
158 %r0 = load volatile float, float addrspace(1)* %gep.0
159 %r1 = load volatile float, float addrspace(1)* %gep.1
161 %add.0 = fadd float %r0, %r0
162 %add.1 = fadd float %add.0, %r1
163 store float %add.1, float addrspace(1)* %gep.out
167 ; GCN-LABEL: {{^}}fadd_b_a_a_f32:
168 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
169 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
171 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
173 ; SI-FLUSH: buffer_store_dword [[R2]]
174 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
176 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
178 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
179 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
181 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
182 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
184 ; SI-DENORM: buffer_store_dword [[RESULT]]
185 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
186 define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
187 float addrspace(1)* %in1,
188 float addrspace(1)* %in2) #0 {
189 %tid = call i32 @llvm.amdgcn.workitem.id.x()
190 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
191 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
192 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
194 %r0 = load volatile float, float addrspace(1)* %gep.0
195 %r1 = load volatile float, float addrspace(1)* %gep.1
197 %add.0 = fadd float %r0, %r0
198 %add.1 = fadd float %r1, %add.0
199 store float %add.1, float addrspace(1)* %gep.out
203 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
204 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
205 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
206 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
207 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
209 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
211 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
212 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
214 ; SI-DENORM: buffer_store_dword [[RESULT]]
215 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
216 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
217 %tid = call i32 @llvm.amdgcn.workitem.id.x()
218 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
219 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
220 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
222 %r1 = load volatile float, float addrspace(1)* %gep.0
223 %r2 = load volatile float, float addrspace(1)* %gep.1
225 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
226 store float %r3, float addrspace(1)* %gep.out
231 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
232 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
233 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
235 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
236 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
238 ; SI-FLUSH: buffer_store_dword [[R2]]
239 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
241 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
243 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
244 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
246 ; SI-DENORM: buffer_store_dword [[RESULT]]
247 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
248 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
249 %tid = call i32 @llvm.amdgcn.workitem.id.x()
250 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
251 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
252 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
254 %r1 = load volatile float, float addrspace(1)* %gep.0
255 %r2 = load volatile float, float addrspace(1)* %gep.1
257 %r1.fneg = fsub float -0.000000e+00, %r1
259 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
260 store float %r3, float addrspace(1)* %gep.out
264 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32:
265 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
266 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
268 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
269 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
271 ; SI-FLUSH: buffer_store_dword [[R2]]
272 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
274 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
276 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
277 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
279 ; SI-DENORM: buffer_store_dword [[RESULT]]
280 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
281 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
282 %tid = call i32 @llvm.amdgcn.workitem.id.x()
283 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
284 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
285 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
287 %r1 = load volatile float, float addrspace(1)* %gep.0
288 %r2 = load volatile float, float addrspace(1)* %gep.1
290 %r1.fneg = fsub float -0.000000e+00, %r1
292 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
293 store float %r3, float addrspace(1)* %gep.out
297 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32:
298 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
299 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
300 ; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
301 ; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
303 ; SI-FLUSH: buffer_store_dword [[RESULT]]
304 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
306 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
308 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
309 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
311 ; SI-DENORM: buffer_store_dword [[RESULT]]
312 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
313 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
314 %tid = call i32 @llvm.amdgcn.workitem.id.x()
315 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
316 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
317 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
319 %r1 = load volatile float, float addrspace(1)* %gep.0
320 %r2 = load volatile float, float addrspace(1)* %gep.1
322 %r2.fneg = fsub float -0.000000e+00, %r2
324 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
325 store float %r3, float addrspace(1)* %gep.out
329 ; GCN-LABEL: {{^}}mad_sub_f32:
330 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
331 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
332 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
333 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
335 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
337 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
338 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
340 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
341 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
343 ; SI: buffer_store_dword [[RESULT]]
344 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
345 define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
346 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
347 %tid.ext = sext i32 %tid to i64
348 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
349 %add1 = add i64 %tid.ext, 1
350 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
351 %add2 = add i64 %tid.ext, 2
352 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
353 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
354 %a = load volatile float, float addrspace(1)* %gep0, align 4
355 %b = load volatile float, float addrspace(1)* %gep1, align 4
356 %c = load volatile float, float addrspace(1)* %gep2, align 4
357 %mul = fmul float %a, %b
358 %sub = fsub float %mul, %c
359 store float %sub, float addrspace(1)* %outgep, align 4
363 ; GCN-LABEL: {{^}}mad_sub_inv_f32:
364 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
365 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
366 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
368 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
370 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
372 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
373 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
375 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
376 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
378 ; SI: buffer_store_dword [[RESULT]]
379 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
380 define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
381 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
382 %tid.ext = sext i32 %tid to i64
383 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
384 %add1 = add i64 %tid.ext, 1
385 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
386 %add2 = add i64 %tid.ext, 2
387 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
388 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
389 %a = load volatile float, float addrspace(1)* %gep0, align 4
390 %b = load volatile float, float addrspace(1)* %gep1, align 4
391 %c = load volatile float, float addrspace(1)* %gep2, align 4
392 %mul = fmul float %a, %b
393 %sub = fsub float %c, %mul
394 store float %sub, float addrspace(1)* %outgep, align 4
398 ; GCN-LABEL: {{^}}mad_sub_fabs_f32:
399 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
400 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
401 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
402 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
404 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
406 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
407 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
409 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
410 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
412 ; SI: buffer_store_dword [[RESULT]]
413 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
414 define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
415 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
416 %tid.ext = sext i32 %tid to i64
417 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
418 %add1 = add i64 %tid.ext, 1
419 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
420 %add2 = add i64 %tid.ext, 2
421 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
422 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
423 %a = load volatile float, float addrspace(1)* %gep0, align 4
424 %b = load volatile float, float addrspace(1)* %gep1, align 4
425 %c = load volatile float, float addrspace(1)* %gep2, align 4
426 %c.abs = call float @llvm.fabs.f32(float %c) #0
427 %mul = fmul float %a, %b
428 %sub = fsub float %mul, %c.abs
429 store float %sub, float addrspace(1)* %outgep, align 4
433 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32:
434 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
435 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
436 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
437 ; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
438 ; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
440 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
442 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
443 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
445 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
446 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
448 ; SI: buffer_store_dword [[RESULT]]
449 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
450 define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
451 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
452 %tid.ext = sext i32 %tid to i64
453 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
454 %add1 = add i64 %tid.ext, 1
455 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
456 %add2 = add i64 %tid.ext, 2
457 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
458 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
459 %a = load volatile float, float addrspace(1)* %gep0, align 4
460 %b = load volatile float, float addrspace(1)* %gep1, align 4
461 %c = load volatile float, float addrspace(1)* %gep2, align 4
462 %c.abs = call float @llvm.fabs.f32(float %c) #0
463 %mul = fmul float %a, %b
464 %sub = fsub float %c.abs, %mul
465 store float %sub, float addrspace(1)* %outgep, align 4
469 ; GCN-LABEL: {{^}}neg_neg_mad_f32:
470 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
471 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
472 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
474 ; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]]
475 ; SI-FLUSH: buffer_store_dword [[REGC]]
476 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
478 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
480 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
481 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
483 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
484 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
486 ; SI-DENORM: buffer_store_dword [[RESULT]]
487 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
488 define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
489 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
490 %tid.ext = sext i32 %tid to i64
491 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
492 %add1 = add i64 %tid.ext, 1
493 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
494 %add2 = add i64 %tid.ext, 2
495 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
496 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
497 %a = load volatile float, float addrspace(1)* %gep0, align 4
498 %b = load volatile float, float addrspace(1)* %gep1, align 4
499 %c = load volatile float, float addrspace(1)* %gep2, align 4
500 %nega = fsub float -0.000000e+00, %a
501 %negb = fsub float -0.000000e+00, %b
502 %mul = fmul float %nega, %negb
503 %sub = fadd float %mul, %c
504 store float %sub, float addrspace(1)* %outgep, align 4
508 ; GCN-LABEL: {{^}}mad_fabs_sub_f32:
509 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
510 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
511 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
512 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
514 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
516 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
517 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
519 ; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
520 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
522 ; SI: buffer_store_dword [[RESULT]]
523 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
524 define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
525 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
526 %tid.ext = sext i32 %tid to i64
527 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
528 %add1 = add i64 %tid.ext, 1
529 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
530 %add2 = add i64 %tid.ext, 2
531 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
532 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
533 %a = load volatile float, float addrspace(1)* %gep0, align 4
534 %b = load volatile float, float addrspace(1)* %gep1, align 4
535 %c = load volatile float, float addrspace(1)* %gep2, align 4
536 %b.abs = call float @llvm.fabs.f32(float %b) #0
537 %mul = fmul float %a, %b.abs
538 %sub = fsub float %mul, %c
539 store float %sub, float addrspace(1)* %outgep, align 4
543 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32:
544 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
545 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
546 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
547 ; SI-FLUSH: buffer_store_dword [[R2]]
548 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
550 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
552 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
553 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
555 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
556 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
558 ; SI-DENORM: buffer_store_dword [[RESULT]]
559 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
560 define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
561 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
562 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
563 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
564 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
566 %r1 = load volatile float, float addrspace(1)* %gep.0
567 %r2 = load volatile float, float addrspace(1)* %gep.1
569 %add = fadd float %r1, %r1
570 %r3 = fsub float %r2, %add
572 store float %r3, float addrspace(1)* %gep.out
576 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32:
577 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
578 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
579 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
581 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
583 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
584 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
586 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
587 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
589 ; SI: buffer_store_dword [[RESULT]]
590 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
591 define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
592 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
593 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
594 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
595 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
597 %r1 = load volatile float, float addrspace(1)* %gep.0
598 %r2 = load volatile float, float addrspace(1)* %gep.1
600 %add = fadd float %r1, %r1
601 %r3 = fsub float %add, %r2
603 store float %r3, float addrspace(1)* %gep.out
607 attributes #0 = { nounwind }
608 attributes #1 = { nounwind readnone }