1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-FASTFMA,SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-SLOWFMA,SI %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
9 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
12 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD %s
13 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
15 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
17 ; FIXME: Should probably test this, but sometimes selecting fmac is painful to match.
18 ; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
20 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
21 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT %s
23 ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
25 target triple = "amdgcn--"
28 declare i32 @llvm.amdgcn.workitem.id.x() #1
29 declare float @llvm.fmuladd.f32(float, float, float) #1
30 declare half @llvm.fmuladd.f16(half, half, half) #1
31 declare float @llvm.fabs.f32(float) #1
33 ; GCN-LABEL: {{^}}fmuladd_f32:
34 ; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
35 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
37 ; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
39 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
40 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
41 define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
42 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
43 %r0 = load float, float addrspace(1)* %in1
44 %r1 = load float, float addrspace(1)* %in2
45 %r2 = load float, float addrspace(1)* %in3
46 %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
47 store float %r3, float addrspace(1)* %out
51 ; GCN-LABEL: {{^}}fmul_fadd_f32:
52 ; GCN-FLUSH: v_mac_f32
54 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
56 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
57 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
59 ; GCN-DENORM-STRICT: v_mul_f32_e32
60 ; GCN-DENORM-STRICT: v_add_f32_e32
61 define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
62 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
63 %r0 = load volatile float, float addrspace(1)* %in1
64 %r1 = load volatile float, float addrspace(1)* %in2
65 %r2 = load volatile float, float addrspace(1)* %in3
66 %mul = fmul float %r0, %r1
67 %add = fadd float %mul, %r2
68 store float %add, float addrspace(1)* %out
72 ; GCN-LABEL: {{^}}fmul_fadd_contract_f32:
73 ; GCN-FLUSH-FMAC: v_fmac_f32_e32
75 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
76 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
78 ; GCN-DENORM-FASTFMA: v_fma_f32
79 define amdgpu_kernel void @fmul_fadd_contract_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
80 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
81 %r0 = load volatile float, float addrspace(1)* %in1
82 %r1 = load volatile float, float addrspace(1)* %in2
83 %r2 = load volatile float, float addrspace(1)* %in3
84 %mul = fmul contract float %r0, %r1
85 %add = fadd contract float %mul, %r2
86 store float %add, float addrspace(1)* %out
90 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
91 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
92 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
94 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
95 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
96 ; SI-FLUSH: buffer_store_dword [[R2]]
97 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
99 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
101 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
102 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
104 ; SI-DENORM: buffer_store_dword [[RESULT]]
105 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
106 define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
107 %tid = call i32 @llvm.amdgcn.workitem.id.x()
108 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
109 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
110 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
112 %r1 = load volatile float, float addrspace(1)* %gep.0
113 %r2 = load volatile float, float addrspace(1)* %gep.1
115 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
116 store float %r3, float addrspace(1)* %gep.out
120 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32
121 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
122 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
124 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
125 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
127 ; SI-FLUSH: buffer_store_dword [[R2]]
128 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
130 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
132 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
133 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
135 ; SI-DENORM: buffer_store_dword [[RESULT]]
136 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
137 define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
138 %tid = call i32 @llvm.amdgcn.workitem.id.x()
139 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
140 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
141 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
143 %r1 = load volatile float, float addrspace(1)* %gep.0
144 %r2 = load volatile float, float addrspace(1)* %gep.1
146 %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
147 store float %r3, float addrspace(1)* %gep.out
151 ; GCN-LABEL: {{^}}fadd_a_a_b_f32:
152 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
153 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
155 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
157 ; SI-FLUSH: buffer_store_dword [[R2]]
158 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
160 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
162 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
163 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
165 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
166 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
168 ; SI-DENORM: buffer_store_dword [[RESULT]]
169 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
170 define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
171 float addrspace(1)* %in1,
172 float addrspace(1)* %in2) #0 {
173 %tid = call i32 @llvm.amdgcn.workitem.id.x()
174 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
175 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
176 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
178 %r0 = load volatile float, float addrspace(1)* %gep.0
179 %r1 = load volatile float, float addrspace(1)* %gep.1
181 %add.0 = fadd float %r0, %r0
182 %add.1 = fadd float %add.0, %r1
183 store float %add.1, float addrspace(1)* %gep.out
187 ; GCN-LABEL: {{^}}fadd_b_a_a_f32:
188 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
189 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
191 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
193 ; SI-FLUSH: buffer_store_dword [[R2]]
194 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
196 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
198 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
199 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
201 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
202 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
204 ; SI-DENORM: buffer_store_dword [[RESULT]]
205 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
206 define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
207 float addrspace(1)* %in1,
208 float addrspace(1)* %in2) #0 {
209 %tid = call i32 @llvm.amdgcn.workitem.id.x()
210 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
211 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
212 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
214 %r0 = load volatile float, float addrspace(1)* %gep.0
215 %r1 = load volatile float, float addrspace(1)* %gep.1
217 %add.0 = fadd float %r0, %r0
218 %add.1 = fadd float %r1, %add.0
219 store float %add.1, float addrspace(1)* %gep.out
223 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
224 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
225 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
226 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
227 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
229 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
231 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
232 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
234 ; SI-DENORM: buffer_store_dword [[RESULT]]
235 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
236 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
237 %tid = call i32 @llvm.amdgcn.workitem.id.x()
238 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
239 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
240 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
242 %r1 = load volatile float, float addrspace(1)* %gep.0
243 %r2 = load volatile float, float addrspace(1)* %gep.1
245 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
246 store float %r3, float addrspace(1)* %gep.out
251 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
252 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
253 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
255 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
256 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
258 ; SI-FLUSH: buffer_store_dword [[R2]]
259 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
261 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
263 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
264 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
266 ; SI-DENORM: buffer_store_dword [[RESULT]]
267 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
268 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
269 %tid = call i32 @llvm.amdgcn.workitem.id.x()
270 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
271 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
272 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
274 %r1 = load volatile float, float addrspace(1)* %gep.0
275 %r2 = load volatile float, float addrspace(1)* %gep.1
277 %r1.fneg = fneg float %r1
279 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
280 store float %r3, float addrspace(1)* %gep.out
284 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32:
285 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
286 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
288 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
289 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
291 ; SI-FLUSH: buffer_store_dword [[R2]]
292 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
294 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
296 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
297 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
299 ; SI-DENORM: buffer_store_dword [[RESULT]]
300 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
301 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
302 %tid = call i32 @llvm.amdgcn.workitem.id.x()
303 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
304 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
305 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
307 %r1 = load volatile float, float addrspace(1)* %gep.0
308 %r2 = load volatile float, float addrspace(1)* %gep.1
310 %r1.fneg = fneg float %r1
312 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
313 store float %r3, float addrspace(1)* %gep.out
317 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32:
318 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
319 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
320 ; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
321 ; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
323 ; SI-FLUSH: buffer_store_dword [[RESULT]]
324 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
326 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
328 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
329 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
331 ; SI-DENORM: buffer_store_dword [[RESULT]]
332 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
333 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
334 %tid = call i32 @llvm.amdgcn.workitem.id.x()
335 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
336 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
337 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
339 %r1 = load volatile float, float addrspace(1)* %gep.0
340 %r2 = load volatile float, float addrspace(1)* %gep.1
342 %r2.fneg = fneg float %r2
344 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
345 store float %r3, float addrspace(1)* %gep.out
349 ; GCN-LABEL: {{^}}mad_sub_f32:
350 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
351 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
352 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
353 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
355 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
357 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
358 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
360 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
361 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
363 ; SI: buffer_store_dword [[RESULT]]
364 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
365 define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
366 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
367 %tid.ext = sext i32 %tid to i64
368 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
369 %add1 = add i64 %tid.ext, 1
370 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
371 %add2 = add i64 %tid.ext, 2
372 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
373 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
374 %a = load volatile float, float addrspace(1)* %gep0, align 4
375 %b = load volatile float, float addrspace(1)* %gep1, align 4
376 %c = load volatile float, float addrspace(1)* %gep2, align 4
377 %mul = fmul float %a, %b
378 %sub = fsub float %mul, %c
379 store float %sub, float addrspace(1)* %outgep, align 4
383 ; GCN-LABEL: {{^}}mad_sub_inv_f32:
384 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
385 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
386 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
388 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
390 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
392 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
393 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
395 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
396 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
398 ; SI: buffer_store_dword [[RESULT]]
399 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
400 define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
401 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
402 %tid.ext = sext i32 %tid to i64
403 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
404 %add1 = add i64 %tid.ext, 1
405 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
406 %add2 = add i64 %tid.ext, 2
407 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
408 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
409 %a = load volatile float, float addrspace(1)* %gep0, align 4
410 %b = load volatile float, float addrspace(1)* %gep1, align 4
411 %c = load volatile float, float addrspace(1)* %gep2, align 4
412 %mul = fmul float %a, %b
413 %sub = fsub float %c, %mul
414 store float %sub, float addrspace(1)* %outgep, align 4
418 ; GCN-LABEL: {{^}}mad_sub_fabs_f32:
419 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
420 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
421 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
422 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
424 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
426 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
427 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
429 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
430 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
432 ; SI: buffer_store_dword [[RESULT]]
433 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
434 define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
435 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
436 %tid.ext = sext i32 %tid to i64
437 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
438 %add1 = add i64 %tid.ext, 1
439 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
440 %add2 = add i64 %tid.ext, 2
441 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
442 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
443 %a = load volatile float, float addrspace(1)* %gep0, align 4
444 %b = load volatile float, float addrspace(1)* %gep1, align 4
445 %c = load volatile float, float addrspace(1)* %gep2, align 4
446 %c.abs = call float @llvm.fabs.f32(float %c) #0
447 %mul = fmul float %a, %b
448 %sub = fsub float %mul, %c.abs
449 store float %sub, float addrspace(1)* %outgep, align 4
453 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32:
454 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
455 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
456 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
457 ; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
458 ; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
460 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
462 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
463 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
465 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
466 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
468 ; SI: buffer_store_dword [[RESULT]]
469 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
470 define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
471 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
472 %tid.ext = sext i32 %tid to i64
473 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
474 %add1 = add i64 %tid.ext, 1
475 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
476 %add2 = add i64 %tid.ext, 2
477 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
478 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
479 %a = load volatile float, float addrspace(1)* %gep0, align 4
480 %b = load volatile float, float addrspace(1)* %gep1, align 4
481 %c = load volatile float, float addrspace(1)* %gep2, align 4
482 %c.abs = call float @llvm.fabs.f32(float %c) #0
483 %mul = fmul float %a, %b
484 %sub = fsub float %c.abs, %mul
485 store float %sub, float addrspace(1)* %outgep, align 4
489 ; GCN-LABEL: {{^}}neg_neg_mad_f32:
490 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
491 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
492 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
494 ; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]]
495 ; SI-FLUSH: buffer_store_dword [[REGC]]
496 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
498 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
500 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
501 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
503 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
504 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
506 ; SI-DENORM: buffer_store_dword [[RESULT]]
507 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
508 define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
509 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
510 %tid.ext = sext i32 %tid to i64
511 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
512 %add1 = add i64 %tid.ext, 1
513 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
514 %add2 = add i64 %tid.ext, 2
515 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
516 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
517 %a = load volatile float, float addrspace(1)* %gep0, align 4
518 %b = load volatile float, float addrspace(1)* %gep1, align 4
519 %c = load volatile float, float addrspace(1)* %gep2, align 4
520 %nega = fneg float %a
521 %negb = fneg float %b
522 %mul = fmul float %nega, %negb
523 %sub = fadd float %mul, %c
524 store float %sub, float addrspace(1)* %outgep, align 4
528 ; GCN-LABEL: {{^}}mad_fabs_sub_f32:
529 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
530 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
531 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
532 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
534 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
536 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
537 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
539 ; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
540 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
542 ; SI: buffer_store_dword [[RESULT]]
543 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
544 define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
545 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
546 %tid.ext = sext i32 %tid to i64
547 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
548 %add1 = add i64 %tid.ext, 1
549 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
550 %add2 = add i64 %tid.ext, 2
551 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
552 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
553 %a = load volatile float, float addrspace(1)* %gep0, align 4
554 %b = load volatile float, float addrspace(1)* %gep1, align 4
555 %c = load volatile float, float addrspace(1)* %gep2, align 4
556 %b.abs = call float @llvm.fabs.f32(float %b) #0
557 %mul = fmul float %a, %b.abs
558 %sub = fsub float %mul, %c
559 store float %sub, float addrspace(1)* %outgep, align 4
563 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32:
564 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
565 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
566 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
567 ; SI-FLUSH: buffer_store_dword [[R2]]
568 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
570 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
572 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
573 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
575 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
576 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
578 ; SI-DENORM: buffer_store_dword [[RESULT]]
579 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
580 define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
581 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
582 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
583 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
584 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
586 %r1 = load volatile float, float addrspace(1)* %gep.0
587 %r2 = load volatile float, float addrspace(1)* %gep.1
589 %add = fadd float %r1, %r1
590 %r3 = fsub float %r2, %add
592 store float %r3, float addrspace(1)* %gep.out
596 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32:
597 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
598 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
599 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
601 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
603 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
604 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
606 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
607 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
609 ; SI: buffer_store_dword [[RESULT]]
610 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
611 define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
612 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
613 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
614 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
615 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
617 %r1 = load volatile float, float addrspace(1)* %gep.0
618 %r2 = load volatile float, float addrspace(1)* %gep.1
620 %add = fadd float %r1, %r1
621 %r3 = fsub float %add, %r2
623 store float %r3, float addrspace(1)* %gep.out
627 attributes #0 = { nounwind }
628 attributes #1 = { nounwind readnone }