1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
5 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
6 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
9 declare i32 @llvm.amdgcn.workitem.id.x() #0
10 declare double @llvm.fabs.f64(double) #0
11 declare double @llvm.fma.f64(double, double, double) #0
12 declare float @llvm.fma.f32(float, float, float) #0
14 ; (fadd (fmul x, y), z) -> (fma x, y, z)
15 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
16 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
17 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
18 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
19 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
20 ; SI: buffer_store_dwordx2 [[RESULT]]
21 define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
22 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
23 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
24 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
25 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
26 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
28 %a = load volatile double, double addrspace(1)* %gep.0
29 %b = load volatile double, double addrspace(1)* %gep.1
30 %c = load volatile double, double addrspace(1)* %gep.2
32 %mul = fmul double %a, %b
33 %fma = fadd double %mul, %c
34 store double %fma, double addrspace(1)* %gep.out
38 ; (fadd (fmul x, y), z) -> (fma x, y, z)
39 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
40 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
41 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
42 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
43 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
44 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
45 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
46 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
47 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
49 define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
50 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
51 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
52 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
53 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
54 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
55 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
56 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
58 %a = load volatile double, double addrspace(1)* %gep.0
59 %b = load volatile double, double addrspace(1)* %gep.1
60 %c = load volatile double, double addrspace(1)* %gep.2
61 %d = load volatile double, double addrspace(1)* %gep.3
63 %mul = fmul double %a, %b
64 %fma0 = fadd double %mul, %c
65 %fma1 = fadd double %mul, %d
66 store volatile double %fma0, double addrspace(1)* %gep.out.0
67 store volatile double %fma1, double addrspace(1)* %gep.out.1
71 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
72 ; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
73 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
74 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
75 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
76 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
77 ; SI: buffer_store_dwordx2 [[RESULT]]
78 define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
79 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
80 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
81 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
82 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
83 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
85 %a = load volatile double, double addrspace(1)* %gep.0
86 %b = load volatile double, double addrspace(1)* %gep.1
87 %c = load volatile double, double addrspace(1)* %gep.2
89 %mul = fmul double %a, %b
90 %fma = fadd double %c, %mul
91 store double %fma, double addrspace(1)* %gep.out
95 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
96 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
97 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
98 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
99 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
100 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
101 ; SI: buffer_store_dwordx2 [[RESULT]]
102 define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
103 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
104 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
105 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
106 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
107 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
109 %a = load volatile double, double addrspace(1)* %gep.0
110 %b = load volatile double, double addrspace(1)* %gep.1
111 %c = load volatile double, double addrspace(1)* %gep.2
113 %mul = fmul double %a, %b
114 %fma = fsub double %mul, %c
115 store double %fma, double addrspace(1)* %gep.out
119 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
120 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
121 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
122 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
123 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
124 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
125 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
126 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
127 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
128 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
130 define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
131 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
132 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
133 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
134 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
135 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
136 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
137 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
139 %a = load volatile double, double addrspace(1)* %gep.0
140 %b = load volatile double, double addrspace(1)* %gep.1
141 %c = load volatile double, double addrspace(1)* %gep.2
142 %d = load volatile double, double addrspace(1)* %gep.3
144 %mul = fmul double %a, %b
145 %fma0 = fsub double %mul, %c
146 %fma1 = fsub double %mul, %d
147 store volatile double %fma0, double addrspace(1)* %gep.out.0
148 store volatile double %fma1, double addrspace(1)* %gep.out.1
152 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
153 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
154 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
155 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
156 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
157 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
158 ; SI: buffer_store_dwordx2 [[RESULT]]
159 define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
160 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
161 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
162 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
163 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
164 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
166 %a = load volatile double, double addrspace(1)* %gep.0
167 %b = load volatile double, double addrspace(1)* %gep.1
168 %c = load volatile double, double addrspace(1)* %gep.2
170 %mul = fmul double %a, %b
171 %fma = fsub double %c, %mul
172 store double %fma, double addrspace(1)* %gep.out
176 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
177 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
178 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
179 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
180 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
181 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
182 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
183 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
184 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
185 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
187 define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
188 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
189 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
190 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
191 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
192 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
193 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
194 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
196 %a = load volatile double, double addrspace(1)* %gep.0
197 %b = load volatile double, double addrspace(1)* %gep.1
198 %c = load volatile double, double addrspace(1)* %gep.2
199 %d = load volatile double, double addrspace(1)* %gep.3
201 %mul = fmul double %a, %b
202 %fma0 = fsub double %c, %mul
203 %fma1 = fsub double %d, %mul
204 store volatile double %fma0, double addrspace(1)* %gep.out.0
205 store volatile double %fma1, double addrspace(1)* %gep.out.1
209 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
210 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
211 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
212 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
213 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
214 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
215 ; SI: buffer_store_dwordx2 [[RESULT]]
216 define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
217 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
218 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
219 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
220 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
221 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
223 %a = load volatile double, double addrspace(1)* %gep.0
224 %b = load volatile double, double addrspace(1)* %gep.1
225 %c = load volatile double, double addrspace(1)* %gep.2
227 %mul = fmul double %a, %b
228 %mul.neg = fsub double -0.0, %mul
229 %fma = fsub double %mul.neg, %c
231 store double %fma, double addrspace(1)* %gep.out
235 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
236 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
237 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
238 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
239 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
240 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
241 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
242 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
243 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
244 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
246 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
247 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
248 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
249 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
250 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
251 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
252 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
253 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
255 %a = load volatile double, double addrspace(1)* %gep.0
256 %b = load volatile double, double addrspace(1)* %gep.1
257 %c = load volatile double, double addrspace(1)* %gep.2
258 %d = load volatile double, double addrspace(1)* %gep.3
260 %mul = fmul double %a, %b
261 %mul.neg = fsub double -0.0, %mul
262 %fma0 = fsub double %mul.neg, %c
263 %fma1 = fsub double %mul.neg, %d
265 store volatile double %fma0, double addrspace(1)* %gep.out.0
266 store volatile double %fma1, double addrspace(1)* %gep.out.1
270 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
271 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
272 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
273 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
274 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
275 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
276 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
277 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
278 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
279 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
281 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
282 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
283 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
284 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
285 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
286 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
287 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
288 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
290 %a = load volatile double, double addrspace(1)* %gep.0
291 %b = load volatile double, double addrspace(1)* %gep.1
292 %c = load volatile double, double addrspace(1)* %gep.2
293 %d = load volatile double, double addrspace(1)* %gep.3
295 %mul = fmul double %a, %b
296 %mul.neg = fsub double -0.0, %mul
297 %fma0 = fsub double %mul.neg, %c
298 %fma1 = fsub double %mul, %d
300 store volatile double %fma0, double addrspace(1)* %gep.out.0
301 store volatile double %fma1, double addrspace(1)* %gep.out.1
305 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
307 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
308 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
309 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
310 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
311 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
312 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
314 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
315 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
316 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]]
318 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
319 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
321 ; SI: buffer_store_dwordx2 [[RESULT]]
322 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
324 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
325 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
326 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
327 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
328 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
329 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
331 %x = load volatile double, double addrspace(1)* %gep.0
332 %y = load volatile double, double addrspace(1)* %gep.1
333 %z = load volatile double, double addrspace(1)* %gep.2
334 %u = load volatile double, double addrspace(1)* %gep.3
335 %v = load volatile double, double addrspace(1)* %gep.4
337 %tmp0 = fmul double %u, %v
338 %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
339 %tmp2 = fsub double %tmp1, %z
341 store double %tmp2, double addrspace(1)* %gep.out
345 ; fold (fsub x, (fma y, z, (fmul u, v)))
346 ; -> (fma (fneg y), z, (fma (fneg u), v, x))
348 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
349 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
350 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
351 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
352 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
353 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
355 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
356 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
357 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]]
359 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
360 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
362 ; SI: buffer_store_dwordx2 [[RESULT]]
363 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
364 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
365 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
366 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
367 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
368 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
369 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
370 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
372 %x = load volatile double, double addrspace(1)* %gep.0
373 %y = load volatile double, double addrspace(1)* %gep.1
374 %z = load volatile double, double addrspace(1)* %gep.2
375 %u = load volatile double, double addrspace(1)* %gep.3
376 %v = load volatile double, double addrspace(1)* %gep.4
378 %tmp0 = fmul double %u, %v
379 %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
380 %tmp2 = fsub double %x, %tmp1
382 store double %tmp2, double addrspace(1)* %gep.out
387 ; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
390 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
391 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
392 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
394 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
395 define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
396 float addrspace(1)* %in1,
397 float addrspace(1)* %in2) {
398 %x = load volatile float, float addrspace(1)* %in1
399 %y = load volatile float, float addrspace(1)* %in2
400 %a = fadd float %x, 1.0
401 %m = fmul float %a, %y
402 store float %m, float addrspace(1)* %out
406 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
407 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
408 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
410 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
411 define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
412 float addrspace(1)* %in1,
413 float addrspace(1)* %in2) {
414 %x = load volatile float, float addrspace(1)* %in1
415 %y = load volatile float, float addrspace(1)* %in2
416 %a = fadd float %x, 1.0
417 %m = fmul float %y, %a
418 store float %m, float addrspace(1)* %out
422 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
423 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
424 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
426 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
427 define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
428 float addrspace(1)* %in1,
429 float addrspace(1)* %in2) {
430 %x = load float, float addrspace(1)* %in1
431 %y = load float, float addrspace(1)* %in2
432 %a = fadd float %x, -1.0
433 %m = fmul float %a, %y
434 store float %m, float addrspace(1)* %out
438 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
439 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
440 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
442 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
443 define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
444 float addrspace(1)* %in1,
445 float addrspace(1)* %in2) {
446 %x = load float, float addrspace(1)* %in1
447 %y = load float, float addrspace(1)* %in2
448 %a = fadd float %x, -1.0
449 %m = fmul float %y, %a
450 store float %m, float addrspace(1)* %out
454 ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
455 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
456 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
458 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
459 define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
460 float addrspace(1)* %in1,
461 float addrspace(1)* %in2) {
462 %x = load float, float addrspace(1)* %in1
463 %y = load float, float addrspace(1)* %in2
464 %s = fsub float 1.0, %x
465 %m = fmul float %s, %y
466 store float %m, float addrspace(1)* %out
470 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
471 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
472 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
474 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
475 define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
476 float addrspace(1)* %in1,
477 float addrspace(1)* %in2) {
478 %x = load float, float addrspace(1)* %in1
479 %y = load float, float addrspace(1)* %in2
480 %s = fsub float 1.0, %x
481 %m = fmul float %y, %s
482 store float %m, float addrspace(1)* %out
486 ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
487 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
488 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
490 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
491 define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
492 float addrspace(1)* %in1,
493 float addrspace(1)* %in2) {
494 %x = load float, float addrspace(1)* %in1
495 %y = load float, float addrspace(1)* %in2
496 %s = fsub float -1.0, %x
497 %m = fmul float %s, %y
498 store float %m, float addrspace(1)* %out
502 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
503 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
504 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
506 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
507 define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
508 float addrspace(1)* %in1,
509 float addrspace(1)* %in2) {
510 %x = load float, float addrspace(1)* %in1
511 %y = load float, float addrspace(1)* %in2
512 %s = fsub float -1.0, %x
513 %m = fmul float %y, %s
514 store float %m, float addrspace(1)* %out
518 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
519 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
520 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
522 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
523 define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
524 float addrspace(1)* %in1,
525 float addrspace(1)* %in2) {
526 %x = load float, float addrspace(1)* %in1
527 %y = load float, float addrspace(1)* %in2
528 %s = fsub float %x, 1.0
529 %m = fmul float %s, %y
530 store float %m, float addrspace(1)* %out
534 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
535 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
536 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
538 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
539 define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
540 float addrspace(1)* %in1,
541 float addrspace(1)* %in2) {
542 %x = load float, float addrspace(1)* %in1
543 %y = load float, float addrspace(1)* %in2
544 %s = fsub float %x, 1.0
545 %m = fmul float %y, %s
546 store float %m, float addrspace(1)* %out
550 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
551 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
552 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
554 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
555 define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
556 float addrspace(1)* %in1,
557 float addrspace(1)* %in2) {
558 %x = load float, float addrspace(1)* %in1
559 %y = load float, float addrspace(1)* %in2
560 %s = fsub float %x, -1.0
561 %m = fmul float %s, %y
562 store float %m, float addrspace(1)* %out
566 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
567 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
568 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
570 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
571 define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
572 float addrspace(1)* %in1,
573 float addrspace(1)* %in2) {
574 %x = load float, float addrspace(1)* %in1
575 %y = load float, float addrspace(1)* %in2
576 %s = fsub float %x, -1.0
577 %m = fmul float %y, %s
578 store float %m, float addrspace(1)* %out
583 ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
586 ; FUNC-LABEL: {{^}}test_f32_interp:
587 ; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
588 ; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
589 ; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
591 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
592 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
593 define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
594 float addrspace(1)* %in1,
595 float addrspace(1)* %in2,
596 float addrspace(1)* %in3) {
597 %x = load float, float addrspace(1)* %in1
598 %y = load float, float addrspace(1)* %in2
599 %t = load float, float addrspace(1)* %in3
600 %t1 = fsub float 1.0, %t
601 %tx = fmul float %x, %t
602 %ty = fmul float %y, %t1
603 %r = fadd float %tx, %ty
604 store float %r, float addrspace(1)* %out
608 ; FUNC-LABEL: {{^}}test_f64_interp:
609 ; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0
610 ; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]]
611 ; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]]
613 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
614 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
615 define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
616 double addrspace(1)* %in1,
617 double addrspace(1)* %in2,
618 double addrspace(1)* %in3) {
619 %x = load double, double addrspace(1)* %in1
620 %y = load double, double addrspace(1)* %in2
621 %t = load double, double addrspace(1)* %in3
622 %t1 = fsub double 1.0, %t
623 %tx = fmul double %x, %t
624 %ty = fmul double %y, %t1
625 %r = fadd double %tx, %ty
626 store double %r, double addrspace(1)* %out
630 ; Make sure negative constant cancels out fneg
631 ; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
632 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
633 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
636 ; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
637 define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
638 %tid = call i32 @llvm.amdgcn.workitem.id.x()
639 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
640 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
641 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
643 %r1 = load volatile float, float addrspace(1)* %gep.0
644 %r2 = load volatile float, float addrspace(1)* %gep.1
646 %r1.fneg = fsub float -0.000000e+00, %r1
648 %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2)
649 store float %r3, float addrspace(1)* %gep.out
653 ; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32:
654 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
655 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
658 ; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
659 define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
660 %tid = call i32 @llvm.amdgcn.workitem.id.x()
661 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
662 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
663 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
665 %r1 = load volatile float, float addrspace(1)* %gep.0
666 %r2 = load volatile float, float addrspace(1)* %gep.1
668 %r1.fneg = fsub float -0.000000e+00, %r1
670 %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2)
671 store float %r3, float addrspace(1)* %gep.out
675 attributes #0 = { nounwind readnone }
676 attributes #1 = { nounwind }