1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
5 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
6 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
9 declare i32 @llvm.amdgcn.workitem.id.x() #0
10 declare double @llvm.fabs.f64(double) #0
11 declare double @llvm.fma.f64(double, double, double) #0
12 declare float @llvm.fma.f32(float, float, float) #0
14 ; (fadd (fmul x, y), z) -> (fma x, y, z)
15 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
16 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
17 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
18 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
19 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
20 ; SI: buffer_store_dwordx2 [[RESULT]]
21 define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
22 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
23 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
24 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
25 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
26 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
28 %a = load volatile double, double addrspace(1)* %gep.0
29 %b = load volatile double, double addrspace(1)* %gep.1
30 %c = load volatile double, double addrspace(1)* %gep.2
32 %mul = fmul double %a, %b
33 %fma = fadd double %mul, %c
34 store double %fma, double addrspace(1)* %gep.out
38 ; (fadd (fmul x, y), z) -> (fma x, y, z)
39 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
40 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
41 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
42 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
43 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
44 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
45 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
46 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
47 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
49 define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
50 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
51 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
52 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
53 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
54 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
55 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
56 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
58 %a = load volatile double, double addrspace(1)* %gep.0
59 %b = load volatile double, double addrspace(1)* %gep.1
60 %c = load volatile double, double addrspace(1)* %gep.2
61 %d = load volatile double, double addrspace(1)* %gep.3
63 %mul = fmul double %a, %b
64 %fma0 = fadd double %mul, %c
65 %fma1 = fadd double %mul, %d
66 store volatile double %fma0, double addrspace(1)* %gep.out.0
67 store volatile double %fma1, double addrspace(1)* %gep.out.1
71 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
72 ; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
73 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
74 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
75 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
76 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
77 ; SI: buffer_store_dwordx2 [[RESULT]]
78 define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
79 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
80 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
81 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
82 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
83 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
85 %a = load volatile double, double addrspace(1)* %gep.0
86 %b = load volatile double, double addrspace(1)* %gep.1
87 %c = load volatile double, double addrspace(1)* %gep.2
89 %mul = fmul double %a, %b
90 %fma = fadd double %c, %mul
91 store double %fma, double addrspace(1)* %gep.out
95 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
96 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
97 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
98 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
99 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
100 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
101 ; SI: buffer_store_dwordx2 [[RESULT]]
102 define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
103 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
104 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
105 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
106 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
107 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
109 %a = load volatile double, double addrspace(1)* %gep.0
110 %b = load volatile double, double addrspace(1)* %gep.1
111 %c = load volatile double, double addrspace(1)* %gep.2
113 %mul = fmul double %a, %b
114 %fma = fsub double %mul, %c
115 store double %fma, double addrspace(1)* %gep.out
119 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
120 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
121 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
122 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
123 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
124 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
125 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
126 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
127 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
128 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
130 define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
131 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
132 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
133 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
134 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
135 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
136 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
137 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
139 %a = load volatile double, double addrspace(1)* %gep.0
140 %b = load volatile double, double addrspace(1)* %gep.1
141 %c = load volatile double, double addrspace(1)* %gep.2
142 %d = load volatile double, double addrspace(1)* %gep.3
144 %mul = fmul double %a, %b
145 %fma0 = fsub double %mul, %c
146 %fma1 = fsub double %mul, %d
147 store volatile double %fma0, double addrspace(1)* %gep.out.0
148 store volatile double %fma1, double addrspace(1)* %gep.out.1
152 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
153 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
154 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
155 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
156 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
157 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
158 ; SI: buffer_store_dwordx2 [[RESULT]]
159 define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
160 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
161 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
162 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
163 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
164 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
166 %a = load volatile double, double addrspace(1)* %gep.0
167 %b = load volatile double, double addrspace(1)* %gep.1
168 %c = load volatile double, double addrspace(1)* %gep.2
170 %mul = fmul double %a, %b
171 %fma = fsub double %c, %mul
172 store double %fma, double addrspace(1)* %gep.out
176 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
177 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
178 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
179 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
180 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
181 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
182 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
183 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
184 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
185 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
187 define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
188 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
189 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
190 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
191 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
192 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
193 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
194 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
196 %a = load volatile double, double addrspace(1)* %gep.0
197 %b = load volatile double, double addrspace(1)* %gep.1
198 %c = load volatile double, double addrspace(1)* %gep.2
199 %d = load volatile double, double addrspace(1)* %gep.3
201 %mul = fmul double %a, %b
202 %fma0 = fsub double %c, %mul
203 %fma1 = fsub double %d, %mul
204 store volatile double %fma0, double addrspace(1)* %gep.out.0
205 store volatile double %fma1, double addrspace(1)* %gep.out.1
209 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
210 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
211 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
212 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
213 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
214 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
215 ; SI: buffer_store_dwordx2 [[RESULT]]
216 define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
217 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
218 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
219 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
220 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
221 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
223 %a = load volatile double, double addrspace(1)* %gep.0
224 %b = load volatile double, double addrspace(1)* %gep.1
225 %c = load volatile double, double addrspace(1)* %gep.2
227 %mul = fmul double %a, %b
228 %mul.neg = fsub double -0.0, %mul
229 %fma = fsub double %mul.neg, %c
231 store double %fma, double addrspace(1)* %gep.out
235 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
236 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
237 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
238 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
239 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
240 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
241 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
242 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
243 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
245 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
246 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
247 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
248 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
249 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
250 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
251 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
252 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
254 %a = load volatile double, double addrspace(1)* %gep.0
255 %b = load volatile double, double addrspace(1)* %gep.1
256 %c = load volatile double, double addrspace(1)* %gep.2
257 %d = load volatile double, double addrspace(1)* %gep.3
259 %mul = fmul double %a, %b
260 %mul.neg = fsub double -0.0, %mul
261 %fma0 = fsub double %mul.neg, %c
262 %fma1 = fsub double %mul.neg, %d
264 store volatile double %fma0, double addrspace(1)* %gep.out.0
265 store volatile double %fma1, double addrspace(1)* %gep.out.1
269 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
270 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
271 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
272 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
273 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
274 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
275 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
276 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
277 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
279 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
280 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
281 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
282 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
283 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
284 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
285 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
286 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
288 %a = load volatile double, double addrspace(1)* %gep.0
289 %b = load volatile double, double addrspace(1)* %gep.1
290 %c = load volatile double, double addrspace(1)* %gep.2
291 %d = load volatile double, double addrspace(1)* %gep.3
293 %mul = fmul double %a, %b
294 %mul.neg = fsub double -0.0, %mul
295 %fma0 = fsub double %mul.neg, %c
296 %fma1 = fsub double %mul, %d
298 store volatile double %fma0, double addrspace(1)* %gep.out.0
299 store volatile double %fma1, double addrspace(1)* %gep.out.1
303 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
305 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
306 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
307 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
308 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
309 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
310 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
312 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
313 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
314 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]]
316 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
317 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
319 ; SI: buffer_store_dwordx2 [[RESULT]]
320 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
321 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
322 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
323 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
324 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
325 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
326 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
327 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
329 %x = load volatile double, double addrspace(1)* %gep.0
330 %y = load volatile double, double addrspace(1)* %gep.1
331 %z = load volatile double, double addrspace(1)* %gep.2
332 %u = load volatile double, double addrspace(1)* %gep.3
333 %v = load volatile double, double addrspace(1)* %gep.4
335 %tmp0 = fmul double %u, %v
336 %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
337 %tmp2 = fsub double %tmp1, %z
339 store double %tmp2, double addrspace(1)* %gep.out
343 ; fold (fsub x, (fma y, z, (fmul u, v)))
344 ; -> (fma (fneg y), z, (fma (fneg u), v, x))
346 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
347 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
348 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
349 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
350 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
351 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
353 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
354 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
355 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]]
357 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
358 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
360 ; SI: buffer_store_dwordx2 [[RESULT]]
361 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
362 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
363 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
364 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
365 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
366 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
367 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
368 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
370 %x = load volatile double, double addrspace(1)* %gep.0
371 %y = load volatile double, double addrspace(1)* %gep.1
372 %z = load volatile double, double addrspace(1)* %gep.2
373 %u = load volatile double, double addrspace(1)* %gep.3
374 %v = load volatile double, double addrspace(1)* %gep.4
376 %tmp0 = fmul double %u, %v
377 %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
378 %tmp2 = fsub double %x, %tmp1
380 store double %tmp2, double addrspace(1)* %gep.out
385 ; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
388 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
389 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
390 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
392 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
393 define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
394 float addrspace(1)* %in1,
395 float addrspace(1)* %in2) {
396 %x = load volatile float, float addrspace(1)* %in1
397 %y = load volatile float, float addrspace(1)* %in2
398 %a = fadd float %x, 1.0
399 %m = fmul float %a, %y
400 store float %m, float addrspace(1)* %out
404 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
405 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
406 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
408 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
409 define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
410 float addrspace(1)* %in1,
411 float addrspace(1)* %in2) {
412 %x = load volatile float, float addrspace(1)* %in1
413 %y = load volatile float, float addrspace(1)* %in2
414 %a = fadd float %x, 1.0
415 %m = fmul float %y, %a
416 store float %m, float addrspace(1)* %out
420 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
421 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
422 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
424 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
425 define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
426 float addrspace(1)* %in1,
427 float addrspace(1)* %in2) {
428 %x = load float, float addrspace(1)* %in1
429 %y = load float, float addrspace(1)* %in2
430 %a = fadd float %x, -1.0
431 %m = fmul float %a, %y
432 store float %m, float addrspace(1)* %out
436 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
437 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
438 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
440 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
441 define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
442 float addrspace(1)* %in1,
443 float addrspace(1)* %in2) {
444 %x = load float, float addrspace(1)* %in1
445 %y = load float, float addrspace(1)* %in2
446 %a = fadd float %x, -1.0
447 %m = fmul float %y, %a
448 store float %m, float addrspace(1)* %out
452 ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
453 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
454 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
456 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
457 define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
458 float addrspace(1)* %in1,
459 float addrspace(1)* %in2) {
460 %x = load float, float addrspace(1)* %in1
461 %y = load float, float addrspace(1)* %in2
462 %s = fsub float 1.0, %x
463 %m = fmul float %s, %y
464 store float %m, float addrspace(1)* %out
468 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
469 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
470 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
472 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
473 define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
474 float addrspace(1)* %in1,
475 float addrspace(1)* %in2) {
476 %x = load float, float addrspace(1)* %in1
477 %y = load float, float addrspace(1)* %in2
478 %s = fsub float 1.0, %x
479 %m = fmul float %y, %s
480 store float %m, float addrspace(1)* %out
484 ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
485 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
486 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
488 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
489 define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
490 float addrspace(1)* %in1,
491 float addrspace(1)* %in2) {
492 %x = load float, float addrspace(1)* %in1
493 %y = load float, float addrspace(1)* %in2
494 %s = fsub float -1.0, %x
495 %m = fmul float %s, %y
496 store float %m, float addrspace(1)* %out
500 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
501 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
502 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
504 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
505 define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
506 float addrspace(1)* %in1,
507 float addrspace(1)* %in2) {
508 %x = load float, float addrspace(1)* %in1
509 %y = load float, float addrspace(1)* %in2
510 %s = fsub float -1.0, %x
511 %m = fmul float %y, %s
512 store float %m, float addrspace(1)* %out
516 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
517 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
518 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
520 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
521 define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
522 float addrspace(1)* %in1,
523 float addrspace(1)* %in2) {
524 %x = load float, float addrspace(1)* %in1
525 %y = load float, float addrspace(1)* %in2
526 %s = fsub float %x, 1.0
527 %m = fmul float %s, %y
528 store float %m, float addrspace(1)* %out
532 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
533 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
534 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
536 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
537 define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
538 float addrspace(1)* %in1,
539 float addrspace(1)* %in2) {
540 %x = load float, float addrspace(1)* %in1
541 %y = load float, float addrspace(1)* %in2
542 %s = fsub float %x, 1.0
543 %m = fmul float %y, %s
544 store float %m, float addrspace(1)* %out
548 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
549 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
550 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
552 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
553 define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
554 float addrspace(1)* %in1,
555 float addrspace(1)* %in2) {
556 %x = load float, float addrspace(1)* %in1
557 %y = load float, float addrspace(1)* %in2
558 %s = fsub float %x, -1.0
559 %m = fmul float %s, %y
560 store float %m, float addrspace(1)* %out
564 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
565 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
566 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
568 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
569 define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
570 float addrspace(1)* %in1,
571 float addrspace(1)* %in2) {
572 %x = load float, float addrspace(1)* %in1
573 %y = load float, float addrspace(1)* %in2
574 %s = fsub float %x, -1.0
575 %m = fmul float %y, %s
576 store float %m, float addrspace(1)* %out
581 ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
584 ; FUNC-LABEL: {{^}}test_f32_interp:
585 ; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
586 ; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
587 ; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
589 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
590 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
591 define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
592 float addrspace(1)* %in1,
593 float addrspace(1)* %in2,
594 float addrspace(1)* %in3) {
595 %x = load float, float addrspace(1)* %in1
596 %y = load float, float addrspace(1)* %in2
597 %t = load float, float addrspace(1)* %in3
598 %t1 = fsub float 1.0, %t
599 %tx = fmul float %x, %t
600 %ty = fmul float %y, %t1
601 %r = fadd float %tx, %ty
602 store float %r, float addrspace(1)* %out
606 ; FUNC-LABEL: {{^}}test_f64_interp:
607 ; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0
608 ; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]]
609 ; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]]
611 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
612 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
613 define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
614 double addrspace(1)* %in1,
615 double addrspace(1)* %in2,
616 double addrspace(1)* %in3) {
617 %x = load double, double addrspace(1)* %in1
618 %y = load double, double addrspace(1)* %in2
619 %t = load double, double addrspace(1)* %in3
620 %t1 = fsub double 1.0, %t
621 %tx = fmul double %x, %t
622 %ty = fmul double %y, %t1
623 %r = fadd double %tx, %ty
624 store double %r, double addrspace(1)* %out
628 attributes #0 = { nounwind readnone }
629 attributes #1 = { nounwind }