1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
5 ; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
7 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
8 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
11 declare i32 @llvm.amdgcn.workitem.id.x() #0
12 declare double @llvm.fabs.f64(double) #0
13 declare double @llvm.fma.f64(double, double, double) #0
14 declare float @llvm.fma.f32(float, float, float) #0
15 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
17 ; (fadd (fmul x, y), z) -> (fma x, y, z)
18 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
19 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
20 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
21 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
22 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
23 ; SI: buffer_store_dwordx2 [[RESULT]]
24 define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
25 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
26 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
27 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
28 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
29 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
31 %a = load volatile double, double addrspace(1)* %gep.0
32 %b = load volatile double, double addrspace(1)* %gep.1
33 %c = load volatile double, double addrspace(1)* %gep.2
35 %mul = fmul double %a, %b
36 %fma = fadd double %mul, %c
37 store double %fma, double addrspace(1)* %gep.out
41 ; (fadd (fmul x, y), z) -> (fma x, y, z)
42 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
43 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
44 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
45 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
46 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
47 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
48 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
49 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
50 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
52 define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
53 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
54 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
55 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
56 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
57 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
58 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
59 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
61 %a = load volatile double, double addrspace(1)* %gep.0
62 %b = load volatile double, double addrspace(1)* %gep.1
63 %c = load volatile double, double addrspace(1)* %gep.2
64 %d = load volatile double, double addrspace(1)* %gep.3
66 %mul = fmul double %a, %b
67 %fma0 = fadd double %mul, %c
68 %fma1 = fadd double %mul, %d
69 store volatile double %fma0, double addrspace(1)* %gep.out.0
70 store volatile double %fma1, double addrspace(1)* %gep.out.1
74 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
75 ; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
76 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
77 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
78 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
79 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
80 ; SI: buffer_store_dwordx2 [[RESULT]]
81 define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
82 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
83 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
84 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
85 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
86 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
88 %a = load volatile double, double addrspace(1)* %gep.0
89 %b = load volatile double, double addrspace(1)* %gep.1
90 %c = load volatile double, double addrspace(1)* %gep.2
92 %mul = fmul double %a, %b
93 %fma = fadd double %c, %mul
94 store double %fma, double addrspace(1)* %gep.out
98 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
99 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
100 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
101 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
102 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
103 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
104 ; SI: buffer_store_dwordx2 [[RESULT]]
105 define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
106 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
107 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
108 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
109 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
110 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
112 %a = load volatile double, double addrspace(1)* %gep.0
113 %b = load volatile double, double addrspace(1)* %gep.1
114 %c = load volatile double, double addrspace(1)* %gep.2
116 %mul = fmul double %a, %b
117 %fma = fsub double %mul, %c
118 store double %fma, double addrspace(1)* %gep.out
122 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
123 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
124 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
125 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
126 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
127 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
128 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
129 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
130 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
131 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
133 define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
134 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
135 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
136 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
137 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
138 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
139 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
140 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
142 %a = load volatile double, double addrspace(1)* %gep.0
143 %b = load volatile double, double addrspace(1)* %gep.1
144 %c = load volatile double, double addrspace(1)* %gep.2
145 %d = load volatile double, double addrspace(1)* %gep.3
147 %mul = fmul double %a, %b
148 %fma0 = fsub double %mul, %c
149 %fma1 = fsub double %mul, %d
150 store volatile double %fma0, double addrspace(1)* %gep.out.0
151 store volatile double %fma1, double addrspace(1)* %gep.out.1
155 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
156 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
157 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
158 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
159 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
160 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
161 ; SI: buffer_store_dwordx2 [[RESULT]]
162 define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
163 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
164 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
165 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
166 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
167 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
169 %a = load volatile double, double addrspace(1)* %gep.0
170 %b = load volatile double, double addrspace(1)* %gep.1
171 %c = load volatile double, double addrspace(1)* %gep.2
173 %mul = fmul double %a, %b
174 %fma = fsub double %c, %mul
175 store double %fma, double addrspace(1)* %gep.out
179 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
180 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
181 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
182 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
183 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
184 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
185 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
186 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
187 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
188 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
190 define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
191 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
192 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
193 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
194 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
195 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
196 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
197 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
199 %a = load volatile double, double addrspace(1)* %gep.0
200 %b = load volatile double, double addrspace(1)* %gep.1
201 %c = load volatile double, double addrspace(1)* %gep.2
202 %d = load volatile double, double addrspace(1)* %gep.3
204 %mul = fmul double %a, %b
205 %fma0 = fsub double %c, %mul
206 %fma1 = fsub double %d, %mul
207 store volatile double %fma0, double addrspace(1)* %gep.out.0
208 store volatile double %fma1, double addrspace(1)* %gep.out.1
212 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
213 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
214 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
215 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
216 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
217 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
218 ; SI: buffer_store_dwordx2 [[RESULT]]
219 define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
220 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
221 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
222 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
223 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
224 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
226 %a = load volatile double, double addrspace(1)* %gep.0
227 %b = load volatile double, double addrspace(1)* %gep.1
228 %c = load volatile double, double addrspace(1)* %gep.2
230 %mul = fmul double %a, %b
231 %mul.neg = fsub double -0.0, %mul
232 %fma = fsub double %mul.neg, %c
234 store double %fma, double addrspace(1)* %gep.out
238 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
239 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
240 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
241 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
242 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
243 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
244 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
245 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
246 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
247 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
249 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
250 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
251 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
252 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
253 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
254 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
255 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
256 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
258 %a = load volatile double, double addrspace(1)* %gep.0
259 %b = load volatile double, double addrspace(1)* %gep.1
260 %c = load volatile double, double addrspace(1)* %gep.2
261 %d = load volatile double, double addrspace(1)* %gep.3
263 %mul = fmul double %a, %b
264 %mul.neg = fsub double -0.0, %mul
265 %fma0 = fsub double %mul.neg, %c
266 %fma1 = fsub double %mul.neg, %d
268 store volatile double %fma0, double addrspace(1)* %gep.out.0
269 store volatile double %fma1, double addrspace(1)* %gep.out.1
273 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
274 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
275 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
276 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
277 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
278 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
279 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
280 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
281 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
282 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
284 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
285 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
286 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
287 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
288 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
289 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
290 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
291 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
293 %a = load volatile double, double addrspace(1)* %gep.0
294 %b = load volatile double, double addrspace(1)* %gep.1
295 %c = load volatile double, double addrspace(1)* %gep.2
296 %d = load volatile double, double addrspace(1)* %gep.3
298 %mul = fmul double %a, %b
299 %mul.neg = fsub double -0.0, %mul
300 %fma0 = fsub double %mul.neg, %c
301 %fma1 = fsub double %mul, %d
303 store volatile double %fma0, double addrspace(1)* %gep.out.0
304 store volatile double %fma1, double addrspace(1)* %gep.out.1
308 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
310 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
311 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
312 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
313 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
314 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
315 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}
317 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
318 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
319 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]]
321 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
322 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
324 ; SI: buffer_store_dwordx2 [[RESULT]]
325 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
326 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
327 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
328 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
329 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
330 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
331 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
332 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
334 %x = load volatile double, double addrspace(1)* %gep.0
335 %y = load volatile double, double addrspace(1)* %gep.1
336 %z = load volatile double, double addrspace(1)* %gep.2
337 %u = load volatile double, double addrspace(1)* %gep.3
338 %v = load volatile double, double addrspace(1)* %gep.4
340 %tmp0 = fmul double %u, %v
341 %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
342 %tmp2 = fsub double %tmp1, %z
344 store double %tmp2, double addrspace(1)* %gep.out
348 ; fold (fsub x, (fma y, z, (fmul u, v)))
349 ; -> (fma (fneg y), z, (fma (fneg u), v, x))
351 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
352 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
353 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
354 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
355 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
356 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}
358 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
359 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
360 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]]
362 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
363 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
365 ; SI: buffer_store_dwordx2 [[RESULT]]
366 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
367 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
368 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
369 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
370 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
371 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
372 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
373 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
375 %x = load volatile double, double addrspace(1)* %gep.0
376 %y = load volatile double, double addrspace(1)* %gep.1
377 %z = load volatile double, double addrspace(1)* %gep.2
378 %u = load volatile double, double addrspace(1)* %gep.3
379 %v = load volatile double, double addrspace(1)* %gep.4
381 ; nsz flag is needed since this combine may change sign of zero
382 %tmp0 = fmul nsz double %u, %v
383 %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
384 %tmp2 = fsub nsz double %x, %tmp1
386 store double %tmp2, double addrspace(1)* %gep.out
391 ; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
394 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
395 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
396 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
398 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
399 define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
400 float addrspace(1)* %in1,
401 float addrspace(1)* %in2) {
402 %x = load volatile float, float addrspace(1)* %in1
403 %y = load volatile float, float addrspace(1)* %in2
404 %a = fadd float %x, 1.0
405 %m = fmul float %a, %y
406 store float %m, float addrspace(1)* %out
410 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
411 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
412 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
414 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
415 define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
416 float addrspace(1)* %in1,
417 float addrspace(1)* %in2) {
418 %x = load volatile float, float addrspace(1)* %in1
419 %y = load volatile float, float addrspace(1)* %in2
420 %a = fadd float %x, 1.0
421 %m = fmul float %y, %a
422 store float %m, float addrspace(1)* %out
426 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
427 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
428 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
430 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
431 define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
432 float addrspace(1)* %in1,
433 float addrspace(1)* %in2) {
434 %x = load float, float addrspace(1)* %in1
435 %y = load float, float addrspace(1)* %in2
436 %a = fadd float %x, -1.0
437 %m = fmul float %a, %y
438 store float %m, float addrspace(1)* %out
442 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
443 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
444 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
446 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
447 define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
448 float addrspace(1)* %in1,
449 float addrspace(1)* %in2) {
450 %x = load float, float addrspace(1)* %in1
451 %y = load float, float addrspace(1)* %in2
452 %a = fadd float %x, -1.0
453 %m = fmul float %y, %a
454 store float %m, float addrspace(1)* %out
458 ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
459 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
460 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
462 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
463 define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
464 float addrspace(1)* %in1,
465 float addrspace(1)* %in2) {
466 %x = load float, float addrspace(1)* %in1
467 %y = load float, float addrspace(1)* %in2
468 %s = fsub float 1.0, %x
469 %m = fmul float %s, %y
470 store float %m, float addrspace(1)* %out
474 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
475 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
476 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
478 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
479 define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
480 float addrspace(1)* %in1,
481 float addrspace(1)* %in2) {
482 %x = load float, float addrspace(1)* %in1
483 %y = load float, float addrspace(1)* %in2
484 %s = fsub float 1.0, %x
485 %m = fmul float %y, %s
486 store float %m, float addrspace(1)* %out
490 ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
491 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
492 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
494 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
495 define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
496 float addrspace(1)* %in1,
497 float addrspace(1)* %in2) {
498 %x = load float, float addrspace(1)* %in1
499 %y = load float, float addrspace(1)* %in2
500 %s = fsub float -1.0, %x
501 %m = fmul float %s, %y
502 store float %m, float addrspace(1)* %out
506 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
507 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
508 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
510 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
511 define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
512 float addrspace(1)* %in1,
513 float addrspace(1)* %in2) {
514 %x = load float, float addrspace(1)* %in1
515 %y = load float, float addrspace(1)* %in2
516 %s = fsub float -1.0, %x
517 %m = fmul float %y, %s
518 store float %m, float addrspace(1)* %out
522 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
523 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
524 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
526 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
527 define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
528 float addrspace(1)* %in1,
529 float addrspace(1)* %in2) {
530 %x = load float, float addrspace(1)* %in1
531 %y = load float, float addrspace(1)* %in2
532 %s = fsub float %x, 1.0
533 %m = fmul float %s, %y
534 store float %m, float addrspace(1)* %out
538 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
539 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
540 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
542 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
543 define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
544 float addrspace(1)* %in1,
545 float addrspace(1)* %in2) {
546 %x = load float, float addrspace(1)* %in1
547 %y = load float, float addrspace(1)* %in2
548 %s = fsub float %x, 1.0
549 %m = fmul float %y, %s
550 store float %m, float addrspace(1)* %out
554 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
555 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
556 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
558 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
559 define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
560 float addrspace(1)* %in1,
561 float addrspace(1)* %in2) {
562 %x = load float, float addrspace(1)* %in1
563 %y = load float, float addrspace(1)* %in2
564 %s = fsub float %x, -1.0
565 %m = fmul float %s, %y
566 store float %m, float addrspace(1)* %out
570 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
571 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
572 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
574 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
575 define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
576 float addrspace(1)* %in1,
577 float addrspace(1)* %in2) {
578 %x = load float, float addrspace(1)* %in1
579 %y = load float, float addrspace(1)* %in2
580 %s = fsub float %x, -1.0
581 %m = fmul float %y, %s
582 store float %m, float addrspace(1)* %out
587 ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
590 ; FUNC-LABEL: {{^}}test_f32_interp:
591 ; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
592 ; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
593 ; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
595 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
596 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
597 define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
598 float addrspace(1)* %in1,
599 float addrspace(1)* %in2,
600 float addrspace(1)* %in3) {
601 %x = load float, float addrspace(1)* %in1
602 %y = load float, float addrspace(1)* %in2
603 %t = load float, float addrspace(1)* %in3
604 %t1 = fsub float 1.0, %t
605 %tx = fmul float %x, %t
606 %ty = fmul float %y, %t1
607 %r = fadd float %tx, %ty
608 store float %r, float addrspace(1)* %out
612 ; FUNC-LABEL: {{^}}test_f64_interp:
613 ; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0
614 ; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]]
615 ; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]]
617 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
618 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
619 define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
620 double addrspace(1)* %in1,
621 double addrspace(1)* %in2,
622 double addrspace(1)* %in3) {
623 %x = load double, double addrspace(1)* %in1
624 %y = load double, double addrspace(1)* %in2
625 %t = load double, double addrspace(1)* %in3
626 %t1 = fsub double 1.0, %t
627 %tx = fmul double %x, %t
628 %ty = fmul double %y, %t1
629 %r = fadd double %tx, %ty
630 store double %r, double addrspace(1)* %out
634 ; Make sure negative constant cancels out fneg
635 ; SI-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
636 ; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
637 ; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
640 ; SI: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
641 define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
642 %tid = call i32 @llvm.amdgcn.workitem.id.x()
643 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
644 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
645 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
647 %r1 = load volatile float, float addrspace(1)* %gep.0
648 %r2 = load volatile float, float addrspace(1)* %gep.1
650 %r1.fneg = fneg float %r1
652 %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2)
653 store float %r3, float addrspace(1)* %gep.out
657 ; SI-LABEL: {{^}}fma_2.0_neg_a_b_f32:
658 ; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
659 ; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
662 ; SI: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
663 define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
664 %tid = call i32 @llvm.amdgcn.workitem.id.x()
665 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
666 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
667 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
669 %r1 = load volatile float, float addrspace(1)* %gep.0
670 %r2 = load volatile float, float addrspace(1)* %gep.1
672 %r1.fneg = fneg float %r1
674 %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2)
675 store float %r3, float addrspace(1)* %gep.out
679 ; SI-LABEL: {{^}}fma_neg_b_c_v4f32:
680 ; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
681 ; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
682 ; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
683 ; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
684 define amdgpu_kernel void @fma_neg_b_c_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #2 {
685 %tid = call i32 @llvm.amdgcn.workitem.id.x()
686 %gep.0 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %tid
687 %gep.1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.0, i32 1
688 %gep.2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.1, i32 2
689 %gep.out = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
691 %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %gep.0
692 %tmp1 = load <4 x float>, <4 x float> addrspace(1)* %gep.1
693 %tmp2 = load <4 x float>, <4 x float> addrspace(1)* %gep.2
695 %fneg0 = fneg fast <4 x float> %tmp0
696 %fneg1 = fneg fast <4 x float> %tmp1
697 %fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1)
699 store <4 x float> %fma0, <4 x float> addrspace(1)* %gep.out
703 attributes #0 = { nounwind readnone }
704 attributes #1 = { nounwind }
705 attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }